Merge tag 'xfs-for-linus-v3.12-rc1' of git://oss.sgi.com/xfs/xfs

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c

index f3900427ffab5173ee05041a7375b9633312e835..87ba7cf99cd754590ffaf2f038926ae9098f4ca9 100644 (file)
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -620,12 +620,16 @@ spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
                 case Opt_uid:
                         if (match_int(&args[0], &option))
                                 return 0;
-                       root->i_uid = option;
+                       root->i_uid = make_kuid(current_user_ns(), option);
+                       if (!uid_valid(root->i_uid))
+                               return 0;
                         break;
                 case Opt_gid:
                         if (match_int(&args[0], &option))
                                 return 0;
-                       root->i_gid = option;
+                       root->i_gid = make_kgid(current_user_ns(), option);
+                       if (!gid_valid(root->i_gid))
+                               return 0;
                         break;
                 case Opt_mode:
                         if (match_octal(&args[0], &option))
diff --git a/fs/quota/quota.c b/fs/quota/quota.c

index c7314f1771f5824be95fffb4cd27695494c767ff..dea86e8967ee2c354d489384a83d1492b1dbcba3 100644 (file)
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -27,6 +27,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
         case Q_SYNC:
         case Q_GETINFO:
         case Q_XGETQSTAT:
+       case Q_XGETQSTATV:
         case Q_XQUOTASYNC:
                 break;
         /* allow to query information for dquots we "own" */
@@ -217,6 +218,31 @@ static int quota_getxstate(struct super_block *sb, void __user *addr)
         return ret;
  }
  
+static int quota_getxstatev(struct super_block *sb, void __user *addr)
+{
+       struct fs_quota_statv fqs;
+       int ret;
+
+       if (!sb->s_qcop->get_xstatev)
+               return -ENOSYS;
+
+       memset(&fqs, 0, sizeof(fqs));
+       if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */
+               return -EFAULT;
+
+       /* If this kernel doesn't support user specified version, fail */
+       switch (fqs.qs_version) {
+       case FS_QSTATV_VERSION1:
+               break;
+       default:
+               return -EINVAL;
+       }
+       ret = sb->s_qcop->get_xstatev(sb, &fqs);
+       if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+               return -EFAULT;
+       return ret;
+}
+
  static int quota_setxquota(struct super_block *sb, int type, qid_t id,
                            void __user *addr)
  {
@@ -293,6 +319,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                 return quota_setxstate(sb, cmd, addr);
         case Q_XGETQSTAT:
                 return quota_getxstate(sb, addr);
+       case Q_XGETQSTATV:
+               return quota_getxstatev(sb, addr);
         case Q_XSETQLIM:
                 return quota_setxquota(sb, type, id, addr);
         case Q_XGETQUOTA:
@@ -317,6 +345,7 @@ static int quotactl_cmd_write(int cmd)
         case Q_GETINFO:
         case Q_SYNC:
         case Q_XGETQSTAT:
+       case Q_XGETQSTATV:
         case Q_XGETQUOTA:
         case Q_XQUOTASYNC:
                 return 0;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index 4a4508023a3c15724a2edfd87204550f718c805b..0719e4db93f274de9af844f3ef66ce387b02a716 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -27,9 +27,12 @@ xfs-y                                += xfs_trace.o
  
  # highlevel code
  xfs-y                          += xfs_aops.o \
+                                  xfs_attr_inactive.o \
+                                  xfs_attr_list.o \
                                    xfs_bit.o \
+                                  xfs_bmap_util.o \
                                    xfs_buf.o \
-                                  xfs_dfrag.o \
+                                  xfs_dir2_readdir.o \
                                    xfs_discard.o \
                                    xfs_error.o \
                                    xfs_export.o \
@@ -44,11 +47,11 @@ xfs-y                               += xfs_aops.o \
                                    xfs_iops.o \
                                    xfs_itable.o \
                                    xfs_message.o \
+                                  xfs_mount.o \
                                    xfs_mru_cache.o \
-                                  xfs_rename.o \
                                    xfs_super.o \
-                                  xfs_utils.o \
-                                  xfs_vnodeops.o \
+                                  xfs_symlink.o \
+                                  xfs_trans.o \
                                    xfs_xattr.o \
                                    kmem.o \
                                    uuid.o
@@ -73,10 +76,13 @@ xfs-y                               += xfs_alloc.o \
                                    xfs_ialloc_btree.o \
                                    xfs_icreate_item.o \
                                    xfs_inode.o \
+                                  xfs_inode_fork.o \
+                                  xfs_inode_buf.o \
                                    xfs_log_recover.o \
-                                  xfs_mount.o \
-                                  xfs_symlink.o \
-                                  xfs_trans.o
+                                  xfs_log_rlimit.o \
+                                  xfs_sb.o \
+                                  xfs_symlink_remote.o \
+                                  xfs_trans_resv.o
  
  # low-level transaction/log code
  xfs-y                          += xfs_log.o \
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c

index 306d883d89bc7d6420ca4b5b8c5f848e573249bf..69518960b2ba17e888d71f75d30e4df57d0cca73 100644 (file)
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,11 +16,13 @@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
  #include "xfs_acl.h"
  #include "xfs_attr.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_inode.h"
-#include "xfs_vnodeops.h"
+#include "xfs_ag.h"
  #include "xfs_sb.h"
  #include "xfs_mount.h"
  #include "xfs_trace.h"
@@ -68,14 +70,15 @@ xfs_acl_from_disk(
  
                 switch (acl_e->e_tag) {
                 case ACL_USER:
+                       acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+                       break;
                 case ACL_GROUP:
-                       acl_e->e_id = be32_to_cpu(ace->ae_id);
+                       acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
                         break;
                 case ACL_USER_OBJ:
                 case ACL_GROUP_OBJ:
                 case ACL_MASK:
                 case ACL_OTHER:
-                       acl_e->e_id = ACL_UNDEFINED_ID;
                         break;
                 default:
                         goto fail;
@@ -101,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
                 acl_e = &acl->a_entries[i];
  
                 ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-               ace->ae_id = cpu_to_be32(acl_e->e_id);
+               switch (acl_e->e_tag) {
+               case ACL_USER:
+                       ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+                       break;
+               case ACL_GROUP:
+                       ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+                       break;
+               default:
+                       ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+                       break;
+               }
+
                 ace->ae_perm = cpu_to_be16(acl_e->e_perm);
         }
  }
@@ -360,7 +374,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
                 return -EINVAL;
         if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                 return value ? -EACCES : 0;
-       if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+       if (!inode_owner_or_capable(inode))
                 return -EPERM;
  
         if (!value)
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h

index 317aa86d96ea04925b35e995cc70e1eb024beb28..1cb740afd674e8fd3f5ce0a3f47bfc00f8b36471 100644 (file)
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -226,59 +226,6 @@ typedef struct xfs_agfl {
         __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
  } xfs_agfl_t;
  
-/*
- * Per-ag incore structure, copies of information in agf and agi,
- * to improve the performance of allocation group selection.
- */
-#define XFS_PAGB_NUM_SLOTS     128
-
-typedef struct xfs_perag {
-       struct xfs_mount *pag_mount;    /* owner filesystem */
-       xfs_agnumber_t  pag_agno;       /* AG this structure belongs to */
-       atomic_t        pag_ref;        /* perag reference count */
-       char            pagf_init;      /* this agf's entry is initialized */
-       char            pagi_init;      /* this agi's entry is initialized */
-       char            pagf_metadata;  /* the agf is preferred to be metadata */
-       char            pagi_inodeok;   /* The agi is ok for inodes */
-       __uint8_t       pagf_levels[XFS_BTNUM_AGF];
-                                       /* # of levels in bno & cnt btree */
-       __uint32_t      pagf_flcount;   /* count of blocks in freelist */
-       xfs_extlen_t    pagf_freeblks;  /* total free blocks */
-       xfs_extlen_t    pagf_longest;   /* longest free space */
-       __uint32_t      pagf_btreeblks; /* # of blocks held in AGF btrees */
-       xfs_agino_t     pagi_freecount; /* number of free inodes */
-       xfs_agino_t     pagi_count;     /* number of allocated inodes */
-
-       /*
-        * Inode allocation search lookup optimisation.
-        * If the pagino matches, the search for new inodes
-        * doesn't need to search the near ones again straight away
-        */
-       xfs_agino_t     pagl_pagino;
-       xfs_agino_t     pagl_leftrec;
-       xfs_agino_t     pagl_rightrec;
-#ifdef __KERNEL__
-       spinlock_t      pagb_lock;      /* lock for pagb_tree */
-       struct rb_root  pagb_tree;      /* ordered tree of busy extents */
-
-       atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-
-       spinlock_t      pag_ici_lock;   /* incore inode cache lock */
-       struct radix_tree_root pag_ici_root;    /* incore inode cache root */
-       int             pag_ici_reclaimable;    /* reclaimable inodes */
-       struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
-       unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
-
-       /* buffer cache index */
-       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
-       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
-
-       /* for rcu-safe freeing */
-       struct rcu_head rcu_head;
-#endif
-       int             pagb_count;     /* pagb slots in use */
-} xfs_perag_t;
-
  /*
   * tags for inode radix tree
   */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c

index 71596e57283ae6b44702f8d6866405de6b911728..5a1393f5e020739a648612002f7ae9a3f704d032 100644 (file)
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -878,7 +878,7 @@ xfs_alloc_ag_vextent_near(
         xfs_agblock_t   ltnew;          /* useful start bno of left side */
         xfs_extlen_t    rlen;           /* length of returned extent */
         int             forced = 0;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
         /*
          * Randomly don't execute the first algorithm.
          */
@@ -938,8 +938,8 @@ restart:
                 xfs_extlen_t    blen=0;
                 xfs_agblock_t   bnew=0;
  
-#if defined(DEBUG) && defined(__KERNEL__)
-               if (!dofirst)
+#ifdef DEBUG
+               if (dofirst)
                         break;
  #endif
                 /*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index e11d654af786b580086f188f72881fe7416ca928..977da0ec66047eedacb3298ebec57de3a2968f0c 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,9 +28,9 @@
  #include "xfs_alloc.h"
  #include "xfs_error.h"
  #include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
  #include "xfs_trace.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include <linux/aio.h>
  #include <linux/gfp.h>
  #include <linux/mpage.h>
@@ -108,7 +108,7 @@ xfs_setfilesize_trans_alloc(
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
  
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
@@ -440,7 +440,7 @@ xfs_start_page_writeback(
                 end_page_writeback(page);
  }
  
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  {
         return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  }
@@ -514,7 +514,7 @@ xfs_submit_ioend(
                                 goto retry;
                         }
  
-                       if (bio_add_buffer(bio, bh) != bh->b_size) {
+                       if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
                                 xfs_submit_ioend_bio(wbc, ioend, bio);
                                 goto retry;
                         }
@@ -1498,13 +1498,26 @@ xfs_vm_write_failed(
         loff_t                  pos,
         unsigned                len)
  {
-       loff_t                  block_offset = pos & PAGE_MASK;
+       loff_t                  block_offset;
         loff_t                  block_start;
         loff_t                  block_end;
         loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
         loff_t                  to = from + len;
         struct buffer_head      *bh, *head;
  
+       /*
+        * The request pos offset might be 32 or 64 bit, this is all fine
+        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
+        * platform, the high 32-bit will be masked off if we evaluate the
+        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+        * 0xfffff000 as an unsigned long, hence the result is incorrect
+        * which could cause the following ASSERT failed in most cases.
+        * In order to avoid this, we can evaluate the block_offset of the
+        * start of the page by using shifts rather than masks the mismatch
+        * problem.
+        */
+       block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
         ASSERT(block_offset + from == pos);
  
         head = page_buffers(page);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c

index 20fe3fe9d3417aabcd566ddad7719d3653ad4443..ddcf2267ffa6fdf1bcf33cf7439c6b379472c2b4 100644 (file)
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -17,10 +17,11 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
@@ -32,13 +33,13 @@
  #include "xfs_alloc.h"
  #include "xfs_inode_item.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_attr.h"
  #include "xfs_attr_leaf.h"
  #include "xfs_attr_remote.h"
  #include "xfs_error.h"
  #include "xfs_quota.h"
  #include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
  #include "xfs_trace.h"
  
  /*
@@ -62,7 +63,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
  
  /*
   * Internal routines when attribute list is more than one block.
@@ -70,7 +70,6 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
  STATIC int xfs_attr_node_get(xfs_da_args_t *args);
  STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
  STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
  STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
  STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
  
@@ -90,7 +89,7 @@ xfs_attr_name_to_xname(
         return 0;
  }
  
-STATIC int
+int
  xfs_inode_hasattr(
         struct xfs_inode        *ip)
  {
@@ -227,13 +226,14 @@ xfs_attr_set_int(
         int             valuelen,
         int             flags)
  {
-       xfs_da_args_t   args;
-       xfs_fsblock_t   firstblock;
-       xfs_bmap_free_t flist;
-       int             error, err2, committed;
-       xfs_mount_t     *mp = dp->i_mount;
-       int             rsvd = (flags & ATTR_ROOT) != 0;
-       int             local;
+       xfs_da_args_t           args;
+       xfs_fsblock_t           firstblock;
+       xfs_bmap_free_t         flist;
+       int                     error, err2, committed;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_trans_res    tres;
+       int                     rsvd = (flags & ATTR_ROOT) != 0;
+       int                     local;
  
         /*
          * Attach the dquots to the inode.
@@ -293,11 +293,11 @@ xfs_attr_set_int(
         if (rsvd)
                 args.trans->t_flags |= XFS_TRANS_RESERVE;
  
-       error = xfs_trans_reserve(args.trans, args.total,
-                                 XFS_ATTRSETM_LOG_RES(mp) +
-                                 XFS_ATTRSETRT_LOG_RES(mp) * args.total,
-                                 0, XFS_TRANS_PERM_LOG_RES,
-                                 XFS_ATTRSET_LOG_COUNT);
+       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
         if (error) {
                 xfs_trans_cancel(args.trans, 0);
                 return(error);
@@ -517,11 +517,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
         if (flags & ATTR_ROOT)
                 args.trans->t_flags |= XFS_TRANS_RESERVE;
  
-       if ((error = xfs_trans_reserve(args.trans,
-                                     XFS_ATTRRM_SPACE_RES(mp),
-                                     XFS_ATTRRM_LOG_RES(mp),
-                                     0, XFS_TRANS_PERM_LOG_RES,
-                                     XFS_ATTRRM_LOG_COUNT))) {
+       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+                                 XFS_ATTRRM_SPACE_RES(mp), 0);
+       if (error) {
                 xfs_trans_cancel(args.trans, 0);
                 return(error);
         }
@@ -611,228 +609,6 @@ xfs_attr_remove(
         return xfs_attr_remove_int(dp, &xname, flags);
  }
  
-int
-xfs_attr_list_int(xfs_attr_list_context_t *context)
-{
-       int error;
-       xfs_inode_t *dp = context->dp;
-
-       XFS_STATS_INC(xs_attr_list);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return EIO;
-
-       xfs_ilock(dp, XFS_ILOCK_SHARED);
-
-       /*
-        * Decide on what work routines to call based on the inode size.
-        */
-       if (!xfs_inode_hasattr(dp)) {
-               error = 0;
-       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               error = xfs_attr_shortform_list(context);
-       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               error = xfs_attr_leaf_list(context);
-       } else {
-               error = xfs_attr_node_list(context);
-       }
-
-       xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
-       return error;
-}
-
-#define        ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
-       (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define        ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
-       ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
-        & ~(sizeof(u_int32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(
-       xfs_attr_list_context_t *context,
-       int             flags,
-       unsigned char   *name,
-       int             namelen,
-       int             valuelen,
-       unsigned char   *value)
-{
-       struct attrlist *alist = (struct attrlist *)context->alist;
-       attrlist_ent_t *aep;
-       int arraytop;
-
-       ASSERT(!(context->flags & ATTR_KERNOVAL));
-       ASSERT(context->count >= 0);
-       ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-       ASSERT(context->firstu >= sizeof(*alist));
-       ASSERT(context->firstu <= context->bufsize);
-
-       /*
-        * Only list entries in the right namespace.
-        */
-       if (((context->flags & ATTR_SECURE) == 0) !=
-           ((flags & XFS_ATTR_SECURE) == 0))
-               return 0;
-       if (((context->flags & ATTR_ROOT) == 0) !=
-           ((flags & XFS_ATTR_ROOT) == 0))
-               return 0;
-
-       arraytop = sizeof(*alist) +
-                       context->count * sizeof(alist->al_offset[0]);
-       context->firstu -= ATTR_ENTSIZE(namelen);
-       if (context->firstu < arraytop) {
-               trace_xfs_attr_list_full(context);
-               alist->al_more = 1;
-               context->seen_enough = 1;
-               return 1;
-       }
-
-       aep = (attrlist_ent_t *)&context->alist[context->firstu];
-       aep->a_valuelen = valuelen;
-       memcpy(aep->a_name, name, namelen);
-       aep->a_name[namelen] = 0;
-       alist->al_offset[context->count++] = context->firstu;
-       alist->al_count = context->count;
-       trace_xfs_attr_list_add(context);
-       return 0;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths.  Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
-       xfs_inode_t     *dp,
-       char            *buffer,
-       int             bufsize,
-       int             flags,
-       attrlist_cursor_kern_t *cursor)
-{
-       xfs_attr_list_context_t context;
-       struct attrlist *alist;
-       int error;
-
-       /*
-        * Validate the cursor.
-        */
-       if (cursor->pad1 || cursor->pad2)
-               return(XFS_ERROR(EINVAL));
-       if ((cursor->initted == 0) &&
-           (cursor->hashval || cursor->blkno || cursor->offset))
-               return XFS_ERROR(EINVAL);
-
-       /*
-        * Check for a properly aligned buffer.
-        */
-       if (((long)buffer) & (sizeof(int)-1))
-               return XFS_ERROR(EFAULT);
-       if (flags & ATTR_KERNOVAL)
-               bufsize = 0;
-
-       /*
-        * Initialize the output buffer.
-        */
-       memset(&context, 0, sizeof(context));
-       context.dp = dp;
-       context.cursor = cursor;
-       context.resynch = 1;
-       context.flags = flags;
-       context.alist = buffer;
-       context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
-       context.firstu = context.bufsize;
-       context.put_listent = xfs_attr_put_listent;
-
-       alist = (struct attrlist *)context.alist;
-       alist->al_count = 0;
-       alist->al_more = 0;
-       alist->al_offset[0] = context.bufsize;
-
-       error = xfs_attr_list_int(&context);
-       ASSERT(error >= 0);
-       return error;
-}
-
-int                                                            /* error */
-xfs_attr_inactive(xfs_inode_t *dp)
-{
-       xfs_trans_t *trans;
-       xfs_mount_t *mp;
-       int error;
-
-       mp = dp->i_mount;
-       ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
-
-       xfs_ilock(dp, XFS_ILOCK_SHARED);
-       if (!xfs_inode_hasattr(dp) ||
-           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               xfs_iunlock(dp, XFS_ILOCK_SHARED);
-               return 0;
-       }
-       xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-       if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
-                                     XFS_TRANS_PERM_LOG_RES,
-                                     XFS_ATTRINVAL_LOG_COUNT))) {
-               xfs_trans_cancel(trans, 0);
-               return(error);
-       }
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
-
-       /*
-        * No need to make quota reservations here. We expect to release some
-        * blocks, not allocate, in the common case.
-        */
-       xfs_trans_ijoin(trans, dp, 0);
-
-       /*
-        * Decide on what work routines to call based on the inode size.
-        */
-       if (!xfs_inode_hasattr(dp) ||
-           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               error = 0;
-               goto out;
-       }
-       error = xfs_attr3_root_inactive(&trans, dp);
-       if (error)
-               goto out;
-
-       error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
-       if (error)
-               goto out;
-
-       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-       return(error);
-
-out:
-       xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return(error);
-}
-
-
  
  /*========================================================================
   * External routines when attribute list is inside the inode
@@ -1166,28 +942,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
         return error;
  }
  
-/*
- * Copy out attribute entries for attr_list(), for leaf attribute lists.
- */
-STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
-{
-       int error;
-       struct xfs_buf *bp;
-
-       trace_xfs_attr_leaf_list(context);
-
-       context->cursor->blkno = 0;
-       error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
-       if (error)
-               return XFS_ERROR(error);
-
-       error = xfs_attr3_leaf_list_int(bp, context);
-       xfs_trans_brelse(NULL, bp);
-       return XFS_ERROR(error);
-}
-
-
  /*========================================================================
   * External routines when attribute list size > XFS_LBSIZE(mp).
   *========================================================================*/
@@ -1260,6 +1014,7 @@ restart:
                          * have been a b-tree.
                          */
                         xfs_da_state_free(state);
+                       state = NULL;
                         xfs_bmap_init(args->flist, args->firstblock);
                         error = xfs_attr3_leaf_to_node(args);
                         if (!error) {
@@ -1780,143 +1535,3 @@ xfs_attr_node_get(xfs_da_args_t *args)
         xfs_da_state_free(state);
         return(retval);
  }
-
-STATIC int                                                     /* error */
-xfs_attr_node_list(xfs_attr_list_context_t *context)
-{
-       attrlist_cursor_kern_t *cursor;
-       xfs_attr_leafblock_t *leaf;
-       xfs_da_intnode_t *node;
-       struct xfs_attr3_icleaf_hdr leafhdr;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_node_entry *btree;
-       int error, i;
-       struct xfs_buf *bp;
-
-       trace_xfs_attr_node_list(context);
-
-       cursor = context->cursor;
-       cursor->initted = 1;
-
-       /*
-        * Do all sorts of validation on the passed-in cursor structure.
-        * If anything is amiss, ignore the cursor and look up the hashval
-        * starting from the btree root.
-        */
-       bp = NULL;
-       if (cursor->blkno > 0) {
-               error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
-                                             &bp, XFS_ATTR_FORK);
-               if ((error != 0) && (error != EFSCORRUPTED))
-                       return(error);
-               if (bp) {
-                       struct xfs_attr_leaf_entry *entries;
-
-                       node = bp->b_addr;
-                       switch (be16_to_cpu(node->hdr.info.magic)) {
-                       case XFS_DA_NODE_MAGIC:
-                       case XFS_DA3_NODE_MAGIC:
-                               trace_xfs_attr_list_wrong_blk(context);
-                               xfs_trans_brelse(NULL, bp);
-                               bp = NULL;
-                               break;
-                       case XFS_ATTR_LEAF_MAGIC:
-                       case XFS_ATTR3_LEAF_MAGIC:
-                               leaf = bp->b_addr;
-                               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-                               entries = xfs_attr3_leaf_entryp(leaf);
-                               if (cursor->hashval > be32_to_cpu(
-                                               entries[leafhdr.count - 1].hashval)) {
-                                       trace_xfs_attr_list_wrong_blk(context);
-                                       xfs_trans_brelse(NULL, bp);
-                                       bp = NULL;
-                               } else if (cursor->hashval <= be32_to_cpu(
-                                               entries[0].hashval)) {
-                                       trace_xfs_attr_list_wrong_blk(context);
-                                       xfs_trans_brelse(NULL, bp);
-                                       bp = NULL;
-                               }
-                               break;
-                       default:
-                               trace_xfs_attr_list_wrong_blk(context);
-                               xfs_trans_brelse(NULL, bp);
-                               bp = NULL;
-                       }
-               }
-       }
-
-       /*
-        * We did not find what we expected given the cursor's contents,
-        * so we start from the top and work down based on the hash value.
-        * Note that start of node block is same as start of leaf block.
-        */
-       if (bp == NULL) {
-               cursor->blkno = 0;
-               for (;;) {
-                       __uint16_t magic;
-
-                       error = xfs_da3_node_read(NULL, context->dp,
-                                                     cursor->blkno, -1, &bp,
-                                                     XFS_ATTR_FORK);
-                       if (error)
-                               return(error);
-                       node = bp->b_addr;
-                       magic = be16_to_cpu(node->hdr.info.magic);
-                       if (magic == XFS_ATTR_LEAF_MAGIC ||
-                           magic == XFS_ATTR3_LEAF_MAGIC)
-                               break;
-                       if (magic != XFS_DA_NODE_MAGIC &&
-                           magic != XFS_DA3_NODE_MAGIC) {
-                               XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    context->dp->i_mount,
-                                                    node);
-                               xfs_trans_brelse(NULL, bp);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-
-                       xfs_da3_node_hdr_from_disk(&nodehdr, node);
-                       btree = xfs_da3_node_tree_p(node);
-                       for (i = 0; i < nodehdr.count; btree++, i++) {
-                               if (cursor->hashval
-                                               <= be32_to_cpu(btree->hashval)) {
-                                       cursor->blkno = be32_to_cpu(btree->before);
-                                       trace_xfs_attr_list_node_descend(context,
-                                                                        btree);
-                                       break;
-                               }
-                       }
-                       if (i == nodehdr.count) {
-                               xfs_trans_brelse(NULL, bp);
-                               return 0;
-                       }
-                       xfs_trans_brelse(NULL, bp);
-               }
-       }
-       ASSERT(bp != NULL);
-
-       /*
-        * Roll upward through the blocks, processing each leaf block in
-        * order.  As long as there is space in the result buffer, keep
-        * adding the information.
-        */
-       for (;;) {
-               leaf = bp->b_addr;
-               error = xfs_attr3_leaf_list_int(bp, context);
-               if (error) {
-                       xfs_trans_brelse(NULL, bp);
-                       return error;
-               }
-               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-               if (context->seen_enough || leafhdr.forw == 0)
-                       break;
-               cursor->blkno = leafhdr.forw;
-               xfs_trans_brelse(NULL, bp);
-               error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
-                                          &bp);
-               if (error)
-                       return error;
-       }
-       xfs_trans_brelse(NULL, bp);
-       return 0;
-}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h

index de8dd58da46c28ec078dbd5e6d829ae725e973ab..dd4824589470eb106a2b5a764da6039d56121726 100644 (file)
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -141,5 +141,14 @@ typedef struct xfs_attr_list_context {
   */
  int xfs_attr_inactive(struct xfs_inode *dp);
  int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+                unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+                unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+                 int flags, struct attrlist_cursor_kern *cursor);
+
  
  #endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c

new file mode 100644 (file)

index 0000000..bb24b07
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_remote.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr3_leaf_freextent(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             blkno,
+       int                     blkcnt)
+{
+       struct xfs_bmbt_irec    map;
+       struct xfs_buf          *bp;
+       xfs_dablk_t             tblkno;
+       xfs_daddr_t             dblkno;
+       int                     tblkcnt;
+       int                     dblkcnt;
+       int                     nmap;
+       int                     error;
+
+       /*
+        * Roll through the "value", invalidating the attribute value's
+        * blocks.
+        */
+       tblkno = blkno;
+       tblkcnt = blkcnt;
+       while (tblkcnt > 0) {
+               /*
+                * Try to remember where we decided to put the value.
+                */
+               nmap = 1;
+               error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+                                      &map, &nmap, XFS_BMAPI_ATTRFORK);
+               if (error) {
+                       return(error);
+               }
+               ASSERT(nmap == 1);
+               ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+               /*
+                * If it's a hole, these are already unmapped
+                * so there's nothing to invalidate.
+                */
+               if (map.br_startblock != HOLESTARTBLOCK) {
+
+                       dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+                                                 map.br_startblock);
+                       dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+                                               map.br_blockcount);
+                       bp = xfs_trans_get_buf(*trans,
+                                       dp->i_mount->m_ddev_targp,
+                                       dblkno, dblkcnt, 0);
+                       if (!bp)
+                               return ENOMEM;
+                       xfs_trans_binval(*trans, bp);
+                       /*
+                        * Roll to next transaction.
+                        */
+                       error = xfs_trans_roll(trans, dp);
+                       if (error)
+                               return (error);
+               }
+
+               tblkno += map.br_blockcount;
+               tblkcnt -= map.br_blockcount;
+       }
+
+       return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_attr_inactive_list *list;
+       struct xfs_attr_inactive_list *lp;
+       int                     error;
+       int                     count;
+       int                     size;
+       int                     tmp;
+       int                     i;
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+       /*
+        * Count the number of "remote" value extents.
+        */
+       count = 0;
+       entry = xfs_attr3_leaf_entryp(leaf);
+       for (i = 0; i < ichdr.count; entry++, i++) {
+               if (be16_to_cpu(entry->nameidx) &&
+                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       if (name_rmt->valueblk)
+                               count++;
+               }
+       }
+
+       /*
+        * If there are no "remote" values, we're done.
+        */
+       if (count == 0) {
+               xfs_trans_brelse(*trans, bp);
+               return 0;
+       }
+
+       /*
+        * Allocate storage for a list of all the "remote" value extents.
+        */
+       size = count * sizeof(xfs_attr_inactive_list_t);
+       list = kmem_alloc(size, KM_SLEEP);
+
+       /*
+        * Identify each of the "remote" value extents.
+        */
+       lp = list;
+       entry = xfs_attr3_leaf_entryp(leaf);
+       for (i = 0; i < ichdr.count; entry++, i++) {
+               if (be16_to_cpu(entry->nameidx) &&
+                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       if (name_rmt->valueblk) {
+                               lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+                               lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+                                                   be32_to_cpu(name_rmt->valuelen));
+                               lp++;
+                       }
+               }
+       }
+       xfs_trans_brelse(*trans, bp);   /* unlock for trans. in freextent() */
+
+       /*
+        * Invalidate each of the "remote" value extents.
+        */
+       error = 0;
+       for (lp = list, i = 0; i < count; i++, lp++) {
+               tmp = xfs_attr3_leaf_freextent(trans, dp,
+                               lp->valueblk, lp->valuelen);
+
+               if (error == 0)
+                       error = tmp;    /* save only the 1st errno */
+       }
+
+       kmem_free(list);
+       return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+       struct xfs_trans **trans,
+       struct xfs_inode *dp,
+       struct xfs_buf  *bp,
+       int             level)
+{
+       xfs_da_blkinfo_t *info;
+       xfs_da_intnode_t *node;
+       xfs_dablk_t child_fsb;
+       xfs_daddr_t parent_blkno, child_blkno;
+       int error, i;
+       struct xfs_buf *child_bp;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr ichdr;
+
+       /*
+        * Since this code is recursive (gasp!) we must protect ourselves.
+        */
+       if (level > XFS_DA_NODE_MAXDEPTH) {
+               xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
+               return XFS_ERROR(EIO);
+       }
+
+       node = bp->b_addr;
+       xfs_da3_node_hdr_from_disk(&ichdr, node);
+       parent_blkno = bp->b_bn;
+       if (!ichdr.count) {
+               xfs_trans_brelse(*trans, bp);
+               return 0;
+       }
+       btree = xfs_da3_node_tree_p(node);
+       child_fsb = be32_to_cpu(btree[0].before);
+       xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
+
+       /*
+        * If this is the node level just above the leaves, simply loop
+        * over the leaves removing all of them.  If this is higher up
+        * in the tree, recurse downward.
+        */
+       for (i = 0; i < ichdr.count; i++) {
+               /*
+                * Read the subsidiary block to see what we have to work with.
+                * Don't do this in a transaction.  This is a depth-first
+                * traversal of the tree so we may deal with many blocks
+                * before we come back to this one.
+                */
+               error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
+                                               XFS_ATTR_FORK);
+               if (error)
+                       return(error);
+               if (child_bp) {
+                                               /* save for re-read later */
+                       child_blkno = XFS_BUF_ADDR(child_bp);
+
+                       /*
+                        * Invalidate the subtree, however we have to.
+                        */
+                       info = child_bp->b_addr;
+                       switch (info->magic) {
+                       case cpu_to_be16(XFS_DA_NODE_MAGIC):
+                       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+                               error = xfs_attr3_node_inactive(trans, dp,
+                                                       child_bp, level + 1);
+                               break;
+                       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+                       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+                               error = xfs_attr3_leaf_inactive(trans, dp,
+                                                       child_bp);
+                               break;
+                       default:
+                               error = XFS_ERROR(EIO);
+                               xfs_trans_brelse(*trans, child_bp);
+                               break;
+                       }
+                       if (error)
+                               return error;
+
+                       /*
+                        * Remove the subsidiary block from the cache
+                        * and from the log.
+                        */
+                       error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+                               &child_bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+                       xfs_trans_binval(*trans, child_bp);
+               }
+
+               /*
+                * If we're not done, re-read the parent to get the next
+                * child block number.
+                */
+               if (i + 1 < ichdr.count) {
+                       error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
+                                                &bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+                       child_fsb = be32_to_cpu(btree[i + 1].before);
+                       xfs_trans_brelse(*trans, bp);
+               }
+               /*
+                * Atomically commit the whole invalidate stuff.
+                */
+               error = xfs_trans_roll(trans, dp);
+               if (error)
+                       return  error;
+       }
+
+       return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr3_root_inactive(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp)
+{
+       struct xfs_da_blkinfo   *info;
+       struct xfs_buf          *bp;
+       xfs_daddr_t             blkno;
+       int                     error;
+
+       /*
+        * Read block 0 to see what we have to work with.
+        * We only get here if we have extents, since we remove
+        * the extents in reverse order the extent containing
+        * block 0 must still be there.
+        */
+       error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+       if (error)
+               return error;
+       blkno = bp->b_bn;
+
+       /*
+        * Invalidate the tree, even if the "tree" is only a single leaf block.
+        * This is a depth-first traversal!
+        */
+       info = bp->b_addr;
+       switch (info->magic) {
+       case cpu_to_be16(XFS_DA_NODE_MAGIC):
+       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+               error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+               break;
+       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+               error = xfs_attr3_leaf_inactive(trans, dp, bp);
+               break;
+       default:
+               error = XFS_ERROR(EIO);
+               xfs_trans_brelse(*trans, bp);
+               break;
+       }
+       if (error)
+               return error;
+
+       /*
+        * Invalidate the incore copy of the root block.
+        */
+       error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+       if (error)
+               return error;
+       xfs_trans_binval(*trans, bp);   /* remove from cache */
+       /*
+        * Commit the invalidate and start the next transaction.
+        */
+       error = xfs_trans_roll(trans, dp);
+
+       return error;
+}
+
+int
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+       xfs_trans_t *trans;
+       xfs_mount_t *mp;
+       int error;
+
+       mp = dp->i_mount;
+       ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+       xfs_ilock(dp, XFS_ILOCK_SHARED);
+       if (!xfs_inode_hasattr(dp) ||
+           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               xfs_iunlock(dp, XFS_ILOCK_SHARED);
+               return 0;
+       }
+       xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+       /*
+        * Start our first transaction of the day.
+        *
+        * All future transactions during this code must be "chained" off
+        * this one via the trans_dup() call.  All transactions will contain
+        * the inode, and the inode will always be marked with trans_ihold().
+        * Since the inode will be locked in all transactions, we must log
+        * the inode in every transaction to let it float upward through
+        * the log.
+        */
+       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+       error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+       if (error) {
+               xfs_trans_cancel(trans, 0);
+               return(error);
+       }
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+       /*
+        * No need to make quota reservations here. We expect to release some
+        * blocks, not allocate, in the common case.
+        */
+       xfs_trans_ijoin(trans, dp, 0);
+
+       /*
+        * Decide on what work routines to call based on the inode size.
+        */
+       if (!xfs_inode_hasattr(dp) ||
+           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               error = 0;
+               goto out;
+       }
+       error = xfs_attr3_root_inactive(&trans, dp);
+       if (error)
+               goto out;
+
+       error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+       if (error)
+               goto out;
+
+       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+       return(error);
+
+out:
+       xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return(error);
+}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c

index b800fbcafc7f639f05a83fc97fc5e964f77422b0..86db20a9cc02b5df2dd7ecb3c44eca39d7498dd7 100644 (file)
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -22,6 +22,7 @@
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
@@ -77,16 +78,6 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
                         int *number_entries_in_blk1,
                         int *number_usedbytes_in_blk1);
  
-/*
- * Routines used for shrinking the Btree.
- */
-STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
-                                 struct xfs_buf *bp, int level);
-STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
-                                 struct xfs_buf *bp);
-STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
-                                  xfs_dablk_t blkno, int blkcnt);
-
  /*
   * Utility routines.
   */
@@ -635,7 +626,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
         xfs_attr_sf_entry_t *sfe;
         int i;
  
-       ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+       ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
         sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
         sfe = &sf->list[0];
         for (i = 0; i < sf->hdr.count;
@@ -751,182 +742,6 @@ out:
         return(error);
  }
  
-STATIC int
-xfs_attr_shortform_compare(const void *a, const void *b)
-{
-       xfs_attr_sf_sort_t *sa, *sb;
-
-       sa = (xfs_attr_sf_sort_t *)a;
-       sb = (xfs_attr_sf_sort_t *)b;
-       if (sa->hash < sb->hash) {
-               return(-1);
-       } else if (sa->hash > sb->hash) {
-               return(1);
-       } else {
-               return(sa->entno - sb->entno);
-       }
-}
-
-
-#define XFS_ISRESET_CURSOR(cursor) \
-       (!((cursor)->initted) && !((cursor)->hashval) && \
-        !((cursor)->blkno) && !((cursor)->offset))
-/*
- * Copy out entries of shortform attribute lists for attr_list().
- * Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
- * we have to calculate each entries' hashvalue and sort them before
- * we can begin returning them to the user.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
-{
-       attrlist_cursor_kern_t *cursor;
-       xfs_attr_sf_sort_t *sbuf, *sbp;
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       xfs_inode_t *dp;
-       int sbsize, nsbuf, count, i;
-       int error;
-
-       ASSERT(context != NULL);
-       dp = context->dp;
-       ASSERT(dp != NULL);
-       ASSERT(dp->i_afp != NULL);
-       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
-       ASSERT(sf != NULL);
-       if (!sf->hdr.count)
-               return(0);
-       cursor = context->cursor;
-       ASSERT(cursor != NULL);
-
-       trace_xfs_attr_list_sf(context);
-
-       /*
-        * If the buffer is large enough and the cursor is at the start,
-        * do not bother with sorting since we will return everything in
-        * one buffer and another call using the cursor won't need to be
-        * made.
-        * Note the generous fudge factor of 16 overhead bytes per entry.
-        * If bufsize is zero then put_listent must be a search function
-        * and can just scan through what we have.
-        */
-       if (context->bufsize == 0 ||
-           (XFS_ISRESET_CURSOR(cursor) &&
-             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
-               for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-                       error = context->put_listent(context,
-                                          sfe->flags,
-                                          sfe->nameval,
-                                          (int)sfe->namelen,
-                                          (int)sfe->valuelen,
-                                          &sfe->nameval[sfe->namelen]);
-
-                       /*
-                        * Either search callback finished early or
-                        * didn't fit it all in the buffer after all.
-                        */
-                       if (context->seen_enough)
-                               break;
-
-                       if (error)
-                               return error;
-                       sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-               }
-               trace_xfs_attr_list_sf_all(context);
-               return(0);
-       }
-
-       /* do no more for a search callback */
-       if (context->bufsize == 0)
-               return 0;
-
-       /*
-        * It didn't all fit, so we have to sort everything on hashval.
-        */
-       sbsize = sf->hdr.count * sizeof(*sbuf);
-       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
-
-       /*
-        * Scan the attribute list for the rest of the entries, storing
-        * the relevant info from only those that match into a buffer.
-        */
-       nsbuf = 0;
-       for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-               if (unlikely(
-                   ((char *)sfe < (char *)sf) ||
-                   ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
-                       XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
-                                            XFS_ERRLEVEL_LOW,
-                                            context->dp->i_mount, sfe);
-                       kmem_free(sbuf);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-
-               sbp->entno = i;
-               sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
-               sbp->name = sfe->nameval;
-               sbp->namelen = sfe->namelen;
-               /* These are bytes, and both on-disk, don't endian-flip */
-               sbp->valuelen = sfe->valuelen;
-               sbp->flags = sfe->flags;
-               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-               sbp++;
-               nsbuf++;
-       }
-
-       /*
-        * Sort the entries on hash then entno.
-        */
-       xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
-
-       /*
-        * Re-find our place IN THE SORTED LIST.
-        */
-       count = 0;
-       cursor->initted = 1;
-       cursor->blkno = 0;
-       for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
-               if (sbp->hash == cursor->hashval) {
-                       if (cursor->offset == count) {
-                               break;
-                       }
-                       count++;
-               } else if (sbp->hash > cursor->hashval) {
-                       break;
-               }
-       }
-       if (i == nsbuf) {
-               kmem_free(sbuf);
-               return(0);
-       }
-
-       /*
-        * Loop putting entries into the user buffer.
-        */
-       for ( ; i < nsbuf; i++, sbp++) {
-               if (cursor->hashval != sbp->hash) {
-                       cursor->hashval = sbp->hash;
-                       cursor->offset = 0;
-               }
-               error = context->put_listent(context,
-                                       sbp->flags,
-                                       sbp->name,
-                                       sbp->namelen,
-                                       sbp->valuelen,
-                                       &sbp->name[sbp->namelen]);
-               if (error)
-                       return error;
-               if (context->seen_enough)
-                       break;
-               cursor->offset++;
-       }
-
-       kmem_free(sbuf);
-       return(0);
-}
-
  /*
   * Check a leaf attribute block to see if all the entries would fit into
   * a shortform attribute list.
@@ -1121,7 +936,6 @@ out:
         return error;
  }
  
-
  /*========================================================================
   * Routines used for growing the Btree.
   *========================================================================*/
@@ -1482,7 +1296,6 @@ xfs_attr3_leaf_compact(
         ichdr_dst->freemap[0].size = ichdr_dst->firstused -
                                                 ichdr_dst->freemap[0].base;
  
-
         /* write the header back to initialise the underlying buffer */
         xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
  
@@ -2643,130 +2456,6 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
         return size;
  }
  
-/*
- * Copy out attribute list entries for attr_list(), for leaf attribute lists.
- */
-int
-xfs_attr3_leaf_list_int(
-       struct xfs_buf                  *bp,
-       struct xfs_attr_list_context    *context)
-{
-       struct attrlist_cursor_kern     *cursor;
-       struct xfs_attr_leafblock       *leaf;
-       struct xfs_attr3_icleaf_hdr     ichdr;
-       struct xfs_attr_leaf_entry      *entries;
-       struct xfs_attr_leaf_entry      *entry;
-       int                             retval;
-       int                             i;
-
-       trace_xfs_attr_list_leaf(context);
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       entries = xfs_attr3_leaf_entryp(leaf);
-
-       cursor = context->cursor;
-       cursor->initted = 1;
-
-       /*
-        * Re-find our place in the leaf block if this is a new syscall.
-        */
-       if (context->resynch) {
-               entry = &entries[0];
-               for (i = 0; i < ichdr.count; entry++, i++) {
-                       if (be32_to_cpu(entry->hashval) == cursor->hashval) {
-                               if (cursor->offset == context->dupcnt) {
-                                       context->dupcnt = 0;
-                                       break;
-                               }
-                               context->dupcnt++;
-                       } else if (be32_to_cpu(entry->hashval) >
-                                       cursor->hashval) {
-                               context->dupcnt = 0;
-                               break;
-                       }
-               }
-               if (i == ichdr.count) {
-                       trace_xfs_attr_list_notfound(context);
-                       return 0;
-               }
-       } else {
-               entry = &entries[0];
-               i = 0;
-       }
-       context->resynch = 0;
-
-       /*
-        * We have found our place, start copying out the new attributes.
-        */
-       retval = 0;
-       for (; i < ichdr.count; entry++, i++) {
-               if (be32_to_cpu(entry->hashval) != cursor->hashval) {
-                       cursor->hashval = be32_to_cpu(entry->hashval);
-                       cursor->offset = 0;
-               }
-
-               if (entry->flags & XFS_ATTR_INCOMPLETE)
-                       continue;               /* skip incomplete entries */
-
-               if (entry->flags & XFS_ATTR_LOCAL) {
-                       xfs_attr_leaf_name_local_t *name_loc =
-                               xfs_attr3_leaf_name_local(leaf, i);
-
-                       retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_loc->nameval,
-                                               (int)name_loc->namelen,
-                                               be16_to_cpu(name_loc->valuelen),
-                                               &name_loc->nameval[name_loc->namelen]);
-                       if (retval)
-                               return retval;
-               } else {
-                       xfs_attr_leaf_name_remote_t *name_rmt =
-                               xfs_attr3_leaf_name_remote(leaf, i);
-
-                       int valuelen = be32_to_cpu(name_rmt->valuelen);
-
-                       if (context->put_value) {
-                               xfs_da_args_t args;
-
-                               memset((char *)&args, 0, sizeof(args));
-                               args.dp = context->dp;
-                               args.whichfork = XFS_ATTR_FORK;
-                               args.valuelen = valuelen;
-                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                       args.dp->i_mount, valuelen);
-                               retval = xfs_attr_rmtval_get(&args);
-                               if (retval)
-                                       return retval;
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               args.value);
-                               kmem_free(args.value);
-                       } else {
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               NULL);
-                       }
-                       if (retval)
-                               return retval;
-               }
-               if (context->seen_enough)
-                       break;
-               cursor->offset++;
-       }
-       trace_xfs_attr_list_leaf_end(context);
-       return retval;
-}
-
  
  /*========================================================================
   * Manage the INCOMPLETE flag in a leaf entry
@@ -3011,345 +2700,3 @@ xfs_attr3_leaf_flipflags(
  
         return error;
  }
-
-/*========================================================================
- * Indiscriminately delete the entire attribute fork
- *========================================================================*/
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-int
-xfs_attr3_root_inactive(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp)
-{
-       struct xfs_da_blkinfo   *info;
-       struct xfs_buf          *bp;
-       xfs_daddr_t             blkno;
-       int                     error;
-
-       /*
-        * Read block 0 to see what we have to work with.
-        * We only get here if we have extents, since we remove
-        * the extents in reverse order the extent containing
-        * block 0 must still be there.
-        */
-       error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
-       if (error)
-               return error;
-       blkno = bp->b_bn;
-
-       /*
-        * Invalidate the tree, even if the "tree" is only a single leaf block.
-        * This is a depth-first traversal!
-        */
-       info = bp->b_addr;
-       switch (info->magic) {
-       case cpu_to_be16(XFS_DA_NODE_MAGIC):
-       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
-               error = xfs_attr3_node_inactive(trans, dp, bp, 1);
-               break;
-       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
-       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
-               error = xfs_attr3_leaf_inactive(trans, dp, bp);
-               break;
-       default:
-               error = XFS_ERROR(EIO);
-               xfs_trans_brelse(*trans, bp);
-               break;
-       }
-       if (error)
-               return error;
-
-       /*
-        * Invalidate the incore copy of the root block.
-        */
-       error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
-       if (error)
-               return error;
-       xfs_trans_binval(*trans, bp);   /* remove from cache */
-       /*
-        * Commit the invalidate and start the next transaction.
-        */
-       error = xfs_trans_roll(trans, dp);
-
-       return error;
-}
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-STATIC int
-xfs_attr3_node_inactive(
-       struct xfs_trans **trans,
-       struct xfs_inode *dp,
-       struct xfs_buf  *bp,
-       int             level)
-{
-       xfs_da_blkinfo_t *info;
-       xfs_da_intnode_t *node;
-       xfs_dablk_t child_fsb;
-       xfs_daddr_t parent_blkno, child_blkno;
-       int error, i;
-       struct xfs_buf *child_bp;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr ichdr;
-
-       /*
-        * Since this code is recursive (gasp!) we must protect ourselves.
-        */
-       if (level > XFS_DA_NODE_MAXDEPTH) {
-               xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-               return XFS_ERROR(EIO);
-       }
-
-       node = bp->b_addr;
-       xfs_da3_node_hdr_from_disk(&ichdr, node);
-       parent_blkno = bp->b_bn;
-       if (!ichdr.count) {
-               xfs_trans_brelse(*trans, bp);
-               return 0;
-       }
-       btree = xfs_da3_node_tree_p(node);
-       child_fsb = be32_to_cpu(btree[0].before);
-       xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-
-       /*
-        * If this is the node level just above the leaves, simply loop
-        * over the leaves removing all of them.  If this is higher up
-        * in the tree, recurse downward.
-        */
-       for (i = 0; i < ichdr.count; i++) {
-               /*
-                * Read the subsidiary block to see what we have to work with.
-                * Don't do this in a transaction.  This is a depth-first
-                * traversal of the tree so we may deal with many blocks
-                * before we come back to this one.
-                */
-               error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
-                                               XFS_ATTR_FORK);
-               if (error)
-                       return(error);
-               if (child_bp) {
-                                               /* save for re-read later */
-                       child_blkno = XFS_BUF_ADDR(child_bp);
-
-                       /*
-                        * Invalidate the subtree, however we have to.
-                        */
-                       info = child_bp->b_addr;
-                       switch (info->magic) {
-                       case cpu_to_be16(XFS_DA_NODE_MAGIC):
-                       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
-                               error = xfs_attr3_node_inactive(trans, dp,
-                                                       child_bp, level + 1);
-                               break;
-                       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
-                       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
-                               error = xfs_attr3_leaf_inactive(trans, dp,
-                                                       child_bp);
-                               break;
-                       default:
-                               error = XFS_ERROR(EIO);
-                               xfs_trans_brelse(*trans, child_bp);
-                               break;
-                       }
-                       if (error)
-                               return error;
-
-                       /*
-                        * Remove the subsidiary block from the cache
-                        * and from the log.
-                        */
-                       error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
-                               &child_bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-                       xfs_trans_binval(*trans, child_bp);
-               }
-
-               /*
-                * If we're not done, re-read the parent to get the next
-                * child block number.
-                */
-               if (i + 1 < ichdr.count) {
-                       error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
-                                                &bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-                       child_fsb = be32_to_cpu(btree[i + 1].before);
-                       xfs_trans_brelse(*trans, bp);
-               }
-               /*
-                * Atomically commit the whole invalidate stuff.
-                */
-               error = xfs_trans_roll(trans, dp);
-               if (error)
-                       return  error;
-       }
-
-       return 0;
-}
-
-/*
- * Invalidate all of the "remote" value regions pointed to by a particular
- * leaf block.
- * Note that we must release the lock on the buffer so that we are not
- * caught holding something that the logging code wants to flush to disk.
- */
-STATIC int
-xfs_attr3_leaf_inactive(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_attr_inactive_list *list;
-       struct xfs_attr_inactive_list *lp;
-       int                     error;
-       int                     count;
-       int                     size;
-       int                     tmp;
-       int                     i;
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-       /*
-        * Count the number of "remote" value extents.
-        */
-       count = 0;
-       entry = xfs_attr3_leaf_entryp(leaf);
-       for (i = 0; i < ichdr.count; entry++, i++) {
-               if (be16_to_cpu(entry->nameidx) &&
-                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
-                       if (name_rmt->valueblk)
-                               count++;
-               }
-       }
-
-       /*
-        * If there are no "remote" values, we're done.
-        */
-       if (count == 0) {
-               xfs_trans_brelse(*trans, bp);
-               return 0;
-       }
-
-       /*
-        * Allocate storage for a list of all the "remote" value extents.
-        */
-       size = count * sizeof(xfs_attr_inactive_list_t);
-       list = kmem_alloc(size, KM_SLEEP);
-
-       /*
-        * Identify each of the "remote" value extents.
-        */
-       lp = list;
-       entry = xfs_attr3_leaf_entryp(leaf);
-       for (i = 0; i < ichdr.count; entry++, i++) {
-               if (be16_to_cpu(entry->nameidx) &&
-                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
-                       if (name_rmt->valueblk) {
-                               lp->valueblk = be32_to_cpu(name_rmt->valueblk);
-                               lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
-                                                   be32_to_cpu(name_rmt->valuelen));
-                               lp++;
-                       }
-               }
-       }
-       xfs_trans_brelse(*trans, bp);   /* unlock for trans. in freextent() */
-
-       /*
-        * Invalidate each of the "remote" value extents.
-        */
-       error = 0;
-       for (lp = list, i = 0; i < count; i++, lp++) {
-               tmp = xfs_attr3_leaf_freextent(trans, dp,
-                               lp->valueblk, lp->valuelen);
-
-               if (error == 0)
-                       error = tmp;    /* save only the 1st errno */
-       }
-
-       kmem_free(list);
-       return error;
-}
-
-/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
- */
-STATIC int
-xfs_attr3_leaf_freextent(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             blkno,
-       int                     blkcnt)
-{
-       struct xfs_bmbt_irec    map;
-       struct xfs_buf          *bp;
-       xfs_dablk_t             tblkno;
-       xfs_daddr_t             dblkno;
-       int                     tblkcnt;
-       int                     dblkcnt;
-       int                     nmap;
-       int                     error;
-
-       /*
-        * Roll through the "value", invalidating the attribute value's
-        * blocks.
-        */
-       tblkno = blkno;
-       tblkcnt = blkcnt;
-       while (tblkcnt > 0) {
-               /*
-                * Try to remember where we decided to put the value.
-                */
-               nmap = 1;
-               error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
-                                      &map, &nmap, XFS_BMAPI_ATTRFORK);
-               if (error) {
-                       return(error);
-               }
-               ASSERT(nmap == 1);
-               ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-
-               /*
-                * If it's a hole, these are already unmapped
-                * so there's nothing to invalidate.
-                */
-               if (map.br_startblock != HOLESTARTBLOCK) {
-
-                       dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
-                                                 map.br_startblock);
-                       dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
-                                               map.br_blockcount);
-                       bp = xfs_trans_get_buf(*trans,
-                                       dp->i_mount->m_ddev_targp,
-                                       dblkno, dblkcnt, 0);
-                       if (!bp)
-                               return ENOMEM;
-                       xfs_trans_binval(*trans, bp);
-                       /*
-                        * Roll to next transaction.
-                        */
-                       error = xfs_trans_roll(trans, dp);
-                       if (error)
-                               return (error);
-               }
-
-               tblkno += map.br_blockcount;
-               tblkcnt -= map.br_blockcount;
-       }
-
-       return(0);
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h

index 444a7704596c409f43f0ec495c9e6cc9838460c4..c1022138c7e6f3261819a0c52618b6bb1a9cf34f 100644 (file)
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -333,6 +333,8 @@ int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
                         struct xfs_buf **bpp);
  void   xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
                                      struct xfs_attr_leafblock *from);
+void   xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+                                  struct xfs_attr3_icleaf_hdr *from);
  
  extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
  
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c

new file mode 100644 (file)

index 0000000..cbc80d4
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+       xfs_attr_sf_sort_t *sa, *sb;
+
+       sa = (xfs_attr_sf_sort_t *)a;
+       sb = (xfs_attr_sf_sort_t *)b;
+       if (sa->hash < sb->hash) {
+               return(-1);
+       } else if (sa->hash > sb->hash) {
+               return(1);
+       } else {
+               return(sa->entno - sb->entno);
+       }
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+       (!((cursor)->initted) && !((cursor)->hashval) && \
+        !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+       attrlist_cursor_kern_t *cursor;
+       xfs_attr_sf_sort_t *sbuf, *sbp;
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       xfs_inode_t *dp;
+       int sbsize, nsbuf, count, i;
+       int error;
+
+       ASSERT(context != NULL);
+       dp = context->dp;
+       ASSERT(dp != NULL);
+       ASSERT(dp->i_afp != NULL);
+       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+       ASSERT(sf != NULL);
+       if (!sf->hdr.count)
+               return(0);
+       cursor = context->cursor;
+       ASSERT(cursor != NULL);
+
+       trace_xfs_attr_list_sf(context);
+
+       /*
+        * If the buffer is large enough and the cursor is at the start,
+        * do not bother with sorting since we will return everything in
+        * one buffer and another call using the cursor won't need to be
+        * made.
+        * Note the generous fudge factor of 16 overhead bytes per entry.
+        * If bufsize is zero then put_listent must be a search function
+        * and can just scan through what we have.
+        */
+       if (context->bufsize == 0 ||
+           (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+               for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+                       error = context->put_listent(context,
+                                          sfe->flags,
+                                          sfe->nameval,
+                                          (int)sfe->namelen,
+                                          (int)sfe->valuelen,
+                                          &sfe->nameval[sfe->namelen]);
+
+                       /*
+                        * Either search callback finished early or
+                        * didn't fit it all in the buffer after all.
+                        */
+                       if (context->seen_enough)
+                               break;
+
+                       if (error)
+                               return error;
+                       sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+               }
+               trace_xfs_attr_list_sf_all(context);
+               return(0);
+       }
+
+       /* do no more for a search callback */
+       if (context->bufsize == 0)
+               return 0;
+
+       /*
+        * It didn't all fit, so we have to sort everything on hashval.
+        */
+       sbsize = sf->hdr.count * sizeof(*sbuf);
+       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+       /*
+        * Scan the attribute list for the rest of the entries, storing
+        * the relevant info from only those that match into a buffer.
+        */
+       nsbuf = 0;
+       for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+               if (unlikely(
+                   ((char *)sfe < (char *)sf) ||
+                   ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+                       XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+                                            XFS_ERRLEVEL_LOW,
+                                            context->dp->i_mount, sfe);
+                       kmem_free(sbuf);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               sbp->entno = i;
+               sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+               sbp->name = sfe->nameval;
+               sbp->namelen = sfe->namelen;
+               /* These are bytes, and both on-disk, don't endian-flip */
+               sbp->valuelen = sfe->valuelen;
+               sbp->flags = sfe->flags;
+               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+               sbp++;
+               nsbuf++;
+       }
+
+       /*
+        * Sort the entries on hash then entno.
+        */
+       xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+       /*
+        * Re-find our place IN THE SORTED LIST.
+        */
+       count = 0;
+       cursor->initted = 1;
+       cursor->blkno = 0;
+       for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+               if (sbp->hash == cursor->hashval) {
+                       if (cursor->offset == count) {
+                               break;
+                       }
+                       count++;
+               } else if (sbp->hash > cursor->hashval) {
+                       break;
+               }
+       }
+       if (i == nsbuf) {
+               kmem_free(sbuf);
+               return(0);
+       }
+
+       /*
+        * Loop putting entries into the user buffer.
+        */
+       for ( ; i < nsbuf; i++, sbp++) {
+               if (cursor->hashval != sbp->hash) {
+                       cursor->hashval = sbp->hash;
+                       cursor->offset = 0;
+               }
+               error = context->put_listent(context,
+                                       sbp->flags,
+                                       sbp->name,
+                                       sbp->namelen,
+                                       sbp->valuelen,
+                                       &sbp->name[sbp->namelen]);
+               if (error)
+                       return error;
+               if (context->seen_enough)
+                       break;
+               cursor->offset++;
+       }
+
+       kmem_free(sbuf);
+       return(0);
+}
+
+STATIC int
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+       attrlist_cursor_kern_t *cursor;
+       xfs_attr_leafblock_t *leaf;
+       xfs_da_intnode_t *node;
+       struct xfs_attr3_icleaf_hdr leafhdr;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_node_entry *btree;
+       int error, i;
+       struct xfs_buf *bp;
+
+       trace_xfs_attr_node_list(context);
+
+       cursor = context->cursor;
+       cursor->initted = 1;
+
+       /*
+        * Do all sorts of validation on the passed-in cursor structure.
+        * If anything is amiss, ignore the cursor and look up the hashval
+        * starting from the btree root.
+        */
+       bp = NULL;
+       if (cursor->blkno > 0) {
+               error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
+                                             &bp, XFS_ATTR_FORK);
+               if ((error != 0) && (error != EFSCORRUPTED))
+                       return(error);
+               if (bp) {
+                       struct xfs_attr_leaf_entry *entries;
+
+                       node = bp->b_addr;
+                       switch (be16_to_cpu(node->hdr.info.magic)) {
+                       case XFS_DA_NODE_MAGIC:
+                       case XFS_DA3_NODE_MAGIC:
+                               trace_xfs_attr_list_wrong_blk(context);
+                               xfs_trans_brelse(NULL, bp);
+                               bp = NULL;
+                               break;
+                       case XFS_ATTR_LEAF_MAGIC:
+                       case XFS_ATTR3_LEAF_MAGIC:
+                               leaf = bp->b_addr;
+                               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                               entries = xfs_attr3_leaf_entryp(leaf);
+                               if (cursor->hashval > be32_to_cpu(
+                                               entries[leafhdr.count - 1].hashval)) {
+                                       trace_xfs_attr_list_wrong_blk(context);
+                                       xfs_trans_brelse(NULL, bp);
+                                       bp = NULL;
+                               } else if (cursor->hashval <= be32_to_cpu(
+                                               entries[0].hashval)) {
+                                       trace_xfs_attr_list_wrong_blk(context);
+                                       xfs_trans_brelse(NULL, bp);
+                                       bp = NULL;
+                               }
+                               break;
+                       default:
+                               trace_xfs_attr_list_wrong_blk(context);
+                               xfs_trans_brelse(NULL, bp);
+                               bp = NULL;
+                       }
+               }
+       }
+
+       /*
+        * We did not find what we expected given the cursor's contents,
+        * so we start from the top and work down based on the hash value.
+        * Note that start of node block is same as start of leaf block.
+        */
+       if (bp == NULL) {
+               cursor->blkno = 0;
+               for (;;) {
+                       __uint16_t magic;
+
+                       error = xfs_da3_node_read(NULL, context->dp,
+                                                     cursor->blkno, -1, &bp,
+                                                     XFS_ATTR_FORK);
+                       if (error)
+                               return(error);
+                       node = bp->b_addr;
+                       magic = be16_to_cpu(node->hdr.info.magic);
+                       if (magic == XFS_ATTR_LEAF_MAGIC ||
+                           magic == XFS_ATTR3_LEAF_MAGIC)
+                               break;
+                       if (magic != XFS_DA_NODE_MAGIC &&
+                           magic != XFS_DA3_NODE_MAGIC) {
+                               XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    context->dp->i_mount,
+                                                    node);
+                               xfs_trans_brelse(NULL, bp);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       xfs_da3_node_hdr_from_disk(&nodehdr, node);
+                       btree = xfs_da3_node_tree_p(node);
+                       for (i = 0; i < nodehdr.count; btree++, i++) {
+                               if (cursor->hashval
+                                               <= be32_to_cpu(btree->hashval)) {
+                                       cursor->blkno = be32_to_cpu(btree->before);
+                                       trace_xfs_attr_list_node_descend(context,
+                                                                        btree);
+                                       break;
+                               }
+                       }
+                       if (i == nodehdr.count) {
+                               xfs_trans_brelse(NULL, bp);
+                               return 0;
+                       }
+                       xfs_trans_brelse(NULL, bp);
+               }
+       }
+       ASSERT(bp != NULL);
+
+       /*
+        * Roll upward through the blocks, processing each leaf block in
+        * order.  As long as there is space in the result buffer, keep
+        * adding the information.
+        */
+       for (;;) {
+               leaf = bp->b_addr;
+               error = xfs_attr3_leaf_list_int(bp, context);
+               if (error) {
+                       xfs_trans_brelse(NULL, bp);
+                       return error;
+               }
+               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+               if (context->seen_enough || leafhdr.forw == 0)
+                       break;
+               cursor->blkno = leafhdr.forw;
+               xfs_trans_brelse(NULL, bp);
+               error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
+                                          &bp);
+               if (error)
+                       return error;
+       }
+       xfs_trans_brelse(NULL, bp);
+       return 0;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+       struct xfs_buf                  *bp,
+       struct xfs_attr_list_context    *context)
+{
+       struct attrlist_cursor_kern     *cursor;
+       struct xfs_attr_leafblock       *leaf;
+       struct xfs_attr3_icleaf_hdr     ichdr;
+       struct xfs_attr_leaf_entry      *entries;
+       struct xfs_attr_leaf_entry      *entry;
+       int                             retval;
+       int                             i;
+
+       trace_xfs_attr_list_leaf(context);
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       entries = xfs_attr3_leaf_entryp(leaf);
+
+       cursor = context->cursor;
+       cursor->initted = 1;
+
+       /*
+        * Re-find our place in the leaf block if this is a new syscall.
+        */
+       if (context->resynch) {
+               entry = &entries[0];
+               for (i = 0; i < ichdr.count; entry++, i++) {
+                       if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+                               if (cursor->offset == context->dupcnt) {
+                                       context->dupcnt = 0;
+                                       break;
+                               }
+                               context->dupcnt++;
+                       } else if (be32_to_cpu(entry->hashval) >
+                                       cursor->hashval) {
+                               context->dupcnt = 0;
+                               break;
+                       }
+               }
+               if (i == ichdr.count) {
+                       trace_xfs_attr_list_notfound(context);
+                       return 0;
+               }
+       } else {
+               entry = &entries[0];
+               i = 0;
+       }
+       context->resynch = 0;
+
+       /*
+        * We have found our place, start copying out the new attributes.
+        */
+       retval = 0;
+       for (; i < ichdr.count; entry++, i++) {
+               if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+                       cursor->hashval = be32_to_cpu(entry->hashval);
+                       cursor->offset = 0;
+               }
+
+               if (entry->flags & XFS_ATTR_INCOMPLETE)
+                       continue;               /* skip incomplete entries */
+
+               if (entry->flags & XFS_ATTR_LOCAL) {
+                       xfs_attr_leaf_name_local_t *name_loc =
+                               xfs_attr3_leaf_name_local(leaf, i);
+
+                       retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_loc->nameval,
+                                               (int)name_loc->namelen,
+                                               be16_to_cpu(name_loc->valuelen),
+                                               &name_loc->nameval[name_loc->namelen]);
+                       if (retval)
+                               return retval;
+               } else {
+                       xfs_attr_leaf_name_remote_t *name_rmt =
+                               xfs_attr3_leaf_name_remote(leaf, i);
+
+                       int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+                       if (context->put_value) {
+                               xfs_da_args_t args;
+
+                               memset((char *)&args, 0, sizeof(args));
+                               args.dp = context->dp;
+                               args.whichfork = XFS_ATTR_FORK;
+                               args.valuelen = valuelen;
+                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
+                                                       args.dp->i_mount, valuelen);
+                               retval = xfs_attr_rmtval_get(&args);
+                               if (retval)
+                                       return retval;
+                               retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_rmt->name,
+                                               (int)name_rmt->namelen,
+                                               valuelen,
+                                               args.value);
+                               kmem_free(args.value);
+                       } else {
+                               retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_rmt->name,
+                                               (int)name_rmt->namelen,
+                                               valuelen,
+                                               NULL);
+                       }
+                       if (retval)
+                               return retval;
+               }
+               if (context->seen_enough)
+                       break;
+               cursor->offset++;
+       }
+       trace_xfs_attr_list_leaf_end(context);
+       return retval;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+       int error;
+       struct xfs_buf *bp;
+
+       trace_xfs_attr_leaf_list(context);
+
+       context->cursor->blkno = 0;
+       error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+       if (error)
+               return XFS_ERROR(error);
+
+       error = xfs_attr3_leaf_list_int(bp, context);
+       xfs_trans_brelse(NULL, bp);
+       return XFS_ERROR(error);
+}
+
+int
+xfs_attr_list_int(
+       xfs_attr_list_context_t *context)
+{
+       int error;
+       xfs_inode_t *dp = context->dp;
+
+       XFS_STATS_INC(xs_attr_list);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return EIO;
+
+       xfs_ilock(dp, XFS_ILOCK_SHARED);
+
+       /*
+        * Decide on what work routines to call based on the inode size.
+        */
+       if (!xfs_inode_hasattr(dp)) {
+               error = 0;
+       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               error = xfs_attr_shortform_list(context);
+       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_list(context);
+       } else {
+               error = xfs_attr_node_list(context);
+       }
+
+       xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+       return error;
+}
+
+#define        ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
+       (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define        ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
+       ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+        & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+STATIC int
+xfs_attr_put_listent(
+       xfs_attr_list_context_t *context,
+       int             flags,
+       unsigned char   *name,
+       int             namelen,
+       int             valuelen,
+       unsigned char   *value)
+{
+       struct attrlist *alist = (struct attrlist *)context->alist;
+       attrlist_ent_t *aep;
+       int arraytop;
+
+       ASSERT(!(context->flags & ATTR_KERNOVAL));
+       ASSERT(context->count >= 0);
+       ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+       ASSERT(context->firstu >= sizeof(*alist));
+       ASSERT(context->firstu <= context->bufsize);
+
+       /*
+        * Only list entries in the right namespace.
+        */
+       if (((context->flags & ATTR_SECURE) == 0) !=
+           ((flags & XFS_ATTR_SECURE) == 0))
+               return 0;
+       if (((context->flags & ATTR_ROOT) == 0) !=
+           ((flags & XFS_ATTR_ROOT) == 0))
+               return 0;
+
+       arraytop = sizeof(*alist) +
+                       context->count * sizeof(alist->al_offset[0]);
+       context->firstu -= ATTR_ENTSIZE(namelen);
+       if (context->firstu < arraytop) {
+               trace_xfs_attr_list_full(context);
+               alist->al_more = 1;
+               context->seen_enough = 1;
+               return 1;
+       }
+
+       aep = (attrlist_ent_t *)&context->alist[context->firstu];
+       aep->a_valuelen = valuelen;
+       memcpy(aep->a_name, name, namelen);
+       aep->a_name[namelen] = 0;
+       alist->al_offset[context->count++] = context->firstu;
+       alist->al_count = context->count;
+       trace_xfs_attr_list_add(context);
+       return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.  Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+       xfs_inode_t     *dp,
+       char            *buffer,
+       int             bufsize,
+       int             flags,
+       attrlist_cursor_kern_t *cursor)
+{
+       xfs_attr_list_context_t context;
+       struct attrlist *alist;
+       int error;
+
+       /*
+        * Validate the cursor.
+        */
+       if (cursor->pad1 || cursor->pad2)
+               return(XFS_ERROR(EINVAL));
+       if ((cursor->initted == 0) &&
+           (cursor->hashval || cursor->blkno || cursor->offset))
+               return XFS_ERROR(EINVAL);
+
+       /*
+        * Check for a properly aligned buffer.
+        */
+       if (((long)buffer) & (sizeof(int)-1))
+               return XFS_ERROR(EFAULT);
+       if (flags & ATTR_KERNOVAL)
+               bufsize = 0;
+
+       /*
+        * Initialize the output buffer.
+        */
+       memset(&context, 0, sizeof(context));
+       context.dp = dp;
+       context.cursor = cursor;
+       context.resynch = 1;
+       context.flags = flags;
+       context.alist = buffer;
+       context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+       context.firstu = context.bufsize;
+       context.put_listent = xfs_attr_put_listent;
+
+       alist = (struct attrlist *)context.alist;
+       alist->al_count = 0;
+       alist->al_more = 0;
+       alist->al_offset[0] = context.bufsize;
+
+       error = xfs_attr_list_int(&context);
+       ASSERT(error >= 0);
+       return error;
+}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c

index ef6b0c124528f6bff8d59c0fee5fa31a1d5dcc8b..712a502de619b097df202f744ba481bd3471847d 100644 (file)
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
@@ -33,6 +34,7 @@
  #include "xfs_alloc.h"
  #include "xfs_inode_item.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_attr.h"
  #include "xfs_attr_leaf.h"
  #include "xfs_attr_remote.h"
@@ -237,7 +239,7 @@ xfs_attr_rmtval_copyout(
         xfs_ino_t       ino,
         int             *offset,
         int             *valuelen,
-       char            **dst)
+       __uint8_t       **dst)
  {
         char            *src = bp->b_addr;
         xfs_daddr_t     bno = bp->b_bn;
@@ -249,7 +251,7 @@ xfs_attr_rmtval_copyout(
                 int hdr_size = 0;
                 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
  
-               byte_cnt = min_t(int, *valuelen, byte_cnt);
+               byte_cnt = min(*valuelen, byte_cnt);
  
                 if (xfs_sb_version_hascrc(&mp->m_sb)) {
                         if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
@@ -284,7 +286,7 @@ xfs_attr_rmtval_copyin(
         xfs_ino_t       ino,
         int             *offset,
         int             *valuelen,
-       char            **src)
+       __uint8_t       **src)
  {
         char            *dst = bp->b_addr;
         xfs_daddr_t     bno = bp->b_bn;
@@ -337,7 +339,7 @@ xfs_attr_rmtval_get(
         struct xfs_mount        *mp = args->dp->i_mount;
         struct xfs_buf          *bp;
         xfs_dablk_t             lblkno = args->rmtblkno;
-       char                    *dst = args->value;
+       __uint8_t               *dst = args->value;
         int                     valuelen = args->valuelen;
         int                     nmap;
         int                     error;
@@ -401,7 +403,7 @@ xfs_attr_rmtval_set(
         struct xfs_bmbt_irec    map;
         xfs_dablk_t             lblkno;
         xfs_fileoff_t           lfileoff = 0;
-       char                    *src = args->value;
+       __uint8_t               *src = args->value;
         int                     blkcnt;
         int                     valuelen;
         int                     nmap;
@@ -543,11 +545,6 @@ xfs_attr_rmtval_remove(
  
         /*
          * Roll through the "value", invalidating the attribute value's blocks.
-        * Note that args->rmtblkcnt is the minimum number of data blocks we'll
-        * see for a CRC enabled remote attribute. Each extent will have a
-        * header, and so we may have more blocks than we realise here.  If we
-        * fail to map the blocks correctly, we'll have problems with the buffer
-        * lookups.
          */
         lblkno = args->rmtblkno;
         blkcnt = args->rmtblkcnt;
@@ -628,4 +625,3 @@ xfs_attr_rmtval_remove(
         }
         return(0);
  }
-
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c

index 05c698ccb238f4fa9eef9a1844683fe24f62bd13..92b830901d60bcf2b662315bb1625b2944964ddf 100644 (file)
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,16 +17,17 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
  #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_ialloc_btree.h"
@@ -39,6 +40,7 @@
  #include "xfs_extfree_item.h"
  #include "xfs_alloc.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_rtalloc.h"
  #include "xfs_error.h"
  #include "xfs_attr_leaf.h"
@@ -46,7 +48,6 @@
  #include "xfs_trans_space.h"
  #include "xfs_buf_item.h"
  #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
  #include "xfs_trace.h"
  #include "xfs_symlink.h"
  
@@ -108,19 +109,6 @@ xfs_bmap_compute_maxlevels(
         mp->m_bm_maxlevels[whichfork] = level;
  }
  
-/*
- * Convert the given file system block to a disk block.  We have to treat it
- * differently based on whether the file is a real time file or not, because the
- * bmap code does.
- */
-xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
-       return (XFS_IS_REALTIME_INODE(ip) ? \
-                (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
-                XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
-
  STATIC int                             /* error */
  xfs_bmbt_lookup_eq(
         struct xfs_btree_cur    *cur,
@@ -262,173 +250,6 @@ xfs_bmap_forkoff_reset(
         }
  }
  
-/*
- * Extent tree block counting routines.
- */
-
-/*
- * Count leaf blocks given a range of extent records.
- */
-STATIC void
-xfs_bmap_count_leaves(
-       xfs_ifork_t             *ifp,
-       xfs_extnum_t            idx,
-       int                     numrecs,
-       int                     *count)
-{
-       int             b;
-
-       for (b = 0; b < numrecs; b++) {
-               xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
-               *count += xfs_bmbt_get_blockcount(frp);
-       }
-}
-
-/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
-       struct xfs_mount        *mp,
-       struct xfs_btree_block  *block,
-       int                     numrecs,
-       int                     *count)
-{
-       int             b;
-       xfs_bmbt_rec_t  *frp;
-
-       for (b = 1; b <= numrecs; b++) {
-               frp = XFS_BMBT_REC_ADDR(mp, block, b);
-               *count += xfs_bmbt_disk_get_blockcount(frp);
-       }
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-STATIC int                                     /* error */
-xfs_bmap_count_tree(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fsblock_t   blockno,        /* file system block number */
-       int             levelin,        /* level in btree */
-       int             *count)         /* Count of blocks */
-{
-       int                     error;
-       xfs_buf_t               *bp, *nbp;
-       int                     level = levelin;
-       __be64                  *pp;
-       xfs_fsblock_t           bno = blockno;
-       xfs_fsblock_t           nextbno;
-       struct xfs_btree_block  *block, *nextblock;
-       int                     numrecs;
-
-       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-       if (error)
-               return error;
-       *count += 1;
-       block = XFS_BUF_TO_BLOCK(bp);
-
-       if (--level) {
-               /* Not at node above leaves, count this level of nodes */
-               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-               while (nextbno != NULLFSBLOCK) {
-                       error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               return error;
-                       *count += 1;
-                       nextblock = XFS_BUF_TO_BLOCK(nbp);
-                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
-                       xfs_trans_brelse(tp, nbp);
-               }
-
-               /* Dive to the next level */
-               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-               bno = be64_to_cpu(*pp);
-               if (unlikely((error =
-                    xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
-                       xfs_trans_brelse(tp, bp);
-                       XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               xfs_trans_brelse(tp, bp);
-       } else {
-               /* count all level 1 nodes and their leaves */
-               for (;;) {
-                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-                       numrecs = be16_to_cpu(block->bb_numrecs);
-                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
-                       xfs_trans_brelse(tp, bp);
-                       if (nextbno == NULLFSBLOCK)
-                               break;
-                       bno = nextbno;
-                       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               return error;
-                       *count += 1;
-                       block = XFS_BUF_TO_BLOCK(bp);
-               }
-       }
-       return 0;
-}
-
-/*
- * Count fsblocks of the given fork.
- */
-int                                            /* error */
-xfs_bmap_count_blocks(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode */
-       int                     whichfork,      /* data or attr fork */
-       int                     *count)         /* out: count of blocks */
-{
-       struct xfs_btree_block  *block; /* current btree block */
-       xfs_fsblock_t           bno;    /* block # of "block" */
-       xfs_ifork_t             *ifp;   /* fork structure */
-       int                     level;  /* btree level, for checking */
-       xfs_mount_t             *mp;    /* file system mount structure */
-       __be64                  *pp;    /* pointer to block address */
-
-       bno = NULLFSBLOCK;
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-               xfs_bmap_count_leaves(ifp, 0,
-                       ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-                       count);
-               return 0;
-       }
-
-       /*
-        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-        */
-       block = ifp->if_broot;
-       level = be16_to_cpu(block->bb_level);
-       ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-       bno = be64_to_cpu(*pp);
-       ASSERT(bno != NULLDFSBNO);
-       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-       if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
-               XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
-                                mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       return 0;
-}
-
  /*
   * Debug/sanity checking code
   */
@@ -724,8 +545,8 @@ xfs_bmap_trace_exlist(
  
  /*
   * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters.  Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
   * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
   */
  STATIC void
@@ -823,7 +644,7 @@ xfs_bmap_add_free(
   * Remove the entry "free" from the free item list.  Prev points to the
   * previous entry, unless "free" is the head of the list.
   */
-STATIC void
+void
  xfs_bmap_del_free(
         xfs_bmap_free_t         *flist, /* free item list header */
         xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
@@ -837,92 +658,6 @@ xfs_bmap_del_free(
         kmem_zone_free(xfs_bmap_free_item_zone, free);
  }
  
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.  We never free any extents in
- * the first transaction.
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-int                                            /* error */
-xfs_bmap_finish(
-       xfs_trans_t             **tp,           /* transaction pointer addr */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *committed)     /* xact committed or not */
-{
-       xfs_efd_log_item_t      *efd;           /* extent free data */
-       xfs_efi_log_item_t      *efi;           /* extent free intention */
-       int                     error;          /* error return value */
-       xfs_bmap_free_item_t    *free;          /* free extent item */
-       unsigned int            logres;         /* new log reservation */
-       unsigned int            logcount;       /* new log count */
-       xfs_mount_t             *mp;            /* filesystem mount structure */
-       xfs_bmap_free_item_t    *next;          /* next item on free list */
-       xfs_trans_t             *ntp;           /* new transaction pointer */
-
-       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-       if (flist->xbf_count == 0) {
-               *committed = 0;
-               return 0;
-       }
-       ntp = *tp;
-       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
-       for (free = flist->xbf_first; free; free = free->xbfi_next)
-               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
-                       free->xbfi_blockcount);
-       logres = ntp->t_log_res;
-       logcount = ntp->t_log_count;
-       ntp = xfs_trans_dup(*tp);
-       error = xfs_trans_commit(*tp, 0);
-       *tp = ntp;
-       *committed = 1;
-       /*
-        * We have a new transaction, so we should return committed=1,
-        * even though we're returning an error.
-        */
-       if (error)
-               return error;
-
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(ntp->t_ticket);
-
-       if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
-                       logcount)))
-               return error;
-       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
-       for (free = flist->xbf_first; free != NULL; free = next) {
-               next = free->xbfi_next;
-               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
-                               free->xbfi_blockcount))) {
-                       /*
-                        * The bmap free list will be cleaned up at a
-                        * higher level.  The EFI will be canceled when
-                        * this transaction is aborted.
-                        * Need to force shutdown here to make sure it
-                        * happens, since this transaction may not be
-                        * dirty yet.
-                        */
-                       mp = ntp->t_mountp;
-                       if (!XFS_FORCED_SHUTDOWN(mp))
-                               xfs_force_shutdown(mp,
-                                                  (error == EFSCORRUPTED) ?
-                                                  SHUTDOWN_CORRUPT_INCORE :
-                                                  SHUTDOWN_META_IO_ERROR);
-                       return error;
-               }
-               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
-                       free->xbfi_blockcount);
-               xfs_bmap_del_free(flist, NULL, free);
-       }
-       return 0;
-}
-
  /*
   * Free up any items left in the list.
   */
@@ -1413,8 +1148,8 @@ xfs_bmap_add_attrfork(
         blks = XFS_ADDAFORK_SPACE_RES(mp);
         if (rsvd)
                 tp->t_flags |= XFS_TRANS_RESERVE;
-       if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+       if (error)
                 goto error0;
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
@@ -1815,7 +1550,7 @@ xfs_bmap_first_unused(
  }
  
  /*
- * Returns the file-relative block number of the last block + 1 before
+ * Returns the file-relative block number of the last block - 1 before
   * last_block (input value) in the file.
   * This is not based on i_size, it is based on the extent records.
   * Returns 0 for local files, as they do not have extent records.
@@ -1863,7 +1598,7 @@ xfs_bmap_last_before(
         return 0;
  }
  
-STATIC int
+int
  xfs_bmap_last_extent(
         struct xfs_trans        *tp,
         struct xfs_inode        *ip,
@@ -1926,29 +1661,6 @@ xfs_bmap_isaeof(
         return 0;
  }
  
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.  All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           endoff,
-       int                     whichfork,
-       int                     *eof)
-{
-       struct xfs_bmbt_irec    rec;
-       int                     error;
-
-       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
-       if (error || *eof)
-               return error;
-
-       *eof = endoff >= rec.br_startoff + rec.br_blockcount;
-       return 0;
-}
-
  /*
   * Returns the file-relative block number of the first block past eof in
   * the file.  This is not based on i_size, it is based on the extent records.
@@ -3488,7 +3200,7 @@ done:
  /*
   * Adjust the size of the new extent based on di_extsize and rt extsize.
   */
-STATIC int
+int
  xfs_bmap_extsize_align(
         xfs_mount_t     *mp,
         xfs_bmbt_irec_t *gotp,          /* next extent pointer */
@@ -3650,9 +3362,9 @@ xfs_bmap_extsize_align(
  
  #define XFS_ALLOC_GAP_UNITS    4
  
-STATIC void
+void
  xfs_bmap_adjacent(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
  {
         xfs_fsblock_t   adjust;         /* adjustment to block numbers */
         xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
@@ -3798,109 +3510,6 @@ xfs_bmap_adjacent(
  #undef ISVALID
  }
  
-STATIC int
-xfs_bmap_rtalloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
-{
-       xfs_alloctype_t atype = 0;      /* type for allocation routines */
-       int             error;          /* error return value */
-       xfs_mount_t     *mp;            /* mount point structure */
-       xfs_extlen_t    prod = 0;       /* product factor for allocators */
-       xfs_extlen_t    ralen = 0;      /* realtime allocation length */
-       xfs_extlen_t    align;          /* minimum allocation alignment */
-       xfs_rtblock_t   rtb;
-
-       mp = ap->ip->i_mount;
-       align = xfs_get_extsz_hint(ap->ip);
-       prod = align / mp->m_sb.sb_rextsize;
-       error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
-                                       align, 1, ap->eof, 0,
-                                       ap->conv, &ap->offset, &ap->length);
-       if (error)
-               return error;
-       ASSERT(ap->length);
-       ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
-
-       /*
-        * If the offset & length are not perfectly aligned
-        * then kill prod, it will just get us in trouble.
-        */
-       if (do_mod(ap->offset, align) || ap->length % align)
-               prod = 1;
-       /*
-        * Set ralen to be the actual requested length in rtextents.
-        */
-       ralen = ap->length / mp->m_sb.sb_rextsize;
-       /*
-        * If the old value was close enough to MAXEXTLEN that
-        * we rounded up to it, cut it back so it's valid again.
-        * Note that if it's a really large request (bigger than
-        * MAXEXTLEN), we don't hear about that number, and can't
-        * adjust the starting point to match it.
-        */
-       if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
-               ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
-
-       /*
-        * Lock out other modifications to the RT bitmap inode.
-        */
-       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-
-       /*
-        * If it's an allocation to an empty file at offset 0,
-        * pick an extent that will space things out in the rt area.
-        */
-       if (ap->eof && ap->offset == 0) {
-               xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
-
-               error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
-               if (error)
-                       return error;
-               ap->blkno = rtx * mp->m_sb.sb_rextsize;
-       } else {
-               ap->blkno = 0;
-       }
-
-       xfs_bmap_adjacent(ap);
-
-       /*
-        * Realtime allocation, done through xfs_rtallocate_extent.
-        */
-       atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
-       do_div(ap->blkno, mp->m_sb.sb_rextsize);
-       rtb = ap->blkno;
-       ap->length = ralen;
-       if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
-                               &ralen, atype, ap->wasdel, prod, &rtb)))
-               return error;
-       if (rtb == NULLFSBLOCK && prod > 1 &&
-           (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
-                                          ap->length, &ralen, atype,
-                                          ap->wasdel, 1, &rtb)))
-               return error;
-       ap->blkno = rtb;
-       if (ap->blkno != NULLFSBLOCK) {
-               ap->blkno *= mp->m_sb.sb_rextsize;
-               ralen *= mp->m_sb.sb_rextsize;
-               ap->length = ralen;
-               ap->ip->i_d.di_nblocks += ralen;
-               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-               if (ap->wasdel)
-                       ap->ip->i_delayed_blks -= ralen;
-               /*
-                * Adjust the disk quota also. This was reserved
-                * earlier.
-                */
-               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-                       ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
-                                       XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
-       } else {
-               ap->length = 0;
-       }
-       return 0;
-}
-
  STATIC int
  xfs_bmap_btalloc_nullfb(
         struct xfs_bmalloca     *ap,
@@ -4018,7 +3627,7 @@ xfs_bmap_btalloc_nullfb(
  
  STATIC int
  xfs_bmap_btalloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
  {
         xfs_mount_t     *mp;            /* mount point structure */
         xfs_alloctype_t atype = 0;      /* type for allocation routines */
@@ -4250,7 +3859,7 @@ xfs_bmap_btalloc(
   */
  STATIC int
  xfs_bmap_alloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
  {
         if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
                 return xfs_bmap_rtalloc(ap);
@@ -4638,7 +4247,7 @@ xfs_bmapi_delay(
  }
  
  
-STATIC int
+int
  __xfs_bmapi_allocate(
         struct xfs_bmalloca     *bma)
  {
@@ -4648,12 +4257,9 @@ __xfs_bmapi_allocate(
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
         int                     tmp_logflags = 0;
         int                     error;
-       int                     rt;
  
         ASSERT(bma->length > 0);
  
-       rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
-
         /*
          * For the wasdelay case, we could also just allocate the stuff asked
          * for in this bmap call but that wouldn't be as good.
@@ -4756,45 +4362,6 @@ __xfs_bmapi_allocate(
         return 0;
  }
  
-static void
-xfs_bmapi_allocate_worker(
-       struct work_struct      *work)
-{
-       struct xfs_bmalloca     *args = container_of(work,
-                                               struct xfs_bmalloca, work);
-       unsigned long           pflags;
-
-       /* we are in a transaction context here */
-       current_set_flags_nested(&pflags, PF_FSTRANS);
-
-       args->result = __xfs_bmapi_allocate(args);
-       complete(args->done);
-
-       current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Some allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Otherwise just
- * call directly to avoid the context switch overhead here.
- */
-int
-xfs_bmapi_allocate(
-       struct xfs_bmalloca     *args)
-{
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       if (!args->stack_switch)
-               return __xfs_bmapi_allocate(args);
-
-
-       args->done = &done;
-       INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
-       queue_work(xfs_alloc_wq, &args->work);
-       wait_for_completion(&done);
-       return args->result;
-}
-
  STATIC int
  xfs_bmapi_convert_unwritten(
         struct xfs_bmalloca     *bma,
@@ -5789,359 +5356,3 @@ error0:
         }
         return error;
  }
-
-/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
-       xfs_inode_t             *ip,            /* xfs incore inode pointer */
-       struct getbmapx         *out,           /* output structure */
-       int                     prealloced,     /* this is a file with
-                                                * preallocated data space */
-       __int64_t               end,            /* last block requested */
-       xfs_fsblock_t           startblock)
-{
-       __int64_t               fixlen;
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_ifork_t             *ifp;           /* inode fork pointer */
-       xfs_extnum_t            lastx;          /* last extent pointer */
-       xfs_fileoff_t           fileblock;
-
-       if (startblock == HOLESTARTBLOCK) {
-               mp = ip->i_mount;
-               out->bmv_block = -1;
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
-               fixlen -= out->bmv_offset;
-               if (prealloced && out->bmv_offset + out->bmv_length == end) {
-                       /* Came to hole at EOF. Trim it. */
-                       if (fixlen <= 0)
-                               return 0;
-                       out->bmv_length = fixlen;
-               }
-       } else {
-               if (startblock == DELAYSTARTBLOCK)
-                       out->bmv_block = -2;
-               else
-                       out->bmv_block = xfs_fsb_to_db(ip, startblock);
-               fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
-               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-               if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
-                  (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
-                       out->bmv_oflags |= BMV_OF_LAST;
-       }
-
-       return 1;
-}
-
-/*
- * Get inode's extents as described in bmv, and format for output.
- * Calls formatter to fill the user's buffer until all extents
- * are mapped, until the passed-in bmv->bmv_count slots have
- * been filled, or until the formatter short-circuits the loop,
- * if it is tracking filled-in extents on its own.
- */
-int                                            /* error code */
-xfs_getbmap(
-       xfs_inode_t             *ip,
-       struct getbmapx         *bmv,           /* user bmap structure */
-       xfs_bmap_format_t       formatter,      /* format to user */
-       void                    *arg)           /* formatter arg */
-{
-       __int64_t               bmvend;         /* last block requested */
-       int                     error = 0;      /* return value */
-       __int64_t               fixlen;         /* length for -1 case */
-       int                     i;              /* extent number */
-       int                     lock;           /* lock state */
-       xfs_bmbt_irec_t         *map;           /* buffer for user's data */
-       xfs_mount_t             *mp;            /* file system mount point */
-       int                     nex;            /* # of user extents can do */
-       int                     nexleft;        /* # of user extents left */
-       int                     subnex;         /* # of bmapi's can do */
-       int                     nmap;           /* number of map entries */
-       struct getbmapx         *out;           /* output structure */
-       int                     whichfork;      /* data or attr fork */
-       int                     prealloced;     /* this is a file with
-                                                * preallocated data space */
-       int                     iflags;         /* interface flags */
-       int                     bmapi_flags;    /* flags for xfs_bmapi */
-       int                     cur_ext = 0;
-
-       mp = ip->i_mount;
-       iflags = bmv->bmv_iflags;
-       whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-
-       if (whichfork == XFS_ATTR_FORK) {
-               if (XFS_IFORK_Q(ip)) {
-                       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
-                           ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
-                           ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-                               return XFS_ERROR(EINVAL);
-               } else if (unlikely(
-                          ip->i_d.di_aformat != 0 &&
-                          ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
-                       XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
-                                        ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-
-               prealloced = 0;
-               fixlen = 1LL << 32;
-       } else {
-               if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
-                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-                       return XFS_ERROR(EINVAL);
-
-               if (xfs_get_extsz_hint(ip) ||
-                   ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
-                       prealloced = 1;
-                       fixlen = mp->m_super->s_maxbytes;
-               } else {
-                       prealloced = 0;
-                       fixlen = XFS_ISIZE(ip);
-               }
-       }
-
-       if (bmv->bmv_length == -1) {
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-               bmv->bmv_length =
-                       max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
-       } else if (bmv->bmv_length == 0) {
-               bmv->bmv_entries = 0;
-               return 0;
-       } else if (bmv->bmv_length < 0) {
-               return XFS_ERROR(EINVAL);
-       }
-
-       nex = bmv->bmv_count - 1;
-       if (nex <= 0)
-               return XFS_ERROR(EINVAL);
-       bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
-       if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-               return XFS_ERROR(ENOMEM);
-       out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-       if (!out) {
-               out = kmem_zalloc_large(bmv->bmv_count *
-                                       sizeof(struct getbmapx));
-               if (!out)
-                       return XFS_ERROR(ENOMEM);
-       }
-
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
-                       if (error)
-                               goto out_unlock_iolock;
-               }
-               /*
-                * even after flushing the inode, there can still be delalloc
-                * blocks on the inode beyond EOF due to speculative
-                * preallocation. These are not removed until the release
-                * function is called or the inode is inactivated. Hence we
-                * cannot assert here that ip->i_delayed_blks == 0.
-                */
-       }
-
-       lock = xfs_ilock_map_shared(ip);
-
-       /*
-        * Don't let nex be bigger than the number of extents
-        * we can have assuming alternating holes and real extents.
-        */
-       if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
-               nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
-       bmapi_flags = xfs_bmapi_aflag(whichfork);
-       if (!(iflags & BMV_IF_PREALLOC))
-               bmapi_flags |= XFS_BMAPI_IGSTATE;
-
-       /*
-        * Allocate enough space to handle "subnex" maps at a time.
-        */
-       error = ENOMEM;
-       subnex = 16;
-       map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
-       if (!map)
-               goto out_unlock_ilock;
-
-       bmv->bmv_entries = 0;
-
-       if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
-           (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
-               error = 0;
-               goto out_free_map;
-       }
-
-       nexleft = nex;
-
-       do {
-               nmap = (nexleft > subnex) ? subnex : nexleft;
-               error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-                                      XFS_BB_TO_FSB(mp, bmv->bmv_length),
-                                      map, &nmap, bmapi_flags);
-               if (error)
-                       goto out_free_map;
-               ASSERT(nmap <= subnex);
-
-               for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                       out[cur_ext].bmv_oflags = 0;
-                       if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                               out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
-                       else if (map[i].br_startblock == DELAYSTARTBLOCK)
-                               out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
-                       out[cur_ext].bmv_offset =
-                               XFS_FSB_TO_BB(mp, map[i].br_startoff);
-                       out[cur_ext].bmv_length =
-                               XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                       out[cur_ext].bmv_unused1 = 0;
-                       out[cur_ext].bmv_unused2 = 0;
-
-                       /*
-                        * delayed allocation extents that start beyond EOF can
-                        * occur due to speculative EOF allocation when the
-                        * delalloc extent is larger than the largest freespace
-                        * extent at conversion time. These extents cannot be
-                        * converted by data writeback, so can exist here even
-                        * if we are not supposed to be finding delalloc
-                        * extents.
-                        */
-                       if (map[i].br_startblock == DELAYSTARTBLOCK &&
-                           map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
-                               ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
-                        if (map[i].br_startblock == HOLESTARTBLOCK &&
-                           whichfork == XFS_ATTR_FORK) {
-                               /* came to the end of attribute fork */
-                               out[cur_ext].bmv_oflags |= BMV_OF_LAST;
-                               goto out_free_map;
-                       }
-
-                       if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
-                                       prealloced, bmvend,
-                                       map[i].br_startblock))
-                               goto out_free_map;
-
-                       bmv->bmv_offset =
-                               out[cur_ext].bmv_offset +
-                               out[cur_ext].bmv_length;
-                       bmv->bmv_length =
-                               max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
-
-                       /*
-                        * In case we don't want to return the hole,
-                        * don't increase cur_ext so that we can reuse
-                        * it in the next loop.
-                        */
-                       if ((iflags & BMV_IF_NO_HOLES) &&
-                           map[i].br_startblock == HOLESTARTBLOCK) {
-                               memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
-                               continue;
-                       }
-
-                       nexleft--;
-                       bmv->bmv_entries++;
-                       cur_ext++;
-               }
-       } while (nmap && nexleft && bmv->bmv_length);
-
- out_free_map:
-       kmem_free(map);
- out_unlock_ilock:
-       xfs_iunlock_map_shared(ip, lock);
- out_unlock_iolock:
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-       for (i = 0; i < cur_ext; i++) {
-               int full = 0;   /* user array is full */
-
-               /* format results & advance arg */
-               error = formatter(&arg, &out[i], &full);
-               if (error || full)
-                       break;
-       }
-
-       if (is_vmalloc_addr(out))
-               kmem_free_large(out);
-       else
-               kmem_free(out);
-       return error;
-}
-
-/*
- * dead simple method of punching delalyed allocation blocks from a range in
- * the inode. Walks a block at a time so will be slow, but is only executed in
- * rare error cases so the overhead is not critical. This will alays punch out
- * both the start and end blocks, even if the ranges only partially overlap
- * them, so it is up to the caller to ensure that partial blocks are not
- * passed in.
- */
-int
-xfs_bmap_punch_delalloc_range(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           start_fsb,
-       xfs_fileoff_t           length)
-{
-       xfs_fileoff_t           remaining = length;
-       int                     error = 0;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       do {
-               int             done;
-               xfs_bmbt_irec_t imap;
-               int             nimaps = 1;
-               xfs_fsblock_t   firstblock;
-               xfs_bmap_free_t flist;
-
-               /*
-                * Map the range first and check that it is a delalloc extent
-                * before trying to unmap the range. Otherwise we will be
-                * trying to remove a real extent (which requires a
-                * transaction) or a hole, which is probably a bad idea...
-                */
-               error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
-                                      XFS_BMAPI_ENTIRE);
-
-               if (error) {
-                       /* something screwed, just bail */
-                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                               xfs_alert(ip->i_mount,
-                       "Failed delalloc mapping lookup ino %lld fsb %lld.",
-                                               ip->i_ino, start_fsb);
-                       }
-                       break;
-               }
-               if (!nimaps) {
-                       /* nothing there */
-                       goto next_block;
-               }
-               if (imap.br_startblock != DELAYSTARTBLOCK) {
-                       /* been converted, ignore */
-                       goto next_block;
-               }
-               WARN_ON(imap.br_blockcount == 0);
-
-               /*
-                * Note: while we initialise the firstblock/flist pair, they
-                * should never be used because blocks should never be
-                * allocated or freed for a delalloc extent and hence we need
-                * don't cancel or finish them after the xfs_bunmapi() call.
-                */
-               xfs_bmap_init(&flist, &firstblock);
-               error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
-                                       &flist, &done);
-               if (error)
-                       break;
-
-               ASSERT(!flist.xbf_count && !flist.xbf_first);
-next_block:
-               start_fsb++;
-               remaining--;
-       } while(remaining > 0);
-
-       return error;
-}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h

index 1cf1292d29b70cdbee6168c9a530d3a891547d7f..33b41f35122574e0b1cf7ad7a2a9ae23ecfadddb 100644 (file)
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -107,41 +107,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
                 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
  }
  
-/*
- * Argument structure for xfs_bmap_alloc.
- */
-typedef struct xfs_bmalloca {
-       xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-       struct xfs_bmap_free    *flist; /* bmap freelist */
-       struct xfs_trans        *tp;    /* transaction pointer */
-       struct xfs_inode        *ip;    /* incore inode pointer */
-       struct xfs_bmbt_irec    prev;   /* extent before the new one */
-       struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-
-       xfs_fileoff_t           offset; /* offset in file filling in */
-       xfs_extlen_t            length; /* i/o length asked/allocated */
-       xfs_fsblock_t           blkno;  /* starting block of new extent */
-
-       struct xfs_btree_cur    *cur;   /* btree cursor */
-       xfs_extnum_t            idx;    /* current extent index */
-       int                     nallocs;/* number of extents alloc'd */
-       int                     logflags;/* flags for transaction logging */
-
-       xfs_extlen_t            total;  /* total blocks needed for xaction */
-       xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
-       xfs_extlen_t            minleft; /* amount must be left after alloc */
-       char                    eof;    /* set if allocating past last extent */
-       char                    wasdel; /* replacing a delayed allocation */
-       char                    userdata;/* set if is user data */
-       char                    aeof;   /* allocated space at eof */
-       char                    conv;   /* overwriting unwritten extents */
-       char                    stack_switch;
-       int                     flags;
-       struct completion       *done;
-       struct work_struct      work;
-       int                     result;
-} xfs_bmalloca_t;
-
  /*
   * Flags for xfs_bmap_add_extent*.
   */
@@ -162,7 +127,7 @@ typedef struct xfs_bmalloca {
         { BMAP_RIGHT_FILLING,   "RF" }, \
         { BMAP_ATTRFORK,        "ATTR" }
  
-#if defined(__KERNEL) && defined(DEBUG)
+#ifdef DEBUG
  void   xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                 int whichfork, unsigned long caller_ip);
  #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
@@ -205,23 +170,4 @@ int        xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                 xfs_extnum_t num);
  uint   xfs_default_attroffset(struct xfs_inode *ip);
  
-#ifdef __KERNEL__
-/* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
-
-int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-               int *committed);
-int    xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
-               xfs_bmap_format_t formatter, void *arg);
-int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
-               int whichfork, int *eof);
-int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-               int whichfork, int *count);
-int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
-               xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-
-xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
-
-#endif /* __KERNEL__ */
-
  #endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c

index 0c61a22be6fd630668a16d0f92b3db625fa03173..cf3bc76710c3de6e021b37ccc275894458f8c931 100644 (file)
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,7 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
@@ -722,7 +722,7 @@ xfs_bmbt_key_diff(
                                       cur->bc_rec.b.br_startoff;
  }
  
-static int
+static bool
  xfs_bmbt_verify(
         struct xfs_buf          *bp)
  {
@@ -775,7 +775,6 @@ xfs_bmbt_verify(
                 return false;
  
         return true;
-
  }
  
  static void
@@ -789,7 +788,6 @@ xfs_bmbt_read_verify(
                                      bp->b_target->bt_mount, bp->b_addr);
                 xfs_buf_ioerror(bp, EFSCORRUPTED);
         }
-
  }
  
  static void
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

new file mode 100644 (file)

index 0000000..541d59f
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,2026 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block.  We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+       return (XFS_IS_REALTIME_INODE(ip) ? \
+                (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+                XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int                                            /* error */
+xfs_bmap_finish(
+       xfs_trans_t             **tp,           /* transaction pointer addr */
+       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       int                     *committed)     /* xact committed or not */
+{
+       xfs_efd_log_item_t      *efd;           /* extent free data */
+       xfs_efi_log_item_t      *efi;           /* extent free intention */
+       int                     error;          /* error return value */
+       xfs_bmap_free_item_t    *free;          /* free extent item */
+       struct xfs_trans_res    tres;           /* new log reservation */
+       xfs_mount_t             *mp;            /* filesystem mount structure */
+       xfs_bmap_free_item_t    *next;          /* next item on free list */
+       xfs_trans_t             *ntp;           /* new transaction pointer */
+
+       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+       if (flist->xbf_count == 0) {
+               *committed = 0;
+               return 0;
+       }
+       ntp = *tp;
+       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+       for (free = flist->xbf_first; free; free = free->xbfi_next)
+               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+                       free->xbfi_blockcount);
+
+       tres.tr_logres = ntp->t_log_res;
+       tres.tr_logcount = ntp->t_log_count;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       ntp = xfs_trans_dup(*tp);
+       error = xfs_trans_commit(*tp, 0);
+       *tp = ntp;
+       *committed = 1;
+       /*
+        * We have a new transaction, so we should return committed=1,
+        * even though we're returning an error.
+        */
+       if (error)
+               return error;
+
+       /*
+        * transaction commit worked ok so we can drop the extra ticket
+        * reference that we gained in xfs_trans_dup()
+        */
+       xfs_log_ticket_put(ntp->t_ticket);
+
+       error = xfs_trans_reserve(ntp, &tres, 0, 0);
+       if (error)
+               return error;
+       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+       for (free = flist->xbf_first; free != NULL; free = next) {
+               next = free->xbfi_next;
+               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+                               free->xbfi_blockcount))) {
+                       /*
+                        * The bmap free list will be cleaned up at a
+                        * higher level.  The EFI will be canceled when
+                        * this transaction is aborted.
+                        * Need to force shutdown here to make sure it
+                        * happens, since this transaction may not be
+                        * dirty yet.
+                        */
+                       mp = ntp->t_mountp;
+                       if (!XFS_FORCED_SHUTDOWN(mp))
+                               xfs_force_shutdown(mp,
+                                                  (error == EFSCORRUPTED) ?
+                                                  SHUTDOWN_CORRUPT_INCORE :
+                                                  SHUTDOWN_META_IO_ERROR);
+                       return error;
+               }
+               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+                       free->xbfi_blockcount);
+               xfs_bmap_del_free(flist, NULL, free);
+       }
+       return 0;
+}
+
+int
+xfs_bmap_rtalloc(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+       xfs_alloctype_t atype = 0;      /* type for allocation routines */
+       int             error;          /* error return value */
+       xfs_mount_t     *mp;            /* mount point structure */
+       xfs_extlen_t    prod = 0;       /* product factor for allocators */
+       xfs_extlen_t    ralen = 0;      /* realtime allocation length */
+       xfs_extlen_t    align;          /* minimum allocation alignment */
+       xfs_rtblock_t   rtb;
+
+       mp = ap->ip->i_mount;
+       align = xfs_get_extsz_hint(ap->ip);
+       prod = align / mp->m_sb.sb_rextsize;
+       error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+                                       align, 1, ap->eof, 0,
+                                       ap->conv, &ap->offset, &ap->length);
+       if (error)
+               return error;
+       ASSERT(ap->length);
+       ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+       /*
+        * If the offset & length are not perfectly aligned
+        * then kill prod, it will just get us in trouble.
+        */
+       if (do_mod(ap->offset, align) || ap->length % align)
+               prod = 1;
+       /*
+        * Set ralen to be the actual requested length in rtextents.
+        */
+       ralen = ap->length / mp->m_sb.sb_rextsize;
+       /*
+        * If the old value was close enough to MAXEXTLEN that
+        * we rounded up to it, cut it back so it's valid again.
+        * Note that if it's a really large request (bigger than
+        * MAXEXTLEN), we don't hear about that number, and can't
+        * adjust the starting point to match it.
+        */
+       if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+               ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+       /*
+        * Lock out other modifications to the RT bitmap inode.
+        */
+       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+       /*
+        * If it's an allocation to an empty file at offset 0,
+        * pick an extent that will space things out in the rt area.
+        */
+       if (ap->eof && ap->offset == 0) {
+               xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+               error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+               if (error)
+                       return error;
+               ap->blkno = rtx * mp->m_sb.sb_rextsize;
+       } else {
+               ap->blkno = 0;
+       }
+
+       xfs_bmap_adjacent(ap);
+
+       /*
+        * Realtime allocation, done through xfs_rtallocate_extent.
+        */
+       atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+       do_div(ap->blkno, mp->m_sb.sb_rextsize);
+       rtb = ap->blkno;
+       ap->length = ralen;
+       if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+                               &ralen, atype, ap->wasdel, prod, &rtb)))
+               return error;
+       if (rtb == NULLFSBLOCK && prod > 1 &&
+           (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+                                          ap->length, &ralen, atype,
+                                          ap->wasdel, 1, &rtb)))
+               return error;
+       ap->blkno = rtb;
+       if (ap->blkno != NULLFSBLOCK) {
+               ap->blkno *= mp->m_sb.sb_rextsize;
+               ralen *= mp->m_sb.sb_rextsize;
+               ap->length = ralen;
+               ap->ip->i_d.di_nblocks += ralen;
+               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+               if (ap->wasdel)
+                       ap->ip->i_delayed_blks -= ralen;
+               /*
+                * Adjust the disk quota also. This was reserved
+                * earlier.
+                */
+               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+                       ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+                                       XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+       } else {
+               ap->length = 0;
+       }
+       return 0;
+}
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_bmapi_allocate_worker(
+       struct work_struct      *work)
+{
+       struct xfs_bmalloca     *args = container_of(work,
+                                               struct xfs_bmalloca, work);
+       unsigned long           pflags;
+
+       /* we are in a transaction context here */
+       current_set_flags_nested(&pflags, PF_FSTRANS);
+
+       args->result = __xfs_bmapi_allocate(args);
+       complete(args->done);
+
+       current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+       struct xfs_bmalloca     *args)
+{
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       if (!args->stack_switch)
+               return __xfs_bmapi_allocate(args);
+
+
+       args->done = &done;
+       INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+       queue_work(xfs_alloc_wq, &args->work);
+       wait_for_completion(&done);
+       return args->result;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           endoff,
+       int                     whichfork,
+       int                     *eof)
+{
+       struct xfs_bmbt_irec    rec;
+       int                     error;
+
+       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+       if (error || *eof)
+               return error;
+
+       *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+       return 0;
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+       xfs_ifork_t             *ifp,
+       xfs_extnum_t            idx,
+       int                     numrecs,
+       int                     *count)
+{
+       int             b;
+
+       for (b = 0; b < numrecs; b++) {
+               xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+               *count += xfs_bmbt_get_blockcount(frp);
+       }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
+       int                     numrecs,
+       int                     *count)
+{
+       int             b;
+       xfs_bmbt_rec_t  *frp;
+
+       for (b = 1; b <= numrecs; b++) {
+               frp = XFS_BMBT_REC_ADDR(mp, block, b);
+               *count += xfs_bmbt_disk_get_blockcount(frp);
+       }
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks in use.
+ */
+STATIC int                                     /* error */
+xfs_bmap_count_tree(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fsblock_t   blockno,        /* file system block number */
+       int             levelin,        /* level in btree */
+       int             *count)         /* Count of blocks */
+{
+       int                     error;
+       xfs_buf_t               *bp, *nbp;
+       int                     level = levelin;
+       __be64                  *pp;
+       xfs_fsblock_t           bno = blockno;
+       xfs_fsblock_t           nextbno;
+       struct xfs_btree_block  *block, *nextblock;
+       int                     numrecs;
+
+       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+       if (error)
+               return error;
+       *count += 1;
+       block = XFS_BUF_TO_BLOCK(bp);
+
+       if (--level) {
+               /* Not at node above leaves, count this level of nodes */
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+               while (nextbno != NULLFSBLOCK) {
+                       error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               return error;
+                       *count += 1;
+                       nextblock = XFS_BUF_TO_BLOCK(nbp);
+                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+                       xfs_trans_brelse(tp, nbp);
+               }
+
+               /* Dive to the next level */
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+               bno = be64_to_cpu(*pp);
+               if (unlikely((error =
+                    xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+                       xfs_trans_brelse(tp, bp);
+                       XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               xfs_trans_brelse(tp, bp);
+       } else {
+               /* count all level 1 nodes and their leaves */
+               for (;;) {
+                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+                       numrecs = be16_to_cpu(block->bb_numrecs);
+                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+                       xfs_trans_brelse(tp, bp);
+                       if (nextbno == NULLFSBLOCK)
+                               break;
+                       bno = nextbno;
+                       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               return error;
+                       *count += 1;
+                       block = XFS_BUF_TO_BLOCK(bp);
+               }
+       }
+       return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int                                            /* error */
+xfs_bmap_count_blocks(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode */
+       int                     whichfork,      /* data or attr fork */
+       int                     *count)         /* out: count of blocks */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       xfs_fsblock_t           bno;    /* block # of "block" */
+       xfs_ifork_t             *ifp;   /* fork structure */
+       int                     level;  /* btree level, for checking */
+       xfs_mount_t             *mp;    /* file system mount structure */
+       __be64                  *pp;    /* pointer to block address */
+
+       bno = NULLFSBLOCK;
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+               xfs_bmap_count_leaves(ifp, 0,
+                       ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+                       count);
+               return 0;
+       }
+
+       /*
+        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+        */
+       block = ifp->if_broot;
+       level = be16_to_cpu(block->bb_level);
+       ASSERT(level > 0);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+       bno = be64_to_cpu(*pp);
+       ASSERT(bno != NULLDFSBNO);
+       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+       if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+               XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+                                mp);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       return 0;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+       xfs_inode_t             *ip,            /* xfs incore inode pointer */
+       struct getbmapx         *out,           /* output structure */
+       int                     prealloced,     /* this is a file with
+                                                * preallocated data space */
+       __int64_t               end,            /* last block requested */
+       xfs_fsblock_t           startblock)
+{
+       __int64_t               fixlen;
+       xfs_mount_t             *mp;            /* file system mount point */
+       xfs_ifork_t             *ifp;           /* inode fork pointer */
+       xfs_extnum_t            lastx;          /* last extent pointer */
+       xfs_fileoff_t           fileblock;
+
+       if (startblock == HOLESTARTBLOCK) {
+               mp = ip->i_mount;
+               out->bmv_block = -1;
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+               fixlen -= out->bmv_offset;
+               if (prealloced && out->bmv_offset + out->bmv_length == end) {
+                       /* Came to hole at EOF. Trim it. */
+                       if (fixlen <= 0)
+                               return 0;
+                       out->bmv_length = fixlen;
+               }
+       } else {
+               if (startblock == DELAYSTARTBLOCK)
+                       out->bmv_block = -2;
+               else
+                       out->bmv_block = xfs_fsb_to_db(ip, startblock);
+               fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                  (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                       out->bmv_oflags |= BMV_OF_LAST;
+       }
+
+       return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int                                            /* error code */
+xfs_getbmap(
+       xfs_inode_t             *ip,
+       struct getbmapx         *bmv,           /* user bmap structure */
+       xfs_bmap_format_t       formatter,      /* format to user */
+       void                    *arg)           /* formatter arg */
+{
+       __int64_t               bmvend;         /* last block requested */
+       int                     error = 0;      /* return value */
+       __int64_t               fixlen;         /* length for -1 case */
+       int                     i;              /* extent number */
+       int                     lock;           /* lock state */
+       xfs_bmbt_irec_t         *map;           /* buffer for user's data */
+       xfs_mount_t             *mp;            /* file system mount point */
+       int                     nex;            /* # of user extents can do */
+       int                     nexleft;        /* # of user extents left */
+       int                     subnex;         /* # of bmapi's can do */
+       int                     nmap;           /* number of map entries */
+       struct getbmapx         *out;           /* output structure */
+       int                     whichfork;      /* data or attr fork */
+       int                     prealloced;     /* this is a file with
+                                                * preallocated data space */
+       int                     iflags;         /* interface flags */
+       int                     bmapi_flags;    /* flags for xfs_bmapi */
+       int                     cur_ext = 0;
+
+       mp = ip->i_mount;
+       iflags = bmv->bmv_iflags;
+       whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+       if (whichfork == XFS_ATTR_FORK) {
+               if (XFS_IFORK_Q(ip)) {
+                       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+                           ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+                           ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+                               return XFS_ERROR(EINVAL);
+               } else if (unlikely(
+                          ip->i_d.di_aformat != 0 &&
+                          ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+                       XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               prealloced = 0;
+               fixlen = 1LL << 32;
+       } else {
+               if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+                       return XFS_ERROR(EINVAL);
+
+               if (xfs_get_extsz_hint(ip) ||
+                   ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+                       prealloced = 1;
+                       fixlen = mp->m_super->s_maxbytes;
+               } else {
+                       prealloced = 0;
+                       fixlen = XFS_ISIZE(ip);
+               }
+       }
+
+       if (bmv->bmv_length == -1) {
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+               bmv->bmv_length =
+                       max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+       } else if (bmv->bmv_length == 0) {
+               bmv->bmv_entries = 0;
+               return 0;
+       } else if (bmv->bmv_length < 0) {
+               return XFS_ERROR(EINVAL);
+       }
+
+       nex = bmv->bmv_count - 1;
+       if (nex <= 0)
+               return XFS_ERROR(EINVAL);
+       bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+       if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+               return XFS_ERROR(ENOMEM);
+       out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+       if (!out) {
+               out = kmem_zalloc_large(bmv->bmv_count *
+                                       sizeof(struct getbmapx));
+               if (!out)
+                       return XFS_ERROR(ENOMEM);
+       }
+
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+                       if (error)
+                               goto out_unlock_iolock;
+               }
+               /*
+                * even after flushing the inode, there can still be delalloc
+                * blocks on the inode beyond EOF due to speculative
+                * preallocation. These are not removed until the release
+                * function is called or the inode is inactivated. Hence we
+                * cannot assert here that ip->i_delayed_blks == 0.
+                */
+       }
+
+       lock = xfs_ilock_map_shared(ip);
+
+       /*
+        * Don't let nex be bigger than the number of extents
+        * we can have assuming alternating holes and real extents.
+        */
+       if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+               nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+       bmapi_flags = xfs_bmapi_aflag(whichfork);
+       if (!(iflags & BMV_IF_PREALLOC))
+               bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+       /*
+        * Allocate enough space to handle "subnex" maps at a time.
+        */
+       error = ENOMEM;
+       subnex = 16;
+       map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+       if (!map)
+               goto out_unlock_ilock;
+
+       bmv->bmv_entries = 0;
+
+       if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+           (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+               error = 0;
+               goto out_free_map;
+       }
+
+       nexleft = nex;
+
+       do {
+               nmap = (nexleft > subnex) ? subnex : nexleft;
+               error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+                                      XFS_BB_TO_FSB(mp, bmv->bmv_length),
+                                      map, &nmap, bmapi_flags);
+               if (error)
+                       goto out_free_map;
+               ASSERT(nmap <= subnex);
+
+               for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+                       out[cur_ext].bmv_oflags = 0;
+                       if (map[i].br_state == XFS_EXT_UNWRITTEN)
+                               out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+                       else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                               out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+                       out[cur_ext].bmv_offset =
+                               XFS_FSB_TO_BB(mp, map[i].br_startoff);
+                       out[cur_ext].bmv_length =
+                               XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                       out[cur_ext].bmv_unused1 = 0;
+                       out[cur_ext].bmv_unused2 = 0;
+
+                       /*
+                        * delayed allocation extents that start beyond EOF can
+                        * occur due to speculative EOF allocation when the
+                        * delalloc extent is larger than the largest freespace
+                        * extent at conversion time. These extents cannot be
+                        * converted by data writeback, so can exist here even
+                        * if we are not supposed to be finding delalloc
+                        * extents.
+                        */
+                       if (map[i].br_startblock == DELAYSTARTBLOCK &&
+                           map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+                               ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+                        if (map[i].br_startblock == HOLESTARTBLOCK &&
+                           whichfork == XFS_ATTR_FORK) {
+                               /* came to the end of attribute fork */
+                               out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+                               goto out_free_map;
+                       }
+
+                       if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+                                       prealloced, bmvend,
+                                       map[i].br_startblock))
+                               goto out_free_map;
+
+                       bmv->bmv_offset =
+                               out[cur_ext].bmv_offset +
+                               out[cur_ext].bmv_length;
+                       bmv->bmv_length =
+                               max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+                       /*
+                        * In case we don't want to return the hole,
+                        * don't increase cur_ext so that we can reuse
+                        * it in the next loop.
+                        */
+                       if ((iflags & BMV_IF_NO_HOLES) &&
+                           map[i].br_startblock == HOLESTARTBLOCK) {
+                               memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+                               continue;
+                       }
+
+                       nexleft--;
+                       bmv->bmv_entries++;
+                       cur_ext++;
+               }
+       } while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+       kmem_free(map);
+ out_unlock_ilock:
+       xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       for (i = 0; i < cur_ext; i++) {
+               int full = 0;   /* user array is full */
+
+               /* format results & advance arg */
+               error = formatter(&arg, &out[i], &full);
+               if (error || full)
+                       break;
+       }
+
+       if (is_vmalloc_addr(out))
+               kmem_free_large(out);
+       else
+               kmem_free(out);
+       return error;
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will always punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           start_fsb,
+       xfs_fileoff_t           length)
+{
+       xfs_fileoff_t           remaining = length;
+       int                     error = 0;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       do {
+               int             done;
+               xfs_bmbt_irec_t imap;
+               int             nimaps = 1;
+               xfs_fsblock_t   firstblock;
+               xfs_bmap_free_t flist;
+
+               /*
+                * Map the range first and check that it is a delalloc extent
+                * before trying to unmap the range. Otherwise we will be
+                * trying to remove a real extent (which requires a
+                * transaction) or a hole, which is probably a bad idea...
+                */
+               error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+                                      XFS_BMAPI_ENTIRE);
+
+               if (error) {
+                       /* something screwed, just bail */
+                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                               xfs_alert(ip->i_mount,
+                       "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                               ip->i_ino, start_fsb);
+                       }
+                       break;
+               }
+               if (!nimaps) {
+                       /* nothing there */
+                       goto next_block;
+               }
+               if (imap.br_startblock != DELAYSTARTBLOCK) {
+                       /* been converted, ignore */
+                       goto next_block;
+               }
+               WARN_ON(imap.br_blockcount == 0);
+
+               /*
+                * Note: while we initialise the firstblock/flist pair, they
+                * should never be used because blocks should never be
+                * allocated or freed for a delalloc extent and hence we need
+                * don't cancel or finish them after the xfs_bunmapi() call.
+                */
+               xfs_bmap_init(&flist, &firstblock);
+               error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                       &flist, &done);
+               if (error)
+                       break;
+
+               ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+               start_fsb++;
+               remaining--;
+       } while(remaining > 0);
+
+       return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+       /* prealloc/delalloc exists only on regular files */
+       if (!S_ISREG(ip->i_d.di_mode))
+               return false;
+
+       /*
+        * Zero sized files with no cached pages and delalloc blocks will not
+        * have speculative prealloc/delalloc blocks to remove.
+        */
+       if (VFS_I(ip)->i_size == 0 &&
+           VN_CACHED(VFS_I(ip)) == 0 &&
+           ip->i_delayed_blks == 0)
+               return false;
+
+       /* If we haven't read in the extent list, then don't do it now. */
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+               return false;
+
+       /*
+        * Do not free real preallocated or append-only files unless the file
+        * has delalloc blocks and we are forced to remove them.
+        */
+       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+               if (!force || ip->i_delayed_blks == 0)
+                       return false;
+
+       return true;
+}
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+int
+xfs_free_eofblocks(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *ip,
+       bool            need_iolock)
+{
+       xfs_trans_t     *tp;
+       int             error;
+       xfs_fileoff_t   end_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_filblks_t   map_len;
+       int             nimaps;
+       xfs_bmbt_irec_t imap;
+
+       /*
+        * Figure out if there are any blocks beyond the end
+        * of the file.  If not, then there is nothing to do.
+        */
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+       last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+       if (last_fsb <= end_fsb)
+               return 0;
+       map_len = last_fsb - end_fsb;
+
+       nimaps = 1;
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (!error && (nimaps != 0) &&
+           (imap.br_startblock != HOLESTARTBLOCK ||
+            ip->i_delayed_blks)) {
+               /*
+                * Attach the dquots to the inode up front.
+                */
+               error = xfs_qm_dqattach(ip, 0);
+               if (error)
+                       return error;
+
+               /*
+                * There are blocks after the end of file.
+                * Free them up now by truncating the file to
+                * its current size.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+               if (need_iolock) {
+                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+                               xfs_trans_cancel(tp, 0);
+                               return EAGAIN;
+                       }
+               }
+
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               if (error) {
+                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       if (need_iolock)
+                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                       return error;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+
+               /*
+                * Do not update the on-disk file size.  If we update the
+                * on-disk file size and then the system crashes before the
+                * contents of the file are flushed to disk then the files
+                * may be full of holes (ie NULL files bug).
+                */
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+                                             XFS_ISIZE(ip));
+               if (error) {
+                       /*
+                        * If we get an error at this point we simply don't
+                        * bother truncating the file.
+                        */
+                       xfs_trans_cancel(tp,
+                                        (XFS_TRANS_RELEASE_LOG_RES |
+                                         XFS_TRANS_ABORT));
+               } else {
+                       error = xfs_trans_commit(tp,
+                                               XFS_TRANS_RELEASE_LOG_RES);
+                       if (!error)
+                               xfs_inode_clear_eofblocks_tag(ip);
+               }
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (need_iolock)
+                       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       }
+       return error;
+}
+
+/*
+ * xfs_alloc_file_space()
+ *      This routine allocates disk space for the given file.
+ *
+ *     If alloc_type == 0, this request is for an ALLOCSP type
+ *     request which will change the file size.  In this case, no
+ *     DMAPI event will be generated by the call.  A TRUNCATE event
+ *     will be generated later by xfs_setattr.
+ *
+ *     If alloc_type != 0, this request is for a RESVSP type
+ *     request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *     lower block boundary byte address is less than the file's
+ *     length.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_alloc_file_space(
+       xfs_inode_t             *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     alloc_type,
+       int                     attr_flags)
+{
+       xfs_mount_t             *mp = ip->i_mount;
+       xfs_off_t               count;
+       xfs_filblks_t           allocated_fsb;
+       xfs_filblks_t           allocatesize_fsb;
+       xfs_extlen_t            extsz, temp;
+       xfs_fileoff_t           startoffset_fsb;
+       xfs_fsblock_t           firstfsb;
+       int                     nimaps;
+       int                     quota_flag;
+       int                     rt;
+       xfs_trans_t             *tp;
+       xfs_bmbt_irec_t         imaps[1], *imapp;
+       xfs_bmap_free_t         free_list;
+       uint                    qblocks, resblks, resrtextents;
+       int                     committed;
+       int                     error;
+
+       trace_xfs_alloc_file_space(ip);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return error;
+
+       if (len <= 0)
+               return XFS_ERROR(EINVAL);
+
+       rt = XFS_IS_REALTIME_INODE(ip);
+       extsz = xfs_get_extsz_hint(ip);
+
+       count = len;
+       imapp = &imaps[0];
+       nimaps = 1;
+       startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+       allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+       /*
+        * Allocate file space until done or until there is an error
+        */
+       while (allocatesize_fsb && !error) {
+               xfs_fileoff_t   s, e;
+
+               /*
+                * Determine space reservations for data/realtime.
+                */
+               if (unlikely(extsz)) {
+                       s = startoffset_fsb;
+                       do_div(s, extsz);
+                       s *= extsz;
+                       e = startoffset_fsb + allocatesize_fsb;
+                       if ((temp = do_mod(startoffset_fsb, extsz)))
+                               e += temp;
+                       if ((temp = do_mod(e, extsz)))
+                               e += extsz - temp;
+               } else {
+                       s = 0;
+                       e = allocatesize_fsb;
+               }
+
+               /*
+                * The transaction reservation is limited to a 32-bit block
+                * count, hence we need to limit the number of blocks we are
+                * trying to reserve to avoid an overflow. We can't allocate
+                * more than @nimaps extents, and an extent is limited on disk
+                * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                */
+               resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+               if (unlikely(rt)) {
+                       resrtextents = qblocks = resblks;
+                       resrtextents /= mp->m_sb.sb_rextsize;
+                       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+                       quota_flag = XFS_QMOPT_RES_RTBLKS;
+               } else {
+                       resrtextents = 0;
+                       resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+                       quota_flag = XFS_QMOPT_RES_REGBLKS;
+               }
+
+               /*
+                * Allocate and setup the transaction.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                         resblks, resrtextents);
+               /*
+                * Check for running out of space
+                */
+               if (error) {
+                       /*
+                        * Free the transaction structure.
+                        */
+                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       break;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+                                                     0, quota_flag);
+               if (error)
+                       goto error1;
+
+               xfs_trans_ijoin(tp, ip, 0);
+
+               xfs_bmap_init(&free_list, &firstfsb);
+               error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+                                       allocatesize_fsb, alloc_type, &firstfsb,
+                                       0, imapp, &nimaps, &free_list);
+               if (error) {
+                       goto error0;
+               }
+
+               /*
+                * Complete the transaction
+                */
+               error = xfs_bmap_finish(&tp, &free_list, &committed);
+               if (error) {
+                       goto error0;
+               }
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error) {
+                       break;
+               }
+
+               allocated_fsb = imapp->br_blockcount;
+
+               if (nimaps == 0) {
+                       error = XFS_ERROR(ENOSPC);
+                       break;
+               }
+
+               startoffset_fsb += allocated_fsb;
+               allocatesize_fsb -= allocated_fsb;
+       }
+
+       return error;
+
+error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+
+error1:        /* Just cancel transaction */
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+       xfs_inode_t             *ip,
+       xfs_off_t               startoff,
+       xfs_off_t               endoff)
+{
+       xfs_bmbt_irec_t         imap;
+       xfs_fileoff_t           offset_fsb;
+       xfs_off_t               lastoffset;
+       xfs_off_t               offset;
+       xfs_buf_t               *bp;
+       xfs_mount_t             *mp = ip->i_mount;
+       int                     nimap;
+       int                     error = 0;
+
+       /*
+        * Avoid doing I/O beyond eof - it's not necessary
+        * since nothing can read beyond eof.  The space will
+        * be zeroed when the file is extended anyway.
+        */
+       if (startoff >= XFS_ISIZE(ip))
+               return 0;
+
+       if (endoff > XFS_ISIZE(ip))
+               endoff = XFS_ISIZE(ip);
+
+       bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+                                       mp->m_rtdev_targp : mp->m_ddev_targp,
+                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+       if (!bp)
+               return XFS_ERROR(ENOMEM);
+
+       xfs_buf_unlock(bp);
+
+       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+               offset_fsb = XFS_B_TO_FSBT(mp, offset);
+               nimap = 1;
+               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+               if (error || nimap < 1)
+                       break;
+               ASSERT(imap.br_blockcount >= 1);
+               ASSERT(imap.br_startoff == offset_fsb);
+               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+               if (lastoffset > endoff)
+                       lastoffset = endoff;
+               if (imap.br_startblock == HOLESTARTBLOCK)
+                       continue;
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               if (imap.br_state == XFS_EXT_UNWRITTEN)
+                       continue;
+               XFS_BUF_UNDONE(bp);
+               XFS_BUF_UNWRITE(bp);
+               XFS_BUF_READ(bp);
+               XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+               xfsbdstrat(mp, bp);
+               error = xfs_buf_iowait(bp);
+               if (error) {
+                       xfs_buf_ioerror_alert(bp,
+                                       "xfs_zero_remaining_bytes(read)");
+                       break;
+               }
+               memset(bp->b_addr +
+                       (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                     0, lastoffset - offset + 1);
+               XFS_BUF_UNDONE(bp);
+               XFS_BUF_UNREAD(bp);
+               XFS_BUF_WRITE(bp);
+               xfsbdstrat(mp, bp);
+               error = xfs_buf_iowait(bp);
+               if (error) {
+                       xfs_buf_ioerror_alert(bp,
+                                       "xfs_zero_remaining_bytes(write)");
+                       break;
+               }
+       }
+       xfs_buf_free(bp);
+       return error;
+}
+
+/*
+ * xfs_free_file_space()
+ *      This routine frees disk space for the given file.
+ *
+ *     This routine is only called by xfs_change_file_space
+ *     for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+       xfs_inode_t             *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     attr_flags)
+{
+       int                     committed;
+       int                     done;
+       xfs_fileoff_t           endoffset_fsb;
+       int                     error;
+       xfs_fsblock_t           firstfsb;
+       xfs_bmap_free_t         free_list;
+       xfs_bmbt_irec_t         imap;
+       xfs_off_t               ioffset;
+       xfs_extlen_t            mod=0;
+       xfs_mount_t             *mp;
+       int                     nimap;
+       uint                    resblks;
+       xfs_off_t               rounding;
+       int                     rt;
+       xfs_fileoff_t           startoffset_fsb;
+       xfs_trans_t             *tp;
+       int                     need_iolock = 1;
+
+       mp = ip->i_mount;
+
+       trace_xfs_free_file_space(ip);
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return error;
+
+       error = 0;
+       if (len <= 0)   /* if nothing being freed */
+               return error;
+       rt = XFS_IS_REALTIME_INODE(ip);
+       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+       if (attr_flags & XFS_ATTR_NOLOCK)
+               need_iolock = 0;
+       if (need_iolock) {
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+               /* wait for the completion of any pending DIOs */
+               inode_dio_wait(VFS_I(ip));
+       }
+
+       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+       ioffset = offset & ~(rounding - 1);
+       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                             ioffset, -1);
+       if (error)
+               goto out_unlock_iolock;
+       truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+
+       /*
+        * Need to zero the stuff we're not freeing, on disk.
+        * If it's a realtime file & can't use unwritten extents then we
+        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+        * will take care of it for us.
+        */
+       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+               nimap = 1;
+               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+                                       &imap, &nimap, 0);
+               if (error)
+                       goto out_unlock_iolock;
+               ASSERT(nimap == 0 || nimap == 1);
+               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                       xfs_daddr_t     block;
+
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       block = imap.br_startblock;
+                       mod = do_div(block, mp->m_sb.sb_rextsize);
+                       if (mod)
+                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+               }
+               nimap = 1;
+               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+                                       &imap, &nimap, 0);
+               if (error)
+                       goto out_unlock_iolock;
+               ASSERT(nimap == 0 || nimap == 1);
+               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       mod++;
+                       if (mod && (mod != mp->m_sb.sb_rextsize))
+                               endoffset_fsb -= mod;
+               }
+       }
+       if ((done = (endoffset_fsb <= startoffset_fsb)))
+               /*
+                * One contiguous piece to clear
+                */
+               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+       else {
+               /*
+                * Some full blocks, possibly two pieces to clear
+                */
+               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+                       error = xfs_zero_remaining_bytes(ip, offset,
+                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+               if (!error &&
+                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+                       error = xfs_zero_remaining_bytes(ip,
+                               XFS_FSB_TO_B(mp, endoffset_fsb),
+                               offset + len - 1);
+       }
+
+       /*
+        * free file space until done or until there is an error
+        */
+       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+       while (!error && !done) {
+
+               /*
+                * allocate and setup the transaction. Allow this
+                * transaction to dip into the reserve blocks to ensure
+                * the freeing of the space succeeds at ENOSPC.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+               tp->t_flags |= XFS_TRANS_RESERVE;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+
+               /*
+                * check for running out of space
+                */
+               if (error) {
+                       /*
+                        * Free the transaction structure.
+                        */
+                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       break;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_trans_reserve_quota(tp, mp,
+                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
+               if (error)
+                       goto error1;
+
+               xfs_trans_ijoin(tp, ip, 0);
+
+               /*
+                * issue the bunmapi() call to free the blocks
+                */
+               xfs_bmap_init(&free_list, &firstfsb);
+               error = xfs_bunmapi(tp, ip, startoffset_fsb,
+                                 endoffset_fsb - startoffset_fsb,
+                                 0, 2, &firstfsb, &free_list, &done);
+               if (error) {
+                       goto error0;
+               }
+
+               /*
+                * complete the transaction
+                */
+               error = xfs_bmap_finish(&tp, &free_list, &committed);
+               if (error) {
+                       goto error0;
+               }
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+
+ out_unlock_iolock:
+       if (need_iolock)
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+
+ error0:
+       xfs_bmap_cancel(&free_list);
+ error1:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
+                   XFS_ILOCK_EXCL);
+       return error;
+}
+
+
+STATIC int
+xfs_zero_file_space(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     attr_flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    granularity;
+       xfs_off_t               start_boundary;
+       xfs_off_t               end_boundary;
+       int                     error;
+
+       granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+       /*
+        * Round the range of extents we are going to convert inwards.  If the
+        * offset is aligned, then it doesn't get changed so we zero from the
+        * start of the block offset points to.
+        */
+       start_boundary = round_up(offset, granularity);
+       end_boundary = round_down(offset + len, granularity);
+
+       ASSERT(start_boundary >= offset);
+       ASSERT(end_boundary <= offset + len);
+
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       if (start_boundary < end_boundary - 1) {
+               /* punch out the page cache over the conversion range */
+               truncate_pagecache_range(VFS_I(ip), start_boundary,
+                                        end_boundary - 1);
+               /* convert the blocks */
+               error = xfs_alloc_file_space(ip, start_boundary,
+                                       end_boundary - start_boundary - 1,
+                                       XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+                                       attr_flags);
+               if (error)
+                       goto out_unlock;
+
+               /* We've handled the interior of the range, now for the edges */
+               if (start_boundary != offset)
+                       error = xfs_iozero(ip, offset, start_boundary - offset);
+               if (error)
+                       goto out_unlock;
+
+               if (end_boundary != offset + len)
+                       error = xfs_iozero(ip, end_boundary,
+                                          offset + len - end_boundary);
+
+       } else {
+               /*
+                * It's either a sub-granularity range or the range spanned lies
+                * partially across two adjacent blocks.
+                */
+               error = xfs_iozero(ip, offset, len);
+       }
+
+out_unlock:
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+
+}
+
+/*
+ * xfs_change_file_space()
+ *      This routine allocates or frees disk space for the given file.
+ *      The user specified parameters are checked for alignment and size
+ *      limitations.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_change_file_space(
+       xfs_inode_t     *ip,
+       int             cmd,
+       xfs_flock64_t   *bf,
+       xfs_off_t       offset,
+       int             attr_flags)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       int             clrprealloc;
+       int             error;
+       xfs_fsize_t     fsize;
+       int             setprealloc;
+       xfs_off_t       startoffset;
+       xfs_trans_t     *tp;
+       struct iattr    iattr;
+
+       if (!S_ISREG(ip->i_d.di_mode))
+               return XFS_ERROR(EINVAL);
+
+       switch (bf->l_whence) {
+       case 0: /*SEEK_SET*/
+               break;
+       case 1: /*SEEK_CUR*/
+               bf->l_start += offset;
+               break;
+       case 2: /*SEEK_END*/
+               bf->l_start += XFS_ISIZE(ip);
+               break;
+       default:
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * length of <= 0 for resv/unresv/zero is invalid.  length for
+        * alloc/free is ignored completely and we have no idea what userspace
+        * might have set it to, so set it to zero to allow range
+        * checks to pass.
+        */
+       switch (cmd) {
+       case XFS_IOC_ZERO_RANGE:
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_RESVSP64:
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_UNRESVSP64:
+               if (bf->l_len <= 0)
+                       return XFS_ERROR(EINVAL);
+               break;
+       default:
+               bf->l_len = 0;
+               break;
+       }
+
+       if (bf->l_start < 0 ||
+           bf->l_start > mp->m_super->s_maxbytes ||
+           bf->l_start + bf->l_len < 0 ||
+           bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
+               return XFS_ERROR(EINVAL);
+
+       bf->l_whence = 0;
+
+       startoffset = bf->l_start;
+       fsize = XFS_ISIZE(ip);
+
+       setprealloc = clrprealloc = 0;
+       switch (cmd) {
+       case XFS_IOC_ZERO_RANGE:
+               error = xfs_zero_file_space(ip, startoffset, bf->l_len,
+                                               attr_flags);
+               if (error)
+                       return error;
+               setprealloc = 1;
+               break;
+
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_RESVSP64:
+               error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+                                               XFS_BMAPI_PREALLOC, attr_flags);
+               if (error)
+                       return error;
+               setprealloc = 1;
+               break;
+
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_UNRESVSP64:
+               if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+                                                               attr_flags)))
+                       return error;
+               break;
+
+       case XFS_IOC_ALLOCSP:
+       case XFS_IOC_ALLOCSP64:
+       case XFS_IOC_FREESP:
+       case XFS_IOC_FREESP64:
+               /*
+                * These operations actually do IO when extending the file, but
+                * the allocation is done seperately to the zeroing that is
+                * done. This set of operations need to be serialised against
+                * other IO operations, such as truncate and buffered IO. We
+                * need to take the IOLOCK here to serialise the allocation and
+                * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
+                * truncate, direct IO) from racing against the transient
+                * allocated but not written state we can have here.
+                */
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+               if (startoffset > fsize) {
+                       error = xfs_alloc_file_space(ip, fsize,
+                                       startoffset - fsize, 0,
+                                       attr_flags | XFS_ATTR_NOLOCK);
+                       if (error) {
+                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                               break;
+                       }
+               }
+
+               iattr.ia_valid = ATTR_SIZE;
+               iattr.ia_size = startoffset;
+
+               error = xfs_setattr_size(ip, &iattr,
+                                        attr_flags | XFS_ATTR_NOLOCK);
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+               if (error)
+                       return error;
+
+               clrprealloc = 1;
+               break;
+
+       default:
+               ASSERT(0);
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * update the inode timestamp, mode, and prealloc flag bits
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       if ((attr_flags & XFS_ATTR_DMI) == 0) {
+               ip->i_d.di_mode &= ~S_ISUID;
+
+               /*
+                * Note that we don't have to worry about mandatory
+                * file locking being disabled here because we only
+                * clear the S_ISGID bit if the Group execute bit is
+                * on, but if it was on then mandatory locking wouldn't
+                * have been enabled.
+                */
+               if (ip->i_d.di_mode & S_IXGRP)
+                       ip->i_d.di_mode &= ~S_ISGID;
+
+               xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       }
+       if (setprealloc)
+               ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+       else if (clrprealloc)
+               ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       if (attr_flags & XFS_ATTR_SYNC)
+               xfs_trans_set_sync(tp);
+       return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+       xfs_inode_t     *ip,    /* target inode */
+       xfs_inode_t     *tip)   /* tmp inode */
+{
+
+       /* Should never get a local format */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+           tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+               return EINVAL;
+
+       /*
+        * if the target inode has less extents that then temporary inode then
+        * why did userspace call us?
+        */
+       if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+               return EINVAL;
+
+       /*
+        * if the target inode is in extent form and the temp inode is in btree
+        * form then we will end up with the target inode in the wrong format
+        * as we already know there are less extents in the temp inode.
+        */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               return EINVAL;
+
+       /* Check temp in extent form to max in target */
+       if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+               return EINVAL;
+
+       /* Check target in extent form to max in temp */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+               return EINVAL;
+
+       /*
+        * If we are in a btree format, check that the temp root block will fit
+        * in the target and that it has enough extents to be in btree format
+        * in the target.
+        *
+        * Note that we have to be careful to allow btree->extent conversions
+        * (a common defrag case) which will occur when the temp inode is in
+        * extent format...
+        */
+       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(ip) &&
+                   XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
+
+       /* Reciprocal target->temp btree format checks */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(tip) &&
+                   XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
+
+       return 0;
+}
+
+int
+xfs_swap_extents(
+       xfs_inode_t     *ip,    /* target inode */
+       xfs_inode_t     *tip,   /* tmp inode */
+       xfs_swapext_t   *sxp)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_trans_t     *tp;
+       xfs_bstat_t     *sbp = &sxp->sx_stat;
+       xfs_ifork_t     *tempifp, *ifp, *tifp;
+       int             src_log_flags, target_log_flags;
+       int             error = 0;
+       int             aforkblks = 0;
+       int             taforkblks = 0;
+       __uint64_t      tmp;
+
+       /*
+        * We have no way of updating owner information in the BMBT blocks for
+        * each inode on CRC enabled filesystems, so to avoid corrupting the
+        * this metadata we simply don't allow extent swaps to occur.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return XFS_ERROR(EINVAL);
+
+       tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+       if (!tempifp) {
+               error = XFS_ERROR(ENOMEM);
+               goto out;
+       }
+
+       /*
+        * we have to do two separate lock calls here to keep lockdep
+        * happy. If we try to get all the locks in one call, lock will
+        * report false positives when we drop the ILOCK and regain them
+        * below.
+        */
+       xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+       /* Verify that both files have the same format */
+       if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       /* Verify both files are either real-time or non-realtime */
+       if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+       if (error)
+               goto out_unlock;
+       truncate_pagecache_range(VFS_I(tip), 0, -1);
+
+       /* Verify O_DIRECT for ftmp */
+       if (VN_CACHED(VFS_I(tip)) != 0) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       /* Verify all data are being swapped */
+       if (sxp->sx_offset != 0 ||
+           sxp->sx_length != ip->i_d.di_size ||
+           sxp->sx_length != tip->i_d.di_size) {
+               error = XFS_ERROR(EFAULT);
+               goto out_unlock;
+       }
+
+       trace_xfs_swap_extent_before(ip, 0);
+       trace_xfs_swap_extent_before(tip, 1);
+
+       /* check inode formats now that data is flushed */
+       error = xfs_swap_extents_check_format(ip, tip);
+       if (error) {
+               xfs_notice(mp,
+                   "%s: inode 0x%llx format is incompatible for exchanging.",
+                               __func__, ip->i_ino);
+               goto out_unlock;
+       }
+
+       /*
+        * Compare the current change & modify times with that
+        * passed in.  If they differ, we abort this swap.
+        * This is the mechanism used to ensure the calling
+        * process that the file was not changed out from
+        * under it.
+        */
+       if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+           (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+           (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+           (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+               error = XFS_ERROR(EBUSY);
+               goto out_unlock;
+       }
+
+       /* We need to fail if the file is memory mapped.  Once we have tossed
+        * all existing pages, the page fault will have no option
+        * but to go to the filesystem for pages. By making the page fault call
+        * vop_read (or write in the case of autogrow) they block on the iolock
+        * until we have switched the extents.
+        */
+       if (VN_MAPPED(VFS_I(ip))) {
+               error = XFS_ERROR(EBUSY);
+               goto out_unlock;
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+       /*
+        * There is a race condition here since we gave up the
+        * ilock.  However, the data fork will not change since
+        * we have the iolock (locked for truncation too) so we
+        * are safe.  We don't really care if non-io related
+        * fields change.
+        */
+       truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       if (error) {
+               xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
+               xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+               xfs_trans_cancel(tp, 0);
+               goto out;
+       }
+       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+       /*
+        * Count the number of extended attribute blocks
+        */
+       if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+            (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+               error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+               if (error)
+                       goto out_trans_cancel;
+       }
+       if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+            (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+               error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+                       &taforkblks);
+               if (error)
+                       goto out_trans_cancel;
+       }
+
+       /*
+        * Swap the data forks of the inodes
+        */
+       ifp = &ip->i_df;
+       tifp = &tip->i_df;
+       *tempifp = *ifp;        /* struct copy */
+       *ifp = *tifp;           /* struct copy */
+       *tifp = *tempifp;       /* struct copy */
+
+       /*
+        * Fix the on-disk inode values
+        */
+       tmp = (__uint64_t)ip->i_d.di_nblocks;
+       ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+       tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+       tmp = (__uint64_t) ip->i_d.di_nextents;
+       ip->i_d.di_nextents = tip->i_d.di_nextents;
+       tip->i_d.di_nextents = tmp;
+
+       tmp = (__uint64_t) ip->i_d.di_format;
+       ip->i_d.di_format = tip->i_d.di_format;
+       tip->i_d.di_format = tmp;
+
+       /*
+        * The extents in the source inode could still contain speculative
+        * preallocation beyond EOF (e.g. the file is open but not modified
+        * while defrag is in progress). In that case, we need to copy over the
+        * number of delalloc blocks the data fork in the source inode is
+        * tracking beyond EOF so that when the fork is truncated away when the
+        * temporary inode is unlinked we don't underrun the i_delayed_blks
+        * counter on that inode.
+        */
+       ASSERT(tip->i_delayed_blks == 0);
+       tip->i_delayed_blks = ip->i_delayed_blks;
+       ip->i_delayed_blks = 0;
+
+       src_log_flags = XFS_ILOG_CORE;
+       switch (ip->i_d.di_format) {
+       case XFS_DINODE_FMT_EXTENTS:
+               /* If the extents fit in the inode, fix the
+                * pointer.  Otherwise it's already NULL or
+                * pointing to the extent.
+                */
+               if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                       ifp->if_u1.if_extents =
+                               ifp->if_u2.if_inline_ext;
+               }
+               src_log_flags |= XFS_ILOG_DEXT;
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               src_log_flags |= XFS_ILOG_DBROOT;
+               break;
+       }
+
+       target_log_flags = XFS_ILOG_CORE;
+       switch (tip->i_d.di_format) {
+       case XFS_DINODE_FMT_EXTENTS:
+               /* If the extents fit in the inode, fix the
+                * pointer.  Otherwise it's already NULL or
+                * pointing to the extent.
+                */
+               if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                       tifp->if_u1.if_extents =
+                               tifp->if_u2.if_inline_ext;
+               }
+               target_log_flags |= XFS_ILOG_DEXT;
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               target_log_flags |= XFS_ILOG_DBROOT;
+               break;
+       }
+
+
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+       xfs_trans_log_inode(tp, ip,  src_log_flags);
+       xfs_trans_log_inode(tp, tip, target_log_flags);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        */
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(tp);
+
+       error = xfs_trans_commit(tp, 0);
+
+       trace_xfs_swap_extent_after(ip, 0);
+       trace_xfs_swap_extent_after(tip, 1);
+out:
+       kmem_free(tempifp);
+       return error;
+
+out_unlock:
+       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       goto out;
+
+out_trans_cancel:
+       xfs_trans_cancel(tp, 0);
+       goto out_unlock;
+}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h

new file mode 100644 (file)

index 0000000..0612609
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define        __XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_bmap_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+       xfs_fsblock_t           *firstblock; /* i/o first block allocated */
+       struct xfs_bmap_free    *flist; /* bmap freelist */
+       struct xfs_trans        *tp;    /* transaction pointer */
+       struct xfs_inode        *ip;    /* incore inode pointer */
+       struct xfs_bmbt_irec    prev;   /* extent before the new one */
+       struct xfs_bmbt_irec    got;    /* extent after, or delayed */
+
+       xfs_fileoff_t           offset; /* offset in file filling in */
+       xfs_extlen_t            length; /* i/o length asked/allocated */
+       xfs_fsblock_t           blkno;  /* starting block of new extent */
+
+       struct xfs_btree_cur    *cur;   /* btree cursor */
+       xfs_extnum_t            idx;    /* current extent index */
+       int                     nallocs;/* number of extents alloc'd */
+       int                     logflags;/* flags for transaction logging */
+
+       xfs_extlen_t            total;  /* total blocks needed for xaction */
+       xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
+       xfs_extlen_t            minleft; /* amount must be left after alloc */
+       char                    eof;    /* set if allocating past last extent */
+       char                    wasdel; /* replacing a delayed allocation */
+       char                    userdata;/* set if is user data */
+       char                    aeof;   /* allocated space at eof */
+       char                    conv;   /* overwriting unwritten extents */
+       char                    stack_switch;
+       int                     flags;
+       struct completion       *done;
+       struct work_struct      work;
+       int                     result;
+};
+
+int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+                       int *committed);
+int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+int    xfs_bmapi_allocate(struct xfs_bmalloca *args);
+int    __xfs_bmapi_allocate(struct xfs_bmalloca *args);
+int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+                    int whichfork, int *eof);
+int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+                             int whichfork, int *count);
+int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+               xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+int    xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+               xfs_bmap_format_t formatter, void *arg);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+void   xfs_bmap_del_free(struct xfs_bmap_free *flist,
+                         struct xfs_bmap_free_item *prev,
+                         struct xfs_bmap_free_item *free);
+int    xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+                              struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+                              int rt, int eof, int delay, int convert,
+                              xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void   xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int    xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+                            int whichfork, struct xfs_bmbt_irec *rec,
+                            int *is_empty);
+
+/* preallocation and hole punch interface */
+int    xfs_change_file_space(struct xfs_inode *ip, int cmd,
+                             xfs_flock64_t *bf, xfs_off_t offset,
+                             int attr_flags);
+
+/* EOF block manipulation functions */
+bool   xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int    xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+                          bool need_iolock);
+
+int    xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+                        struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c

index 0903960410a255c171a7b663ffb3ba4af9137074..7a2b4da3c0db9a0f77f19a30cea966848ed60cef 100644 (file)
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -510,7 +510,7 @@ xfs_btree_ptr_addr(
  }
  
  /*
- * Get a the root block which is stored in the inode.
+ * Get the root block which is stored in the inode.
   *
   * For now this btree implementation assumes the btree root is always
   * stored in the if_broot field of an inode fork.
@@ -978,6 +978,7 @@ xfs_btree_init_block_int(
                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
                         buf->bb_u.l.bb_pad = 0;
+                       buf->bb_u.l.bb_lsn = 0;
                 }
         } else {
                 /* owner is a 32 bit value on short blocks */
@@ -989,6 +990,7 @@ xfs_btree_init_block_int(
                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+                       buf->bb_u.s.bb_lsn = 0;
                 }
         }
  }
@@ -1684,7 +1686,7 @@ xfs_lookup_get_search_key(
  
  /*
   * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * stat is set to 0 if can't find any such record, 1 for success.
   */
  int                                    /* error */
  xfs_btree_lookup(
@@ -2756,7 +2758,6 @@ xfs_btree_make_block_unfull(
  
                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
                         /* A root block that can be made bigger. */
-
                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
                 } else {
                         /* A root block that needs replacing */
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h

index 55e3c7cc3c3d3f22178fb1feb8aab44679821172..c8473c7ef45e4c764fd61eb1bf6419cb1d98f4ea 100644 (file)
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -88,13 +88,11 @@ struct xfs_btree_block {
  #define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
  #define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
  
-
  #define XFS_BTREE_SBLOCK_CRC_OFF \
         offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
  #define XFS_BTREE_LBLOCK_CRC_OFF \
         offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
  
-
  /*
   * Generic key, ptr and record wrapper structures.
   *
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 1b2472a46e46b96e31e0615f670120218ed7cf24..c06823fe10d3559c143c3608b04815efd6252631 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,6 +35,7 @@
  #include <linux/freezer.h>
  
  #include "xfs_sb.h"
+#include "xfs_trans_resv.h"
  #include "xfs_log.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
@@ -303,7 +304,7 @@ _xfs_buf_free_pages(
   *     Releases the specified buffer.
   *
   *     The modification state of any associated pages is left unchanged.
- *     The buffer most not be on any hash - use xfs_buf_rele instead for
+ *     The buffer must not be on any hash - use xfs_buf_rele instead for
   *     hashed and refcounted buffers
   */
  void
@@ -1621,7 +1622,7 @@ xfs_setsize_buftarg_flags(
  /*
   *     When allocating the initial buffer target we have not yet
   *     read in the superblock, so don't know what sized sectors
- *     are being used is at this early stage.  Play safe.
+ *     are being used at this early stage.  Play safe.
   */
  STATIC int
  xfs_setsize_buftarg_early(
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index bfc4e0c26fd3404fb36f007da344be79543aea4c..3a944b198e35a0fbfc758a84fd39a2e0674d360f 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -39,6 +39,14 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
  
  STATIC void    xfs_buf_do_callbacks(struct xfs_buf *bp);
  
+static inline int
+xfs_buf_log_format_size(
+       struct xfs_buf_log_format *blfp)
+{
+       return offsetof(struct xfs_buf_log_format, blf_data_map) +
+                       (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+}
+
  /*
   * This returns the number of log iovecs needed to log the
   * given buf log item.
@@ -49,25 +57,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
   *
   * If the XFS_BLI_STALE flag has been set, then log nothing.
   */
-STATIC uint
+STATIC void
  xfs_buf_item_size_segment(
         struct xfs_buf_log_item *bip,
-       struct xfs_buf_log_format *blfp)
+       struct xfs_buf_log_format *blfp,
+       int                     *nvecs,
+       int                     *nbytes)
  {
         struct xfs_buf          *bp = bip->bli_buf;
-       uint                    nvecs;
         int                     next_bit;
         int                     last_bit;
  
         last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
         if (last_bit == -1)
-               return 0;
+               return;
  
         /*
          * initial count for a dirty buffer is 2 vectors - the format structure
          * and the first dirty region.
          */
-       nvecs = 2;
+       *nvecs += 2;
+       *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
  
         while (last_bit != -1) {
                 /*
@@ -87,18 +97,17 @@ xfs_buf_item_size_segment(
                         break;
                 } else if (next_bit != last_bit + 1) {
                         last_bit = next_bit;
-                       nvecs++;
+                       (*nvecs)++;
                 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
                            (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
                             XFS_BLF_CHUNK)) {
                         last_bit = next_bit;
-                       nvecs++;
+                       (*nvecs)++;
                 } else {
                         last_bit++;
                 }
+               *nbytes += XFS_BLF_CHUNK;
         }
-
-       return nvecs;
  }
  
  /*
@@ -118,12 +127,13 @@ xfs_buf_item_size_segment(
   * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
   * format structures.
   */
-STATIC uint
+STATIC void
  xfs_buf_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
         struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-       uint                    nvecs;
         int                     i;
  
         ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -135,7 +145,11 @@ xfs_buf_item_size(
                  */
                 trace_xfs_buf_item_size_stale(bip);
                 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
-               return bip->bli_format_count;
+               *nvecs += bip->bli_format_count;
+               for (i = 0; i < bip->bli_format_count; i++) {
+                       *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+               }
+               return;
         }
  
         ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
@@ -147,7 +161,8 @@ xfs_buf_item_size(
                  * commit, so no vectors are used at all.
                  */
                 trace_xfs_buf_item_size_ordered(bip);
-               return XFS_LOG_VEC_ORDERED;
+               *nvecs = XFS_LOG_VEC_ORDERED;
+               return;
         }
  
         /*
@@ -159,13 +174,11 @@ xfs_buf_item_size(
          * count for the extra buf log format structure that will need to be
          * written.
          */
-       nvecs = 0;
         for (i = 0; i < bip->bli_format_count; i++) {
-               nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]);
+               xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
+                                         nvecs, nbytes);
         }
-
         trace_xfs_buf_item_size(bip);
-       return nvecs;
  }
  
  static struct xfs_log_iovec *
@@ -192,8 +205,7 @@ xfs_buf_item_format_segment(
          * the actual size of the dirty bitmap rather than the size of the in
          * memory structure.
          */
-       base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
-                       (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+       base_size = xfs_buf_log_format_size(blfp);
  
         nvecs = 0;
         first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
@@ -601,11 +613,9 @@ xfs_buf_item_unlock(
                         }
                 }
         }
-       if (clean)
-               xfs_buf_item_relse(bp);
-       else if (aborted) {
+       if (clean || aborted) {
                 if (atomic_dec_and_test(&bip->bli_refcount)) {
-                       ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                       ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp));
                         xfs_buf_item_relse(bp);
                 }
         } else
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h

index 0f1c247dc680031fe06554a4bd41f6f962290a53..db6371087fe8ea9b786f82189b387c55979a2460 100644 (file)
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -18,101 +18,9 @@
  #ifndef        __XFS_BUF_ITEM_H__
  #define        __XFS_BUF_ITEM_H__
  
-extern kmem_zone_t     *xfs_buf_item_zone;
-
-/*
- * This flag indicates that the buffer contains on disk inodes
- * and requires special recovery handling.
- */
-#define        XFS_BLF_INODE_BUF       (1<<0)
-/*
- * This flag indicates that the buffer should not be replayed
- * during recovery because its blocks are being freed.
- */
-#define        XFS_BLF_CANCEL          (1<<1)
-
-/*
- * This flag indicates that the buffer contains on disk
- * user or group dquots and may require special recovery handling.
- */
-#define        XFS_BLF_UDQUOT_BUF      (1<<2)
-#define XFS_BLF_PDQUOT_BUF     (1<<3)
-#define        XFS_BLF_GDQUOT_BUF      (1<<4)
-
-#define        XFS_BLF_CHUNK           128
-#define        XFS_BLF_SHIFT           7
-#define        BIT_TO_WORD_SHIFT       5
-#define        NBWORD                  (NBBY * sizeof(unsigned int))
-
-/*
- * This is the structure used to lay out a buf log item in the
- * log.  The data map describes which 128 byte chunks of the buffer
- * have been logged.
- */
-#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+/* kernel only definitions */
  
-typedef struct xfs_buf_log_format {
-       unsigned short  blf_type;       /* buf log item type indicator */
-       unsigned short  blf_size;       /* size of this item */
-       ushort          blf_flags;      /* misc state */
-       ushort          blf_len;        /* number of blocks in this buf */
-       __int64_t       blf_blkno;      /* starting blkno of this buf */
-       unsigned int    blf_map_size;   /* used size of data bitmap in words */
-       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
-
-/*
- * All buffers now need to tell recovery where the magic number
- * is so that it can verify and calculate the CRCs on the buffer correctly
- * once the changes have been replayed into the buffer.
- *
- * The type value is held in the upper 5 bits of the blf_flags field, which is
- * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
- */
-#define XFS_BLFT_BITS  5
-#define XFS_BLFT_SHIFT 11
-#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
-
-enum xfs_blft {
-       XFS_BLFT_UNKNOWN_BUF = 0,
-       XFS_BLFT_UDQUOT_BUF,
-       XFS_BLFT_PDQUOT_BUF,
-       XFS_BLFT_GDQUOT_BUF,
-       XFS_BLFT_BTREE_BUF,
-       XFS_BLFT_AGF_BUF,
-       XFS_BLFT_AGFL_BUF,
-       XFS_BLFT_AGI_BUF,
-       XFS_BLFT_DINO_BUF,
-       XFS_BLFT_SYMLINK_BUF,
-       XFS_BLFT_DIR_BLOCK_BUF,
-       XFS_BLFT_DIR_DATA_BUF,
-       XFS_BLFT_DIR_FREE_BUF,
-       XFS_BLFT_DIR_LEAF1_BUF,
-       XFS_BLFT_DIR_LEAFN_BUF,
-       XFS_BLFT_DA_NODE_BUF,
-       XFS_BLFT_ATTR_LEAF_BUF,
-       XFS_BLFT_ATTR_RMT_BUF,
-       XFS_BLFT_SB_BUF,
-       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
-};
-
-static inline void
-xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
-{
-       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
-       blf->blf_flags &= ~XFS_BLFT_MASK;
-       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
-}
-
-static inline __uint16_t
-xfs_blft_from_flags(struct xfs_buf_log_format *blf)
-{
-       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
-}
-
-/*
- * buf log item flags
- */
+/* buf log item flags */
  #define        XFS_BLI_HOLD            0x01
  #define        XFS_BLI_DIRTY           0x02
  #define        XFS_BLI_STALE           0x04
@@ -133,8 +41,6 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
         { XFS_BLI_ORDERED,      "ORDERED" }
  
  
-#ifdef __KERNEL__
-
  struct xfs_buf;
  struct xfs_mount;
  struct xfs_buf_log_item;
@@ -169,6 +75,6 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
                                enum xfs_blft);
  void   xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
  
-#endif /* __KERNEL__ */
+extern kmem_zone_t     *xfs_buf_item_zone;
  
  #endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c

index 0b8b2a13cd24debe493c8982679a2c565ebae5a1..d4e59a4ff59ff1600cf8c9a83ce4b36f47ddfcd0 100644 (file)
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -27,8 +27,8 @@
  #include "xfs_mount.h"
  #include "xfs_da_btree.h"
  #include "xfs_bmap_btree.h"
-#include "xfs_dir2.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
@@ -399,7 +399,7 @@ xfs_da3_split(
         struct xfs_da_intnode   *node;
         struct xfs_buf          *bp;
         int                     max;
-       int                     action;
+       int                     action = 0;
         int                     error;
         int                     i;
  
@@ -2454,9 +2454,9 @@ static int
  xfs_buf_map_from_irec(
         struct xfs_mount        *mp,
         struct xfs_buf_map      **mapp,
-       unsigned int            *nmaps,
+       int                     *nmaps,
         struct xfs_bmbt_irec    *irecs,
-       unsigned int            nirecs)
+       int                     nirecs)
  {
         struct xfs_buf_map      *map;
         int                     i;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h

index 6fb3371c63cf3db535ea84cd6d62c31445b8c4f2..b1f267995dea32e97dc041c7dd14af77e1630dc8 100644 (file)
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -133,12 +133,19 @@ extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
                                      struct xfs_da3_icnode_hdr *from);
  
  static inline int
-xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+__xfs_da3_node_hdr_size(bool v3)
  {
-       if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC))
+       if (v3)
                 return sizeof(struct xfs_da3_node_hdr);
         return sizeof(struct xfs_da_node_hdr);
  }
+static inline int
+xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+{
+       bool    v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
+
+       return __xfs_da3_node_hdr_size(v3);
+}
  
  static inline struct xfs_da_node_entry *
  xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
@@ -176,6 +183,7 @@ enum xfs_dacmp {
  typedef struct xfs_da_args {
         const __uint8_t *name;          /* string (maybe not NULL terminated) */
         int             namelen;        /* length of string (maybe no NULL) */
+       __uint8_t       filetype;       /* filetype of inode for directories */
         __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
         int             valuelen;       /* length of value */
         int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c

deleted file mode 100644 (file)

index e36445c..0000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_itable.h"
-#include "xfs_dfrag.h"
-#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-static int xfs_swap_extents(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip,   /* tmp inode */
-       xfs_swapext_t   *sxp);
-
-/*
- * ioctl interface for swapext
- */
-int
-xfs_swapext(
-       xfs_swapext_t   *sxp)
-{
-       xfs_inode_t     *ip, *tip;
-       struct fd       f, tmp;
-       int             error = 0;
-
-       /* Pull information for the target fd */
-       f = fdget((int)sxp->sx_fdtarget);
-       if (!f.file) {
-               error = XFS_ERROR(EINVAL);
-               goto out;
-       }
-
-       if (!(f.file->f_mode & FMODE_WRITE) ||
-           !(f.file->f_mode & FMODE_READ) ||
-           (f.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
-               goto out_put_file;
-       }
-
-       tmp = fdget((int)sxp->sx_fdtmp);
-       if (!tmp.file) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_file;
-       }
-
-       if (!(tmp.file->f_mode & FMODE_WRITE) ||
-           !(tmp.file->f_mode & FMODE_READ) ||
-           (tmp.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
-               goto out_put_tmp_file;
-       }
-
-       if (IS_SWAPFILE(file_inode(f.file)) ||
-           IS_SWAPFILE(file_inode(tmp.file))) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       ip = XFS_I(file_inode(f.file));
-       tip = XFS_I(file_inode(tmp.file));
-
-       if (ip->i_mount != tip->i_mount) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       if (ip->i_ino == tip->i_ino) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               error = XFS_ERROR(EIO);
-               goto out_put_tmp_file;
-       }
-
-       error = xfs_swap_extents(ip, tip, sxp);
-
- out_put_tmp_file:
-       fdput(tmp);
- out_put_file:
-       fdput(f);
- out:
-       return error;
-}
-
-/*
- * We need to check that the format of the data fork in the temporary inode is
- * valid for the target inode before doing the swap. This is not a problem with
- * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
- * data fork depending on the space the attribute fork is taking so we can get
- * invalid formats on the target inode.
- *
- * E.g. target has space for 7 extents in extent format, temp inode only has
- * space for 6.  If we defragment down to 7 extents, then the tmp format is a
- * btree, but when swapped it needs to be in extent format. Hence we can't just
- * blindly swap data forks on attr2 filesystems.
- *
- * Note that we check the swap in both directions so that we don't end up with
- * a corrupt temporary inode, either.
- *
- * Note that fixing the way xfs_fsr sets up the attribute fork in the source
- * inode will prevent this situation from occurring, so all we do here is
- * reject and log the attempt. basically we are putting the responsibility on
- * userspace to get this right.
- */
-static int
-xfs_swap_extents_check_format(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip)   /* tmp inode */
-{
-
-       /* Should never get a local format */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-           tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-               return EINVAL;
-
-       /*
-        * if the target inode has less extents that then temporary inode then
-        * why did userspace call us?
-        */
-       if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-               return EINVAL;
-
-       /*
-        * if the target inode is in extent form and the temp inode is in btree
-        * form then we will end up with the target inode in the wrong format
-        * as we already know there are less extents in the temp inode.
-        */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-               return EINVAL;
-
-       /* Check temp in extent form to max in target */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
-                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-               return EINVAL;
-
-       /* Check target in extent form to max in temp */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
-                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-               return EINVAL;
-
-       /*
-        * If we are in a btree format, check that the temp root block will fit
-        * in the target and that it has enough extents to be in btree format
-        * in the target.
-        *
-        * Note that we have to be careful to allow btree->extent conversions
-        * (a common defrag case) which will occur when the temp inode is in
-        * extent format...
-        */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-               if (XFS_IFORK_BOFF(ip) &&
-                   XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-                       return EINVAL;
-               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
-                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-                       return EINVAL;
-       }
-
-       /* Reciprocal target->temp btree format checks */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-               if (XFS_IFORK_BOFF(tip) &&
-                   XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-                       return EINVAL;
-               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
-                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-                       return EINVAL;
-       }
-
-       return 0;
-}
-
-static int
-xfs_swap_extents(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip,   /* tmp inode */
-       xfs_swapext_t   *sxp)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_trans_t     *tp;
-       xfs_bstat_t     *sbp = &sxp->sx_stat;
-       xfs_ifork_t     *tempifp, *ifp, *tifp;
-       int             src_log_flags, target_log_flags;
-       int             error = 0;
-       int             aforkblks = 0;
-       int             taforkblks = 0;
-       __uint64_t      tmp;
-
-       /*
-        * We have no way of updating owner information in the BMBT blocks for
-        * each inode on CRC enabled filesystems, so to avoid corrupting the
-        * this metadata we simply don't allow extent swaps to occur.
-        */
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return XFS_ERROR(EINVAL);
-
-       tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-       if (!tempifp) {
-               error = XFS_ERROR(ENOMEM);
-               goto out;
-       }
-
-       /*
-        * we have to do two separate lock calls here to keep lockdep
-        * happy. If we try to get all the locks in one call, lock will
-        * report false positives when we drop the ILOCK and regain them
-        * below.
-        */
-       xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
-       /* Verify that both files have the same format */
-       if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       /* Verify both files are either real-time or non-realtime */
-       if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
-       if (error)
-               goto out_unlock;
-       truncate_pagecache_range(VFS_I(tip), 0, -1);
-
-       /* Verify O_DIRECT for ftmp */
-       if (VN_CACHED(VFS_I(tip)) != 0) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       /* Verify all data are being swapped */
-       if (sxp->sx_offset != 0 ||
-           sxp->sx_length != ip->i_d.di_size ||
-           sxp->sx_length != tip->i_d.di_size) {
-               error = XFS_ERROR(EFAULT);
-               goto out_unlock;
-       }
-
-       trace_xfs_swap_extent_before(ip, 0);
-       trace_xfs_swap_extent_before(tip, 1);
-
-       /* check inode formats now that data is flushed */
-       error = xfs_swap_extents_check_format(ip, tip);
-       if (error) {
-               xfs_notice(mp,
-                   "%s: inode 0x%llx format is incompatible for exchanging.",
-                               __func__, ip->i_ino);
-               goto out_unlock;
-       }
-
-       /*
-        * Compare the current change & modify times with that
-        * passed in.  If they differ, we abort this swap.
-        * This is the mechanism used to ensure the calling
-        * process that the file was not changed out from
-        * under it.
-        */
-       if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
-           (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
-           (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-           (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       /* We need to fail if the file is memory mapped.  Once we have tossed
-        * all existing pages, the page fault will have no option
-        * but to go to the filesystem for pages. By making the page fault call
-        * vop_read (or write in the case of autogrow) they block on the iolock
-        * until we have switched the extents.
-        */
-       if (VN_MAPPED(VFS_I(ip))) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
-       /*
-        * There is a race condition here since we gave up the
-        * ilock.  However, the data fork will not change since
-        * we have the iolock (locked for truncation too) so we
-        * are safe.  We don't really care if non-io related
-        * fields change.
-        */
-       truncate_pagecache_range(VFS_I(ip), 0, -1);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-       if ((error = xfs_trans_reserve(tp, 0,
-                                    XFS_ICHANGE_LOG_RES(mp), 0,
-                                    0, 0))) {
-               xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
-               xfs_iunlock(tip, XFS_IOLOCK_EXCL);
-               xfs_trans_cancel(tp, 0);
-               goto out;
-       }
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
-       /*
-        * Count the number of extended attribute blocks
-        */
-       if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
-            (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-               error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
-               if (error)
-                       goto out_trans_cancel;
-       }
-       if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
-            (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-               error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
-                       &taforkblks);
-               if (error)
-                       goto out_trans_cancel;
-       }
-
-       /*
-        * Swap the data forks of the inodes
-        */
-       ifp = &ip->i_df;
-       tifp = &tip->i_df;
-       *tempifp = *ifp;        /* struct copy */
-       *ifp = *tifp;           /* struct copy */
-       *tifp = *tempifp;       /* struct copy */
-
-       /*
-        * Fix the on-disk inode values
-        */
-       tmp = (__uint64_t)ip->i_d.di_nblocks;
-       ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
-       tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
-       tmp = (__uint64_t) ip->i_d.di_nextents;
-       ip->i_d.di_nextents = tip->i_d.di_nextents;
-       tip->i_d.di_nextents = tmp;
-
-       tmp = (__uint64_t) ip->i_d.di_format;
-       ip->i_d.di_format = tip->i_d.di_format;
-       tip->i_d.di_format = tmp;
-
-       /*
-        * The extents in the source inode could still contain speculative
-        * preallocation beyond EOF (e.g. the file is open but not modified
-        * while defrag is in progress). In that case, we need to copy over the
-        * number of delalloc blocks the data fork in the source inode is
-        * tracking beyond EOF so that when the fork is truncated away when the
-        * temporary inode is unlinked we don't underrun the i_delayed_blks
-        * counter on that inode.
-        */
-       ASSERT(tip->i_delayed_blks == 0);
-       tip->i_delayed_blks = ip->i_delayed_blks;
-       ip->i_delayed_blks = 0;
-
-       src_log_flags = XFS_ILOG_CORE;
-       switch (ip->i_d.di_format) {
-       case XFS_DINODE_FMT_EXTENTS:
-               /* If the extents fit in the inode, fix the
-                * pointer.  Otherwise it's already NULL or
-                * pointing to the extent.
-                */
-               if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-                       ifp->if_u1.if_extents =
-                               ifp->if_u2.if_inline_ext;
-               }
-               src_log_flags |= XFS_ILOG_DEXT;
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               src_log_flags |= XFS_ILOG_DBROOT;
-               break;
-       }
-
-       target_log_flags = XFS_ILOG_CORE;
-       switch (tip->i_d.di_format) {
-       case XFS_DINODE_FMT_EXTENTS:
-               /* If the extents fit in the inode, fix the
-                * pointer.  Otherwise it's already NULL or
-                * pointing to the extent.
-                */
-               if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-                       tifp->if_u1.if_extents =
-                               tifp->if_u2.if_inline_ext;
-               }
-               target_log_flags |= XFS_ILOG_DEXT;
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               target_log_flags |= XFS_ILOG_DBROOT;
-               break;
-       }
-
-
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-       xfs_trans_log_inode(tp, ip,  src_log_flags);
-       xfs_trans_log_inode(tp, tip, target_log_flags);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * transaction goes to disk before returning to the user.
-        */
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(tp);
-
-       error = xfs_trans_commit(tp, 0);
-
-       trace_xfs_swap_extent_after(ip, 0);
-       trace_xfs_swap_extent_after(tip, 1);
-out:
-       kmem_free(tempifp);
-       return error;
-
-out_unlock:
-       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       goto out;
-
-out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
-       goto out_unlock;
-}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h

deleted file mode 100644 (file)

index 20bdd93..0000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DFRAG_H__
-#define        __XFS_DFRAG_H__
-
-/*
- * Structure passed to xfs_swapext
- */
-
-typedef struct xfs_swapext
-{
-       __int64_t       sx_version;     /* version */
-       __int64_t       sx_fdtarget;    /* fd of target file */
-       __int64_t       sx_fdtmp;       /* fd of tmp file */
-       xfs_off_t       sx_offset;      /* offset into file */
-       xfs_off_t       sx_length;      /* leng from offset */
-       char            sx_pad[16];     /* pad space, unused */
-       xfs_bstat_t     sx_stat;        /* stat of target b4 copy */
-} xfs_swapext_t;
-
-/*
- * Version flag
- */
-#define XFS_SX_VERSION         0
-
-#ifdef __KERNEL__
-/*
- * Prototypes for visible xfs_dfrag.c routines.
- */
-
-/*
- * Syscall interface for xfs_swapext
- */
-int    xfs_swapext(struct xfs_swapext *sx);
-
-#endif /* __KERNEL__ */
-
-#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c

index 8f023dee404da0da9c2092ba15d5ad904588d5e4..edf203ab50afa734a74faaace8e626721e7fc1bb 100644 (file)
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -31,14 +31,14 @@
  #include "xfs_inode.h"
  #include "xfs_inode_item.h"
  #include "xfs_bmap.h"
-#include "xfs_dir2.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_error.h"
-#include "xfs_vnodeops.h"
  #include "xfs_trace.h"
  
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+
  
  /*
   * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -90,6 +90,9 @@ void
  xfs_dir_mount(
         xfs_mount_t     *mp)
  {
+       int     nodehdr_size;
+
+
         ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
         ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
                XFS_MAX_BLOCKSIZE);
@@ -98,12 +101,13 @@ xfs_dir_mount(
         mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
         mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
         mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
-       mp->m_attr_node_ents =
-               (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
-               (uint)sizeof(xfs_da_node_entry_t);
-       mp->m_dir_node_ents =
-               (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
-               (uint)sizeof(xfs_da_node_entry_t);
+
+       nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+       mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+       mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+
         mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
         if (xfs_sb_version_hasasciici(&mp->m_sb))
                 mp->m_dirnameops = &xfs_ascii_ci_nameops;
@@ -209,6 +213,7 @@ xfs_dir_createname(
         memset(&args, 0, sizeof(xfs_da_args_t));
         args.name = name->name;
         args.namelen = name->len;
+       args.filetype = name->type;
         args.hashval = dp->i_mount->m_dirnameops->hashname(name);
         args.inumber = inum;
         args.dp = dp;
@@ -283,6 +288,7 @@ xfs_dir_lookup(
         memset(&args, 0, sizeof(xfs_da_args_t));
         args.name = name->name;
         args.namelen = name->len;
+       args.filetype = name->type;
         args.hashval = dp->i_mount->m_dirnameops->hashname(name);
         args.dp = dp;
         args.whichfork = XFS_DATA_FORK;
@@ -338,6 +344,7 @@ xfs_dir_removename(
         memset(&args, 0, sizeof(xfs_da_args_t));
         args.name = name->name;
         args.namelen = name->len;
+       args.filetype = name->type;
         args.hashval = dp->i_mount->m_dirnameops->hashname(name);
         args.inumber = ino;
         args.dp = dp;
@@ -362,37 +369,6 @@ xfs_dir_removename(
         return rval;
  }
  
-/*
- * Read a directory.
- */
-int
-xfs_readdir(
-       xfs_inode_t     *dp,
-       struct dir_context *ctx,
-       size_t          bufsize)
-{
-       int             rval;           /* return value */
-       int             v;              /* type-checking value */
-
-       trace_xfs_readdir(dp);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       XFS_STATS_INC(xs_dir_getdents);
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-               rval = xfs_dir2_sf_getdents(dp, ctx);
-       else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
-               ;
-       else if (v)
-               rval = xfs_dir2_block_getdents(dp, ctx);
-       else
-               rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
-       return rval;
-}
-
  /*
   * Replace the inode number of a directory entry.
   */
@@ -418,6 +394,7 @@ xfs_dir_replace(
         memset(&args, 0, sizeof(xfs_da_args_t));
         args.name = name->name;
         args.namelen = name->len;
+       args.filetype = name->type;
         args.hashval = dp->i_mount->m_dirnameops->hashname(name);
         args.inumber = inum;
         args.dp = dp;
@@ -465,6 +442,7 @@ xfs_dir_canenter(
         memset(&args, 0, sizeof(xfs_da_args_t));
         args.name = name->name;
         args.namelen = name->len;
+       args.filetype = name->type;
         args.hashval = dp->i_mount->m_dirnameops->hashname(name);
         args.dp = dp;
         args.whichfork = XFS_DATA_FORK;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h

index e937d9991c1850c5427ecae7419011c8395d09c3..9910401327d45c788586b2e0953309b81c8c7c6d 100644 (file)
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -23,6 +23,11 @@ struct xfs_da_args;
  struct xfs_inode;
  struct xfs_mount;
  struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
  
  extern struct xfs_name xfs_name_dotdot;
  
@@ -57,4 +62,45 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
   */
  extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
  
+/*
+ * Interface routines used by userspace utilities
+ */
+extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
+extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
+               xfs_ino_t ino);
+extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
+               struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
+extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
+               struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
+               xfs_ino_t ino);
+
+extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+                               struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+               struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+               struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
+               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+               int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
+               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+               struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
  #endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c

index 5e7fbd72cf5255c53a8494a991986c339ea5293a..0957aa98b6c0d6bb1fdc3fa23161d761aeb58567 100644 (file)
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -31,8 +31,8 @@
  #include "xfs_inode_item.h"
  #include "xfs_bmap.h"
  #include "xfs_buf_item.h"
-#include "xfs_dir2.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_error.h"
  #include "xfs_trace.h"
@@ -126,7 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
         .verify_write = xfs_dir3_block_write_verify,
  };
  
-static int
+int
  xfs_dir3_block_read(
         struct xfs_trans        *tp,
         struct xfs_inode        *dp,
@@ -369,7 +369,7 @@ xfs_dir2_block_addname(
         if (error)
                 return error;
  
-       len = xfs_dir2_data_entsize(args->namelen);
+       len = xfs_dir3_data_entsize(mp, args->namelen);
  
         /*
          * Set up pointers to parts of the block.
@@ -549,7 +549,8 @@ xfs_dir2_block_addname(
         dep->inumber = cpu_to_be64(args->inumber);
         dep->namelen = args->namelen;
         memcpy(dep->name, args->name, args->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
         *tagp = cpu_to_be16((char *)dep - (char *)hdr);
         /*
          * Clean up the bestfree array and log the header, tail, and entry.
@@ -564,101 +565,6 @@ xfs_dir2_block_addname(
         return 0;
  }
  
-/*
- * Readdir for block directories.
- */
-int                                            /* error */
-xfs_dir2_block_getdents(
-       xfs_inode_t             *dp,            /* incore inode */
-       struct dir_context      *ctx)
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       struct xfs_buf          *bp;            /* buffer for block */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
-       char                    *endptr;        /* end of the data entries */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       char                    *ptr;           /* current data entry */
-       int                     wantoff;        /* starting block offset */
-       xfs_off_t               cook;
-
-       mp = dp->i_mount;
-       /*
-        * If the block number in the offset is out of range, we're done.
-        */
-       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
-               return 0;
-
-       error = xfs_dir3_block_read(NULL, dp, &bp);
-       if (error)
-               return error;
-
-       /*
-        * Extract the byte offset we start at from the seek pointer.
-        * We'll skip entries before this.
-        */
-       wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
-       hdr = bp->b_addr;
-       xfs_dir3_data_check(dp, bp);
-       /*
-        * Set up values for the loop.
-        */
-       btp = xfs_dir2_block_tail_p(mp, hdr);
-       ptr = (char *)xfs_dir3_data_entry_p(hdr);
-       endptr = (char *)xfs_dir2_block_leaf_p(btp);
-
-       /*
-        * Loop over the data portion of the block.
-        * Each object is a real entry (dep) or an unused one (dup).
-        */
-       while (ptr < endptr) {
-               dup = (xfs_dir2_data_unused_t *)ptr;
-               /*
-                * Unused, skip it.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       ptr += be16_to_cpu(dup->length);
-                       continue;
-               }
-
-               dep = (xfs_dir2_data_entry_t *)ptr;
-
-               /*
-                * Bump pointer for the next iteration.
-                */
-               ptr += xfs_dir2_data_entsize(dep->namelen);
-               /*
-                * The entry is before the desired starting point, skip it.
-                */
-               if ((char *)dep - (char *)hdr < wantoff)
-                       continue;
-
-               cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                           (char *)dep - (char *)hdr);
-
-               ctx->pos = cook & 0x7fffffff;
-               /*
-                * If it didn't fit, set the final offset to here & return.
-                */
-               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                           be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
-                       xfs_trans_brelse(NULL, bp);
-                       return 0;
-               }
-       }
-
-       /*
-        * Reached the end of the block.
-        * Set the offset to a non-existent block 1 and return.
-        */
-       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
-                       0x7fffffff;
-       xfs_trans_brelse(NULL, bp);
-       return 0;
-}
-
  /*
   * Log leaf entries from the block.
   */
@@ -736,6 +642,7 @@ xfs_dir2_block_lookup(
          * Fill in inode number, CI name if appropriate, release the block.
          */
         args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
         error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
         xfs_trans_brelse(args->trans, bp);
         return XFS_ERROR(error);
@@ -894,7 +801,7 @@ xfs_dir2_block_removename(
         needlog = needscan = 0;
         xfs_dir2_data_make_free(tp, bp,
                 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
         /*
          * Fix up the block tail.
          */
@@ -968,6 +875,7 @@ xfs_dir2_block_replace(
          * Change the inode number to the new value.
          */
         dep->inumber = cpu_to_be64(args->inumber);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
         xfs_dir2_data_log_entry(args->trans, bp, dep);
         xfs_dir3_data_check(dp, bp);
         return 0;
@@ -1254,7 +1162,8 @@ xfs_dir2_sf_to_block(
         dep->inumber = cpu_to_be64(dp->i_ino);
         dep->namelen = 1;
         dep->name[0] = '.';
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
         *tagp = cpu_to_be16((char *)dep - (char *)hdr);
         xfs_dir2_data_log_entry(tp, bp, dep);
         blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
@@ -1267,7 +1176,8 @@ xfs_dir2_sf_to_block(
         dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
         dep->namelen = 2;
         dep->name[0] = dep->name[1] = '.';
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
         *tagp = cpu_to_be16((char *)dep - (char *)hdr);
         xfs_dir2_data_log_entry(tp, bp, dep);
         blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
@@ -1312,10 +1222,12 @@ xfs_dir2_sf_to_block(
                  * Copy a real entry.
                  */
                 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
-               dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep));
+               dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
                 dep->namelen = sfep->namelen;
+               xfs_dir3_dirent_put_ftype(mp, dep,
+                                       xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
                 memcpy(dep->name, sfep->name, dep->namelen);
-               tagp = xfs_dir2_data_entry_tag_p(dep);
+               tagp = xfs_dir3_data_entry_tag_p(mp, dep);
                 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
                 xfs_dir2_data_log_entry(tp, bp, dep);
                 name.name = sfep->name;
@@ -1328,7 +1240,7 @@ xfs_dir2_sf_to_block(
                 if (++i == sfp->count)
                         sfep = NULL;
                 else
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
         }
         /* Done with the temporary buffer */
         kmem_free(sfp);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c

index c2930238005c6605c1e69e4dc455b9ed7267f3da..47e1326c169a08c71d8d21ac51e8d113444d3295 100644 (file)
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -29,14 +29,12 @@
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_error.h"
  #include "xfs_buf_item.h"
  #include "xfs_cksum.h"
  
-STATIC xfs_dir2_data_free_t *
-xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-
  /*
   * Check the consistency of the data block.
   * The input can also be a block-format directory.
@@ -149,8 +147,10 @@ __xfs_dir3_data_check(
                 XFS_WANT_CORRUPTED_RETURN(
                         !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
                 XFS_WANT_CORRUPTED_RETURN(
-                       be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                       be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
                                                (char *)dep - (char *)hdr);
+               XFS_WANT_CORRUPTED_RETURN(
+                       xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
                 count++;
                 lastfree = 0;
                 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -168,7 +168,7 @@ __xfs_dir3_data_check(
                         }
                         XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
                 }
-               p += xfs_dir2_data_entsize(dep->namelen);
+               p += xfs_dir3_data_entsize(mp, dep->namelen);
         }
         /*
          * Need to have seen all the entries and all the bestfree slots.
@@ -325,7 +325,7 @@ xfs_dir3_data_readahead(
   * Given a data block and an unused entry from that block,
   * return the bestfree entry if any that corresponds to it.
   */
-STATIC xfs_dir2_data_free_t *
+xfs_dir2_data_free_t *
  xfs_dir2_data_freefind(
         xfs_dir2_data_hdr_t     *hdr,           /* data block */
         xfs_dir2_data_unused_t  *dup)           /* data unused entry */
@@ -333,7 +333,7 @@ xfs_dir2_data_freefind(
         xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
         xfs_dir2_data_aoff_t    off;            /* offset value needed */
         struct xfs_dir2_data_free *bf;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
         int                     matched;        /* matched the value */
         int                     seenzero;       /* saw a 0 bestfree entry */
  #endif
@@ -341,7 +341,7 @@ xfs_dir2_data_freefind(
         off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
         bf = xfs_dir3_data_bestfree_p(hdr);
  
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
         /*
          * Validate some consistency in the bestfree table.
          * Check order, non-overlapping entries, and if we find the
@@ -538,8 +538,8 @@ xfs_dir2_data_freescan(
                 else {
                         dep = (xfs_dir2_data_entry_t *)p;
                         ASSERT((char *)dep - (char *)hdr ==
-                              be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
-                       p += xfs_dir2_data_entsize(dep->namelen);
+                              be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
+                       p += xfs_dir3_data_entsize(mp, dep->namelen);
                 }
         }
  }
@@ -629,7 +629,8 @@ xfs_dir2_data_log_entry(
         struct xfs_buf          *bp,
         xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
  {
-       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+       struct xfs_mount        *mp = tp->t_mountp;
  
         ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
                hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -637,7 +638,7 @@ xfs_dir2_data_log_entry(
                hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
  
         xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
-               (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
+               (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
                        (char *)hdr - 1));
  }
  
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h

index 7826782b8d789461eef5a5443ae1d29944887815..a0961a61ac1a9a419fd537cd47dbb7337fbbc3a0 100644 (file)
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -68,6 +68,23 @@
  #define        XFS_DIR3_DATA_MAGIC     0x58444433      /* XDD3: multiblock dirs */
  #define        XFS_DIR3_FREE_MAGIC     0x58444633      /* XDF3: free index blocks */
  
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN            0
+#define XFS_DIR3_FT_REG_FILE           1
+#define XFS_DIR3_FT_DIR                        2
+#define XFS_DIR3_FT_CHRDEV             3
+#define XFS_DIR3_FT_BLKDEV             4
+#define XFS_DIR3_FT_FIFO               5
+#define XFS_DIR3_FT_SOCK               6
+#define XFS_DIR3_FT_SYMLINK            7
+#define XFS_DIR3_FT_WHT                        8
+
+#define XFS_DIR3_FT_MAX                        9
+
  /*
   * Byte offset in data block and shortform entry.
   */
@@ -138,6 +155,9 @@ typedef struct xfs_dir2_sf_entry {
         xfs_dir2_sf_off_t       offset;         /* saved offset */
         __u8                    name[];         /* name, variable size */
         /*
+        * A single byte containing the file type field follows the inode
+        * number for version 3 directory entries.
+        *
          * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
          * variable offset after the name.
          */
@@ -162,16 +182,6 @@ xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
         put_unaligned_be16(off, &sfep->offset.i);
  }
  
-static inline int
-xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len)
-{
-       return sizeof(struct xfs_dir2_sf_entry) +       /* namelen + offset */
-               len +                                   /* name */
-               (hdr->i8count ?                         /* ino */
-                sizeof(xfs_dir2_ino8_t) :
-                sizeof(xfs_dir2_ino4_t));
-}
-
  static inline struct xfs_dir2_sf_entry *
  xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
  {
@@ -179,14 +189,78 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
                 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
  }
  
+static inline int
+xfs_dir3_sf_entsize(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       int                     len)
+{
+       int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
+
+       count += len;                                   /* name */
+       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+                               sizeof(xfs_dir2_ino4_t); /* ino # */
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               count += sizeof(__uint8_t);             /* file type */
+       return count;
+}
+
  static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr,
-               struct xfs_dir2_sf_entry *sfep)
+xfs_dir3_sf_nextentry(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
  {
         return (struct xfs_dir2_sf_entry *)
-               ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+               ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
  }
  
+/*
+ * in dir3 shortform directories, the file type field is stored at a variable
+ * offset after the inode number. Because it's only a single byte, endian
+ * conversion is not necessary.
+ */
+static inline __uint8_t *
+xfs_dir3_sfe_ftypep(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return (__uint8_t *)&sfep->name[sfep->namelen];
+}
+
+static inline __uint8_t
+xfs_dir3_sfe_get_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       __uint8_t       *ftp;
+
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return XFS_DIR3_FT_UNKNOWN;
+
+       ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+       if (*ftp >= XFS_DIR3_FT_MAX)
+               return XFS_DIR3_FT_UNKNOWN;
+       return *ftp;
+}
+
+static inline void
+xfs_dir3_sfe_put_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep,
+       __uint8_t               ftype)
+{
+       __uint8_t       *ftp;
+
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return;
+       ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+       *ftp = ftype;
+}
  
  /*
   * Data block structures.
@@ -286,12 +360,18 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
   * Active entry in a data block.
   *
   * Aligned to 8 bytes.  After the variable length name field there is a
- * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p.
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
   */
  typedef struct xfs_dir2_data_entry {
         __be64                  inumber;        /* inode number */
         __u8                    namelen;        /* name length */
         __u8                    name[];         /* name bytes, no null */
+     /* __u8                   filetype; */    /* type of inode we point to */
       /*        __be16                  tag; */         /* starting offset of us */
  } xfs_dir2_data_entry_t;
  
@@ -311,20 +391,67 @@ typedef struct xfs_dir2_data_unused {
  /*
   * Size of a data entry.
   */
-static inline int xfs_dir2_data_entsize(int n)
+static inline int
+__xfs_dir3_data_entsize(
+       bool    ftype,
+       int     n)
  {
-       return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n +
-                (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
+       int     size = offsetof(struct xfs_dir2_data_entry, name[0]);
+
+       size += n;
+       size += sizeof(xfs_dir2_data_off_t);
+       if (ftype)
+               size += sizeof(__uint8_t);
+       return roundup(size, XFS_DIR2_DATA_ALIGN);
+}
+static inline int
+xfs_dir3_data_entsize(
+       struct xfs_mount        *mp,
+       int                     n)
+{
+       bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
+       return __xfs_dir3_data_entsize(ftype, n);
+}
+
+static inline __uint8_t
+xfs_dir3_dirent_get_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep)
+{
+       if (xfs_sb_version_hasftype(&mp->m_sb)) {
+               __uint8_t       type = dep->name[dep->namelen];
+
+               ASSERT(type < XFS_DIR3_FT_MAX);
+               if (type < XFS_DIR3_FT_MAX)
+                       return type;
+
+       }
+       return XFS_DIR3_FT_UNKNOWN;
+}
+
+static inline void
+xfs_dir3_dirent_put_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep,
+       __uint8_t               type)
+{
+       ASSERT(type < XFS_DIR3_FT_MAX);
+       ASSERT(dep->namelen != 0);
+
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               dep->name[dep->namelen] = type;
  }
  
  /*
   * Pointer to an entry's tag word.
   */
  static inline __be16 *
-xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep)
+xfs_dir3_data_entry_tag_p(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep)
  {
         return (__be16 *)((char *)dep +
-               xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+               xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
  }
  
  /*
@@ -375,13 +502,17 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
   * data block header because the sfe embeds the block offset of the entry into
   * it so that it doesn't change when format conversion occurs. Bad Things Happen
   * if we don't follow this rule.
+ *
+ * XXX: there is scope for significant optimisation of the logic here. Right
+ * now we are checking for "dir3 format" over and over again. Ideally we should
+ * only do it once for each operation.
   */
  #define        XFS_DIR3_DATA_DOT_OFFSET(mp)    \
         xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
  #define        XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
-       (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1))
+       (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
  #define        XFS_DIR3_DATA_FIRST_OFFSET(mp)          \
-       (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2))
+       (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
  
  static inline xfs_dir2_data_aoff_t
  xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
@@ -392,13 +523,19 @@ xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
  static inline xfs_dir2_data_aoff_t
  xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
  {
-       return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1);
+       bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+       return xfs_dir3_data_dot_offset(hdr) +
+               __xfs_dir3_data_entsize(dir3, 1);
  }
  
  static inline xfs_dir2_data_aoff_t
  xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
  {
-       return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2);
+       bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+       return xfs_dir3_data_dotdot_offset(hdr) +
+               __xfs_dir3_data_entsize(dir3, 2);
  }
  
  /*
@@ -519,6 +656,9 @@ struct xfs_dir3_leaf {
  
  #define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
  
+extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
+                                       struct xfs_dir2_leaf *from);
+
  static inline int
  xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
  {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c

index 2aed25cae04d9f265df00aba8e5d63a624350a0f..08984eeee159c5d790bdce648caf4fe8cb4ea676 100644 (file)
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -31,6 +31,7 @@
  #include "xfs_inode.h"
  #include "xfs_bmap.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_error.h"
  #include "xfs_trace.h"
@@ -695,7 +696,7 @@ xfs_dir2_leaf_addname(
         ents = xfs_dir3_leaf_ents_p(leaf);
         xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
         bestsp = xfs_dir2_leaf_bests_p(ltp);
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
  
         /*
          * See if there are any entries with the same hash value
@@ -896,7 +897,8 @@ xfs_dir2_leaf_addname(
         dep->inumber = cpu_to_be64(args->inumber);
         dep->namelen = args->namelen;
         memcpy(dep->name, args->name, dep->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
         *tagp = cpu_to_be16((char *)dep - (char *)hdr);
         /*
          * Need to scan fix up the bestfree table.
@@ -1083,396 +1085,6 @@ xfs_dir3_leaf_compact_x1(
         *highstalep = highstale;
  }
  
-struct xfs_dir2_leaf_map_info {
-       xfs_extlen_t    map_blocks;     /* number of fsbs in map */
-       xfs_dablk_t     map_off;        /* last mapped file offset */
-       int             map_size;       /* total entries in *map */
-       int             map_valid;      /* valid entries in *map */
-       int             nmap;           /* mappings to ask xfs_bmapi */
-       xfs_dir2_db_t   curdb;          /* db for current block */
-       int             ra_current;     /* number of read-ahead blks */
-       int             ra_index;       /* *map index for read-ahead */
-       int             ra_offset;      /* map entry offset for ra */
-       int             ra_want;        /* readahead count wanted */
-       struct xfs_bmbt_irec map[];     /* map vector for blocks */
-};
-
-STATIC int
-xfs_dir2_leaf_readbuf(
-       struct xfs_inode        *dp,
-       size_t                  bufsize,
-       struct xfs_dir2_leaf_map_info *mip,
-       xfs_dir2_off_t          *curoff,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp = *bpp;
-       struct xfs_bmbt_irec    *map = mip->map;
-       struct blk_plug         plug;
-       int                     error = 0;
-       int                     length;
-       int                     i;
-       int                     j;
-
-       /*
-        * If we have a buffer, we need to release it and
-        * take it out of the mapping.
-        */
-
-       if (bp) {
-               xfs_trans_brelse(NULL, bp);
-               bp = NULL;
-               mip->map_blocks -= mp->m_dirblkfsbs;
-               /*
-                * Loop to get rid of the extents for the
-                * directory block.
-                */
-               for (i = mp->m_dirblkfsbs; i > 0; ) {
-                       j = min_t(int, map->br_blockcount, i);
-                       map->br_blockcount -= j;
-                       map->br_startblock += j;
-                       map->br_startoff += j;
-                       /*
-                        * If mapping is done, pitch it from
-                        * the table.
-                        */
-                       if (!map->br_blockcount && --mip->map_valid)
-                               memmove(&map[0], &map[1],
-                                       sizeof(map[0]) * mip->map_valid);
-                       i -= j;
-               }
-       }
-
-       /*
-        * Recalculate the readahead blocks wanted.
-        */
-       mip->ra_want = howmany(bufsize + mp->m_dirblksize,
-                              mp->m_sb.sb_blocksize) - 1;
-       ASSERT(mip->ra_want >= 0);
-
-       /*
-        * If we don't have as many as we want, and we haven't
-        * run out of data blocks, get some more mappings.
-        */
-       if (1 + mip->ra_want > mip->map_blocks &&
-           mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
-               /*
-                * Get more bmaps, fill in after the ones
-                * we already have in the table.
-                */
-               mip->nmap = mip->map_size - mip->map_valid;
-               error = xfs_bmapi_read(dp, mip->map_off,
-                               xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
-                                                               mip->map_off,
-                               &map[mip->map_valid], &mip->nmap, 0);
-
-               /*
-                * Don't know if we should ignore this or try to return an
-                * error.  The trouble with returning errors is that readdir
-                * will just stop without actually passing the error through.
-                */
-               if (error)
-                       goto out;       /* XXX */
-
-               /*
-                * If we got all the mappings we asked for, set the final map
-                * offset based on the last bmap value received.  Otherwise,
-                * we've reached the end.
-                */
-               if (mip->nmap == mip->map_size - mip->map_valid) {
-                       i = mip->map_valid + mip->nmap - 1;
-                       mip->map_off = map[i].br_startoff + map[i].br_blockcount;
-               } else
-                       mip->map_off = xfs_dir2_byte_to_da(mp,
-                                                       XFS_DIR2_LEAF_OFFSET);
-
-               /*
-                * Look for holes in the mapping, and eliminate them.  Count up
-                * the valid blocks.
-                */
-               for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
-                       if (map[i].br_startblock == HOLESTARTBLOCK) {
-                               mip->nmap--;
-                               length = mip->map_valid + mip->nmap - i;
-                               if (length)
-                                       memmove(&map[i], &map[i + 1],
-                                               sizeof(map[i]) * length);
-                       } else {
-                               mip->map_blocks += map[i].br_blockcount;
-                               i++;
-                       }
-               }
-               mip->map_valid += mip->nmap;
-       }
-
-       /*
-        * No valid mappings, so no more data blocks.
-        */
-       if (!mip->map_valid) {
-               *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
-               goto out;
-       }
-
-       /*
-        * Read the directory block starting at the first mapping.
-        */
-       mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-       error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
-                       map->br_blockcount >= mp->m_dirblkfsbs ?
-                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-
-       /*
-        * Should just skip over the data block instead of giving up.
-        */
-       if (error)
-               goto out;       /* XXX */
-
-       /*
-        * Adjust the current amount of read-ahead: we just read a block that
-        * was previously ra.
-        */
-       if (mip->ra_current)
-               mip->ra_current -= mp->m_dirblkfsbs;
-
-       /*
-        * Do we need more readahead?
-        */
-       blk_start_plug(&plug);
-       for (mip->ra_index = mip->ra_offset = i = 0;
-            mip->ra_want > mip->ra_current && i < mip->map_blocks;
-            i += mp->m_dirblkfsbs) {
-               ASSERT(mip->ra_index < mip->map_valid);
-               /*
-                * Read-ahead a contiguous directory block.
-                */
-               if (i > mip->ra_current &&
-                   map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                       xfs_dir3_data_readahead(NULL, dp,
-                               map[mip->ra_index].br_startoff + mip->ra_offset,
-                               XFS_FSB_TO_DADDR(mp,
-                                       map[mip->ra_index].br_startblock +
-                                                       mip->ra_offset));
-                       mip->ra_current = i;
-               }
-
-               /*
-                * Read-ahead a non-contiguous directory block.  This doesn't
-                * use our mapping, but this is a very rare case.
-                */
-               else if (i > mip->ra_current) {
-                       xfs_dir3_data_readahead(NULL, dp,
-                                       map[mip->ra_index].br_startoff +
-                                                       mip->ra_offset, -1);
-                       mip->ra_current = i;
-               }
-
-               /*
-                * Advance offset through the mapping table.
-                */
-               for (j = 0; j < mp->m_dirblkfsbs; j++) {
-                       /*
-                        * The rest of this extent but not more than a dir
-                        * block.
-                        */
-                       length = min_t(int, mp->m_dirblkfsbs,
-                                       map[mip->ra_index].br_blockcount -
-                                                       mip->ra_offset);
-                       j += length;
-                       mip->ra_offset += length;
-
-                       /*
-                        * Advance to the next mapping if this one is used up.
-                        */
-                       if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
-                               mip->ra_offset = 0;
-                               mip->ra_index++;
-                       }
-               }
-       }
-       blk_finish_plug(&plug);
-
-out:
-       *bpp = bp;
-       return error;
-}
-
-/*
- * Getdents (readdir) for leaf and node directories.
- * This reads the data blocks only, so is the same for both forms.
- */
-int                                            /* error */
-xfs_dir2_leaf_getdents(
-       xfs_inode_t             *dp,            /* incore directory inode */
-       struct dir_context      *ctx,
-       size_t                  bufsize)
-{
-       struct xfs_buf          *bp = NULL;     /* data block buffer */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_data_entry_t   *dep;           /* data entry */
-       xfs_dir2_data_unused_t  *dup;           /* unused entry */
-       int                     error = 0;      /* error return value */
-       int                     length;         /* temporary length value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     byteoff;        /* offset in current block */
-       xfs_dir2_off_t          curoff;         /* current overall offset */
-       xfs_dir2_off_t          newoff;         /* new curoff after new blk */
-       char                    *ptr = NULL;    /* pointer to current data */
-       struct xfs_dir2_leaf_map_info *map_info;
-
-       /*
-        * If the offset is at or past the largest allowed value,
-        * give up right away.
-        */
-       if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
-               return 0;
-
-       mp = dp->i_mount;
-
-       /*
-        * Set up to bmap a number of blocks based on the caller's
-        * buffer size, the directory block size, and the filesystem
-        * block size.
-        */
-       length = howmany(bufsize + mp->m_dirblksize,
-                                    mp->m_sb.sb_blocksize);
-       map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
-                               (length * sizeof(struct xfs_bmbt_irec)),
-                              KM_SLEEP | KM_NOFS);
-       map_info->map_size = length;
-
-       /*
-        * Inside the loop we keep the main offset value as a byte offset
-        * in the directory file.
-        */
-       curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
-
-       /*
-        * Force this conversion through db so we truncate the offset
-        * down to get the start of the data block.
-        */
-       map_info->map_off = xfs_dir2_db_to_da(mp,
-                                             xfs_dir2_byte_to_db(mp, curoff));
-
-       /*
-        * Loop over directory entries until we reach the end offset.
-        * Get more blocks and readahead as necessary.
-        */
-       while (curoff < XFS_DIR2_LEAF_OFFSET) {
-               /*
-                * If we have no buffer, or we're off the end of the
-                * current buffer, need to get another one.
-                */
-               if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
-
-                       error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
-                                                     &curoff, &bp);
-                       if (error || !map_info->map_valid)
-                               break;
-
-                       /*
-                        * Having done a read, we need to set a new offset.
-                        */
-                       newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
-                       /*
-                        * Start of the current block.
-                        */
-                       if (curoff < newoff)
-                               curoff = newoff;
-                       /*
-                        * Make sure we're in the right block.
-                        */
-                       else if (curoff > newoff)
-                               ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
-                                      map_info->curdb);
-                       hdr = bp->b_addr;
-                       xfs_dir3_data_check(dp, bp);
-                       /*
-                        * Find our position in the block.
-                        */
-                       ptr = (char *)xfs_dir3_data_entry_p(hdr);
-                       byteoff = xfs_dir2_byte_to_off(mp, curoff);
-                       /*
-                        * Skip past the header.
-                        */
-                       if (byteoff == 0)
-                               curoff += xfs_dir3_data_entry_offset(hdr);
-                       /*
-                        * Skip past entries until we reach our offset.
-                        */
-                       else {
-                               while ((char *)ptr - (char *)hdr < byteoff) {
-                                       dup = (xfs_dir2_data_unused_t *)ptr;
-
-                                       if (be16_to_cpu(dup->freetag)
-                                                 == XFS_DIR2_DATA_FREE_TAG) {
-
-                                               length = be16_to_cpu(dup->length);
-                                               ptr += length;
-                                               continue;
-                                       }
-                                       dep = (xfs_dir2_data_entry_t *)ptr;
-                                       length =
-                                          xfs_dir2_data_entsize(dep->namelen);
-                                       ptr += length;
-                               }
-                               /*
-                                * Now set our real offset.
-                                */
-                               curoff =
-                                       xfs_dir2_db_off_to_byte(mp,
-                                           xfs_dir2_byte_to_db(mp, curoff),
-                                           (char *)ptr - (char *)hdr);
-                               if (ptr >= (char *)hdr + mp->m_dirblksize) {
-                                       continue;
-                               }
-                       }
-               }
-               /*
-                * We have a pointer to an entry.
-                * Is it a live one?
-                */
-               dup = (xfs_dir2_data_unused_t *)ptr;
-               /*
-                * No, it's unused, skip over it.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       length = be16_to_cpu(dup->length);
-                       ptr += length;
-                       curoff += length;
-                       continue;
-               }
-
-               dep = (xfs_dir2_data_entry_t *)ptr;
-               length = xfs_dir2_data_entsize(dep->namelen);
-
-               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                           be64_to_cpu(dep->inumber), DT_UNKNOWN))
-                       break;
-
-               /*
-                * Advance to next entry in the block.
-                */
-               ptr += length;
-               curoff += length;
-               /* bufsize may have just been a guess; don't go negative */
-               bufsize = bufsize > length ? bufsize - length : 0;
-       }
-
-       /*
-        * All done.  Set output offset value to current offset.
-        */
-       if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-               ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
-       else
-               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-       kmem_free(map_info);
-       if (bp)
-               xfs_trans_brelse(NULL, bp);
-       return error;
-}
-
-
  /*
   * Log the bests entries indicated from a leaf1 block.
   */
@@ -1614,6 +1226,7 @@ xfs_dir2_leaf_lookup(
          * Return the found inode number & CI name if appropriate
          */
         args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
         error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
         xfs_trans_brelse(tp, dbp);
         xfs_trans_brelse(tp, lbp);
@@ -1816,7 +1429,7 @@ xfs_dir2_leaf_removename(
          */
         xfs_dir2_data_make_free(tp, dbp,
                 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
         /*
          * We just mark the leaf entry stale by putting a null in it.
          */
@@ -1944,6 +1557,7 @@ xfs_dir2_leaf_replace(
          * Put the new inode number in, log it.
          */
         dep->inumber = cpu_to_be64(args->inumber);
+       xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
         tp = args->trans;
         xfs_dir2_data_log_entry(tp, dbp, dep);
         xfs_dir3_leaf_check(dp->i_mount, lbp);
@@ -1975,10 +1589,6 @@ xfs_dir2_leaf_search_hash(
         ents = xfs_dir3_leaf_ents_p(leaf);
         xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
  
-#ifndef __KERNEL__
-       if (!leafhdr.count)
-               return 0;
-#endif
         /*
          * Note, the table cannot be empty, so we have to go through the loop.
          * Binary search the leaf entries looking for our hash value.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c

index 2226a00acd156118a2998ce37c2c95ae628503d9..4c3dba7ffb7439d250b0af44e4de237a9359a795 100644 (file)
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -30,6 +30,7 @@
  #include "xfs_inode.h"
  #include "xfs_bmap.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_error.h"
  #include "xfs_trace.h"
@@ -312,11 +313,13 @@ xfs_dir2_free_log_header(
         struct xfs_trans        *tp,
         struct xfs_buf          *bp)
  {
+#ifdef DEBUG
         xfs_dir2_free_t         *free;          /* freespace structure */
  
         free = bp->b_addr;
         ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
                free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
         xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
  }
  
@@ -602,7 +605,7 @@ xfs_dir2_leafn_lookup_for_addname(
                 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
                        free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
         }
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
         /*
          * Loop over leaf entries with the right hash value.
          */
@@ -813,6 +816,7 @@ xfs_dir2_leafn_lookup_for_entry(
                                 xfs_trans_brelse(tp, state->extrablk.bp);
                         args->cmpresult = cmp;
                         args->inumber = be64_to_cpu(dep->inumber);
+                       args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
                         *indexp = index;
                         state->extravalid = 1;
                         state->extrablk.bp = curbp;
@@ -1256,7 +1260,7 @@ xfs_dir2_leafn_remove(
         longest = be16_to_cpu(bf[0].length);
         needlog = needscan = 0;
         xfs_dir2_data_make_free(tp, dbp, off,
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
         /*
          * Rescan the data block freespaces for bestfree.
          * Log the data block header if needed.
@@ -1708,7 +1712,7 @@ xfs_dir2_node_addname_int(
         dp = args->dp;
         mp = dp->i_mount;
         tp = args->trans;
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
         /*
          * If we came in with a freespace block that means that lookup
          * found an entry with our hash value.  This is the freespace
@@ -2004,7 +2008,8 @@ xfs_dir2_node_addname_int(
         dep->inumber = cpu_to_be64(args->inumber);
         dep->namelen = args->namelen;
         memcpy(dep->name, args->name, dep->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
         *tagp = cpu_to_be16((char *)dep - (char *)hdr);
         xfs_dir2_data_log_entry(tp, dbp, dep);
         /*
@@ -2224,6 +2229,7 @@ xfs_dir2_node_replace(
                  * Fill in the new inode number and log the entry.
                  */
                 dep->inumber = cpu_to_be64(inum);
+               xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
                 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
                 rval = 0;
         }
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h

index 0511cda4a712a480682b946829c5587cf37c4070..1bad84c408295ae4db2de71d96c74b18fff5b2b2 100644 (file)
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -18,23 +18,26 @@
  #ifndef __XFS_DIR2_PRIV_H__
  #define __XFS_DIR2_PRIV_H__
  
+struct dir_context;
+
  /* xfs_dir2.c */
  extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
  extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
                                 xfs_dir2_db_t *dbp);
-extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
-                               struct xfs_buf *bp);
  extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                 const unsigned char *name, int len);
  
-/* xfs_dir2_block.c */
-extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+                                       __uint8_t filetype);
  
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                              struct xfs_buf **bpp);
  extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
-               struct dir_context *ctx);
  extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
  extern int xfs_dir2_block_removename(struct xfs_da_args *args);
  extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -48,9 +51,6 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
  #define        xfs_dir3_data_check(dp,bp)
  #endif
  
-extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
-
  extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
  extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
                 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -60,27 +60,10 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
  extern struct xfs_dir2_data_free *
  xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
                 struct xfs_dir2_data_unused *dup, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
-               struct xfs_dir2_data_hdr *hdr, int *loghead);
  extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
                 struct xfs_buf **bpp);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
-               struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
-               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
-               int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
-               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
  
  /* xfs_dir2_leaf.c */
-extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
-
  extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
                 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
  extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
@@ -91,8 +74,6 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
  extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
                 struct xfs_dir2_leaf_entry *ents, int *indexp,
                 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
-               size_t bufsize);
  extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
                 struct xfs_buf **bpp, __uint16_t magic);
  extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -144,18 +125,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
                 xfs_dablk_t fbno, struct xfs_buf **bpp);
  
  /* xfs_dir2_sf.c */
-extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
-               struct xfs_dir2_sf_entry *sfep);
  extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
                 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
  extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
                 int size, xfs_dir2_sf_hdr_t *sfhp);
  extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
  extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
  extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
  extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
  extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
  
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+                      size_t bufsize);
+
  #endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c

new file mode 100644 (file)

index 0000000..8993ec1
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+       DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+unsigned char
+xfs_dir3_get_dtype(
+       struct xfs_mount        *mp,
+       __uint8_t               filetype)
+{
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return DT_UNKNOWN;
+
+       if (filetype >= XFS_DIR3_FT_MAX)
+               return DT_UNKNOWN;
+
+       return xfs_dir3_filetype_table[filetype];
+}
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+       [0]                     = XFS_DIR3_FT_UNKNOWN,
+       [S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
+       [S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
+       [S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
+       [S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
+       [S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
+       [S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
+       [S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
+};
+
+STATIC int
+xfs_dir2_sf_getdents(
+       xfs_inode_t             *dp,            /* incore directory inode */
+       struct dir_context      *ctx)
+{
+       int                     i;              /* shortform entry number */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_dataptr_t      off;            /* current entry's offset */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       xfs_dir2_dataptr_t      dot_offset;
+       xfs_dir2_dataptr_t      dotdot_offset;
+       xfs_ino_t               ino;
+
+       mp = dp->i_mount;
+
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       /*
+        * Give up if the directory is way too short.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               return XFS_ERROR(EIO);
+       }
+
+       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+       /*
+        * If the block number in the offset is out of range, we're done.
+        */
+       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+               return 0;
+
+       /*
+        * Precalculate offsets for . and .. as we will always need them.
+        *
+        * XXX(hch): the second argument is sometimes 0 and sometimes
+        * mp->m_dirdatablk.
+        */
+       dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                            XFS_DIR3_DATA_DOT_OFFSET(mp));
+       dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                               XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
+
+       /*
+        * Put . entry unless we're starting past it.
+        */
+       if (ctx->pos <= dot_offset) {
+               ctx->pos = dot_offset & 0x7fffffff;
+               if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+                       return 0;
+       }
+
+       /*
+        * Put .. entry unless we're starting past it.
+        */
+       if (ctx->pos <= dotdot_offset) {
+               ino = xfs_dir2_sf_get_parent_ino(sfp);
+               ctx->pos = dotdot_offset & 0x7fffffff;
+               if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+                       return 0;
+       }
+
+       /*
+        * Loop while there are more entries and put'ing works.
+        */
+       sfep = xfs_dir2_sf_firstentry(sfp);
+       for (i = 0; i < sfp->count; i++) {
+               __uint8_t filetype;
+
+               off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                               xfs_dir2_sf_get_offset(sfep));
+
+               if (ctx->pos > off) {
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+                       continue;
+               }
+
+               ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
+               filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
+               ctx->pos = off & 0x7fffffff;
+               if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+                           xfs_dir3_get_dtype(mp, filetype)))
+                       return 0;
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+       }
+
+       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                       0x7fffffff;
+       return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+       xfs_inode_t             *dp,            /* incore inode */
+       struct dir_context      *ctx)
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       struct xfs_buf          *bp;            /* buffer for block */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+       char                    *endptr;        /* end of the data entries */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       char                    *ptr;           /* current data entry */
+       int                     wantoff;        /* starting block offset */
+       xfs_off_t               cook;
+
+       mp = dp->i_mount;
+       /*
+        * If the block number in the offset is out of range, we're done.
+        */
+       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+               return 0;
+
+       error = xfs_dir3_block_read(NULL, dp, &bp);
+       if (error)
+               return error;
+
+       /*
+        * Extract the byte offset we start at from the seek pointer.
+        * We'll skip entries before this.
+        */
+       wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
+       hdr = bp->b_addr;
+       xfs_dir3_data_check(dp, bp);
+       /*
+        * Set up values for the loop.
+        */
+       btp = xfs_dir2_block_tail_p(mp, hdr);
+       ptr = (char *)xfs_dir3_data_entry_p(hdr);
+       endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+       /*
+        * Loop over the data portion of the block.
+        * Each object is a real entry (dep) or an unused one (dup).
+        */
+       while (ptr < endptr) {
+               __uint8_t filetype;
+
+               dup = (xfs_dir2_data_unused_t *)ptr;
+               /*
+                * Unused, skip it.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       ptr += be16_to_cpu(dup->length);
+                       continue;
+               }
+
+               dep = (xfs_dir2_data_entry_t *)ptr;
+
+               /*
+                * Bump pointer for the next iteration.
+                */
+               ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+               /*
+                * The entry is before the desired starting point, skip it.
+                */
+               if ((char *)dep - (char *)hdr < wantoff)
+                       continue;
+
+               cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                           (char *)dep - (char *)hdr);
+
+               ctx->pos = cook & 0x7fffffff;
+               filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+               /*
+                * If it didn't fit, set the final offset to here & return.
+                */
+               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+                           be64_to_cpu(dep->inumber),
+                           xfs_dir3_get_dtype(mp, filetype))) {
+                       xfs_trans_brelse(NULL, bp);
+                       return 0;
+               }
+       }
+
+       /*
+        * Reached the end of the block.
+        * Set the offset to a non-existent block 1 and return.
+        */
+       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                       0x7fffffff;
+       xfs_trans_brelse(NULL, bp);
+       return 0;
+}
+
+struct xfs_dir2_leaf_map_info {
+       xfs_extlen_t    map_blocks;     /* number of fsbs in map */
+       xfs_dablk_t     map_off;        /* last mapped file offset */
+       int             map_size;       /* total entries in *map */
+       int             map_valid;      /* valid entries in *map */
+       int             nmap;           /* mappings to ask xfs_bmapi */
+       xfs_dir2_db_t   curdb;          /* db for current block */
+       int             ra_current;     /* number of read-ahead blks */
+       int             ra_index;       /* *map index for read-ahead */
+       int             ra_offset;      /* map entry offset for ra */
+       int             ra_want;        /* readahead count wanted */
+       struct xfs_bmbt_irec map[];     /* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+       struct xfs_inode        *dp,
+       size_t                  bufsize,
+       struct xfs_dir2_leaf_map_info *mip,
+       xfs_dir2_off_t          *curoff,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp = *bpp;
+       struct xfs_bmbt_irec    *map = mip->map;
+       struct blk_plug         plug;
+       int                     error = 0;
+       int                     length;
+       int                     i;
+       int                     j;
+
+       /*
+        * If we have a buffer, we need to release it and
+        * take it out of the mapping.
+        */
+
+       if (bp) {
+               xfs_trans_brelse(NULL, bp);
+               bp = NULL;
+               mip->map_blocks -= mp->m_dirblkfsbs;
+               /*
+                * Loop to get rid of the extents for the
+                * directory block.
+                */
+               for (i = mp->m_dirblkfsbs; i > 0; ) {
+                       j = min_t(int, map->br_blockcount, i);
+                       map->br_blockcount -= j;
+                       map->br_startblock += j;
+                       map->br_startoff += j;
+                       /*
+                        * If mapping is done, pitch it from
+                        * the table.
+                        */
+                       if (!map->br_blockcount && --mip->map_valid)
+                               memmove(&map[0], &map[1],
+                                       sizeof(map[0]) * mip->map_valid);
+                       i -= j;
+               }
+       }
+
+       /*
+        * Recalculate the readahead blocks wanted.
+        */
+       mip->ra_want = howmany(bufsize + mp->m_dirblksize,
+                              mp->m_sb.sb_blocksize) - 1;
+       ASSERT(mip->ra_want >= 0);
+
+       /*
+        * If we don't have as many as we want, and we haven't
+        * run out of data blocks, get some more mappings.
+        */
+       if (1 + mip->ra_want > mip->map_blocks &&
+           mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+               /*
+                * Get more bmaps, fill in after the ones
+                * we already have in the table.
+                */
+               mip->nmap = mip->map_size - mip->map_valid;
+               error = xfs_bmapi_read(dp, mip->map_off,
+                               xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
+                                                               mip->map_off,
+                               &map[mip->map_valid], &mip->nmap, 0);
+
+               /*
+                * Don't know if we should ignore this or try to return an
+                * error.  The trouble with returning errors is that readdir
+                * will just stop without actually passing the error through.
+                */
+               if (error)
+                       goto out;       /* XXX */
+
+               /*
+                * If we got all the mappings we asked for, set the final map
+                * offset based on the last bmap value received.  Otherwise,
+                * we've reached the end.
+                */
+               if (mip->nmap == mip->map_size - mip->map_valid) {
+                       i = mip->map_valid + mip->nmap - 1;
+                       mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+               } else
+                       mip->map_off = xfs_dir2_byte_to_da(mp,
+                                                       XFS_DIR2_LEAF_OFFSET);
+
+               /*
+                * Look for holes in the mapping, and eliminate them.  Count up
+                * the valid blocks.
+                */
+               for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+                       if (map[i].br_startblock == HOLESTARTBLOCK) {
+                               mip->nmap--;
+                               length = mip->map_valid + mip->nmap - i;
+                               if (length)
+                                       memmove(&map[i], &map[i + 1],
+                                               sizeof(map[i]) * length);
+                       } else {
+                               mip->map_blocks += map[i].br_blockcount;
+                               i++;
+                       }
+               }
+               mip->map_valid += mip->nmap;
+       }
+
+       /*
+        * No valid mappings, so no more data blocks.
+        */
+       if (!mip->map_valid) {
+               *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
+               goto out;
+       }
+
+       /*
+        * Read the directory block starting at the first mapping.
+        */
+       mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+       error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
+                       map->br_blockcount >= mp->m_dirblkfsbs ?
+                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
+
+       /*
+        * Should just skip over the data block instead of giving up.
+        */
+       if (error)
+               goto out;       /* XXX */
+
+       /*
+        * Adjust the current amount of read-ahead: we just read a block that
+        * was previously ra.
+        */
+       if (mip->ra_current)
+               mip->ra_current -= mp->m_dirblkfsbs;
+
+       /*
+        * Do we need more readahead?
+        */
+       blk_start_plug(&plug);
+       for (mip->ra_index = mip->ra_offset = i = 0;
+            mip->ra_want > mip->ra_current && i < mip->map_blocks;
+            i += mp->m_dirblkfsbs) {
+               ASSERT(mip->ra_index < mip->map_valid);
+               /*
+                * Read-ahead a contiguous directory block.
+                */
+               if (i > mip->ra_current &&
+                   map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
+                       xfs_dir3_data_readahead(NULL, dp,
+                               map[mip->ra_index].br_startoff + mip->ra_offset,
+                               XFS_FSB_TO_DADDR(mp,
+                                       map[mip->ra_index].br_startblock +
+                                                       mip->ra_offset));
+                       mip->ra_current = i;
+               }
+
+               /*
+                * Read-ahead a non-contiguous directory block.  This doesn't
+                * use our mapping, but this is a very rare case.
+                */
+               else if (i > mip->ra_current) {
+                       xfs_dir3_data_readahead(NULL, dp,
+                                       map[mip->ra_index].br_startoff +
+                                                       mip->ra_offset, -1);
+                       mip->ra_current = i;
+               }
+
+               /*
+                * Advance offset through the mapping table.
+                */
+               for (j = 0; j < mp->m_dirblkfsbs; j++) {
+                       /*
+                        * The rest of this extent but not more than a dir
+                        * block.
+                        */
+                       length = min_t(int, mp->m_dirblkfsbs,
+                                       map[mip->ra_index].br_blockcount -
+                                                       mip->ra_offset);
+                       j += length;
+                       mip->ra_offset += length;
+
+                       /*
+                        * Advance to the next mapping if this one is used up.
+                        */
+                       if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+                               mip->ra_offset = 0;
+                               mip->ra_index++;
+                       }
+               }
+       }
+       blk_finish_plug(&plug);
+
+out:
+       *bpp = bp;
+       return error;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+       xfs_inode_t             *dp,            /* incore directory inode */
+       struct dir_context      *ctx,
+       size_t                  bufsize)
+{
+       struct xfs_buf          *bp = NULL;     /* data block buffer */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_data_entry_t   *dep;           /* data entry */
+       xfs_dir2_data_unused_t  *dup;           /* unused entry */
+       int                     error = 0;      /* error return value */
+       int                     length;         /* temporary length value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     byteoff;        /* offset in current block */
+       xfs_dir2_off_t          curoff;         /* current overall offset */
+       xfs_dir2_off_t          newoff;         /* new curoff after new blk */
+       char                    *ptr = NULL;    /* pointer to current data */
+       struct xfs_dir2_leaf_map_info *map_info;
+
+       /*
+        * If the offset is at or past the largest allowed value,
+        * give up right away.
+        */
+       if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+               return 0;
+
+       mp = dp->i_mount;
+
+       /*
+        * Set up to bmap a number of blocks based on the caller's
+        * buffer size, the directory block size, and the filesystem
+        * block size.
+        */
+       length = howmany(bufsize + mp->m_dirblksize,
+                                    mp->m_sb.sb_blocksize);
+       map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+                               (length * sizeof(struct xfs_bmbt_irec)),
+                              KM_SLEEP | KM_NOFS);
+       map_info->map_size = length;
+
+       /*
+        * Inside the loop we keep the main offset value as a byte offset
+        * in the directory file.
+        */
+       curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
+
+       /*
+        * Force this conversion through db so we truncate the offset
+        * down to get the start of the data block.
+        */
+       map_info->map_off = xfs_dir2_db_to_da(mp,
+                                             xfs_dir2_byte_to_db(mp, curoff));
+
+       /*
+        * Loop over directory entries until we reach the end offset.
+        * Get more blocks and readahead as necessary.
+        */
+       while (curoff < XFS_DIR2_LEAF_OFFSET) {
+               __uint8_t filetype;
+
+               /*
+                * If we have no buffer, or we're off the end of the
+                * current buffer, need to get another one.
+                */
+               if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
+
+                       error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
+                                                     &curoff, &bp);
+                       if (error || !map_info->map_valid)
+                               break;
+
+                       /*
+                        * Having done a read, we need to set a new offset.
+                        */
+                       newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
+                       /*
+                        * Start of the current block.
+                        */
+                       if (curoff < newoff)
+                               curoff = newoff;
+                       /*
+                        * Make sure we're in the right block.
+                        */
+                       else if (curoff > newoff)
+                               ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+                                      map_info->curdb);
+                       hdr = bp->b_addr;
+                       xfs_dir3_data_check(dp, bp);
+                       /*
+                        * Find our position in the block.
+                        */
+                       ptr = (char *)xfs_dir3_data_entry_p(hdr);
+                       byteoff = xfs_dir2_byte_to_off(mp, curoff);
+                       /*
+                        * Skip past the header.
+                        */
+                       if (byteoff == 0)
+                               curoff += xfs_dir3_data_entry_offset(hdr);
+                       /*
+                        * Skip past entries until we reach our offset.
+                        */
+                       else {
+                               while ((char *)ptr - (char *)hdr < byteoff) {
+                                       dup = (xfs_dir2_data_unused_t *)ptr;
+
+                                       if (be16_to_cpu(dup->freetag)
+                                                 == XFS_DIR2_DATA_FREE_TAG) {
+
+                                               length = be16_to_cpu(dup->length);
+                                               ptr += length;
+                                               continue;
+                                       }
+                                       dep = (xfs_dir2_data_entry_t *)ptr;
+                                       length =
+                                          xfs_dir3_data_entsize(mp, dep->namelen);
+                                       ptr += length;
+                               }
+                               /*
+                                * Now set our real offset.
+                                */
+                               curoff =
+                                       xfs_dir2_db_off_to_byte(mp,
+                                           xfs_dir2_byte_to_db(mp, curoff),
+                                           (char *)ptr - (char *)hdr);
+                               if (ptr >= (char *)hdr + mp->m_dirblksize) {
+                                       continue;
+                               }
+                       }
+               }
+               /*
+                * We have a pointer to an entry.
+                * Is it a live one?
+                */
+               dup = (xfs_dir2_data_unused_t *)ptr;
+               /*
+                * No, it's unused, skip over it.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       length = be16_to_cpu(dup->length);
+                       ptr += length;
+                       curoff += length;
+                       continue;
+               }
+
+               dep = (xfs_dir2_data_entry_t *)ptr;
+               length = xfs_dir3_data_entsize(mp, dep->namelen);
+               filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+
+               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+                           be64_to_cpu(dep->inumber),
+                           xfs_dir3_get_dtype(mp, filetype)))
+                       break;
+
+               /*
+                * Advance to next entry in the block.
+                */
+               ptr += length;
+               curoff += length;
+               /* bufsize may have just been a guess; don't go negative */
+               bufsize = bufsize > length ? bufsize - length : 0;
+       }
+
+       /*
+        * All done.  Set output offset value to current offset.
+        */
+       if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+               ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+       else
+               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+       kmem_free(map_info);
+       if (bp)
+               xfs_trans_brelse(NULL, bp);
+       return error;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+       xfs_inode_t     *dp,
+       struct dir_context *ctx,
+       size_t          bufsize)
+{
+       int             rval;           /* return value */
+       int             v;              /* type-checking value */
+
+       trace_xfs_readdir(dp);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return XFS_ERROR(EIO);
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       XFS_STATS_INC(xs_dir_getdents);
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+               rval = xfs_dir2_sf_getdents(dp, ctx);
+       else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+               ;
+       else if (v)
+               rval = xfs_dir2_block_getdents(dp, ctx);
+       else
+               rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+       return rval;
+}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c

index 97676a347da166e5d843db8ab2052666390e018d..bb6e2848f473d024d5ff1dd70a1232443901ea9f 100644 (file)
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -29,8 +29,8 @@
  #include "xfs_inode.h"
  #include "xfs_inode_item.h"
  #include "xfs_error.h"
-#include "xfs_dir2.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_trace.h"
  
@@ -95,7 +95,7 @@ xfs_dir2_sf_get_parent_ino(
         return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
  }
  
-static void
+void
  xfs_dir2_sf_put_parent_ino(
         struct xfs_dir2_sf_hdr  *hdr,
         xfs_ino_t               ino)
@@ -105,31 +105,38 @@ xfs_dir2_sf_put_parent_ino(
  
  /*
   * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name.  The inode numbers may only be accessed
- * through the helpers below.
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
   */
  static xfs_dir2_inou_t *
-xfs_dir2_sfe_inop(
+xfs_dir3_sfe_inop(
+       struct xfs_mount        *mp,
         struct xfs_dir2_sf_entry *sfep)
  {
-       return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen];
+       __uint8_t       *ptr = &sfep->name[sfep->namelen];
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               ptr++;
+       return (xfs_dir2_inou_t *)ptr;
  }
  
  xfs_ino_t
-xfs_dir2_sfe_get_ino(
+xfs_dir3_sfe_get_ino(
+       struct xfs_mount        *mp,
         struct xfs_dir2_sf_hdr  *hdr,
         struct xfs_dir2_sf_entry *sfep)
  {
-       return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep));
+       return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
  }
  
-static void
-xfs_dir2_sfe_put_ino(
+void
+xfs_dir3_sfe_put_ino(
+       struct xfs_mount        *mp,
         struct xfs_dir2_sf_hdr  *hdr,
         struct xfs_dir2_sf_entry *sfep,
         xfs_ino_t               ino)
  {
-       xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino);
+       xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
  }
  
  /*
@@ -157,9 +164,16 @@ xfs_dir2_block_sfsize(
         int                     namelen;        /* total name bytes */
         xfs_ino_t               parent = 0;     /* parent inode number */
         int                     size=0;         /* total computed size */
+       int                     has_ftype;
  
         mp = dp->i_mount;
  
+       /*
+        * if there is a filetype field, add the extra byte to the namelen
+        * for each entry that we see.
+        */
+       has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+
         count = i8count = namelen = 0;
         btp = xfs_dir2_block_tail_p(mp, hdr);
         blp = xfs_dir2_block_leaf_p(btp);
@@ -188,9 +202,10 @@ xfs_dir2_block_sfsize(
                 if (!isdot)
                         i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
  #endif
+               /* take into account the file type field */
                 if (!isdot && !isdotdot) {
                         count++;
-                       namelen += dep->namelen;
+                       namelen += dep->namelen + has_ftype;
                 } else if (isdotdot)
                         parent = be64_to_cpu(dep->inumber);
                 /*
@@ -316,12 +331,14 @@ xfs_dir2_block_to_sf(
                                 (xfs_dir2_data_aoff_t)
                                 ((char *)dep - (char *)hdr));
                         memcpy(sfep->name, dep->name, dep->namelen);
-                       xfs_dir2_sfe_put_ino(sfp, sfep,
+                       xfs_dir3_sfe_put_ino(mp, sfp, sfep,
                                              be64_to_cpu(dep->inumber));
+                       xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                                       xfs_dir3_dirent_get_ftype(mp, dep));
  
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
                 }
-               ptr += xfs_dir2_data_entsize(dep->namelen);
+               ptr += xfs_dir3_data_entsize(mp, dep->namelen);
         }
         ASSERT((char *)sfep - (char *)sfp == size);
         xfs_dir2_sf_check(args);
@@ -372,7 +389,7 @@ xfs_dir2_sf_addname(
         /*
          * Compute entry (and change in) size.
          */
-       add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
+       add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
         incr_isize = add_entsize;
         objchange = 0;
  #if XFS_BIG_INUMS
@@ -466,8 +483,9 @@ xfs_dir2_sf_addname_easy(
         /*
          * Grow the in-inode space.
          */
-       xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen),
-               XFS_DATA_FORK);
+       xfs_idata_realloc(dp,
+                         xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
+                         XFS_DATA_FORK);
         /*
          * Need to set up again due to realloc of the inode data.
          */
@@ -479,7 +497,9 @@ xfs_dir2_sf_addname_easy(
         sfep->namelen = args->namelen;
         xfs_dir2_sf_put_offset(sfep, offset);
         memcpy(sfep->name, args->name, sfep->namelen);
-       xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
+
         /*
          * Update the header and inode.
          */
@@ -519,11 +539,13 @@ xfs_dir2_sf_addname_hard(
         xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
         xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
         xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
+       struct xfs_mount        *mp;
  
         /*
          * Copy the old directory to the stack buffer.
          */
         dp = args->dp;
+       mp = dp->i_mount;
  
         sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
         old_isize = (int)dp->i_d.di_size;
@@ -535,13 +557,13 @@ xfs_dir2_sf_addname_hard(
          * to insert the new entry.
          * If it's going to end up at the end then oldsfep will point there.
          */
-       for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount),
+       for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
               oldsfep = xfs_dir2_sf_firstentry(oldsfp),
-             add_datasize = xfs_dir2_data_entsize(args->namelen),
+             add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
               eof = (char *)oldsfep == &buf[old_isize];
              !eof;
-            offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
-             oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
+            offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
+             oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
               eof = (char *)oldsfep == &buf[old_isize]) {
                 new_offset = xfs_dir2_sf_get_offset(oldsfep);
                 if (offset + add_datasize <= new_offset)
@@ -570,7 +592,8 @@ xfs_dir2_sf_addname_hard(
         sfep->namelen = args->namelen;
         xfs_dir2_sf_put_offset(sfep, offset);
         memcpy(sfep->name, args->name, sfep->namelen);
-       xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
         sfp->count++;
  #if XFS_BIG_INUMS
         if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -580,7 +603,7 @@ xfs_dir2_sf_addname_hard(
          * If there's more left to copy, do that.
          */
         if (!eof) {
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
                 memcpy(sfep, oldsfep, old_isize - nbytes);
         }
         kmem_free(buf);
@@ -616,7 +639,7 @@ xfs_dir2_sf_addname_pick(
         mp = dp->i_mount;
  
         sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       size = xfs_dir2_data_entsize(args->namelen);
+       size = xfs_dir3_data_entsize(mp, args->namelen);
         offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
         sfep = xfs_dir2_sf_firstentry(sfp);
         holefit = 0;
@@ -629,8 +652,8 @@ xfs_dir2_sf_addname_pick(
                 if (!holefit)
                         holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
                 offset = xfs_dir2_sf_get_offset(sfep) +
-                        xfs_dir2_data_entsize(sfep->namelen);
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                        xfs_dir3_data_entsize(mp, sfep->namelen);
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
         }
         /*
          * Calculate data bytes used excluding the new entry, if this
@@ -684,31 +707,34 @@ xfs_dir2_sf_check(
         int                     offset;         /* data offset */
         xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
         xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       struct xfs_mount        *mp;
  
         dp = args->dp;
+       mp = dp->i_mount;
  
         sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount);
+       offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
         ino = xfs_dir2_sf_get_parent_ino(sfp);
         i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
  
         for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
              i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
                 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
-               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+               ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
                 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
                 offset =
                         xfs_dir2_sf_get_offset(sfep) +
-                       xfs_dir2_data_entsize(sfep->namelen);
+                       xfs_dir3_data_entsize(mp, sfep->namelen);
+               ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
+                                                       XFS_DIR3_FT_MAX);
         }
         ASSERT(i8count == sfp->i8count);
         ASSERT(XFS_BIG_INUMS || i8count == 0);
         ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
         ASSERT(offset +
                (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-              (uint)sizeof(xfs_dir2_block_tail_t) <=
-              dp->i_mount->m_dirblksize);
+              (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
  }
  #endif /* DEBUG */
  
@@ -765,100 +791,6 @@ xfs_dir2_sf_create(
         return 0;
  }
  
-int                                            /* error */
-xfs_dir2_sf_getdents(
-       xfs_inode_t             *dp,            /* incore directory inode */
-       struct dir_context      *ctx)
-{
-       int                     i;              /* shortform entry number */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_dataptr_t      off;            /* current entry's offset */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       xfs_dir2_dataptr_t      dot_offset;
-       xfs_dir2_dataptr_t      dotdot_offset;
-       xfs_ino_t               ino;
-
-       mp = dp->i_mount;
-
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       /*
-        * Give up if the directory is way too short.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               return XFS_ERROR(EIO);
-       }
-
-       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-
-       /*
-        * If the block number in the offset is out of range, we're done.
-        */
-       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
-               return 0;
-
-       /*
-        * Precalculate offsets for . and .. as we will always need them.
-        *
-        * XXX(hch): the second argument is sometimes 0 and sometimes
-        * mp->m_dirdatablk.
-        */
-       dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                            XFS_DIR3_DATA_DOT_OFFSET(mp));
-       dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                               XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
-
-       /*
-        * Put . entry unless we're starting past it.
-        */
-       if (ctx->pos <= dot_offset) {
-               ctx->pos = dot_offset & 0x7fffffff;
-               if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
-                       return 0;
-       }
-
-       /*
-        * Put .. entry unless we're starting past it.
-        */
-       if (ctx->pos <= dotdot_offset) {
-               ino = xfs_dir2_sf_get_parent_ino(sfp);
-               ctx->pos = dotdot_offset & 0x7fffffff;
-               if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
-                       return 0;
-       }
-
-       /*
-        * Loop while there are more entries and put'ing works.
-        */
-       sfep = xfs_dir2_sf_firstentry(sfp);
-       for (i = 0; i < sfp->count; i++) {
-               off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                               xfs_dir2_sf_get_offset(sfep));
-
-               if (ctx->pos > off) {
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
-                       continue;
-               }
-
-               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-               ctx->pos = off & 0x7fffffff;
-               if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
-                           ino, DT_UNKNOWN))
-                       return 0;
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
-       }
-
-       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
-                       0x7fffffff;
-       return 0;
-}
-
  /*
   * Lookup an entry in a shortform directory.
   * Returns EEXIST if found, ENOENT if not found.
@@ -898,6 +830,7 @@ xfs_dir2_sf_lookup(
         if (args->namelen == 1 && args->name[0] == '.') {
                 args->inumber = dp->i_ino;
                 args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
                 return XFS_ERROR(EEXIST);
         }
         /*
@@ -907,6 +840,7 @@ xfs_dir2_sf_lookup(
             args->name[0] == '.' && args->name[1] == '.') {
                 args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
                 args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
                 return XFS_ERROR(EEXIST);
         }
         /*
@@ -914,7 +848,7 @@ xfs_dir2_sf_lookup(
          */
         ci_sfep = NULL;
         for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                 /*
                  * Compare name and if it's an exact match, return the inode
                  * number. If it's the first case-insensitive match, store the
@@ -924,7 +858,10 @@ xfs_dir2_sf_lookup(
                                                                 sfep->namelen);
                 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
                         args->cmpresult = cmp;
-                       args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep);
+                       args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
+                                                            sfp, sfep);
+                       args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
+                                                               sfp, sfep);
                         if (cmp == XFS_CMP_EXACT)
                                 return XFS_ERROR(EEXIST);
                         ci_sfep = sfep;
@@ -980,10 +917,10 @@ xfs_dir2_sf_removename(
          * Find the one we're deleting.
          */
         for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
                                                                 XFS_CMP_EXACT) {
-                       ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) ==
+                       ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
                                args->inumber);
                         break;
                 }
@@ -997,7 +934,7 @@ xfs_dir2_sf_removename(
          * Calculate sizes.
          */
         byteoff = (int)((char *)sfep - (char *)sfp);
-       entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
+       entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
         newsize = oldsize - entsize;
         /*
          * Copy the part if any after the removed entry, sliding it down.
@@ -1113,16 +1050,19 @@ xfs_dir2_sf_replace(
          * Normal entry, look for the name.
          */
         else {
-               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-                               i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+                    i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                         if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
                                                                 XFS_CMP_EXACT) {
  #if XFS_BIG_INUMS || defined(DEBUG)
-                               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+                               ino = xfs_dir3_sfe_get_ino(dp->i_mount,
+                                                          sfp, sfep);
                                 ASSERT(args->inumber != ino);
  #endif
-                               xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+                               xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
+                                                    args->inumber);
+                               xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
+                                                      args->filetype);
                                 break;
                         }
                 }
@@ -1189,10 +1129,12 @@ xfs_dir2_sf_toino4(
         int                     oldsize;        /* old inode size */
         xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
         xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
  
         trace_xfs_dir2_sf_toino4(args);
  
         dp = args->dp;
+       mp = dp->i_mount;
  
         /*
          * Copy the old directory to the buffer.
@@ -1230,13 +1172,15 @@ xfs_dir2_sf_toino4(
         for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
                     oldsfep = xfs_dir2_sf_firstentry(oldsfp);
              i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
-                 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
+                 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
                 sfep->namelen = oldsfep->namelen;
                 sfep->offset = oldsfep->offset;
                 memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               xfs_dir2_sfe_put_ino(sfp, sfep,
-                       xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ino(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
         }
         /*
          * Clean up the inode.
@@ -1264,10 +1208,12 @@ xfs_dir2_sf_toino8(
         int                     oldsize;        /* old inode size */
         xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
         xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
  
         trace_xfs_dir2_sf_toino8(args);
  
         dp = args->dp;
+       mp = dp->i_mount;
  
         /*
          * Copy the old directory to the buffer.
@@ -1305,13 +1251,15 @@ xfs_dir2_sf_toino8(
         for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
                     oldsfep = xfs_dir2_sf_firstentry(oldsfp);
              i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
-                 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
+                 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
                 sfep->namelen = oldsfep->namelen;
                 sfep->offset = oldsfep->offset;
                 memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               xfs_dir2_sfe_put_ino(sfp, sfep,
-                       xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ino(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
         }
         /*
          * Clean up the inode.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c

index 69cf4fcde03e2d31266f70f6dfee1b73fe71b4a7..45560ee1a4ba8b1ccfdc36f9616cc5355e03558d 100644 (file)
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -16,12 +16,13 @@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include "xfs.h"
-#include "xfs_sb.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
  #include "xfs_quota.h"
-#include "xfs_trans.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index 0adf27ecf3f1cd4e98d8fcc06a732e4d476437db..251c66632e5e7d926e92942d764e49220237dd11 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
@@ -28,6 +29,7 @@
  #include "xfs_bmap_btree.h"
  #include "xfs_inode.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_rtalloc.h"
  #include "xfs_error.h"
  #include "xfs_itable.h"
@@ -710,10 +712,8 @@ xfs_qm_dqread(
  
         if (flags & XFS_QMOPT_DQALLOC) {
                 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-               error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                         XFS_QM_DQALLOC_LOG_RES(mp), 0,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
+                                         XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                 if (error)
                         goto error1;
                 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c

index 57aa4b03720cb7440acef44a920d18f81955ba72..60c6e1f126952acc43e1bbe2a1d065f304ed484d 100644 (file)
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
@@ -43,14 +44,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
  /*
   * returns the number of iovecs needed to log the given dquot item.
   */
-STATIC uint
+STATIC void
  xfs_qm_dquot_logitem_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
-       /*
-        * we need only two iovecs, one for the format, one for the real thing
-        */
-       return 2;
+       *nvecs += 2;
+       *nbytes += sizeof(struct xfs_dq_logformat) +
+                  sizeof(struct xfs_disk_dquot);
  }
  
  /*
@@ -285,11 +287,14 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
   * We only need 1 iovec for an quotaoff item.  It just logs the
   * quotaoff_log_format structure.
   */
-STATIC uint
+STATIC void
  xfs_qm_qoff_logitem_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
-       return 1;
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_qoff_logitem);
  }
  
  /*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c

index 35d3f5b041ddc0977f47981cb88991edee857245..1123d93ff79546efe3a9d962460ab1e44e2e1bac 100644 (file)
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -26,7 +26,6 @@
  #include "xfs_bmap_btree.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
-#include "xfs_utils.h"
  #include "xfs_error.h"
  
  #ifdef DEBUG
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c

index c585bc646395e04c5497621eeb4ec34bd1f262e7..066df425c14ffca5b4dacb20f7b3c6fcdd139acb 100644 (file)
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -21,10 +21,11 @@
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_export.h"
-#include "xfs_vnodeops.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_inode.h"
  #include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c

index 85e9f87a1a7ce7945da58e305a21818262542cfa..86f559f6e5d3c3f825df5aeffbf6967a7654f055 100644 (file)
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -147,7 +147,7 @@ xfs_extent_busy_search(
   * extent.  If the overlap covers the beginning, the end, or all of the busy
   * extent, the overlapping portion can be made unbusy and used for the
   * allocation.  We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
+ * transaction/CIL context busy list, but we can update an entry's block
   * number or length.
   *
   * Returns true if the extent can safely be reused, or false if the search
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index 452920a3f03fb2e4405ce52e34587e55acfb7abe..dc53e8febbbeaa54812b4e72dc25938718da69c1 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -73,11 +73,22 @@ __xfs_efi_release(
   * We only need 1 iovec for an efi item.  It just logs the efi_log_format
   * structure.
   */
-STATIC uint
+static inline int
+xfs_efi_item_sizeof(
+       struct xfs_efi_log_item *efip)
+{
+       return sizeof(struct xfs_efi_log_format) +
+              (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
  xfs_efi_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
-       return 1;
+       *nvecs += 1;
+       *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
  }
  
  /*
@@ -93,21 +104,17 @@ xfs_efi_item_format(
         struct xfs_log_iovec    *log_vector)
  {
         struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-       uint                    size;
  
         ASSERT(atomic_read(&efip->efi_next_extent) ==
                                 efip->efi_format.efi_nextents);
  
         efip->efi_format.efi_type = XFS_LI_EFI;
-
-       size = sizeof(xfs_efi_log_format_t);
-       size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
         efip->efi_format.efi_size = 1;
  
         log_vector->i_addr = &efip->efi_format;
-       log_vector->i_len = size;
+       log_vector->i_len = xfs_efi_item_sizeof(efip);
         log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
-       ASSERT(size >= sizeof(xfs_efi_log_format_t));
+       ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
  }
  
  
@@ -333,11 +340,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
   * We only need 1 iovec for an efd item.  It just logs the efd_log_format
   * structure.
   */
-STATIC uint
+static inline int
+xfs_efd_item_sizeof(
+       struct xfs_efd_log_item *efdp)
+{
+       return sizeof(xfs_efd_log_format_t) +
+              (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
  xfs_efd_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
-       return 1;
+       *nvecs += 1;
+       *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
  }
  
  /*
@@ -353,20 +371,16 @@ xfs_efd_item_format(
         struct xfs_log_iovec    *log_vector)
  {
         struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
-       uint                    size;
  
         ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
  
         efdp->efd_format.efd_type = XFS_LI_EFD;
-
-       size = sizeof(xfs_efd_log_format_t);
-       size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
         efdp->efd_format.efd_size = 1;
  
         log_vector->i_addr = &efdp->efd_format;
-       log_vector->i_len = size;
+       log_vector->i_len = xfs_efd_item_sizeof(efdp);
         log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
-       ASSERT(size >= sizeof(xfs_efd_log_format_t));
+       ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
  }
  
  /*
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h

index 432222418c566f6d7f30bed9951289ea9c5f1d10..0ffbce32d5693e05e042de8983c8274942616196 100644 (file)
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -18,93 +18,11 @@
  #ifndef        __XFS_EXTFREE_ITEM_H__
  #define        __XFS_EXTFREE_ITEM_H__
  
+/* kernel only EFI/EFD definitions */
+
  struct xfs_mount;
  struct kmem_zone;
  
-typedef struct xfs_extent {
-       xfs_dfsbno_t    ext_start;
-       xfs_extlen_t    ext_len;
-} xfs_extent_t;
-
-/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
- */
-
-typedef struct xfs_extent_32 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
-
-typedef struct xfs_extent_64 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-       __uint32_t      ext_pad;
-} xfs_extent_64_t;
-
-/*
- * This is the structure used to lay out an efi log item in the
- * log.  The efi_extents field is a variable size array whose
- * size is given by efi_nextents.
- */
-typedef struct xfs_efi_log_format {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_t            efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_t;
-
-typedef struct xfs_efi_log_format_32 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
-
-typedef struct xfs_efi_log_format_64 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
-
-/*
- * This is the structure used to lay out an efd log item in the
- * log.  The efd_extents array is a variable size array whose
- * size is given by efd_nextents;
- */
-typedef struct xfs_efd_log_format {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_t            efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_t;
-
-typedef struct xfs_efd_log_format_32 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
-
-typedef struct xfs_efd_log_format_64 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
-
-
-#ifdef __KERNEL__
-
  /*
   * Max number of extents in fast allocation path.
   */
@@ -160,6 +78,4 @@ int                  xfs_efi_copy_format(xfs_log_iovec_t *buf,
                                             xfs_efi_log_format_t *dst_efi_fmt);
  void                   xfs_efi_item_free(xfs_efi_log_item_t *);
  
-#endif /* __KERNEL__ */
-
  #endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index de3dc98f4e8f76067c1e7d0ee4631a8638d87988..4c749ab543d0de17646993a282f925ebd0314ccf 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,10 +28,11 @@
  #include "xfs_inode.h"
  #include "xfs_inode_item.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_error.h"
-#include "xfs_vnodeops.h"
  #include "xfs_da_btree.h"
  #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_dir2_priv.h"
  #include "xfs_ioctl.h"
  #include "xfs_trace.h"
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c

index 5170306a1009e22e287c425b1968d97b7a885a82..ce78e654d37b73693aa4c637e021dda9154ad5d6 100644 (file)
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -16,18 +16,18 @@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include "xfs.h"
+#include "xfs_log.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_inum.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_ag.h"
-#include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_mount.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_alloc.h"
-#include "xfs_utils.h"
  #include "xfs_mru_cache.h"
  #include "xfs_filestream.h"
  #include "xfs_trace.h"
@@ -668,8 +668,8 @@ exit:
   */
  int
  xfs_filestream_new_ag(
-       xfs_bmalloca_t  *ap,
-       xfs_agnumber_t  *agp)
+       struct xfs_bmalloca     *ap,
+       xfs_agnumber_t          *agp)
  {
         int             flags, err;
         xfs_inode_t     *ip, *pip = NULL;
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h

index 09dd9af454349b57fe2a8a7bec0f3c5c3c7e4e3e..6d61dbee8564b12cca254721bf78685975799a74 100644 (file)
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -18,8 +18,6 @@
  #ifndef __XFS_FILESTREAM_H__
  #define __XFS_FILESTREAM_H__
  
-#ifdef __KERNEL__
-
  struct xfs_mount;
  struct xfs_inode;
  struct xfs_perag;
@@ -69,6 +67,4 @@ xfs_inode_is_filestream(
                 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
  }
  
-#endif /* __KERNEL__ */
-
  #endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h

new file mode 100644 (file)

index 0000000..35c08ff
--- /dev/null
+++ b/fs/xfs/xfs_format.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for 
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
+#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
+#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
+
+#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
+#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
+#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
+#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
+       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define        XFS_SUMPTR(mp,bp,so)    \
+       ((xfs_suminfo_t *)((bp)->b_addr + \
+               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
+#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
+#define        XFS_BITTOWORD(mp,bi)    \
+       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
+#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
+
+#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
+#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
+#else
+#define        XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
+#endif
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
+#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
+       __u8            d_version;      /* dquot version */
+       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+       __be32          d_id;           /* user,project,group id */
+       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
+       __be64          d_blk_softlimit;/* preferred limit on disk blks */
+       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
+       __be64          d_ino_softlimit;/* preferred inode limit */
+       __be64          d_bcount;       /* disk blocks owned by the user */
+       __be64          d_icount;       /* inodes owned by the user */
+       __be32          d_itimer;       /* zero if within inode limits if not,
+                                          this is when we refuse service */
+       __be32          d_btimer;       /* similar to above; for disk blocks */
+       __be16          d_iwarns;       /* warnings issued wrt num inodes */
+       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
+       __be32          d_pad0;         /* 64 bit align */
+       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
+       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
+       __be64          d_rtbcount;     /* realtime blocks owned */
+       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
+       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
+       __be16          d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
+       char              dd_fill[4];   /* filling for posterity */
+
+       /*
+        * These two are only present on filesystems with the CRC bits set.
+        */
+       __be32            dd_crc;       /* checksum */
+       __be64            dd_lsn;       /* last modification in log */
+       uuid_t            dd_uuid;      /* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
+
+struct xfs_dsymlink_hdr {
+       __be32  sl_magic;
+       __be32  sl_offset;
+       __be32  sl_bytes;
+       __be32  sl_crc;
+       uuid_t  sl_uuid;
+       __be64  sl_owner;
+       __be64  sl_blkno;
+       __be64  sl_lsn;
+};
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
+       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                       sizeof(struct xfs_dsymlink_hdr) : 0))
+
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+                                struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h

index d04695545397308a6596f5109a72f8923fd79419..1edb5cc3e5f495fdca3059054d1f2e7bd5d9fd58 100644 (file)
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -240,7 +240,9 @@ typedef struct xfs_fsop_resblks {
  
  
  /*
- * Minimum and maximum sizes need for growth checks
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
   */
  #define XFS_MIN_AG_BLOCKS      64
  #define XFS_MIN_LOG_BLOCKS     512ULL
@@ -310,6 +312,17 @@ typedef struct xfs_bstat {
         __u16           bs_aextents;    /* attribute number of extents  */
  } xfs_bstat_t;
  
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline __uint32_t
+bstat_get_projid(struct xfs_bstat *bs)
+{
+       return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
  /*
   * The user-level BulkStat Request interface structure.
   */
@@ -344,7 +357,7 @@ typedef struct xfs_error_injection {
   * Speculative preallocation trimming.
   */
  #define XFS_EOFBLOCKS_VERSION          1
-struct xfs_eofblocks {
+struct xfs_fs_eofblocks {
         __u32           eof_version;
         __u32           eof_flags;
         uid_t           eof_uid;
@@ -449,6 +462,21 @@ typedef struct xfs_handle {
                                  - (char *) &(handle))                    \
                                  + (handle).ha_fid.fid_len)
  
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+       __int64_t       sx_version;     /* version */
+#define XFS_SX_VERSION         0
+       __int64_t       sx_fdtarget;    /* fd of target file */
+       __int64_t       sx_fdtmp;       /* fd of tmp file */
+       xfs_off_t       sx_offset;      /* offset into file */
+       xfs_off_t       sx_length;      /* leng from offset */
+       char            sx_pad[16];     /* pad space, unused */
+       xfs_bstat_t     sx_stat;        /* stat of target b4 copy */
+} xfs_swapext_t;
+
  /*
   * Flags for going down operation
   */
@@ -511,8 +539,14 @@ typedef struct xfs_handle {
  #define XFS_IOC_ERROR_INJECTION             _IOW ('X', 116, struct xfs_error_injection)
  #define XFS_IOC_ERROR_CLEARALL      _IOW ('X', 117, struct xfs_error_injection)
  /*     XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
+
  /*     XFS_IOC_FREEZE            -- FIFREEZE   119      */
  /*     XFS_IOC_THAW              -- FITHAW     120      */
+#ifndef FIFREEZE
+#define XFS_IOC_FREEZE              _IOWR('X', 119, int)
+#define XFS_IOC_THAW                _IOWR('X', 120, int)
+#endif
+
  #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
  #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
  #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index 614eb0cc360860214ce08443ff84993afa531143..e64ee5288b86be2d0c0267b383d3f6f9297e60a1 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -203,8 +203,9 @@ xfs_growfs_data_private(
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
         tp->t_flags |= XFS_TRANS_RESERVE;
-       if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
-                       XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+                                 XFS_GROWFS_SPACE_RES(mp), 0);
+       if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
         }
@@ -739,8 +740,7 @@ xfs_fs_log_dummy(
         int             error;
  
         tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c

index 7a0c17d7ec0974354cfd645695e9f4f7778704fa..ccf2fb1439629fae273a239625f97f6879192878 100644 (file)
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -39,6 +39,7 @@
  #include "xfs_cksum.h"
  #include "xfs_buf_item.h"
  #include "xfs_icreate_item.h"
+#include "xfs_icache.h"
  
  
  /*
@@ -506,7 +507,7 @@ xfs_ialloc_next_ag(
  
  /*
   * Select an allocation group to look for a free inode in, based on the parent
- * inode and then mode.  Return the allocation group buffer.
+ * inode and the mode.  Return the allocation group buffer.
   */
  STATIC xfs_agnumber_t
  xfs_ialloc_ag_select(
@@ -728,7 +729,7 @@ xfs_dialloc_ag(
                 error = xfs_inobt_get_rec(cur, &rec, &j);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
  
                 if (rec.ir_freecount > 0) {
                         /*
@@ -1341,7 +1342,7 @@ xfs_imap(
         xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
         int             error;  /* error code */
         int             offset; /* index of inode in its buffer */
-       int             offset_agbno;   /* blks from chunk start to inode */
+       xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
  
         ASSERT(ino != NULLFSINO);
  
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index 3f90e1ceb8d68c4655bb033da592f495b91e9772..16219b9c67909a6a483678fb761db701a8cb00e6 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_types.h"
  #include "xfs_log.h"
  #include "xfs_log_priv.h"
@@ -31,12 +32,12 @@
  #include "xfs_dinode.h"
  #include "xfs_error.h"
  #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
  #include "xfs_inode_item.h"
  #include "xfs_quota.h"
  #include "xfs_trace.h"
  #include "xfs_fsops.h"
  #include "xfs_icache.h"
+#include "xfs_bmap_util.h"
  
  #include <linux/kthread.h>
  #include <linux/freezer.h>
@@ -619,7 +620,7 @@ restart:
  
  /*
   * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
   */
  STATIC void
  xfs_queue_eofblocks(
@@ -1203,15 +1204,15 @@ xfs_inode_match_id(
         struct xfs_inode        *ip,
         struct xfs_eofblocks    *eofb)
  {
-       if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
-           ip->i_d.di_uid != eofb->eof_uid)
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+           !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
                 return 0;
  
-       if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
-           ip->i_d.di_gid != eofb->eof_gid)
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+           !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
                 return 0;
  
-       if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
             xfs_get_projid(ip) != eofb->eof_prid)
                 return 0;
  
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h

index a01afbb3909a465a6e94f23bdc819530a3f22140..8a89f7d791bd9df3184fd9f66a15467f0f30f8dc 100644 (file)
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -21,9 +21,24 @@
  struct xfs_mount;
  struct xfs_perag;
  
+struct xfs_eofblocks {
+       __u32           eof_flags;
+       kuid_t          eof_uid;
+       kgid_t          eof_gid;
+       prid_t          eof_prid;
+       __u64           eof_min_file_size;
+};
+
  #define SYNC_WAIT              0x0001  /* wait for i/o to complete */
  #define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
  
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE                0x1
+#define XFS_IGET_UNTRUSTED     0x2
+#define XFS_IGET_DONTCACHE     0x4
+
  int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
              uint flags, uint lock_flags, xfs_inode_t **ipp);
  
@@ -49,4 +64,39 @@ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
                 int flags, void *args),
         int flags, void *args, int tag);
  
+static inline int
+xfs_fs_eofblocks_from_user(
+       struct xfs_fs_eofblocks         *src,
+       struct xfs_eofblocks            *dst)
+{
+       if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+               return EINVAL;
+
+       if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+               return EINVAL;
+
+       if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+           memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+               return EINVAL;
+
+       dst->eof_flags = src->eof_flags;
+       dst->eof_prid = src->eof_prid;
+       dst->eof_min_file_size = src->eof_min_file_size;
+
+       dst->eof_uid = INVALID_UID;
+       if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+               dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+               if (!uid_valid(dst->eof_uid))
+                       return EINVAL;
+       }
+
+       dst->eof_gid = INVALID_GID;
+       if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+               dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+               if (!gid_valid(dst->eof_gid))
+                       return EINVAL;
+       }
+       return 0;
+}
+
  #endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c

index 7716a4e7375e296e926ef402f8687169dcc295c3..5a5a593994d4196d3b18c2e959df90dc28c7588f 100644 (file)
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,23 +20,11 @@
  #include "xfs_types.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
-#include "xfs_inum.h"
  #include "xfs_trans.h"
-#include "xfs_buf_item.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
  #include "xfs_trans_priv.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
  #include "xfs_error.h"
  #include "xfs_icreate_item.h"
  
@@ -52,11 +40,14 @@ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
   *
   * We only need one iovec for the icreate log structure.
   */
-STATIC uint
+STATIC void
  xfs_icreate_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
-       return 1;
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_icreate_log);
  }
  
  /*
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h

index 88ba8aa0bc41c0f3aa291da6b621dfea7445f56b..59e89f87c09b3fb8483326a30a9fcb879a4c4665 100644 (file)
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -18,24 +18,6 @@
  #ifndef XFS_ICREATE_ITEM_H
  #define XFS_ICREATE_ITEM_H     1
  
-/*
- * on disk log item structure
- *
- * Log recovery assumes the first two entries are the type and size and they fit
- * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
- * decoding can be done correctly.
- */
-struct xfs_icreate_log {
-       __uint16_t      icl_type;       /* type of log format structure */
-       __uint16_t      icl_size;       /* size of log format structure */
-       __be32          icl_ag;         /* ag being allocated in */
-       __be32          icl_agbno;      /* start block of inode range */
-       __be32          icl_count;      /* number of inodes to initialise */
-       __be32          icl_isize;      /* size of inodes */
-       __be32          icl_length;     /* length of extent to initialise */
-       __be32          icl_gen;        /* inode generation number to use */
-};
-
  /* in memory log item structure */
  struct xfs_icreate_item {
         struct xfs_log_item     ic_item;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index bb262c25c8de463276e9282d64b299c975eceb7b..e3d75385aa76a6e45b7711a65bb39c268c9f689b 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,18 +19,23 @@
  
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
  #include "xfs_trans.h"
+#include "xfs_trans_space.h"
  #include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_ialloc_btree.h"
  #include "xfs_attr_sf.h"
+#include "xfs_attr.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_buf_item.h"
@@ -39,16 +44,15 @@
  #include "xfs_alloc.h"
  #include "xfs_ialloc.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_error.h"
-#include "xfs_utils.h"
  #include "xfs_quota.h"
  #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
  #include "xfs_cksum.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
+#include "xfs_symlink.h"
  
-kmem_zone_t *xfs_ifork_zone;
  kmem_zone_t *xfs_inode_zone;
  
  /*
@@ -58,9 +62,6 @@ kmem_zone_t *xfs_inode_zone;
  #define        XFS_ITRUNC_MAX_EXTENTS  2
  
  STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  
  /*
   * helper function to extract extent size hint from inode
@@ -310,623 +311,202 @@ xfs_isilocked(
  }
  #endif
  
-void
-__xfs_iflock(
-       struct xfs_inode        *ip)
-{
-       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-       do {
-               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-               if (xfs_isiflocked(ip))
-                       io_schedule();
-       } while (!xfs_iflock_nowait(ip));
-
-       finish_wait(wq, &wait.wait);
-}
-
  #ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
  /*
- * Make sure that the extents in the given memory buffer
- * are valid.
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
   */
-STATIC void
-xfs_validate_extents(
-       xfs_ifork_t             *ifp,
-       int                     nrecs,
-       xfs_exntfmt_t           fmt)
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
  {
-       xfs_bmbt_irec_t         irec;
-       xfs_bmbt_rec_host_t     rec;
-       int                     i;
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
  
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               rec.l0 = get_unaligned(&ep->l0);
-               rec.l1 = get_unaligned(&ep->l1);
-               xfs_bmbt_get_all(&rec, &irec);
-               if (fmt == XFS_EXTFMT_NOSTATE)
-                       ASSERT(irec.br_state == XFS_EXT_NORM);
-       }
+       return lock_mode;
  }
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
  
  /*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
   */
-#if defined(DEBUG)
  void
-xfs_inobp_check(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp)
-{
-       int             i;
-       int             j;
-       xfs_dinode_t    *dip;
-
-       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
-       for (i = 0; i < j; i++) {
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       i * mp->m_sb.sb_inodesize);
-               if (!dip->di_next_unlinked)  {
-                       xfs_alert(mp,
-       "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
-                               bp);
-                       ASSERT(dip->di_next_unlinked);
-               }
-       }
-}
-#endif
-
-static void
-xfs_inode_buf_verify(
-       struct xfs_buf  *bp)
+xfs_lock_inodes(
+       xfs_inode_t     **ips,
+       int             inodes,
+       uint            lock_mode)
  {
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       int             i;
-       int             ni;
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        */
-       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
-
-               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       xfs_buf_ioerror(bp, EFSCORRUPTED);
-                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                            mp, dip);
-#ifdef DEBUG
-                       xfs_emerg(mp,
-                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                               (unsigned long long)bp->b_bn, i,
-                               be16_to_cpu(dip->di_magic));
-                       ASSERT(0);
-#endif
-               }
-       }
-       xfs_inobp_check(mp, bp);
-}
+       int             attempts = 0, i, j, try_lock;
+       xfs_log_item_t  *lp;
  
+       ASSERT(ips && (inodes >= 2)); /* we need at least two */
  
-static void
-xfs_inode_buf_read_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp);
-}
-
-static void
-xfs_inode_buf_write_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp);
-}
+       try_lock = 0;
+       i = 0;
  
-const struct xfs_buf_ops xfs_inode_buf_ops = {
-       .verify_read = xfs_inode_buf_read_verify,
-       .verify_write = xfs_inode_buf_write_verify,
-};
+again:
+       for (; i < inodes; i++) {
+               ASSERT(ips[i]);
  
+               if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                       continue;
  
-/*
- * This routine is called to map an inode to the buffer containing the on-disk
- * version of the inode.  It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
- */
-int
-xfs_imap_to_bp(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_imap         *imap,
-       struct xfs_dinode       **dipp,
-       struct xfs_buf          **bpp,
-       uint                    buf_flags,
-       uint                    iget_flags)
-{
-       struct xfs_buf          *bp;
-       int                     error;
+               /*
+                * If try_lock is not set yet, make sure all locked inodes
+                * are not in the AIL.
+                * If any are, set try_lock to be used later.
+                */
  
-       buf_flags |= XBF_UNMAPPED;
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, buf_flags, &bp,
-                                  &xfs_inode_buf_ops);
-       if (error) {
-               if (error == EAGAIN) {
-                       ASSERT(buf_flags & XBF_TRYLOCK);
-                       return error;
+               if (!try_lock) {
+                       for (j = (i - 1); j >= 0 && !try_lock; j--) {
+                               lp = (xfs_log_item_t *)ips[j]->i_itemp;
+                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                       try_lock++;
+                               }
+                       }
                 }
  
-               if (error == EFSCORRUPTED &&
-                   (iget_flags & XFS_IGET_UNTRUSTED))
-                       return XFS_ERROR(EINVAL);
-
-               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-                       __func__, error);
-               return error;
-       }
-
-       *bpp = bp;
-       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
-       return 0;
-}
-
-/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode.  For fifos, devs, and sockets
- * this means set if_rdev to the proper value.  For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers.  For a file in B-tree format, only the root is immediately
- * brought in-core.  The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
- */
-STATIC int
-xfs_iformat(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip)
-{
-       xfs_attr_shortform_t    *atp;
-       int                     size;
-       int                     error = 0;
-       xfs_fsize_t             di_size;
-
-       if (unlikely(be32_to_cpu(dip->di_nextents) +
-                    be16_to_cpu(dip->di_anextents) >
-                    be64_to_cpu(dip->di_nblocks))) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
-                       (unsigned long long)ip->i_ino,
-                       (int)(be32_to_cpu(dip->di_nextents) +
-                             be16_to_cpu(dip->di_anextents)),
-                       (unsigned long long)
-                               be64_to_cpu(dip->di_nblocks));
-               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                       (unsigned long long)ip->i_ino,
-                       dip->di_forkoff);
-               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
-                    !ip->i_mount->m_rtdev_targp)) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, has realtime flag set.",
-                       ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
-                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFIFO:
-       case S_IFCHR:
-       case S_IFBLK:
-       case S_IFSOCK:
-               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
-                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
-                                             ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               ip->i_d.di_size = 0;
-               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
-               break;
+               /*
+                * If any of the previous locks we have locked is in the AIL,
+                * we must TRY to get the second and subsequent locks. If
+                * we can't get any, we must release all we have
+                * and try again.
+                */
  
-       case S_IFREG:
-       case S_IFLNK:
-       case S_IFDIR:
-               switch (dip->di_format) {
-               case XFS_DINODE_FMT_LOCAL:
+               if (try_lock) {
+                       /* try_lock must be 0 if i is 0. */
                         /*
-                        * no local regular files yet
+                        * try_lock means we have an inode locked
+                        * that is in the AIL.
                          */
-                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (local format for regular file).",
-                                       (unsigned long long) ip->i_ino);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
+                       ASSERT(i != 0);
+                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+                               attempts++;
+
+                               /*
+                                * Unlock all previous guys and try again.
+                                * xfs_iunlock will try to push the tail
+                                * if the inode is in the AIL.
+                                */
+
+                               for(j = i - 1; j >= 0; j--) {
+
+                                       /*
+                                        * Check to see if we've already
+                                        * unlocked this one.
+                                        * Not the first one going back,
+                                        * and the inode ptr is the same.
+                                        */
+                                       if ((j != (i - 1)) && ips[j] ==
+                                                               ips[j+1])
+                                               continue;
+
+                                       xfs_iunlock(ips[j], lock_mode);
+                               }
  
-                       di_size = be64_to_cpu(dip->di_size);
-                       if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (bad size %Ld for local inode).",
-                                       (unsigned long long) ip->i_ino,
-                                       (long long) di_size);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
+                               if ((attempts % 5) == 0) {
+                                       delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+                                       xfs_lock_delays++;
+#endif
+                               }
+                               i = 0;
+                               try_lock = 0;
+                               goto again;
                         }
-
-                       size = (int)di_size;
-                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
-                       break;
-               case XFS_DINODE_FMT_EXTENTS:
-                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
-                       break;
-               case XFS_DINODE_FMT_BTREE:
-                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
-                       break;
-               default:
-                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
-                                        ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
+               } else {
+                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
                 }
-               break;
-
-       default:
-               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       if (error) {
-               return error;
         }
-       if (!XFS_DFORK_Q(dip))
-               return 0;
-
-       ASSERT(ip->i_afp == NULL);
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
-       switch (dip->di_aformat) {
-       case XFS_DINODE_FMT_LOCAL:
-               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
-               size = be16_to_cpu(atp->hdr.totsize);
-
-               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                       xfs_warn(ip->i_mount,
-                               "corrupt inode %Lu (bad attr fork size %Ld).",
-                               (unsigned long long) ip->i_ino,
-                               (long long) size);
-                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
-                                            XFS_ERRLEVEL_LOW,
-                                            ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
  
-               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
-               break;
-       case XFS_DINODE_FMT_EXTENTS:
-               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
-               break;
-       default:
-               error = XFS_ERROR(EFSCORRUPTED);
-               break;
-       }
-       if (error) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+#ifdef DEBUG
+       if (attempts) {
+               if (attempts < 5) xfs_small_retries++;
+               else if (attempts < 100) xfs_middle_retries++;
+               else xfs_lots_retries++;
+       } else {
+               xfs_locked_n++;
         }
-       return error;
+#endif
  }
  
  /*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there.  Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
   */
-STATIC int
-xfs_iformat_local(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork,
-       int             size)
+void
+xfs_lock_two_inodes(
+       xfs_inode_t             *ip0,
+       xfs_inode_t             *ip1,
+       uint                    lock_mode)
  {
-       xfs_ifork_t     *ifp;
-       int             real_size;
-
-       /*
-        * If the size is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount,
-       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                       (unsigned long long) ip->i_ino, size,
-                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
-               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       real_size = 0;
-       if (size == 0)
-               ifp->if_u1.if_data = NULL;
-       else if (size <= sizeof(ifp->if_u2.if_inline_data))
-               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-       else {
-               real_size = roundup(size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-       }
-       ifp->if_bytes = size;
-       ifp->if_real_bytes = real_size;
-       if (size)
-               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFINLINE;
-       return 0;
-}
+       xfs_inode_t             *temp;
+       int                     attempts = 0;
+       xfs_log_item_t          *lp;
  
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it.  Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork)
-{
-       xfs_bmbt_rec_t  *dp;
-       xfs_ifork_t     *ifp;
-       int             nex;
-       int             size;
-       int             i;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
-       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
-       /*
-        * If the number of extents is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                       (unsigned long long) ip->i_ino, nex);
-               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_real_bytes = 0;
-       if (nex == 0)
-               ifp->if_u1.if_extents = NULL;
-       else if (nex <= XFS_INLINE_EXTS)
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       else
-               xfs_iext_add(ifp, 0, nex);
-
-       ifp->if_bytes = size;
-       if (size) {
-               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
-               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
-               for (i = 0; i < nex; i++, dp++) {
-                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                       ep->l0 = get_unaligned_be64(&dp->l0);
-                       ep->l1 = get_unaligned_be64(&dp->l1);
-               }
-               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
-               if (whichfork != XFS_DATA_FORK ||
-                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
-                               if (unlikely(xfs_check_nostate_extents(
-                                   ifp, 0, nex))) {
-                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
-                                                        XFS_ERRLEVEL_LOW,
-                                                        ip->i_mount);
-                                       return XFS_ERROR(EFSCORRUPTED);
-                               }
-       }
-       ifp->if_flags |= XFS_IFEXTENTS;
-       return 0;
-}
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+       ASSERT(ip0->i_ino != ip1->i_ino);
  
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it.  The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_bmdr_block_t        *dfp;
-       xfs_ifork_t             *ifp;
-       /* REFERENCED */
-       int                     nrecs;
-       int                     size;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
-       nrecs = be16_to_cpu(dfp->bb_numrecs);
-
-       /*
-        * blow out if -- fork has less extents than can fit in
-        * fork (fork shouldn't be a btree format), root btree
-        * block has more records than can fit into the fork,
-        * or the number of extents is greater than the number of
-        * blocks.
-        */
-       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
-                    XFS_BMDR_SPACE_CALC(nrecs) >
-                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
-                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-               xfs_warn(mp, "corrupt inode %Lu (btree).",
-                                       (unsigned long long) ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                        mp, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_broot_bytes = size;
-       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
-       ASSERT(ifp->if_broot != NULL);
-       /*
-        * Copy and convert from the on-disk structure
-        * to the in-memory structure.
-        */
-       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-                        ifp->if_broot, size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFBROOT;
+       if (ip0->i_ino > ip1->i_ino) {
+               temp = ip0;
+               ip0 = ip1;
+               ip1 = temp;
+       }
  
-       return 0;
-}
+ again:
+       xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
  
-STATIC void
-xfs_dinode_from_disk(
-       xfs_icdinode_t          *to,
-       xfs_dinode_t            *from)
-{
-       to->di_magic = be16_to_cpu(from->di_magic);
-       to->di_mode = be16_to_cpu(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = be16_to_cpu(from->di_onlink);
-       to->di_uid = be32_to_cpu(from->di_uid);
-       to->di_gid = be32_to_cpu(from->di_gid);
-       to->di_nlink = be32_to_cpu(from->di_nlink);
-       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_flushiter = be16_to_cpu(from->di_flushiter);
-       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
-       to->di_size = be64_to_cpu(from->di_size);
-       to->di_nblocks = be64_to_cpu(from->di_nblocks);
-       to->di_extsize = be32_to_cpu(from->di_extsize);
-       to->di_nextents = be32_to_cpu(from->di_nextents);
-       to->di_anextents = be16_to_cpu(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat  = from->di_aformat;
-       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
-       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
-       to->di_flags    = be16_to_cpu(from->di_flags);
-       to->di_gen      = be32_to_cpu(from->di_gen);
-
-       if (to->di_version == 3) {
-               to->di_changecount = be64_to_cpu(from->di_changecount);
-               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
-               to->di_flags2 = be64_to_cpu(from->di_flags2);
-               to->di_ino = be64_to_cpu(from->di_ino);
-               to->di_lsn = be64_to_cpu(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
+       /*
+        * If the first lock we have locked is in the AIL, we must TRY to get
+        * the second lock. If we can't get it, we must release the first one
+        * and try again.
+        */
+       lp = (xfs_log_item_t *)ip0->i_itemp;
+       if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+               if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                       xfs_iunlock(ip0, lock_mode);
+                       if ((++attempts % 5) == 0)
+                               delay(1); /* Don't just spin the CPU */
+                       goto again;
+               }
+       } else {
+               xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
         }
  }
  
+
  void
-xfs_dinode_to_disk(
-       xfs_dinode_t            *to,
-       xfs_icdinode_t          *from)
+__xfs_iflock(
+       struct xfs_inode        *ip)
  {
-       to->di_magic = cpu_to_be16(from->di_magic);
-       to->di_mode = cpu_to_be16(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = cpu_to_be16(from->di_onlink);
-       to->di_uid = cpu_to_be32(from->di_uid);
-       to->di_gid = cpu_to_be32(from->di_gid);
-       to->di_nlink = cpu_to_be32(from->di_nlink);
-       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
-       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
-       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-       to->di_size = cpu_to_be64(from->di_size);
-       to->di_nblocks = cpu_to_be64(from->di_nblocks);
-       to->di_extsize = cpu_to_be32(from->di_extsize);
-       to->di_nextents = cpu_to_be32(from->di_nextents);
-       to->di_anextents = cpu_to_be16(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat = from->di_aformat;
-       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
-       to->di_dmstate = cpu_to_be16(from->di_dmstate);
-       to->di_flags = cpu_to_be16(from->di_flags);
-       to->di_gen = cpu_to_be32(from->di_gen);
-
-       if (from->di_version == 3) {
-               to->di_changecount = cpu_to_be64(from->di_changecount);
-               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
-               to->di_flags2 = cpu_to_be64(from->di_flags2);
-               to->di_ino = cpu_to_be64(from->di_ino);
-               to->di_lsn = cpu_to_be64(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
-               to->di_flushiter = 0;
-       } else {
-               to->di_flushiter = cpu_to_be16(from->di_flushiter);
-       }
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
  }
  
  STATIC uint
@@ -987,234 +567,49 @@ xfs_dic2xflags(
                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
  }
  
-static bool
-xfs_dinode_verify(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *ip,
-       struct xfs_dinode       *dip)
-{
-       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
-               return false;
-
-       /* only version 3 or greater inodes are extensively verified here */
-       if (dip->di_version < 3)
-               return true;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             offsetof(struct xfs_dinode, di_crc)))
-               return false;
-       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
-               return false;
-       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       return true;
-}
-
-void
-xfs_dinode_calc_crc(
-       struct xfs_mount        *mp,
-       struct xfs_dinode       *dip)
-{
-       __uint32_t              crc;
-
-       if (dip->di_version < 3)
-               return;
-
-       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             offsetof(struct xfs_dinode, di_crc));
-       dip->di_crc = xfs_end_cksum(crc);
-}
-
  /*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
   */
  int
-xfs_iread(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       uint            iget_flags)
+xfs_lookup(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             **ipp,
+       struct xfs_name         *ci_name)
  {
-       xfs_buf_t       *bp;
-       xfs_dinode_t    *dip;
-       int             error;
-
-       /*
-        * Fill in the location information in the in-core inode.
-        */
-       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-       if (error)
-               return error;
-
-       /* shortcut IO on inode allocation if possible */
-       if ((iget_flags & XFS_IGET_CREATE) &&
-           xfs_sb_version_hascrc(&mp->m_sb) &&
-           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-               /* initialise the on-disk inode core */
-               memset(&ip->i_d, 0, sizeof(ip->i_d));
-               ip->i_d.di_magic = XFS_DINODE_MAGIC;
-               ip->i_d.di_gen = prandom_u32();
-               if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                       ip->i_d.di_version = 3;
-                       ip->i_d.di_ino = ip->i_ino;
-                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
-               } else
-                       ip->i_d.di_version = 2;
-               return 0;
-       }
-
-       /*
-        * Get pointers to the on-disk inode and the buffer containing it.
-        */
-       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
-       if (error)
-               return error;
-
-       /* even unallocated inodes are verified */
-       if (!xfs_dinode_verify(mp, ip, dip)) {
-               xfs_alert(mp, "%s: validation failed for inode %lld failed",
-                               __func__, ip->i_ino);
+       xfs_ino_t               inum;
+       int                     error;
+       uint                    lock_mode;
  
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-               error = XFS_ERROR(EFSCORRUPTED);
-               goto out_brelse;
-       }
+       trace_xfs_lookup(dp, name);
  
-       /*
-        * If the on-disk inode is already linked to a directory
-        * entry, copy all of the inode into the in-core inode.
-        * xfs_iformat() handles copying in the inode format
-        * specific information.
-        * Otherwise, just get the truly permanent information.
-        */
-       if (dip->di_mode) {
-               xfs_dinode_from_disk(&ip->i_d, dip);
-               error = xfs_iformat(ip, dip);
-               if (error)  {
-#ifdef DEBUG
-                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                               __func__, error);
-#endif /* DEBUG */
-                       goto out_brelse;
-               }
-       } else {
-               /*
-                * Partial initialisation of the in-core inode. Just the bits
-                * that xfs_ialloc won't overwrite or relies on being correct.
-                */
-               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-               ip->i_d.di_version = dip->di_version;
-               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
-               if (dip->di_version == 3) {
-                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-               }
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return XFS_ERROR(EIO);
  
-               /*
-                * Make sure to pull in the mode here as well in
-                * case the inode is released without being used.
-                * This ensures that xfs_inactive() will see that
-                * the inode is already free and not try to mess
-                * with the uninitialized part of it.
-                */
-               ip->i_d.di_mode = 0;
-       }
+       lock_mode = xfs_ilock_map_shared(dp);
+       error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+       xfs_iunlock_map_shared(dp, lock_mode);
  
-       /*
-        * The inode format changed when we moved the link count and
-        * made it 32 bits long.  If this is an old format inode,
-        * convert it in memory to look like a new one.  If it gets
-        * flushed to disk we will convert back before flushing or
-        * logging it.  We zero out the new projid field and the old link
-        * count field.  We'll handle clearing the pad field (the remains
-        * of the old uuid field) when we actually convert the inode to
-        * the new format. We don't change the version number so that we
-        * can distinguish this from a real new format inode.
-        */
-       if (ip->i_d.di_version == 1) {
-               ip->i_d.di_nlink = ip->i_d.di_onlink;
-               ip->i_d.di_onlink = 0;
-               xfs_set_projid(ip, 0);
-       }
+       if (error)
+               goto out;
  
-       ip->i_delayed_blks = 0;
+       error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+       if (error)
+               goto out_free_name;
  
-       /*
-        * Mark the buffer containing the inode as something to keep
-        * around for a while.  This helps to keep recently accessed
-        * meta-data in-core longer.
-        */
-       xfs_buf_set_ref(bp, XFS_INO_REF);
+       return 0;
  
-       /*
-        * Use xfs_trans_brelse() to release the buffer containing the on-disk
-        * inode, because it was acquired with xfs_trans_read_buf() in
-        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
-        * brelse().  If we're within a transaction, then xfs_trans_brelse()
-        * will only release the buffer if it is not dirty within the
-        * transaction.  It will be OK to release the buffer in this case,
-        * because inodes on disk are never destroyed and we will be locking the
-        * new in-core inode before putting it in the cache where other
-        * processes can find it.  Thus we don't have to worry about the inode
-        * being changed just because we released the buffer.
-        */
- out_brelse:
-       xfs_trans_brelse(tp, bp);
+out_free_name:
+       if (ci_name)
+               kmem_free(ci_name->name);
+out:
+       *ipp = NULL;
         return error;
  }
  
-/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       int             error;
-       xfs_ifork_t     *ifp;
-       xfs_extnum_t    nextents;
-
-       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
-                                ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       /*
-        * We know that the size is valid (it's checked in iformat_btree)
-        */
-       ifp->if_bytes = ifp->if_real_bytes = 0;
-       ifp->if_flags |= XFS_IFEXTENTS;
-       xfs_iext_add(ifp, 0, nextents);
-       error = xfs_bmap_read_extents(tp, ip, whichfork);
-       if (error) {
-               xfs_iext_destroy(ifp);
-               ifp->if_flags &= ~XFS_IFEXTENTS;
-               return error;
-       }
-       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
-       return 0;
-}
-
  /*
   * Allocate an inode on disk and return a copy of its in-core version.
   * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
@@ -1295,8 +690,8 @@ xfs_ialloc(
         ip->i_d.di_onlink = 0;
         ip->i_d.di_nlink = nlink;
         ASSERT(ip->i_d.di_nlink == nlink);
-       ip->i_d.di_uid = current_fsuid();
-       ip->i_d.di_gid = current_fsgid();
+       ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+       ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
         xfs_set_projid(ip, prid);
         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
  
@@ -1335,7 +730,7 @@ xfs_ialloc(
          */
         if ((irix_sgid_inherit) &&
             (ip->i_d.di_mode & S_ISGID) &&
-           (!in_group_p((gid_t)ip->i_d.di_gid))) {
+           (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
                 ip->i_d.di_mode &= ~S_ISGID;
         }
  
@@ -1467,31 +862,608 @@ xfs_ialloc(
  }
  
  /*
- * Free up the underlying blocks past new_size.  The new size must be smaller
- * than the current size.  This routine can be used both for the attribute and
- * data fork, and does not modify the inode size, which is left to the caller.
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
   *
- * The transaction passed to this routine must have made a permanent log
- * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
- * given transaction and start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.  Some transaction will be
- * returned to the caller to be committed.  The incoming transaction must
- * already include the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction.  On return the inode
- * will be "held" within the returned transaction.  This routine does NOT
- * require any disk space to be reserved for it within the transaction.
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
   *
- * If we get an error, we must return with the inode locked and linked into the
- * current transaction. This keeps things simple for the higher level code,
- * because it always knows that the inode is locked and held in the transaction
- * that returns to it whether errors occur or not.  We don't mark the inode
- * dirty on error so that transactions can be easily aborted if possible.
   */
  int
-xfs_itruncate_extents(
-       struct xfs_trans        **tpp,
-       struct xfs_inode        *ip,
-       int                     whichfork,
+xfs_dir_ialloc(
+       xfs_trans_t     **tpp,          /* input: current transaction;
+                                          output: may be a new transaction. */
+       xfs_inode_t     *dp,            /* directory within whose allocate
+                                          the inode. */
+       umode_t         mode,
+       xfs_nlink_t     nlink,
+       xfs_dev_t       rdev,
+       prid_t          prid,           /* project id */
+       int             okalloc,        /* ok to allocate new space */
+       xfs_inode_t     **ipp,          /* pointer to inode; it will be
+                                          locked. */
+       int             *committed)
+
+{
+       xfs_trans_t     *tp;
+       xfs_trans_t     *ntp;
+       xfs_inode_t     *ip;
+       xfs_buf_t       *ialloc_context = NULL;
+       int             code;
+       void            *dqinfo;
+       uint            tflags;
+
+       tp = *tpp;
+       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+       /*
+        * xfs_ialloc will return a pointer to an incore inode if
+        * the Space Manager has an available inode on the free
+        * list. Otherwise, it will do an allocation and replenish
+        * the freelist.  Since we can only do one allocation per
+        * transaction without deadlocks, we will need to commit the
+        * current transaction and start a new one.  We will then
+        * need to call xfs_ialloc again to get the inode.
+        *
+        * If xfs_ialloc did an allocation to replenish the freelist,
+        * it returns the bp containing the head of the freelist as
+        * ialloc_context. We will hold a lock on it across the
+        * transaction commit so that no other process can steal
+        * the inode(s) that we've just allocated.
+        */
+       code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+                         &ialloc_context, &ip);
+
+       /*
+        * Return an error if we were unable to allocate a new inode.
+        * This should only happen if we run out of space on disk or
+        * encounter a disk error.
+        */
+       if (code) {
+               *ipp = NULL;
+               return code;
+       }
+       if (!ialloc_context && !ip) {
+               *ipp = NULL;
+               return XFS_ERROR(ENOSPC);
+       }
+
+       /*
+        * If the AGI buffer is non-NULL, then we were unable to get an
+        * inode in one operation.  We need to commit the current
+        * transaction and call xfs_ialloc() again.  It is guaranteed
+        * to succeed the second time.
+        */
+       if (ialloc_context) {
+               struct xfs_trans_res tres;
+
+               /*
+                * Normally, xfs_trans_commit releases all the locks.
+                * We call bhold to hang on to the ialloc_context across
+                * the commit.  Holding this buffer prevents any other
+                * processes from doing any allocations in this
+                * allocation group.
+                */
+               xfs_trans_bhold(tp, ialloc_context);
+               /*
+                * Save the log reservation so we can use
+                * them in the next transaction.
+                */
+               tres.tr_logres = xfs_trans_get_log_res(tp);
+               tres.tr_logcount = xfs_trans_get_log_count(tp);
+
+               /*
+                * We want the quota changes to be associated with the next
+                * transaction, NOT this one. So, detach the dqinfo from this
+                * and attach it to the next transaction.
+                */
+               dqinfo = NULL;
+               tflags = 0;
+               if (tp->t_dqinfo) {
+                       dqinfo = (void *)tp->t_dqinfo;
+                       tp->t_dqinfo = NULL;
+                       tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+                       tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+               }
+
+               ntp = xfs_trans_dup(tp);
+               code = xfs_trans_commit(tp, 0);
+               tp = ntp;
+               if (committed != NULL) {
+                       *committed = 1;
+               }
+               /*
+                * If we get an error during the commit processing,
+                * release the buffer that is still held and return
+                * to the caller.
+                */
+               if (code) {
+                       xfs_buf_relse(ialloc_context);
+                       if (dqinfo) {
+                               tp->t_dqinfo = dqinfo;
+                               xfs_trans_free_dqinfo(tp);
+                       }
+                       *tpp = ntp;
+                       *ipp = NULL;
+                       return code;
+               }
+
+               /*
+                * transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(tp->t_ticket);
+               tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+               code = xfs_trans_reserve(tp, &tres, 0, 0);
+
+               /*
+                * Re-attach the quota info that we detached from prev trx.
+                */
+               if (dqinfo) {
+                       tp->t_dqinfo = dqinfo;
+                       tp->t_flags |= tflags;
+               }
+
+               if (code) {
+                       xfs_buf_relse(ialloc_context);
+                       *tpp = ntp;
+                       *ipp = NULL;
+                       return code;
+               }
+               xfs_trans_bjoin(tp, ialloc_context);
+
+               /*
+                * Call ialloc again. Since we've locked out all
+                * other allocations in this allocation group,
+                * this call should always succeed.
+                */
+               code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+                                 okalloc, &ialloc_context, &ip);
+
+               /*
+                * If we get an error at this point, return to the caller
+                * so that the current transaction can be aborted.
+                */
+               if (code) {
+                       *tpp = tp;
+                       *ipp = NULL;
+                       return code;
+               }
+               ASSERT(!ialloc_context && ip);
+
+       } else {
+               if (committed != NULL)
+                       *committed = 0;
+       }
+
+       *ipp = ip;
+       *tpp = tp;
+
+       return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int                            /* error */
+xfs_droplink(
+       xfs_trans_t *tp,
+       xfs_inode_t *ip)
+{
+       int     error;
+
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+       ASSERT (ip->i_d.di_nlink > 0);
+       ip->i_d.di_nlink--;
+       drop_nlink(VFS_I(ip));
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       error = 0;
+       if (ip->i_d.di_nlink == 0) {
+               /*
+                * We're dropping the last link to this file.
+                * Move the on-disk inode to the AGI unlinked list.
+                * From xfs_inactive() we will pull the inode from
+                * the list and free it.
+                */
+               error = xfs_iunlink(tp, ip);
+       }
+       return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(ip->i_d.di_version == 1);
+
+       ip->i_d.di_version = 2;
+       ip->i_d.di_onlink = 0;
+       memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+       mp = tp->t_mountp;
+       if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+               spin_lock(&mp->m_sb_lock);
+               if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+                       xfs_sb_version_addnlink(&mp->m_sb);
+                       spin_unlock(&mp->m_sb_lock);
+                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+               } else {
+                       spin_unlock(&mp->m_sb_lock);
+               }
+       }
+       /* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+       xfs_trans_t *tp,
+       xfs_inode_t *ip)
+{
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+       ASSERT(ip->i_d.di_nlink > 0);
+       ip->i_d.di_nlink++;
+       inc_nlink(VFS_I(ip));
+       if ((ip->i_d.di_version == 1) &&
+           (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+               /*
+                * The inode has increased its number of links beyond
+                * what can fit in an old format inode.  It now needs
+                * to be converted to a version 2 inode with a 32 bit
+                * link count.  If this is the first inode in the file
+                * system to do this, then we need to bump the superblock
+                * version number as well.
+                */
+               xfs_bump_ino_vers2(tp, ip);
+       }
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       return 0;
+}
+
+int
+xfs_create(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       umode_t                 mode,
+       xfs_dev_t               rdev,
+       xfs_inode_t             **ipp)
+{
+       int                     is_dir = S_ISDIR(mode);
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_inode        *ip = NULL;
+       struct xfs_trans        *tp = NULL;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       bool                    unlock_dp_on_error = false;
+       uint                    cancel_flags;
+       int                     committed;
+       prid_t                  prid;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
+       struct xfs_dquot        *pdqp = NULL;
+       struct xfs_trans_res    tres;
+       uint                    resblks;
+
+       trace_xfs_create(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+               prid = xfs_get_projid(dp);
+       else
+               prid = XFS_PROJID_DEFAULT;
+
+       /*
+        * Make sure that we have allocated dquot(s) on disk.
+        */
+       error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+                                       xfs_kgid_to_gid(current_fsgid()), prid,
+                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                                       &udqp, &gdqp, &pdqp);
+       if (error)
+               return error;
+
+       if (is_dir) {
+               rdev = 0;
+               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+               tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
+               tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+       } else {
+               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+               tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
+               tres.tr_logcount = XFS_CREATE_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+       }
+
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+       /*
+        * Initially assume that the file does not exist and
+        * reserve the resources for that case.  If that is not
+        * the case we'll drop the one we have and get a more
+        * appropriate transaction later.
+        */
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(tp, &tres, resblks, 0);
+       if (error == ENOSPC) {
+               /* flush outstanding delalloc blocks and retry */
+               xfs_flush_inodes(mp);
+               error = xfs_trans_reserve(tp, &tres, resblks, 0);
+       }
+       if (error == ENOSPC) {
+               /* No space at all so try a "no-allocation" reservation */
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &tres, 0, 0);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       unlock_dp_on_error = true;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       /*
+        * Reserve disk quota and the inode.
+        */
+       error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                               pdqp, resblks, 1, 0);
+       if (error)
+               goto out_trans_cancel;
+
+       error = xfs_dir_canenter(tp, dp, name, resblks);
+       if (error)
+               goto out_trans_cancel;
+
+       /*
+        * A newly created regular or special file just has one directory
+        * entry pointing to them, but a directory also the "." entry
+        * pointing to itself.
+        */
+       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+                              prid, resblks > 0, &ip, &committed);
+       if (error) {
+               if (error == ENOSPC)
+                       goto out_trans_cancel;
+               goto out_trans_abort;
+       }
+
+       /*
+        * Now we join the directory inode to the transaction.  We do not do it
+        * earlier because xfs_dir_ialloc might commit the previous transaction
+        * (and release all the locks).  An error from here on will result in
+        * the transaction cancel unlocking dp so don't do it explicitly in the
+        * error path.
+        */
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       unlock_dp_on_error = false;
+
+       error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks ?
+                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+       if (error) {
+               ASSERT(error != ENOSPC);
+               goto out_trans_abort;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+       if (is_dir) {
+               error = xfs_dir_init(tp, ip, dp);
+               if (error)
+                       goto out_bmap_cancel;
+
+               error = xfs_bumplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
+       }
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * create transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       /*
+        * Attach the dquot(s) to the inodes and modify them incore.
+        * These ids of the inode couldn't have changed since the new
+        * inode has been locked ever since it was created.
+        */
+       xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto out_release_inode;
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       *ipp = ip;
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+       /*
+        * Wait until after the current transaction is aborted to
+        * release the inode.  This prevents recursive transactions
+        * and deadlocks from xfs_inactive.
+        */
+       if (ip)
+               IRELE(ip);
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       if (unlock_dp_on_error)
+               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return error;
+}
+
+int
+xfs_link(
+       xfs_inode_t             *tdp,
+       xfs_inode_t             *sip,
+       struct xfs_name         *target_name)
+{
+       xfs_mount_t             *mp = tdp->i_mount;
+       xfs_trans_t             *tp;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     resblks;
+
+       trace_xfs_link(tdp, target_name);
+
+       ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(sip, 0);
+       if (error)
+               goto std_return;
+
+       error = xfs_qm_dqattach(tdp, 0);
+       if (error)
+               goto std_return;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto error_return;
+       }
+
+       xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+       /*
+        * If we are using project inheritance, we only allow hard link
+        * creation in our tree when the project IDs are the same; else
+        * the tree quota mechanism could be circumvented.
+        */
+       if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
+       error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+       if (error)
+               goto error_return;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error)
+               goto abort_return;
+       xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+       error = xfs_bumplink(tp, sip);
+       if (error)
+               goto abort_return;
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * link transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+               xfs_trans_set_sync(tp);
+       }
+
+       error = xfs_bmap_finish (&tp, &free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(&free_list);
+               goto abort_return;
+       }
+
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+       cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
+}
+
+/*
+ * Free up the underlying blocks past new_size.  The new size must be smaller
+ * than the current size.  This routine can be used both for the attribute and
+ * data fork, and does not modify the inode size, which is left to the caller.
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_extents(
+       struct xfs_trans        **tpp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
         xfs_fsize_t             new_size)
  {
         struct xfs_mount        *mp = ip->i_mount;
@@ -1572,37 +1544,299 @@ xfs_itruncate_extents(
                         goto out;
  
                 /*
-                * Transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
+                * Transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(tp->t_ticket);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               if (error)
+                       goto out;
+       }
+
+       /*
+        * Always re-log the inode so that our permanent transaction can keep
+        * on rolling it forward in the log.
+        */
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       trace_xfs_itruncate_extents_end(ip, new_size);
+
+out:
+       *tpp = tp;
+       return error;
+out_bmap_cancel:
+       /*
+        * If the bunmapi call encounters an error, return to the caller where
+        * the transaction can be properly aborted.  We just need to make sure
+        * we're not holding any resources that we were not when we came in.
+        */
+       xfs_bmap_cancel(&free_list);
+       goto out;
+}
+
+int
+xfs_release(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       int             error;
+
+       if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+               return 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return 0;
+
+       if (!XFS_FORCED_SHUTDOWN(mp)) {
+               int truncated;
+
+               /*
+                * If we are using filestreams, and we have an unlinked
+                * file that we are processing the last close on, then nothing
+                * will be able to reopen and write to this file. Purge this
+                * inode from the filestreams cache so that it doesn't delay
+                * teardown of the inode.
+                */
+               if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
+                       xfs_filestream_deassociate(ip);
+
+               /*
+                * If we previously truncated this file and removed old data
+                * in the process, we want to initiate "early" writeout on
+                * the last close.  This is an attempt to combat the notorious
+                * NULL files problem which is particularly noticeable from a
+                * truncate down, buffered (re-)write (delalloc), followed by
+                * a crash.  What we are effectively doing here is
+                * significantly reducing the time window where we'd otherwise
+                * be exposed to that problem.
+                */
+               truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+               if (truncated) {
+                       xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+                               error = -filemap_flush(VFS_I(ip)->i_mapping);
+                               if (error)
+                                       return error;
+                       }
+               }
+       }
+
+       if (ip->i_d.di_nlink == 0)
+               return 0;
+
+       if (xfs_can_free_eofblocks(ip, false)) {
+
+               /*
+                * If we can't get the iolock just skip truncating the blocks
+                * past EOF because we could deadlock with the mmap_sem
+                * otherwise.  We'll get another chance to drop them once the
+                * last reference to the inode is dropped, so we'll never leak
+                * blocks permanently.
+                *
+                * Further, check if the inode is being opened, written and
+                * closed frequently and we have delayed allocation blocks
+                * outstanding (e.g. streaming writes from the NFS server),
+                * truncating the blocks past EOF will cause fragmentation to
+                * occur.
+                *
+                * In this case don't do the truncation, either, but we have to
+                * be careful how we detect this case. Blocks beyond EOF show
+                * up as i_delayed_blks even when the inode is clean, so we
+                * need to truncate them away first before checking for a dirty
+                * release. Hence on the first dirty close we will still remove
+                * the speculative allocation, but after that we will leave it
+                * in place.
+                */
+               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                       return 0;
+
+               error = xfs_free_eofblocks(mp, ip, true);
+               if (error && error != EAGAIN)
+                       return error;
+
+               /* delalloc blocks after truncation means it really is dirty */
+               if (ip->i_delayed_blks)
+                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+       }
+       return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+int
+xfs_inactive(
+       xfs_inode_t     *ip)
+{
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     committed;
+       struct xfs_trans        *tp;
+       struct xfs_mount        *mp;
+       struct xfs_trans_res    *resp;
+       int                     error;
+       int                     truncate = 0;
+
+       /*
+        * If the inode is already free, then there can be nothing
+        * to clean up here.
+        */
+       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+               ASSERT(ip->i_df.if_real_bytes == 0);
+               ASSERT(ip->i_df.if_broot_bytes == 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       mp = ip->i_mount;
+
+       error = 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               goto out;
+
+       if (ip->i_d.di_nlink != 0) {
+               /*
+                * force is true because we are evicting an inode from the
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
+                */
+               if (xfs_can_free_eofblocks(ip, true)) {
+                       error = xfs_free_eofblocks(mp, ip, false);
+                       if (error)
+                               return VN_INACTIVE_CACHE;
+               }
+               goto out;
+       }
+
+       if (S_ISREG(ip->i_d.di_mode) &&
+           (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+            ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+               truncate = 1;
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return VN_INACTIVE_CACHE;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+       resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
+               &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
+
+       error = xfs_trans_reserve(tp, resp, 0, 0);
+       if (error) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               xfs_trans_cancel(tp, 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       if (S_ISLNK(ip->i_d.di_mode)) {
+               error = xfs_inactive_symlink(ip, &tp);
+               if (error)
+                       goto out_cancel;
+       } else if (truncate) {
+               ip->i_d.di_size = 0;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+               if (error)
+                       goto out_cancel;
+
+               ASSERT(ip->i_d.di_nextents == 0);
+       }
+
+       /*
+        * If there are attributes associated with the file then blow them away
+        * now.  The code calls a routine that recursively deconstructs the
+        * attribute fork.  We need to just commit the current transaction
+        * because we can't use it for xfs_attr_inactive().
+        */
+       if (ip->i_d.di_anextents > 0) {
+               ASSERT(ip->i_d.di_forkoff != 0);
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       goto out_unlock;
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               error = xfs_attr_inactive(ip);
+               if (error)
+                       goto out;
+
+               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       goto out;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+       }
+
+       if (ip->i_afp)
+               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+       ASSERT(ip->i_d.di_anextents == 0);
+
+       /*
+        * Free the inode.
+        */
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_ifree(tp, ip, &free_list);
+       if (error) {
+               /*
+                * If we fail to free the inode, shut down.  The cancel
+                * might do that, we need to make sure.  Otherwise the
+                * inode might be lost for a long time or forever.
+                */
+               if (!XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_notice(mp, "%s: xfs_ifree returned error %d",
+                               __func__, error);
+                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               }
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       } else {
+               /*
+                * Credit the quota account(s). The inode is gone.
+                */
+               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+               /*
+                * Just ignore errors at this point.  There is nothing we can
+                * do except to try to keep going. Make sure it's not a silent
+                * error.
                  */
-               xfs_log_ticket_put(tp->t_ticket);
-               error = xfs_trans_reserve(tp, 0,
-                                       XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                       XFS_TRANS_PERM_LOG_RES,
-                                       XFS_ITRUNCATE_LOG_COUNT);
+               error = xfs_bmap_finish(&tp,  &free_list, &committed);
                 if (error)
-                       goto out;
+                       xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+                               __func__, error);
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+                               __func__, error);
         }
  
         /*
-        * Always re-log the inode so that our permanent transaction can keep
-        * on rolling it forward in the log.
+        * Release the dquots held by inode, if any.
          */
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       trace_xfs_itruncate_extents_end(ip, new_size);
-
+       xfs_qm_dqdetach(ip);
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out:
-       *tpp = tp;
-       return error;
-out_bmap_cancel:
-       /*
-        * If the bunmapi call encounters an error, return to the caller where
-        * the transaction can be properly aborted.  We just need to make sure
-        * we're not holding any resources that we were not when we came in.
-        */
-       xfs_bmap_cancel(&free_list);
-       goto out;
+       return VN_INACTIVE_CACHE;
+out_cancel:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       goto out_unlock;
  }
  
  /*
@@ -1861,7 +2095,7 @@ xfs_iunlink_remove(
  }
  
  /*
- * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
   * inodes that are in memory - they all must be marked stale and attached to
   * the cluster buffer.
   */
@@ -2093,272 +2327,6 @@ xfs_ifree(
         return error;
  }
  
-/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff.  Move the records
- * and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller.  When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root.  If the if_broot is currently NULL, then
- * if we adding records one will be allocated.  The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *      requested for the if_broot array.
- */
-void
-xfs_iroot_realloc(
-       xfs_inode_t             *ip,
-       int                     rec_diff,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     cur_max;
-       xfs_ifork_t             *ifp;
-       struct xfs_btree_block  *new_broot;
-       int                     new_max;
-       size_t                  new_size;
-       char                    *np;
-       char                    *op;
-
-       /*
-        * Handle the degenerate case quietly.
-        */
-       if (rec_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (rec_diff > 0) {
-               /*
-                * If there wasn't any memory allocated before, just
-                * allocate it now and get out.
-                */
-               if (ifp->if_broot_bytes == 0) {
-                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-                       ifp->if_broot_bytes = (int)new_size;
-                       return;
-               }
-
-               /*
-                * If there is already an existing if_broot, then we need
-                * to realloc() it and shift the pointers to their new
-                * location.  The records don't change location because
-                * they are kept butted up against the btree block header.
-                */
-               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-               new_max = cur_max + rec_diff;
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
-                               KM_SLEEP | KM_NOFS);
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    (int)new_size);
-               ifp->if_broot_bytes = (int)new_size;
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-               memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
-               return;
-       }
-
-       /*
-        * rec_diff is less than 0.  In this case, we are shrinking the
-        * if_broot buffer.  It must already exist.  If we go to zero
-        * records, just get rid of the root and clear the status bit.
-        */
-       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-       new_max = cur_max + rec_diff;
-       ASSERT(new_max >= 0);
-       if (new_max > 0)
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-       else
-               new_size = 0;
-       if (new_size > 0) {
-               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-               /*
-                * First copy over the btree block header.
-                */
-               memcpy(new_broot, ifp->if_broot,
-                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
-       } else {
-               new_broot = NULL;
-               ifp->if_flags &= ~XFS_IFBROOT;
-       }
-
-       /*
-        * Only copy the records and pointers if there are any.
-        */
-       if (new_max > 0) {
-               /*
-                * First copy the records.
-                */
-               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
-
-               /*
-                * Then copy the pointers.
-                */
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
-                                                    (int)new_size);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
-       }
-       kmem_free(ifp->if_broot);
-       ifp->if_broot = new_broot;
-       ifp->if_broot_bytes = (int)new_size;
-       if (ifp->if_broot)
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-       return;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased.  The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer.  Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- *      requested for the if_data array.
- */
-void
-xfs_idata_realloc(
-       xfs_inode_t     *ip,
-       int             byte_diff,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-       int             new_size;
-       int             real_size;
-
-       if (byte_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       new_size = (int)ifp->if_bytes + byte_diff;
-       ASSERT(new_size >= 0);
-
-       if (new_size == 0) {
-               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       kmem_free(ifp->if_u1.if_data);
-               }
-               ifp->if_u1.if_data = NULL;
-               real_size = 0;
-       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
-               /*
-                * If the valid extents/data can fit in if_inline_ext/data,
-                * copy them from the malloc'd vector and free it.
-                */
-               if (ifp->if_u1.if_data == NULL) {
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
-                             new_size);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               }
-               real_size = 0;
-       } else {
-               /*
-                * Stuck with malloc/realloc.
-                * For inline data, the underlying buffer must be
-                * a multiple of 4 bytes in size so that it can be
-                * logged and stay on word boundaries.  We enforce
-                * that here.
-                */
-               real_size = roundup(new_size, 4);
-               if (ifp->if_u1.if_data == NULL) {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       /*
-                        * Only do the realloc if the underlying size
-                        * is really changing.
-                        */
-                       if (ifp->if_real_bytes != real_size) {
-                               ifp->if_u1.if_data =
-                                       kmem_realloc(ifp->if_u1.if_data,
-                                                       real_size,
-                                                       ifp->if_real_bytes,
-                                                       KM_SLEEP | KM_NOFS);
-                       }
-               } else {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
-                               ifp->if_bytes);
-               }
-       }
-       ifp->if_real_bytes = real_size;
-       ifp->if_bytes = new_size;
-       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
-void
-xfs_idestroy_fork(
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (ifp->if_broot != NULL) {
-               kmem_free(ifp->if_broot);
-               ifp->if_broot = NULL;
-       }
-
-       /*
-        * If the format is local, then we can't have an extents
-        * array so just look for an inline data array.  If we're
-        * not local then we may or may not have an extents list,
-        * so check and free it up if we do.
-        */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
-                   (ifp->if_u1.if_data != NULL)) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = NULL;
-                       ifp->if_real_bytes = 0;
-               }
-       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
-                  ((ifp->if_flags & XFS_IFEXTIREC) ||
-                   ((ifp->if_u1.if_extents != NULL) &&
-                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
-               ASSERT(ifp->if_real_bytes != 0);
-               xfs_iext_destroy(ifp);
-       }
-       ASSERT(ifp->if_u1.if_extents == NULL ||
-              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
-       ASSERT(ifp->if_real_bytes == 0);
-       if (whichfork == XFS_ATTR_FORK) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-       }
-}
-
  /*
   * This is called to unpin an inode.  The caller must have the inode locked
   * in at least shared mode so that the buffer cannot be subsequently pinned
@@ -2402,162 +2370,471 @@ xfs_iunpin_wait(
                 __xfs_iunpin_wait(ip);
  }
  
-/*
- * xfs_iextents_copy()
- *
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer.  It
- * returns the number of bytes copied into the buffer.
- *
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer.  Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
- */
  int
-xfs_iextents_copy(
-       xfs_inode_t             *ip,
-       xfs_bmbt_rec_t          *dp,
-       int                     whichfork)
+xfs_remove(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             *ip)
  {
-       int                     copied;
-       int                     i;
-       xfs_ifork_t             *ifp;
-       int                     nrecs;
-       xfs_fsblock_t           start_block;
+       xfs_mount_t             *mp = dp->i_mount;
+       xfs_trans_t             *tp = NULL;
+       int                     is_dir = S_ISDIR(ip->i_d.di_mode);
+       int                     error = 0;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     link_zero;
+       uint                    resblks;
+       uint                    log_count;
  
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(ifp->if_bytes > 0);
+       trace_xfs_remove(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(dp, 0);
+       if (error)
+               goto std_return;
  
-       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
-       ASSERT(nrecs > 0);
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               goto std_return;
+
+       if (is_dir) {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+               log_count = XFS_DEFAULT_LOG_COUNT;
+       } else {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+               log_count = XFS_REMOVE_LOG_COUNT;
+       }
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
  
         /*
-        * There are some delayed allocation extents in the
-        * inode, so copy the extents one at a time and skip
-        * the delayed ones.  There must be at least one
-        * non-delayed extent.
+        * We try to get the real space reservation first,
+        * allowing for directory btree deletion(s) implying
+        * possible bmap insert(s).  If we can't get the space
+        * reservation then we use 0 instead, and avoid the bmap
+        * btree insert(s) in the directory code by, if the bmap
+        * insert tries to happen, instead trimming the LAST
+        * block from the directory.
          */
-       copied = 0;
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               start_block = xfs_bmbt_get_startblock(ep);
-               if (isnullstartblock(start_block)) {
-                       /*
-                        * It's a delayed allocation extent, so skip it.
-                        */
-                       continue;
+       resblks = XFS_REMOVE_SPACE_RES(mp);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+       }
+       if (error) {
+               ASSERT(error != ENOSPC);
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we're removing a directory perform some additional validation.
+        */
+       if (is_dir) {
+               ASSERT(ip->i_d.di_nlink >= 2);
+               if (ip->i_d.di_nlink != 2) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
                 }
+               if (!xfs_dir_isempty(ip)) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
+               }
+       }
+
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error) {
+               ASSERT(error != ENOENT);
+               goto out_bmap_cancel;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+       if (is_dir) {
+               /*
+                * Drop the link from ip's "..".
+                */
+               error = xfs_droplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
  
-               /* Translate to on disk format */
-               put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
-               put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
-               dp++;
-               copied++;
+               /*
+                * Drop the "." link from ip to self.
+                */
+               error = xfs_droplink(tp, ip);
+               if (error)
+                       goto out_bmap_cancel;
+       } else {
+               /*
+                * When removing a non-directory we need to log the parent
+                * inode here.  For a directory this is done implicitly
+                * by the xfs_droplink call for the ".." entry.
+                */
+               xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
         }
-       ASSERT(copied != 0);
-       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
  
-       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+       /*
+        * Drop the link from dp to ip.
+        */
+       error = xfs_droplink(tp, ip);
+       if (error)
+               goto out_bmap_cancel;
+
+       /*
+        * Determine if this is the last link while
+        * we are in the transaction.
+        */
+       link_zero = (ip->i_d.di_nlink == 0);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * remove transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto std_return;
+
+       /*
+        * If we are using filestreams, kill the stream association.
+        * If the file is still open it may get a new one but that
+        * will get killed on last close in xfs_close() so we don't
+        * have to worry about that.
+        */
+       if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+               xfs_filestream_deassociate(ip);
+
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
  }
  
  /*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
+ * Enter all inodes for a rename transaction into a sorted array.
   */
-/*ARGSUSED*/
  STATIC void
-xfs_iflush_fork(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       xfs_inode_log_item_t    *iip,
-       int                     whichfork,
-       xfs_buf_t               *bp)
-{
-       char                    *cp;
-       xfs_ifork_t             *ifp;
-       xfs_mount_t             *mp;
-       static const short      brootflag[2] =
-               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
-       static const short      dataflag[2] =
-               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
-       static const short      extflag[2] =
-               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
-       if (!iip)
-               return;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       /*
-        * This can happen if we gave up in iformat in an error path,
-        * for the attribute fork.
-        */
-       if (!ifp) {
-               ASSERT(whichfork == XFS_ATTR_FORK);
-               return;
-       }
-       cp = XFS_DFORK_PTR(dip, whichfork);
-       mp = ip->i_mount;
-       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
-       case XFS_DINODE_FMT_LOCAL:
-               if ((iip->ili_fields & dataflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(ifp->if_u1.if_data != NULL);
-                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+xfs_sort_for_rename(
+       xfs_inode_t     *dp1,   /* in: old (source) directory inode */
+       xfs_inode_t     *dp2,   /* in: new (target) directory inode */
+       xfs_inode_t     *ip1,   /* in: inode of old entry */
+       xfs_inode_t     *ip2,   /* in: inode of new entry, if it
+                                  already exists, NULL otherwise. */
+       xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
+       int             *num_inodes)  /* out: number of inodes in array */
+{
+       xfs_inode_t             *temp;
+       int                     i, j;
+
+       /*
+        * i_tab contains a list of pointers to inodes.  We initialize
+        * the table here & we'll sort it.  We will then use it to
+        * order the acquisition of the inode locks.
+        *
+        * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+        */
+       i_tab[0] = dp1;
+       i_tab[1] = dp2;
+       i_tab[2] = ip1;
+       if (ip2) {
+               *num_inodes = 4;
+               i_tab[3] = ip2;
+       } else {
+               *num_inodes = 3;
+               i_tab[3] = NULL;
+       }
+
+       /*
+        * Sort the elements via bubble sort.  (Remember, there are at
+        * most 4 elements to sort, so this is adequate.)
+        */
+       for (i = 0; i < *num_inodes; i++) {
+               for (j = 1; j < *num_inodes; j++) {
+                       if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+                               temp = i_tab[j];
+                               i_tab[j] = i_tab[j-1];
+                               i_tab[j-1] = temp;
+                       }
                 }
-               break;
+       }
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+       xfs_inode_t     *src_dp,
+       struct xfs_name *src_name,
+       xfs_inode_t     *src_ip,
+       xfs_inode_t     *target_dp,
+       struct xfs_name *target_name,
+       xfs_inode_t     *target_ip)
+{
+       xfs_trans_t     *tp = NULL;
+       xfs_mount_t     *mp = src_dp->i_mount;
+       int             new_parent;             /* moving to a new dir */
+       int             src_is_directory;       /* src_name is a directory */
+       int             error;
+       xfs_bmap_free_t free_list;
+       xfs_fsblock_t   first_block;
+       int             cancel_flags;
+       int             committed;
+       xfs_inode_t     *inodes[4];
+       int             spaceres;
+       int             num_inodes;
+
+       trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+       new_parent = (src_dp != target_dp);
+       src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+
+       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+                               inodes, &num_inodes);
+
+       xfs_bmap_init(&free_list, &first_block);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+       if (error == ENOSPC) {
+               spaceres = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+       }
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               goto std_return;
+       }
+
+       /*
+        * Attach the dquots to the inodes
+        */
+       error = xfs_qm_vop_rename_dqattach(inodes);
+       if (error) {
+               xfs_trans_cancel(tp, cancel_flags);
+               goto std_return;
+       }
+
+       /*
+        * Lock all the participating inodes. Depending upon whether
+        * the target_name exists in the target directory, and
+        * whether the target directory is the same as the source
+        * directory, we can lock from 2 to 4 inodes.
+        */
+       xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+       /*
+        * Join all the inodes to the transaction. From this point on,
+        * we can rely on either trans_commit or trans_cancel to unlock
+        * them.
+        */
+       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+       if (new_parent)
+               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+       if (target_ip)
+               xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we are using project inheritance, we only allow renames
+        * into our tree when the project IDs are the same; else the
+        * tree quota mechanism would be circumvented.
+        */
+       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
+       /*
+        * Set up the target.
+        */
+       if (target_ip == NULL) {
+               /*
+                * If there's no space reservation, check the entry will
+                * fit before actually inserting it.
+                */
+               error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+               if (error)
+                       goto error_return;
+               /*
+                * If target does not exist and the rename crosses
+                * directories, adjust the target directory link count
+                * to account for the ".." reference from the new entry.
+                */
+               error = xfs_dir_createname(tp, target_dp, target_name,
+                                               src_ip->i_ino, &first_block,
+                                               &free_list, spaceres);
+               if (error == ENOSPC)
+                       goto error_return;
+               if (error)
+                       goto abort_return;
+
+               xfs_trans_ichgtime(tp, target_dp,
+                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  
-       case XFS_DINODE_FMT_EXTENTS:
-               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-                      !(iip->ili_fields & extflag[whichfork]));
-               if ((iip->ili_fields & extflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(xfs_iext_get_ext(ifp, 0));
-                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
-                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
-                               whichfork);
+               if (new_parent && src_is_directory) {
+                       error = xfs_bumplink(tp, target_dp);
+                       if (error)
+                               goto abort_return;
                 }
-               break;
+       } else { /* target_ip != NULL */
+               /*
+                * If target exists and it's a directory, check that both
+                * target and source are directories and that target can be
+                * destroyed, or that neither is a directory.
+                */
+               if (S_ISDIR(target_ip->i_d.di_mode)) {
+                       /*
+                        * Make sure target dir is empty.
+                        */
+                       if (!(xfs_dir_isempty(target_ip)) ||
+                           (target_ip->i_d.di_nlink > 2)) {
+                               error = XFS_ERROR(EEXIST);
+                               goto error_return;
+                       }
+               }
+
+               /*
+                * Link the source inode under the target name.
+                * If the source inode is a directory and we are moving
+                * it across directories, its ".." entry will be
+                * inconsistent until we replace that down below.
+                *
+                * In case there is already an entry with the same
+                * name at the destination directory, remove it first.
+                */
+               error = xfs_dir_replace(tp, target_dp, target_name,
+                                       src_ip->i_ino,
+                                       &first_block, &free_list, spaceres);
+               if (error)
+                       goto abort_return;
+
+               xfs_trans_ichgtime(tp, target_dp,
+                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+               /*
+                * Decrement the link count on the target since the target
+                * dir no longer points to it.
+                */
+               error = xfs_droplink(tp, target_ip);
+               if (error)
+                       goto abort_return;
+
+               if (src_is_directory) {
+                       /*
+                        * Drop the link from the old "." entry.
+                        */
+                       error = xfs_droplink(tp, target_ip);
+                       if (error)
+                               goto abort_return;
+               }
+       } /* target_ip != NULL */
+
+       /*
+        * Remove the source.
+        */
+       if (new_parent && src_is_directory) {
+               /*
+                * Rewrite the ".." entry to point to the new
+                * directory.
+                */
+               error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+                                       target_dp->i_ino,
+                                       &first_block, &free_list, spaceres);
+               ASSERT(error != EEXIST);
+               if (error)
+                       goto abort_return;
+       }
+
+       /*
+        * We always want to hit the ctime on the source inode.
+        *
+        * This isn't strictly required by the standards since the source
+        * inode isn't really being changed, but old unix file systems did
+        * it and some incremental backup programs won't work without it.
+        */
+       xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+       /*
+        * Adjust the link count on src_dp.  This is necessary when
+        * renaming a directory, either within one parent when
+        * the target existed, or across two parent directories.
+        */
+       if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+               /*
+                * Decrement link count on src_directory since the
+                * entry that's moved no longer points to it.
+                */
+               error = xfs_droplink(tp, src_dp);
+               if (error)
+                       goto abort_return;
+       }
  
-       case XFS_DINODE_FMT_BTREE:
-               if ((iip->ili_fields & brootflag[whichfork]) &&
-                   (ifp->if_broot_bytes > 0)) {
-                       ASSERT(ifp->if_broot != NULL);
-                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                               XFS_IFORK_SIZE(ip, whichfork));
-                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
-                               (xfs_bmdr_block_t *)cp,
-                               XFS_DFORK_SIZE(dip, mp, whichfork));
-               }
-               break;
+       error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+                                       &first_block, &free_list, spaceres);
+       if (error)
+               goto abort_return;
  
-       case XFS_DINODE_FMT_DEV:
-               if (iip->ili_fields & XFS_ILOG_DEV) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
-               }
-               break;
+       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+       if (new_parent)
+               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
  
-       case XFS_DINODE_FMT_UUID:
-               if (iip->ili_fields & XFS_ILOG_UUID) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       memcpy(XFS_DFORK_DPTR(dip),
-                              &ip->i_df.if_u2.if_uuid,
-                              sizeof(uuid_t));
-               }
-               break;
+       /*
+        * If this is a synchronous mount, make sure that the
+        * rename transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+               xfs_trans_set_sync(tp);
+       }
  
-       default:
-               ASSERT(0);
-               break;
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(&free_list);
+               xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                                XFS_TRANS_ABORT));
+               goto std_return;
         }
+
+       /*
+        * trans_commit will unlock src_ip, target_ip & decrement
+        * the vnode references.
+        */
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+       cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
  }
  
  STATIC int
@@ -2816,7 +3093,6 @@ abort_out:
         return error;
  }
  
-
  STATIC int
  xfs_iflush_int(
         struct xfs_inode        *ip,
@@ -3004,1072 +3280,3 @@ xfs_iflush_int(
  corrupt_out:
         return XFS_ERROR(EFSCORRUPTED);
  }
-
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx)            /* index of target extent */
-{
-       ASSERT(idx >= 0);
-       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
-       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
-               return ifp->if_u1.if_ext_irec->er_extbuf;
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_ext_irec_t  *erp;           /* irec pointer */
-               int             erp_idx = 0;    /* irec index */
-               xfs_extnum_t    page_idx = idx; /* ext index in target list */
-
-               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
-               return &erp->er_extbuf[page_idx];
-       } else if (ifp->if_bytes) {
-               return &ifp->if_u1.if_extents[idx];
-       } else {
-               return NULL;
-       }
-}
-
-/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'.  'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* starting index of new items */
-       xfs_extnum_t    count,          /* number of inserted items */
-       xfs_bmbt_irec_t *new,           /* items to insert */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    i;              /* extent record index */
-
-       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       xfs_iext_add(ifp, idx, count);
-       for (i = idx; i < idx + count; i++, new++)
-               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin adding exts */
-       int             ext_diff)       /* number of extents to add */
-{
-       int             byte_diff;      /* new bytes being added */
-       int             new_size;       /* size of extents after adding */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT((idx >= 0) && (idx <= nextents));
-       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
-       new_size = ifp->if_bytes + byte_diff;
-       /*
-        * If the new number of extents (nextents + ext_diff)
-        * fits inside the inode, then continue to use the inline
-        * extent buffer.
-        */
-       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
-               if (idx < nextents) {
-                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
-                               &ifp->if_u2.if_inline_ext[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
-               }
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-               ifp->if_real_bytes = 0;
-       }
-       /*
-        * Otherwise use a linear (direct) extent list.
-        * If the extents are currently inside the inode,
-        * xfs_iext_realloc_direct will switch us from
-        * inline to direct extent allocation mode.
-        */
-       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, new_size);
-               if (idx < nextents) {
-                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
-                               &ifp->if_u1.if_extents[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
-               }
-       }
-       /* Indirection array */
-       else {
-               xfs_ext_irec_t  *erp;
-               int             erp_idx = 0;
-               int             page_idx = idx;
-
-               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
-               if (ifp->if_flags & XFS_IFEXTIREC) {
-                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
-               } else {
-                       xfs_iext_irec_init(ifp);
-                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-                       erp = ifp->if_u1.if_ext_irec;
-               }
-               /* Extents fit in target extent page */
-               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
-                       if (page_idx < erp->er_extcount) {
-                               memmove(&erp->er_extbuf[page_idx + ext_diff],
-                                       &erp->er_extbuf[page_idx],
-                                       (erp->er_extcount - page_idx) *
-                                       sizeof(xfs_bmbt_rec_t));
-                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
-                       }
-                       erp->er_extcount += ext_diff;
-                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               }
-               /* Insert a new extent page */
-               else if (erp) {
-                       xfs_iext_add_indirect_multi(ifp,
-                               erp_idx, page_idx, ext_diff);
-               }
-               /*
-                * If extent(s) are being appended to the last page in
-                * the indirection array and the new extent(s) don't fit
-                * in the page, then erp is NULL and erp_idx is set to
-                * the next index needed in the indirection array.
-                */
-               else {
-                       int     count = ext_diff;
-
-                       while (count) {
-                               erp = xfs_iext_irec_new(ifp, erp_idx);
-                               erp->er_extcount = count;
-                               count -= MIN(count, (int)XFS_LINEAR_EXTS);
-                               if (count) {
-                                       erp_idx++;
-                               }
-                       }
-               }
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- *    |-------|   |-------|
- *    |       |   |       |    idx - number of extents before idx
- *    |  idx  |   | count |
- *    |       |   |       |    count - number of extents being inserted at idx
- *    |-------|   |-------|
- *    | count |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_add_indirect_multi(
-       xfs_ifork_t     *ifp,                   /* inode fork pointer */
-       int             erp_idx,                /* target extent irec index */
-       xfs_extnum_t    idx,                    /* index within target list */
-       int             count)                  /* new extents being added */
-{
-       int             byte_diff;              /* new bytes being added */
-       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
-       xfs_extnum_t    ext_diff;               /* number of extents to add */
-       xfs_extnum_t    ext_cnt;                /* new extents still needed */
-       xfs_extnum_t    nex2;                   /* extents after idx + count */
-       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
-       int             nlists;                 /* number of irec's (lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       nex2 = erp->er_extcount - idx;
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /*
-        * Save second part of target extent list
-        * (all extents past */
-       if (nex2) {
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
-               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
-               erp->er_extcount -= nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
-               memset(&erp->er_extbuf[idx], 0, byte_diff);
-       }
-
-       /*
-        * Add the new extents to the end of the target
-        * list, then allocate new irec record(s) and
-        * extent buffer(s) as needed to store the rest
-        * of the new extents.
-        */
-       ext_cnt = count;
-       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
-       if (ext_diff) {
-               erp->er_extcount += ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-       while (ext_cnt) {
-               erp_idx++;
-               erp = xfs_iext_irec_new(ifp, erp_idx);
-               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
-               erp->er_extcount = ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-
-       /* Add nex2 extents back to indirection array */
-       if (nex2) {
-               xfs_extnum_t    ext_avail;
-               int             i;
-
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-               i = 0;
-               /*
-                * If nex2 extents fit in the current page, append
-                * nex2_ep after the new extents.
-                */
-               if (nex2 <= ext_avail) {
-                       i = erp->er_extcount;
-               }
-               /*
-                * Otherwise, check if space is available in the
-                * next page.
-                */
-               else if ((erp_idx < nlists - 1) &&
-                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
-                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
-                       erp_idx++;
-                       erp++;
-                       /* Create a hole for nex2 extents */
-                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
-                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
-               }
-               /*
-                * Final choice, create a new extent page for
-                * nex2 extents.
-                */
-               else {
-                       erp_idx++;
-                       erp = xfs_iext_irec_new(ifp, erp_idx);
-               }
-               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-               kmem_free(nex2_ep);
-               erp->er_extcount += nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
-       }
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array.  Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff,       /* number of extents to remove */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
-       ASSERT(ext_diff > 0);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_iext_remove_indirect(ifp, idx, ext_diff);
-       } else if (ifp->if_real_bytes) {
-               xfs_iext_remove_direct(ifp, idx, ext_diff);
-       } else {
-               xfs_iext_remove_inline(ifp, idx, ext_diff);
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       int             nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       ASSERT(idx < XFS_INLINE_EXTS);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(((nextents - ext_diff) > 0) &&
-               (nextents - ext_diff) < XFS_INLINE_EXTS);
-
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u2.if_inline_ext[idx],
-                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
-                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       } else {
-               memset(&ifp->if_u2.if_inline_ext[idx], 0,
-                       ext_diff * sizeof(xfs_bmbt_rec_t));
-       }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       new_size = ifp->if_bytes -
-               (ext_diff * sizeof(xfs_bmbt_rec_t));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-               return;
-       }
-       /* Move extents up in the list (if needed) */
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u1.if_extents[idx],
-                       &ifp->if_u1.if_extents[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-       }
-       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
-               0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       /*
-        * Reallocate the direct extent list. If the extents
-        * will fit inside the inode then xfs_iext_realloc_direct
-        * will switch from direct to inline extent allocation
-        * mode for us.
-        */
-       xfs_iext_realloc_direct(ifp, new_size);
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- *    |-------|   |-------|
- *    | nex1  |   |       |    nex1 - number of extents before idx
- *    |-------|   | count |
- *    |       |   |       |    count - number of extents being removed at idx
- *    | count |   |-------|
- *    |       |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_remove_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing extents */
-       int             count)          /* number of extents to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             erp_idx = 0;    /* indirection array index */
-       xfs_extnum_t    ext_cnt;        /* extents left to remove */
-       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
-       xfs_extnum_t    nex1;           /* number of extents before idx */
-       xfs_extnum_t    nex2;           /* extents after idx + count */
-       int             page_idx = idx; /* index in target extent list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
-       ASSERT(erp != NULL);
-       nex1 = page_idx;
-       ext_cnt = count;
-       while (ext_cnt) {
-               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
-               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
-               /*
-                * Check for deletion of entire list;
-                * xfs_iext_irec_remove() updates extent offsets.
-                */
-               if (ext_diff == erp->er_extcount) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-                       ext_cnt -= ext_diff;
-                       nex1 = 0;
-                       if (ext_cnt) {
-                               ASSERT(erp_idx < ifp->if_real_bytes /
-                                       XFS_IEXT_BUFSZ);
-                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                               nex1 = 0;
-                               continue;
-                       } else {
-                               break;
-                       }
-               }
-               /* Move extents up (if needed) */
-               if (nex2) {
-                       memmove(&erp->er_extbuf[nex1],
-                               &erp->er_extbuf[nex1 + ext_diff],
-                               nex2 * sizeof(xfs_bmbt_rec_t));
-               }
-               /* Zero out rest of page */
-               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
-                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
-               /* Update remaining counters */
-               erp->er_extcount -= ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
-               ext_cnt -= ext_diff;
-               nex1 = 0;
-               erp_idx++;
-               erp++;
-       }
-       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
-       xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new size of extents */
-{
-       int             rnew_size;      /* real new size of extents */
-
-       rnew_size = new_size;
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
-               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
-                (new_size != ifp->if_real_bytes)));
-
-       /* Free extent records */
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       }
-       /* Resize direct extent list and zero any new bytes */
-       else if (ifp->if_real_bytes) {
-               /* Check if extents will fit inside the inode */
-               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
-                       xfs_iext_direct_to_inline(ifp, new_size /
-                               (uint)sizeof(xfs_bmbt_rec_t));
-                       ifp->if_bytes = new_size;
-                       return;
-               }
-               if (!is_power_of_2(new_size)){
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               if (rnew_size != ifp->if_real_bytes) {
-                       ifp->if_u1.if_extents =
-                               kmem_realloc(ifp->if_u1.if_extents,
-                                               rnew_size,
-                                               ifp->if_real_bytes, KM_NOFS);
-               }
-               if (rnew_size > ifp->if_real_bytes) {
-                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
-                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
-                               rnew_size - ifp->if_real_bytes);
-               }
-       }
-       /*
-        * Switch from the inline extent buffer to a direct
-        * extent list. Be sure to include the inline extent
-        * bytes in new_size.
-        */
-       else {
-               new_size += ifp->if_bytes;
-               if (!is_power_of_2(new_size)) {
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               xfs_iext_inline_to_direct(ifp, rnew_size);
-       }
-       ifp->if_real_bytes = rnew_size;
-       ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    nextents)       /* number of extents in file */
-{
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       ASSERT(nextents <= XFS_INLINE_EXTS);
-       /*
-        * The inline buffer was zeroed when we switched
-        * from inline to direct extent allocation mode,
-        * so we don't need to clear it here.
-        */
-       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
-               nextents * sizeof(xfs_bmbt_rec_t));
-       kmem_free(ifp->if_u1.if_extents);
-       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* number of extents in file */
-{
-       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
-       memset(ifp->if_u1.if_extents, 0, new_size);
-       if (ifp->if_bytes) {
-               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
-                       ifp->if_bytes);
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new indirection array size */
-{
-       int             nlists;         /* number of irec's (ex lists) */
-       int             size;           /* current indirection array size */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       size = nlists * sizeof(xfs_ext_irec_t);
-       ASSERT(ifp->if_real_bytes);
-       ASSERT((new_size >= 0) && (new_size != size));
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else {
-               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-                       kmem_realloc(ifp->if_u1.if_ext_irec,
-                               new_size, size, KM_NOFS);
-       }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
-        xfs_ifork_t    *ifp)           /* inode fork pointer */
-{
-       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             size;           /* size of file extents */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-       size = nextents * sizeof(xfs_bmbt_rec_t);
-
-       xfs_iext_irec_compact_pages(ifp);
-       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
-       ep = ifp->if_u1.if_ext_irec->er_extbuf;
-       kmem_free(ifp->if_u1.if_ext_irec);
-       ifp->if_flags &= ~XFS_IFEXTIREC;
-       ifp->if_u1.if_extents = ep;
-       ifp->if_bytes = size;
-       if (nextents < XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, size);
-       }
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               int     erp_idx;
-               int     nlists;
-
-               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-               }
-               ifp->if_flags &= ~XFS_IFEXTIREC;
-       } else if (ifp->if_real_bytes) {
-               kmem_free(ifp->if_u1.if_extents);
-       } else if (ifp->if_bytes) {
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_u1.if_extents = NULL;
-       ifp->if_real_bytes = 0;
-       ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
-xfs_iext_bno_to_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       xfs_extnum_t    *idxp)          /* index of target extent */
-{
-       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
-       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
-       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       int             high;           /* upper boundary in search */
-       xfs_extnum_t    idx = 0;        /* index of target extent */
-       int             low;            /* lower boundary in search */
-       xfs_extnum_t    nextents;       /* number of file extents */
-       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       if (nextents == 0) {
-               *idxp = 0;
-               return NULL;
-       }
-       low = 0;
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               /* Find target extent list */
-               int     erp_idx = 0;
-               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
-               base = erp->er_extbuf;
-               high = erp->er_extcount - 1;
-       } else {
-               base = ifp->if_u1.if_extents;
-               high = nextents - 1;
-       }
-       /* Binary search extent records */
-       while (low <= high) {
-               idx = (low + high) >> 1;
-               ep = base + idx;
-               startoff = xfs_bmbt_get_startoff(ep);
-               blockcount = xfs_bmbt_get_blockcount(ep);
-               if (bno < startoff) {
-                       high = idx - 1;
-               } else if (bno >= startoff + blockcount) {
-                       low = idx + 1;
-               } else {
-                       /* Convert back to file-based extent index */
-                       if (ifp->if_flags & XFS_IFEXTIREC) {
-                               idx += erp->er_extoff;
-                       }
-                       *idxp = idx;
-                       return ep;
-               }
-       }
-       /* Convert back to file-based extent index */
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               idx += erp->er_extoff;
-       }
-       if (bno >= startoff + blockcount) {
-               if (++idx == nextents) {
-                       ep = NULL;
-               } else {
-                       ep = xfs_iext_get_ext(ifp, idx);
-               }
-       }
-       *idxp = idx;
-       return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t *                       /* pointer to found extent record */
-xfs_iext_bno_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       int             *erp_idxp)      /* irec index of target ext list */
-{
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of extent irec's (lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
-               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
-                       high = erp_idx - 1;
-               } else if (erp_next && bno >=
-                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
-                       low = erp_idx + 1;
-               } else {
-                       break;
-               }
-       }
-       *erp_idxp = erp_idx;
-       return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
-       int             *erp_idxp,      /* pointer to target irec */
-       int             realloc)        /* new bytes were just added */
-{
-       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
-       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       ASSERT(page_idx >= 0);
-       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-
-       /* Binary search extent irec's */
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               prev = erp_idx > 0 ? erp - 1 : NULL;
-               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
-                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
-                       high = erp_idx - 1;
-               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
-                          (page_idx == erp->er_extoff + erp->er_extcount &&
-                           !realloc)) {
-                       low = erp_idx + 1;
-               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
-                          erp->er_extcount == XFS_LINEAR_EXTS) {
-                       ASSERT(realloc);
-                       page_idx = 0;
-                       erp_idx++;
-                       erp = erp_idx < nlists ? erp + 1 : NULL;
-                       break;
-               } else {
-                       page_idx -= erp->er_extoff;
-                       break;
-               }
-       }
-       *idxp = page_idx;
-       *erp_idxp = erp_idx;
-       return(erp);
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-
-       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
-       if (nextents == 0) {
-               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       } else if (!ifp->if_real_bytes) {
-               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
-       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
-               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
-       }
-       erp->er_extbuf = ifp->if_u1.if_extents;
-       erp->er_extcount = nextents;
-       erp->er_extoff = 0;
-
-       ifp->if_flags |= XFS_IFEXTIREC;
-       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
-       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
-       ifp->if_u1.if_ext_irec = erp;
-
-       return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* index for new irec */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /* Resize indirection array */
-       xfs_iext_realloc_indirect(ifp, ++nlists *
-                                 sizeof(xfs_ext_irec_t));
-       /*
-        * Move records down in the array so the
-        * new page can use erp_idx.
-        */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = nlists - 1; i > erp_idx; i--) {
-               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
-       }
-       ASSERT(i == erp_idx);
-
-       /* Initialize new extent record */
-       erp = ifp->if_u1.if_ext_irec;
-       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
-       erp[erp_idx].er_extcount = 0;
-       erp[erp_idx].er_extoff = erp_idx > 0 ?
-               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
-       return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* irec index to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       if (erp->er_extbuf) {
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
-                       -erp->er_extcount);
-               kmem_free(erp->er_extbuf);
-       }
-       /* Compact extent records */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = erp_idx; i < nlists - 1; i++) {
-               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
-       }
-       /*
-        * Manually free the last extent record from the indirection
-        * array.  A call to xfs_iext_realloc_indirect() with a size
-        * of zero would result in a call to xfs_iext_destroy() which
-        * would in turn call this function again, creating a nasty
-        * infinite loop.
-        */
-       if (--nlists) {
-               xfs_iext_realloc_indirect(ifp,
-                       nlists * sizeof(xfs_ext_irec_t));
-       } else {
-               kmem_free(ifp->if_u1.if_ext_irec);
-       }
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array.  Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible.  The
- * compaction policy is as follows:
- *
- *    Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- *      No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (nextents == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (nextents <= XFS_INLINE_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-               xfs_iext_direct_to_inline(ifp, nextents);
-       } else if (nextents <= XFS_LINEAR_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
-               xfs_iext_irec_compact_pages(ifp);
-       }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
-       int             erp_idx = 0;    /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       while (erp_idx < nlists - 1) {
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp + 1;
-               if (erp_next->er_extcount <=
-                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
-                       memcpy(&erp->er_extbuf[erp->er_extcount],
-                               erp_next->er_extbuf, erp_next->er_extcount *
-                               sizeof(xfs_bmbt_rec_t));
-                       erp->er_extcount += erp_next->er_extcount;
-                       /*
-                        * Free page before removing extent record
-                        * so er_extoffs don't get modified in
-                        * xfs_iext_irec_remove.
-                        */
-                       kmem_free(erp_next->er_extbuf);
-                       erp_next->er_extbuf = NULL;
-                       xfs_iext_irec_remove(ifp, erp_idx + 1);
-                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               } else {
-                       erp_idx++;
-               }
-       }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx,        /* irec index to update */
-       int             ext_diff)       /* number of new extents */
-{
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       for (i = erp_idx; i < nlists; i++) {
-               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
-       }
-}
-
-/*
- * Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
- */
-bool
-xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
-{
-       /* prealloc/delalloc exists only on regular files */
-       if (!S_ISREG(ip->i_d.di_mode))
-               return false;
-
-       /*
-        * Zero sized files with no cached pages and delalloc blocks will not
-        * have speculative prealloc/delalloc blocks to remove.
-        */
-       if (VFS_I(ip)->i_size == 0 &&
-           VN_CACHED(VFS_I(ip)) == 0 &&
-           ip->i_delayed_blks == 0)
-               return false;
-
-       /* If we haven't read in the extent list, then don't do it now. */
-       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
-               return false;
-
-       /*
-        * Do not free real preallocated or append-only files unless the file
-        * has delalloc blocks and we are forced to remove them.
-        */
-       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
-               if (!force || ip->i_delayed_blks == 0)
-                       return false;
-
-       return true;
-}
-
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index b55fd347ab5b9b9ff51fd555115c854d7cfd8084..4a91358c1470b9ac029d451dd38c3fa77daf6fc0 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,225 +18,15 @@
  #ifndef        __XFS_INODE_H__
  #define        __XFS_INODE_H__
  
-struct posix_acl;
-struct xfs_dinode;
-struct xfs_inode;
-
-/*
- * Fork identifiers.
- */
-#define        XFS_DATA_FORK   0
-#define        XFS_ATTR_FORK   1
-
-/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
-       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
-       xfs_extnum_t    er_extoff;      /* extent offset in file */
-       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
-} xfs_ext_irec_t;
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
  
  /*
- * File incore extent information, present for each of data & attr forks.
+ * Kernel only inode definitions
   */
-#define        XFS_IEXT_BUFSZ          4096
-#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define        XFS_INLINE_EXTS         2
-#define        XFS_INLINE_DATA         32
-typedef struct xfs_ifork {
-       int                     if_bytes;       /* bytes in if_u1 */
-       int                     if_real_bytes;  /* bytes allocated in if_u1 */
-       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
-       short                   if_broot_bytes; /* bytes allocated for root */
-       unsigned char           if_flags;       /* per-fork flags */
-       union {
-               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
-               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
-               char            *if_data;       /* inline file data */
-       } if_u1;
-       union {
-               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
-                                               /* very small file extents */
-               char            if_inline_data[XFS_INLINE_DATA];
-                                               /* very small file data */
-               xfs_dev_t       if_rdev;        /* dev number if special */
-               uuid_t          if_uuid;        /* mount point value */
-       } if_u2;
-} xfs_ifork_t;
-
-/*
- * Inode location information.  Stored in the inode and passed to
- * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
- */
-struct xfs_imap {
-       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-       ushort          im_len;         /* length in BBs of inode chunk */
-       ushort          im_boffset;     /* inode offset in block in bytes */
-};
-
-/*
- * This is the xfs in-core inode structure.
- * Most of the on-disk inode is embedded in the i_d field.
- *
- * The extent pointers/inline file space, however, are managed
- * separately.  The memory for this information is pointed to by
- * the if_u1 unions depending on the type of the data.
- * This is used to linearize the array of extents for fast in-core
- * access.  This is used until the file's number of extents
- * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
- * are accessed through the buffer cache.
- *
- * Other state kept in the in-core inode is used for identification,
- * locking, transactional updating, etc of the inode.
- *
- * Generally, we do not want to hold the i_rlock while holding the
- * i_ilock. Hierarchy is i_iolock followed by i_rlock.
- *
- * xfs_iptr_t contains all the inode fields up to and including the
- * i_mnext and i_mprev fields, it is used as a marker in the inode
- * chain off the mount structure by xfs_sync calls.
- */
-
-typedef struct xfs_ictimestamp {
-       __int32_t       t_sec;          /* timestamp seconds */
-       __int32_t       t_nsec;         /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
-
-/*
- * NOTE:  This structure must be kept identical to struct xfs_dinode
- *       in xfs_dinode.h except for the endianness annotations.
- */
-typedef struct xfs_icdinode {
-       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
-       __uint16_t      di_mode;        /* mode and type of file */
-       __int8_t        di_version;     /* inode version */
-       __int8_t        di_format;      /* format of di_c data */
-       __uint16_t      di_onlink;      /* old number of links to file */
-       __uint32_t      di_uid;         /* owner's user id */
-       __uint32_t      di_gid;         /* owner's group id */
-       __uint32_t      di_nlink;       /* number of links to file */
-       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
-       __uint8_t       di_pad[6];      /* unused, zeroed space */
-       __uint16_t      di_flushiter;   /* incremented on flush */
-       xfs_ictimestamp_t di_atime;     /* time last accessed */
-       xfs_ictimestamp_t di_mtime;     /* time last modified */
-       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
-       xfs_fsize_t     di_size;        /* number of bytes in file */
-       xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
-       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
-       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
-       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
-       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
-       __int8_t        di_aformat;     /* format of attr fork's data */
-       __uint32_t      di_dmevmask;    /* DMIG event mask */
-       __uint16_t      di_dmstate;     /* DMIG state info */
-       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
-       __uint32_t      di_gen;         /* generation number */
-
-       /* di_next_unlinked is the only non-core field in the old dinode */
-       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
-
-       /* start of the extended dinode, writable fields */
-       __uint32_t      di_crc;         /* CRC of the inode */
-       __uint64_t      di_changecount; /* number of attribute changes */
-       xfs_lsn_t       di_lsn;         /* flush sequence */
-       __uint64_t      di_flags2;      /* more random flags */
-       __uint8_t       di_pad2[16];    /* more padding for future expansion */
-
-       /* fields only written to during inode creation */
-       xfs_ictimestamp_t di_crtime;    /* time created */
-       xfs_ino_t       di_ino;         /* inode number */
-       uuid_t          di_uuid;        /* UUID of the filesystem */
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
-
-static inline uint xfs_icdinode_size(int version)
-{
-       if (version == 3)
-               return sizeof(struct xfs_icdinode);
-       return offsetof(struct xfs_icdinode, di_next_unlinked);
-}
-
-/*
- * Flags for xfs_ichgtime().
- */
-#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
-
-/*
- * Per-fork incore inode flags.
- */
-#define        XFS_IFINLINE    0x01    /* Inline data is read in */
-#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-
-/*
- * Fork handling.
- */
-
-#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)            \
-       ((w) == XFS_DATA_FORK ? \
-               &(ip)->i_df : \
-               (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_IFORK_BOFF(ip) : \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
-#define XFS_IFORK_ASIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
-                       XFS_IFORK_BOFF(ip) : \
-               0)
-#define XFS_IFORK_SIZE(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_IFORK_DSIZE(ip) : \
-               XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_format : \
-               (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_format = (n)) : \
-               ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_nextents : \
-               (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_nextents = (n)) : \
-               ((ip)->i_d.di_anextents = (n)))
-#define XFS_IFORK_MAXEXT(ip, w) \
-       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
-
-
-#ifdef __KERNEL__
  
+struct xfs_dinode;
+struct xfs_inode;
  struct xfs_buf;
  struct xfs_bmap_free;
  struct xfs_bmbt_irec;
@@ -525,9 +315,21 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
          ((pip)->i_d.di_mode & S_ISGID))
  
  
-/*
- * xfs_inode.c prototypes.
- */
+int            xfs_release(struct xfs_inode *ip);
+int            xfs_inactive(struct xfs_inode *ip);
+int            xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+                          struct xfs_inode **ipp, struct xfs_name *ci_name);
+int            xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+                          umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int            xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+                          struct xfs_inode *ip);
+int            xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+                        struct xfs_name *target_name);
+int            xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+                          struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+                          struct xfs_name *target_name,
+                          struct xfs_inode *target_ip);
+
  void           xfs_ilock(xfs_inode_t *, uint);
  int            xfs_ilock_nowait(xfs_inode_t *, uint);
  void           xfs_iunlock(xfs_inode_t *, uint);
@@ -548,13 +350,28 @@ int               xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
  int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
  
  void           xfs_iext_realloc(xfs_inode_t *, int, int);
+
  void           xfs_iunpin_wait(xfs_inode_t *);
+#define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
+
  int            xfs_iflush(struct xfs_inode *, struct xfs_buf **);
  void           xfs_lock_inodes(xfs_inode_t **, int, uint);
  void           xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
  
  xfs_extlen_t   xfs_get_extsz_hint(struct xfs_inode *ip);
  
+int            xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
+                              xfs_nlink_t, xfs_dev_t, prid_t, int,
+                              struct xfs_inode **, int *);
+int            xfs_droplink(struct xfs_trans *, struct xfs_inode *);
+int            xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
+void           xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
+
+/* from xfs_file.c */
+int            xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int            xfs_iozero(struct xfs_inode *, loff_t, size_t);
+
+
  #define IHOLD(ip) \
  do { \
         ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -568,65 +385,6 @@ do { \
         iput(VFS_I(ip)); \
  } while (0)
  
-#endif /* __KERNEL__ */
-
-/*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE                0x1
-#define XFS_IGET_UNTRUSTED     0x2
-#define XFS_IGET_DONTCACHE     0x4
-
-int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-                              struct xfs_imap *, struct xfs_dinode **,
-                              struct xfs_buf **, uint, uint);
-int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                         struct xfs_inode *, uint);
-void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void           xfs_dinode_to_disk(struct xfs_dinode *,
-                                  struct xfs_icdinode *);
-void           xfs_idestroy_fork(struct xfs_inode *, int);
-void           xfs_idata_realloc(struct xfs_inode *, int, int);
-void           xfs_iroot_realloc(struct xfs_inode *, int, int);
-int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
-int            xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
-
-xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
-void           xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
-                               xfs_bmbt_irec_t *, int);
-void           xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
-void           xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
-void           xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void           xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
-void           xfs_iext_inline_to_direct(xfs_ifork_t *, int);
-void           xfs_iext_destroy(xfs_ifork_t *);
-xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
-void           xfs_iext_irec_init(xfs_ifork_t *);
-xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
-void           xfs_iext_irec_remove(xfs_ifork_t *, int);
-void           xfs_iext_irec_compact(xfs_ifork_t *);
-void           xfs_iext_irec_compact_pages(xfs_ifork_t *);
-void           xfs_iext_irec_compact_full(xfs_ifork_t *);
-void           xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
-bool           xfs_can_free_eofblocks(struct xfs_inode *, bool);
-
-#define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
-
-#if defined(DEBUG)
-void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define        xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
-
-extern struct kmem_zone        *xfs_ifork_zone;
  extern struct kmem_zone        *xfs_inode_zone;
-extern struct kmem_zone        *xfs_ili_zone;
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
  
  #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c

new file mode 100644 (file)

index 0000000..e011d59
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.c
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_ialloc.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+       xfs_mount_t     *mp,
+       xfs_buf_t       *bp)
+{
+       int             i;
+       int             j;
+       xfs_dinode_t    *dip;
+
+       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+       for (i = 0; i < j; i++) {
+               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                       i * mp->m_sb.sb_inodesize);
+               if (!dip->di_next_unlinked)  {
+                       xfs_alert(mp,
+       "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
+                               bp);
+                       ASSERT(dip->di_next_unlinked);
+               }
+       }
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+       struct xfs_buf  *bp,
+       bool            readahead)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       int             i;
+       int             ni;
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        */
+       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       if (readahead) {
+                               bp->b_flags &= ~XBF_DONE;
+                               return;
+                       }
+
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                            mp, dip);
+#ifdef DEBUG
+                       xfs_emerg(mp,
+                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                               (unsigned long long)bp->b_bn, i,
+                               be16_to_cpu(dip->di_magic));
+                       ASSERT(0);
+#endif
+               }
+       }
+       xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+       .verify_read = xfs_inode_buf_read_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+       .verify_read = xfs_inode_buf_readahead_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp,
+       uint                    buf_flags,
+       uint                    iget_flags)
+{
+       struct xfs_buf          *bp;
+       int                     error;
+
+       buf_flags |= XBF_UNMAPPED;
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                  (int)imap->im_len, buf_flags, &bp,
+                                  &xfs_inode_buf_ops);
+       if (error) {
+               if (error == EAGAIN) {
+                       ASSERT(buf_flags & XBF_TRYLOCK);
+                       return error;
+               }
+
+               if (error == EFSCORRUPTED &&
+                   (iget_flags & XFS_IGET_UNTRUSTED))
+                       return XFS_ERROR(EINVAL);
+
+               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                       __func__, error);
+               return error;
+       }
+
+       *bpp = bp;
+       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+       return 0;
+}
+
+STATIC void
+xfs_dinode_from_disk(
+       xfs_icdinode_t          *to,
+       xfs_dinode_t            *from)
+{
+       to->di_magic = be16_to_cpu(from->di_magic);
+       to->di_mode = be16_to_cpu(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = be16_to_cpu(from->di_onlink);
+       to->di_uid = be32_to_cpu(from->di_uid);
+       to->di_gid = be32_to_cpu(from->di_gid);
+       to->di_nlink = be32_to_cpu(from->di_nlink);
+       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_flushiter = be16_to_cpu(from->di_flushiter);
+       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+       to->di_size = be64_to_cpu(from->di_size);
+       to->di_nblocks = be64_to_cpu(from->di_nblocks);
+       to->di_extsize = be32_to_cpu(from->di_extsize);
+       to->di_nextents = be32_to_cpu(from->di_nextents);
+       to->di_anextents = be16_to_cpu(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat  = from->di_aformat;
+       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
+       to->di_flags    = be16_to_cpu(from->di_flags);
+       to->di_gen      = be32_to_cpu(from->di_gen);
+
+       if (to->di_version == 3) {
+               to->di_changecount = be64_to_cpu(from->di_changecount);
+               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+               to->di_flags2 = be64_to_cpu(from->di_flags2);
+               to->di_ino = be64_to_cpu(from->di_ino);
+               to->di_lsn = be64_to_cpu(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+       }
+}
+
+void
+xfs_dinode_to_disk(
+       xfs_dinode_t            *to,
+       xfs_icdinode_t          *from)
+{
+       to->di_magic = cpu_to_be16(from->di_magic);
+       to->di_mode = cpu_to_be16(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = cpu_to_be16(from->di_onlink);
+       to->di_uid = cpu_to_be32(from->di_uid);
+       to->di_gid = cpu_to_be32(from->di_gid);
+       to->di_nlink = cpu_to_be32(from->di_nlink);
+       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+       to->di_size = cpu_to_be64(from->di_size);
+       to->di_nblocks = cpu_to_be64(from->di_nblocks);
+       to->di_extsize = cpu_to_be32(from->di_extsize);
+       to->di_nextents = cpu_to_be32(from->di_nextents);
+       to->di_anextents = cpu_to_be16(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat = from->di_aformat;
+       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+       to->di_dmstate = cpu_to_be16(from->di_dmstate);
+       to->di_flags = cpu_to_be16(from->di_flags);
+       to->di_gen = cpu_to_be32(from->di_gen);
+
+       if (from->di_version == 3) {
+               to->di_changecount = cpu_to_be64(from->di_changecount);
+               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+               to->di_flags2 = cpu_to_be64(from->di_flags2);
+               to->di_ino = cpu_to_be64(from->di_ino);
+               to->di_lsn = cpu_to_be64(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+               to->di_flushiter = 0;
+       } else {
+               to->di_flushiter = cpu_to_be16(from->di_flushiter);
+       }
+}
+
+static bool
+xfs_dinode_verify(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *ip,
+       struct xfs_dinode       *dip)
+{
+       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+               return false;
+
+       /* only version 3 or greater inodes are extensively verified here */
+       if (dip->di_version < 3)
+               return true;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             offsetof(struct xfs_dinode, di_crc)))
+               return false;
+       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+               return false;
+       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       return true;
+}
+
+void
+xfs_dinode_calc_crc(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip)
+{
+       __uint32_t              crc;
+
+       if (dip->di_version < 3)
+               return;
+
+       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             offsetof(struct xfs_dinode, di_crc));
+       dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       uint            iget_flags)
+{
+       xfs_buf_t       *bp;
+       xfs_dinode_t    *dip;
+       int             error;
+
+       /*
+        * Fill in the location information in the in-core inode.
+        */
+       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+       if (error)
+               return error;
+
+       /* shortcut IO on inode allocation if possible */
+       if ((iget_flags & XFS_IGET_CREATE) &&
+           xfs_sb_version_hascrc(&mp->m_sb) &&
+           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+               /* initialise the on-disk inode core */
+               memset(&ip->i_d, 0, sizeof(ip->i_d));
+               ip->i_d.di_magic = XFS_DINODE_MAGIC;
+               ip->i_d.di_gen = prandom_u32();
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       ip->i_d.di_version = 3;
+                       ip->i_d.di_ino = ip->i_ino;
+                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+               } else
+                       ip->i_d.di_version = 2;
+               return 0;
+       }
+
+       /*
+        * Get pointers to the on-disk inode and the buffer containing it.
+        */
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+       if (error)
+               return error;
+
+       /* even unallocated inodes are verified */
+       if (!xfs_dinode_verify(mp, ip, dip)) {
+               xfs_alert(mp, "%s: validation failed for inode %lld failed",
+                               __func__, ip->i_ino);
+
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+               error = XFS_ERROR(EFSCORRUPTED);
+               goto out_brelse;
+       }
+
+       /*
+        * If the on-disk inode is already linked to a directory
+        * entry, copy all of the inode into the in-core inode.
+        * xfs_iformat_fork() handles copying in the inode format
+        * specific information.
+        * Otherwise, just get the truly permanent information.
+        */
+       if (dip->di_mode) {
+               xfs_dinode_from_disk(&ip->i_d, dip);
+               error = xfs_iformat_fork(ip, dip);
+               if (error)  {
+#ifdef DEBUG
+                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+                               __func__, error);
+#endif /* DEBUG */
+                       goto out_brelse;
+               }
+       } else {
+               /*
+                * Partial initialisation of the in-core inode. Just the bits
+                * that xfs_ialloc won't overwrite or relies on being correct.
+                */
+               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+               ip->i_d.di_version = dip->di_version;
+               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+               if (dip->di_version == 3) {
+                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+               }
+
+               /*
+                * Make sure to pull in the mode here as well in
+                * case the inode is released without being used.
+                * This ensures that xfs_inactive() will see that
+                * the inode is already free and not try to mess
+                * with the uninitialized part of it.
+                */
+               ip->i_d.di_mode = 0;
+       }
+
+       /*
+        * The inode format changed when we moved the link count and
+        * made it 32 bits long.  If this is an old format inode,
+        * convert it in memory to look like a new one.  If it gets
+        * flushed to disk we will convert back before flushing or
+        * logging it.  We zero out the new projid field and the old link
+        * count field.  We'll handle clearing the pad field (the remains
+        * of the old uuid field) when we actually convert the inode to
+        * the new format. We don't change the version number so that we
+        * can distinguish this from a real new format inode.
+        */
+       if (ip->i_d.di_version == 1) {
+               ip->i_d.di_nlink = ip->i_d.di_onlink;
+               ip->i_d.di_onlink = 0;
+               xfs_set_projid(ip, 0);
+       }
+
+       ip->i_delayed_blks = 0;
+
+       /*
+        * Mark the buffer containing the inode as something to keep
+        * around for a while.  This helps to keep recently accessed
+        * meta-data in-core longer.
+        */
+       xfs_buf_set_ref(bp, XFS_INO_REF);
+
+       /*
+        * Use xfs_trans_brelse() to release the buffer containing the on-disk
+        * inode, because it was acquired with xfs_trans_read_buf() in
+        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+        * brelse().  If we're within a transaction, then xfs_trans_brelse()
+        * will only release the buffer if it is not dirty within the
+        * transaction.  It will be OK to release the buffer in this case,
+        * because inodes on disk are never destroyed and we will be locking the
+        * new in-core inode before putting it in the cache where other
+        * processes can find it.  Thus we don't have to worry about the inode
+        * being changed just because we released the buffer.
+        */
+ out_brelse:
+       xfs_trans_brelse(tp, bp);
+       return error;
+}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h

new file mode 100644 (file)

index 0000000..599e6c0
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_BUF_H__
+#define        __XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+       ushort          im_len;         /* length in BBs of inode chunk */
+       ushort          im_boffset;     /* inode offset in block in bytes */
+};
+
+int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                              struct xfs_imap *, struct xfs_dinode **,
+                              struct xfs_buf **, uint, uint);
+int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                         struct xfs_inode *, uint);
+void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void           xfs_dinode_to_disk(struct xfs_dinode *,
+                                  struct xfs_icdinode *);
+
+#if defined(DEBUG)
+void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define        xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+
+#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c

new file mode 100644 (file)

index 0000000..02f1083
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.c
@@ -0,0 +1,1920 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+       xfs_ifork_t             *ifp,
+       int                     nrecs,
+       xfs_exntfmt_t           fmt)
+{
+       xfs_bmbt_irec_t         irec;
+       xfs_bmbt_rec_host_t     rec;
+       int                     i;
+
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               rec.l0 = get_unaligned(&ep->l0);
+               rec.l1 = get_unaligned(&ep->l1);
+               xfs_bmbt_get_all(&rec, &irec);
+               if (fmt == XFS_EXTFMT_NOSTATE)
+                       ASSERT(irec.br_state == XFS_EXT_NORM);
+       }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip)
+{
+       xfs_attr_shortform_t    *atp;
+       int                     size;
+       int                     error = 0;
+       xfs_fsize_t             di_size;
+
+       if (unlikely(be32_to_cpu(dip->di_nextents) +
+                    be16_to_cpu(dip->di_anextents) >
+                    be64_to_cpu(dip->di_nblocks))) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+                       (unsigned long long)ip->i_ino,
+                       (int)(be32_to_cpu(dip->di_nextents) +
+                             be16_to_cpu(dip->di_anextents)),
+                       (unsigned long long)
+                               be64_to_cpu(dip->di_nblocks));
+               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+                       (unsigned long long)ip->i_ino,
+                       dip->di_forkoff);
+               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                    !ip->i_mount->m_rtdev_targp)) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, has realtime flag set.",
+                       ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       switch (ip->i_d.di_mode & S_IFMT) {
+       case S_IFIFO:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFSOCK:
+               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+                                             ip->i_mount, dip);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               ip->i_d.di_size = 0;
+               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+               break;
+
+       case S_IFREG:
+       case S_IFLNK:
+       case S_IFDIR:
+               switch (dip->di_format) {
+               case XFS_DINODE_FMT_LOCAL:
+                       /*
+                        * no local regular files yet
+                        */
+                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (local format for regular file).",
+                                       (unsigned long long) ip->i_ino);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       di_size = be64_to_cpu(dip->di_size);
+                       if (unlikely(di_size < 0 ||
+                                    di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (bad size %Ld for local inode).",
+                                       (unsigned long long) ip->i_ino,
+                                       (long long) di_size);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       size = (int)di_size;
+                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+                       break;
+               case XFS_DINODE_FMT_EXTENTS:
+                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+                       break;
+               case XFS_DINODE_FMT_BTREE:
+                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+                       break;
+               default:
+                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               break;
+
+       default:
+               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       if (error) {
+               return error;
+       }
+       if (!XFS_DFORK_Q(dip))
+               return 0;
+
+       ASSERT(ip->i_afp == NULL);
+       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+       switch (dip->di_aformat) {
+       case XFS_DINODE_FMT_LOCAL:
+               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+               size = be16_to_cpu(atp->hdr.totsize);
+
+               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                       xfs_warn(ip->i_mount,
+                               "corrupt inode %Lu (bad attr fork size %Ld).",
+                               (unsigned long long) ip->i_ino,
+                               (long long) size);
+                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                            XFS_ERRLEVEL_LOW,
+                                            ip->i_mount, dip);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+               break;
+       case XFS_DINODE_FMT_EXTENTS:
+               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+               break;
+       default:
+               error = XFS_ERROR(EFSCORRUPTED);
+               break;
+       }
+       if (error) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+       }
+       return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork,
+       int             size)
+{
+       xfs_ifork_t     *ifp;
+       int             real_size;
+
+       /*
+        * If the size is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount,
+       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+                       (unsigned long long) ip->i_ino, size,
+                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       real_size = 0;
+       if (size == 0)
+               ifp->if_u1.if_data = NULL;
+       else if (size <= sizeof(ifp->if_u2.if_inline_data))
+               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+       else {
+               real_size = roundup(size, 4);
+               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+       }
+       ifp->if_bytes = size;
+       ifp->if_real_bytes = real_size;
+       if (size)
+               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFINLINE;
+       return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork)
+{
+       xfs_bmbt_rec_t  *dp;
+       xfs_ifork_t     *ifp;
+       int             nex;
+       int             size;
+       int             i;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+       /*
+        * If the number of extents is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+                       (unsigned long long) ip->i_ino, nex);
+               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       ifp->if_real_bytes = 0;
+       if (nex == 0)
+               ifp->if_u1.if_extents = NULL;
+       else if (nex <= XFS_INLINE_EXTS)
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       else
+               xfs_iext_add(ifp, 0, nex);
+
+       ifp->if_bytes = size;
+       if (size) {
+               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+               for (i = 0; i < nex; i++, dp++) {
+                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                       ep->l0 = get_unaligned_be64(&dp->l0);
+                       ep->l1 = get_unaligned_be64(&dp->l1);
+               }
+               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+               if (whichfork != XFS_DATA_FORK ||
+                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+                               if (unlikely(xfs_check_nostate_extents(
+                                   ifp, 0, nex))) {
+                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+                                                        XFS_ERRLEVEL_LOW,
+                                                        ip->i_mount);
+                                       return XFS_ERROR(EFSCORRUPTED);
+                               }
+       }
+       ifp->if_flags |= XFS_IFEXTENTS;
+       return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_bmdr_block_t        *dfp;
+       xfs_ifork_t             *ifp;
+       /* REFERENCED */
+       int                     nrecs;
+       int                     size;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+       /*
+        * blow out if -- fork has less extents than can fit in
+        * fork (fork shouldn't be a btree format), root btree
+        * block has more records than can fit into the fork,
+        * or the number of extents is greater than the number of
+        * blocks.
+        */
+       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
+                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+               xfs_warn(mp, "corrupt inode %Lu (btree).",
+                                       (unsigned long long) ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                                        mp, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       ifp->if_broot_bytes = size;
+       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+       ASSERT(ifp->if_broot != NULL);
+       /*
+        * Copy and convert from the on-disk structure
+        * to the in-memory structure.
+        */
+       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                        ifp->if_broot, size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFBROOT;
+
+       return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       int             error;
+       xfs_ifork_t     *ifp;
+       xfs_extnum_t    nextents;
+
+       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+                                ip->i_mount);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       /*
+        * We know that the size is valid (it's checked in iformat_btree)
+        */
+       ifp->if_bytes = ifp->if_real_bytes = 0;
+       ifp->if_flags |= XFS_IFEXTENTS;
+       xfs_iext_add(ifp, 0, nextents);
+       error = xfs_bmap_read_extents(tp, ip, whichfork);
+       if (error) {
+               xfs_iext_destroy(ifp);
+               ifp->if_flags &= ~XFS_IFEXTENTS;
+               return error;
+       }
+       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+       return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *      requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+       xfs_inode_t             *ip,
+       int                     rec_diff,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     cur_max;
+       xfs_ifork_t             *ifp;
+       struct xfs_btree_block  *new_broot;
+       int                     new_max;
+       size_t                  new_size;
+       char                    *np;
+       char                    *op;
+
+       /*
+        * Handle the degenerate case quietly.
+        */
+       if (rec_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (rec_diff > 0) {
+               /*
+                * If there wasn't any memory allocated before, just
+                * allocate it now and get out.
+                */
+               if (ifp->if_broot_bytes == 0) {
+                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                       ifp->if_broot_bytes = (int)new_size;
+                       return;
+               }
+
+               /*
+                * If there is already an existing if_broot, then we need
+                * to realloc() it and shift the pointers to their new
+                * location.  The records don't change location because
+                * they are kept butted up against the btree block header.
+                */
+               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+               new_max = cur_max + rec_diff;
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+                               KM_SLEEP | KM_NOFS);
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    (int)new_size);
+               ifp->if_broot_bytes = (int)new_size;
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+               memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+               return;
+       }
+
+       /*
+        * rec_diff is less than 0.  In this case, we are shrinking the
+        * if_broot buffer.  It must already exist.  If we go to zero
+        * records, just get rid of the root and clear the status bit.
+        */
+       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+       new_max = cur_max + rec_diff;
+       ASSERT(new_max >= 0);
+       if (new_max > 0)
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+       else
+               new_size = 0;
+       if (new_size > 0) {
+               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+               /*
+                * First copy over the btree block header.
+                */
+               memcpy(new_broot, ifp->if_broot,
+                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
+       } else {
+               new_broot = NULL;
+               ifp->if_flags &= ~XFS_IFBROOT;
+       }
+
+       /*
+        * Only copy the records and pointers if there are any.
+        */
+       if (new_max > 0) {
+               /*
+                * First copy the records.
+                */
+               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+               /*
+                * Then copy the pointers.
+                */
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+                                                    (int)new_size);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+       }
+       kmem_free(ifp->if_broot);
+       ifp->if_broot = new_broot;
+       ifp->if_broot_bytes = (int)new_size;
+       if (ifp->if_broot)
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+       return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *      requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+       xfs_inode_t     *ip,
+       int             byte_diff,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+       int             new_size;
+       int             real_size;
+
+       if (byte_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       new_size = (int)ifp->if_bytes + byte_diff;
+       ASSERT(new_size >= 0);
+
+       if (new_size == 0) {
+               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       kmem_free(ifp->if_u1.if_data);
+               }
+               ifp->if_u1.if_data = NULL;
+               real_size = 0;
+       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+               /*
+                * If the valid extents/data can fit in if_inline_ext/data,
+                * copy them from the malloc'd vector and free it.
+                */
+               if (ifp->if_u1.if_data == NULL) {
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+                             new_size);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               }
+               real_size = 0;
+       } else {
+               /*
+                * Stuck with malloc/realloc.
+                * For inline data, the underlying buffer must be
+                * a multiple of 4 bytes in size so that it can be
+                * logged and stay on word boundaries.  We enforce
+                * that here.
+                */
+               real_size = roundup(new_size, 4);
+               if (ifp->if_u1.if_data == NULL) {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       /*
+                        * Only do the realloc if the underlying size
+                        * is really changing.
+                        */
+                       if (ifp->if_real_bytes != real_size) {
+                               ifp->if_u1.if_data =
+                                       kmem_realloc(ifp->if_u1.if_data,
+                                                       real_size,
+                                                       ifp->if_real_bytes,
+                                                       KM_SLEEP | KM_NOFS);
+                       }
+               } else {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+                               ifp->if_bytes);
+               }
+       }
+       ifp->if_real_bytes = real_size;
+       ifp->if_bytes = new_size;
+       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (ifp->if_broot != NULL) {
+               kmem_free(ifp->if_broot);
+               ifp->if_broot = NULL;
+       }
+
+       /*
+        * If the format is local, then we can't have an extents
+        * array so just look for an inline data array.  If we're
+        * not local then we may or may not have an extents list,
+        * so check and free it up if we do.
+        */
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+                   (ifp->if_u1.if_data != NULL)) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = NULL;
+                       ifp->if_real_bytes = 0;
+               }
+       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+                  ((ifp->if_flags & XFS_IFEXTIREC) ||
+                   ((ifp->if_u1.if_extents != NULL) &&
+                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+               ASSERT(ifp->if_real_bytes != 0);
+               xfs_iext_destroy(ifp);
+       }
+       ASSERT(ifp->if_u1.if_extents == NULL ||
+              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+       ASSERT(ifp->if_real_bytes == 0);
+       if (whichfork == XFS_ATTR_FORK) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+       }
+}
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * memcpy() the extents into the buffer.  Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+       xfs_inode_t             *ip,
+       xfs_bmbt_rec_t          *dp,
+       int                     whichfork)
+{
+       int                     copied;
+       int                     i;
+       xfs_ifork_t             *ifp;
+       int                     nrecs;
+       xfs_fsblock_t           start_block;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+       ASSERT(ifp->if_bytes > 0);
+
+       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+       ASSERT(nrecs > 0);
+
+       /*
+        * There are some delayed allocation extents in the
+        * inode, so copy the extents one at a time and skip
+        * the delayed ones.  There must be at least one
+        * non-delayed extent.
+        */
+       copied = 0;
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               start_block = xfs_bmbt_get_startblock(ep);
+               if (isnullstartblock(start_block)) {
+                       /*
+                        * It's a delayed allocation extent, so skip it.
+                        */
+                       continue;
+               }
+
+               /* Translate to on disk format */
+               put_unaligned_be64(ep->l0, &dp->l0);
+               put_unaligned_be64(ep->l1, &dp->l1);
+               dp++;
+               copied++;
+       }
+       ASSERT(copied != 0);
+       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       xfs_inode_log_item_t    *iip,
+       int                     whichfork,
+       xfs_buf_t               *bp)
+{
+       char                    *cp;
+       xfs_ifork_t             *ifp;
+       xfs_mount_t             *mp;
+       static const short      brootflag[2] =
+               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+       static const short      dataflag[2] =
+               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+       static const short      extflag[2] =
+               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+       if (!iip)
+               return;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       /*
+        * This can happen if we gave up in iformat in an error path,
+        * for the attribute fork.
+        */
+       if (!ifp) {
+               ASSERT(whichfork == XFS_ATTR_FORK);
+               return;
+       }
+       cp = XFS_DFORK_PTR(dip, whichfork);
+       mp = ip->i_mount;
+       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+       case XFS_DINODE_FMT_LOCAL:
+               if ((iip->ili_fields & dataflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(ifp->if_u1.if_data != NULL);
+                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+               }
+               break;
+
+       case XFS_DINODE_FMT_EXTENTS:
+               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+                      !(iip->ili_fields & extflag[whichfork]));
+               if ((iip->ili_fields & extflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(xfs_iext_get_ext(ifp, 0));
+                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+                               whichfork);
+               }
+               break;
+
+       case XFS_DINODE_FMT_BTREE:
+               if ((iip->ili_fields & brootflag[whichfork]) &&
+                   (ifp->if_broot_bytes > 0)) {
+                       ASSERT(ifp->if_broot != NULL);
+                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                               XFS_IFORK_SIZE(ip, whichfork));
+                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+                               (xfs_bmdr_block_t *)cp,
+                               XFS_DFORK_SIZE(dip, mp, whichfork));
+               }
+               break;
+
+       case XFS_DINODE_FMT_DEV:
+               if (iip->ili_fields & XFS_ILOG_DEV) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+               }
+               break;
+
+       case XFS_DINODE_FMT_UUID:
+               if (iip->ili_fields & XFS_ILOG_UUID) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       memcpy(XFS_DFORK_DPTR(dip),
+                              &ip->i_df.if_u2.if_uuid,
+                              sizeof(uuid_t));
+               }
+               break;
+
+       default:
+               ASSERT(0);
+               break;
+       }
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx)            /* index of target extent */
+{
+       ASSERT(idx >= 0);
+       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+               return ifp->if_u1.if_ext_irec->er_extbuf;
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_ext_irec_t  *erp;           /* irec pointer */
+               int             erp_idx = 0;    /* irec index */
+               xfs_extnum_t    page_idx = idx; /* ext index in target list */
+
+               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+               return &erp->er_extbuf[page_idx];
+       } else if (ifp->if_bytes) {
+               return &ifp->if_u1.if_extents[idx];
+       } else {
+               return NULL;
+       }
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* starting index of new items */
+       xfs_extnum_t    count,          /* number of inserted items */
+       xfs_bmbt_irec_t *new,           /* items to insert */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    i;              /* extent record index */
+
+       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       xfs_iext_add(ifp, idx, count);
+       for (i = idx; i < idx + count; i++, new++)
+               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin adding exts */
+       int             ext_diff)       /* number of extents to add */
+{
+       int             byte_diff;      /* new bytes being added */
+       int             new_size;       /* size of extents after adding */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT((idx >= 0) && (idx <= nextents));
+       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+       new_size = ifp->if_bytes + byte_diff;
+       /*
+        * If the new number of extents (nextents + ext_diff)
+        * fits inside the inode, then continue to use the inline
+        * extent buffer.
+        */
+       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+               if (idx < nextents) {
+                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+                               &ifp->if_u2.if_inline_ext[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+               }
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+               ifp->if_real_bytes = 0;
+       }
+       /*
+        * Otherwise use a linear (direct) extent list.
+        * If the extents are currently inside the inode,
+        * xfs_iext_realloc_direct will switch us from
+        * inline to direct extent allocation mode.
+        */
+       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, new_size);
+               if (idx < nextents) {
+                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+                               &ifp->if_u1.if_extents[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+               }
+       }
+       /* Indirection array */
+       else {
+               xfs_ext_irec_t  *erp;
+               int             erp_idx = 0;
+               int             page_idx = idx;
+
+               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+               if (ifp->if_flags & XFS_IFEXTIREC) {
+                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+               } else {
+                       xfs_iext_irec_init(ifp);
+                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+                       erp = ifp->if_u1.if_ext_irec;
+               }
+               /* Extents fit in target extent page */
+               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+                       if (page_idx < erp->er_extcount) {
+                               memmove(&erp->er_extbuf[page_idx + ext_diff],
+                                       &erp->er_extbuf[page_idx],
+                                       (erp->er_extcount - page_idx) *
+                                       sizeof(xfs_bmbt_rec_t));
+                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+                       }
+                       erp->er_extcount += ext_diff;
+                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               }
+               /* Insert a new extent page */
+               else if (erp) {
+                       xfs_iext_add_indirect_multi(ifp,
+                               erp_idx, page_idx, ext_diff);
+               }
+               /*
+                * If extent(s) are being appended to the last page in
+                * the indirection array and the new extent(s) don't fit
+                * in the page, then erp is NULL and erp_idx is set to
+                * the next index needed in the indirection array.
+                */
+               else {
+                       int     count = ext_diff;
+
+                       while (count) {
+                               erp = xfs_iext_irec_new(ifp, erp_idx);
+                               erp->er_extcount = count;
+                               count -= MIN(count, (int)XFS_LINEAR_EXTS);
+                               if (count) {
+                                       erp_idx++;
+                               }
+                       }
+               }
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+       xfs_ifork_t     *ifp,                   /* inode fork pointer */
+       int             erp_idx,                /* target extent irec index */
+       xfs_extnum_t    idx,                    /* index within target list */
+       int             count)                  /* new extents being added */
+{
+       int             byte_diff;              /* new bytes being added */
+       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
+       xfs_extnum_t    ext_diff;               /* number of extents to add */
+       xfs_extnum_t    ext_cnt;                /* new extents still needed */
+       xfs_extnum_t    nex2;                   /* extents after idx + count */
+       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
+       int             nlists;                 /* number of irec's (lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       nex2 = erp->er_extcount - idx;
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /*
+        * Save second part of target extent list
+        * (all extents past */
+       if (nex2) {
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+               erp->er_extcount -= nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+               memset(&erp->er_extbuf[idx], 0, byte_diff);
+       }
+
+       /*
+        * Add the new extents to the end of the target
+        * list, then allocate new irec record(s) and
+        * extent buffer(s) as needed to store the rest
+        * of the new extents.
+        */
+       ext_cnt = count;
+       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+       if (ext_diff) {
+               erp->er_extcount += ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+       while (ext_cnt) {
+               erp_idx++;
+               erp = xfs_iext_irec_new(ifp, erp_idx);
+               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+               erp->er_extcount = ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+
+       /* Add nex2 extents back to indirection array */
+       if (nex2) {
+               xfs_extnum_t    ext_avail;
+               int             i;
+
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+               i = 0;
+               /*
+                * If nex2 extents fit in the current page, append
+                * nex2_ep after the new extents.
+                */
+               if (nex2 <= ext_avail) {
+                       i = erp->er_extcount;
+               }
+               /*
+                * Otherwise, check if space is available in the
+                * next page.
+                */
+               else if ((erp_idx < nlists - 1) &&
+                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+                       erp_idx++;
+                       erp++;
+                       /* Create a hole for nex2 extents */
+                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+               }
+               /*
+                * Final choice, create a new extent page for
+                * nex2 extents.
+                */
+               else {
+                       erp_idx++;
+                       erp = xfs_iext_irec_new(ifp, erp_idx);
+               }
+               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+               kmem_free(nex2_ep);
+               erp->er_extcount += nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+       }
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff,       /* number of extents to remove */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+       ASSERT(ext_diff > 0);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_iext_remove_indirect(ifp, idx, ext_diff);
+       } else if (ifp->if_real_bytes) {
+               xfs_iext_remove_direct(ifp, idx, ext_diff);
+       } else {
+               xfs_iext_remove_inline(ifp, idx, ext_diff);
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       int             nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       ASSERT(idx < XFS_INLINE_EXTS);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(((nextents - ext_diff) > 0) &&
+               (nextents - ext_diff) < XFS_INLINE_EXTS);
+
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u2.if_inline_ext[idx],
+                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       } else {
+               memset(&ifp->if_u2.if_inline_ext[idx], 0,
+                       ext_diff * sizeof(xfs_bmbt_rec_t));
+       }
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       new_size = ifp->if_bytes -
+               (ext_diff * sizeof(xfs_bmbt_rec_t));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+               return;
+       }
+       /* Move extents up in the list (if needed) */
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u1.if_extents[idx],
+                       &ifp->if_u1.if_extents[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+       }
+       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+               0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       /*
+        * Reallocate the direct extent list. If the extents
+        * will fit inside the inode then xfs_iext_realloc_direct
+        * will switch from direct to inline extent allocation
+        * mode for us.
+        */
+       xfs_iext_realloc_direct(ifp, new_size);
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing extents */
+       int             count)          /* number of extents to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             erp_idx = 0;    /* indirection array index */
+       xfs_extnum_t    ext_cnt;        /* extents left to remove */
+       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
+       xfs_extnum_t    nex1;           /* number of extents before idx */
+       xfs_extnum_t    nex2;           /* extents after idx + count */
+       int             page_idx = idx; /* index in target extent list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+       ASSERT(erp != NULL);
+       nex1 = page_idx;
+       ext_cnt = count;
+       while (ext_cnt) {
+               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+               /*
+                * Check for deletion of entire list;
+                * xfs_iext_irec_remove() updates extent offsets.
+                */
+               if (ext_diff == erp->er_extcount) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+                       ext_cnt -= ext_diff;
+                       nex1 = 0;
+                       if (ext_cnt) {
+                               ASSERT(erp_idx < ifp->if_real_bytes /
+                                       XFS_IEXT_BUFSZ);
+                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                               nex1 = 0;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+               /* Move extents up (if needed) */
+               if (nex2) {
+                       memmove(&erp->er_extbuf[nex1],
+                               &erp->er_extbuf[nex1 + ext_diff],
+                               nex2 * sizeof(xfs_bmbt_rec_t));
+               }
+               /* Zero out rest of page */
+               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+               /* Update remaining counters */
+               erp->er_extcount -= ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+               ext_cnt -= ext_diff;
+               nex1 = 0;
+               erp_idx++;
+               erp++;
+       }
+       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+       xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new size of extents */
+{
+       int             rnew_size;      /* real new size of extents */
+
+       rnew_size = new_size;
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+                (new_size != ifp->if_real_bytes)));
+
+       /* Free extent records */
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       }
+       /* Resize direct extent list and zero any new bytes */
+       else if (ifp->if_real_bytes) {
+               /* Check if extents will fit inside the inode */
+               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+                       xfs_iext_direct_to_inline(ifp, new_size /
+                               (uint)sizeof(xfs_bmbt_rec_t));
+                       ifp->if_bytes = new_size;
+                       return;
+               }
+               if (!is_power_of_2(new_size)){
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               if (rnew_size != ifp->if_real_bytes) {
+                       ifp->if_u1.if_extents =
+                               kmem_realloc(ifp->if_u1.if_extents,
+                                               rnew_size,
+                                               ifp->if_real_bytes, KM_NOFS);
+               }
+               if (rnew_size > ifp->if_real_bytes) {
+                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
+                               rnew_size - ifp->if_real_bytes);
+               }
+       }
+       /*
+        * Switch from the inline extent buffer to a direct
+        * extent list. Be sure to include the inline extent
+        * bytes in new_size.
+        */
+       else {
+               new_size += ifp->if_bytes;
+               if (!is_power_of_2(new_size)) {
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               xfs_iext_inline_to_direct(ifp, rnew_size);
+       }
+       ifp->if_real_bytes = rnew_size;
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    nextents)       /* number of extents in file */
+{
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       ASSERT(nextents <= XFS_INLINE_EXTS);
+       /*
+        * The inline buffer was zeroed when we switched
+        * from inline to direct extent allocation mode,
+        * so we don't need to clear it here.
+        */
+       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+               nextents * sizeof(xfs_bmbt_rec_t));
+       kmem_free(ifp->if_u1.if_extents);
+       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* number of extents in file */
+{
+       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+       memset(ifp->if_u1.if_extents, 0, new_size);
+       if (ifp->if_bytes) {
+               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+                       ifp->if_bytes);
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new indirection array size */
+{
+       int             nlists;         /* number of irec's (ex lists) */
+       int             size;           /* current indirection array size */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       size = nlists * sizeof(xfs_ext_irec_t);
+       ASSERT(ifp->if_real_bytes);
+       ASSERT((new_size >= 0) && (new_size != size));
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else {
+               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+                       kmem_realloc(ifp->if_u1.if_ext_irec,
+                               new_size, size, KM_NOFS);
+       }
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+        xfs_ifork_t    *ifp)           /* inode fork pointer */
+{
+       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             size;           /* size of file extents */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+       size = nextents * sizeof(xfs_bmbt_rec_t);
+
+       xfs_iext_irec_compact_pages(ifp);
+       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+       ep = ifp->if_u1.if_ext_irec->er_extbuf;
+       kmem_free(ifp->if_u1.if_ext_irec);
+       ifp->if_flags &= ~XFS_IFEXTIREC;
+       ifp->if_u1.if_extents = ep;
+       ifp->if_bytes = size;
+       if (nextents < XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, size);
+       }
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               int     erp_idx;
+               int     nlists;
+
+               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+               }
+               ifp->if_flags &= ~XFS_IFEXTIREC;
+       } else if (ifp->if_real_bytes) {
+               kmem_free(ifp->if_u1.if_extents);
+       } else if (ifp->if_bytes) {
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_u1.if_extents = NULL;
+       ifp->if_real_bytes = 0;
+       ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       xfs_extnum_t    *idxp)          /* index of target extent */
+{
+       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
+       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
+       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       int             high;           /* upper boundary in search */
+       xfs_extnum_t    idx = 0;        /* index of target extent */
+       int             low;            /* lower boundary in search */
+       xfs_extnum_t    nextents;       /* number of file extents */
+       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       if (nextents == 0) {
+               *idxp = 0;
+               return NULL;
+       }
+       low = 0;
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               /* Find target extent list */
+               int     erp_idx = 0;
+               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+               base = erp->er_extbuf;
+               high = erp->er_extcount - 1;
+       } else {
+               base = ifp->if_u1.if_extents;
+               high = nextents - 1;
+       }
+       /* Binary search extent records */
+       while (low <= high) {
+               idx = (low + high) >> 1;
+               ep = base + idx;
+               startoff = xfs_bmbt_get_startoff(ep);
+               blockcount = xfs_bmbt_get_blockcount(ep);
+               if (bno < startoff) {
+                       high = idx - 1;
+               } else if (bno >= startoff + blockcount) {
+                       low = idx + 1;
+               } else {
+                       /* Convert back to file-based extent index */
+                       if (ifp->if_flags & XFS_IFEXTIREC) {
+                               idx += erp->er_extoff;
+                       }
+                       *idxp = idx;
+                       return ep;
+               }
+       }
+       /* Convert back to file-based extent index */
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               idx += erp->er_extoff;
+       }
+       if (bno >= startoff + blockcount) {
+               if (++idx == nextents) {
+                       ep = NULL;
+               } else {
+                       ep = xfs_iext_get_ext(ifp, idx);
+               }
+       }
+       *idxp = idx;
+       return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *                       /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       int             *erp_idxp)      /* irec index of target ext list */
+{
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of extent irec's (lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+                       high = erp_idx - 1;
+               } else if (erp_next && bno >=
+                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+                       low = erp_idx + 1;
+               } else {
+                       break;
+               }
+       }
+       *erp_idxp = erp_idx;
+       return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
+       int             *erp_idxp,      /* pointer to target irec */
+       int             realloc)        /* new bytes were just added */
+{
+       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
+       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       ASSERT(page_idx >= 0);
+       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+
+       /* Binary search extent irec's */
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               prev = erp_idx > 0 ? erp - 1 : NULL;
+               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+                       high = erp_idx - 1;
+               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+                          (page_idx == erp->er_extoff + erp->er_extcount &&
+                           !realloc)) {
+                       low = erp_idx + 1;
+               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+                          erp->er_extcount == XFS_LINEAR_EXTS) {
+                       ASSERT(realloc);
+                       page_idx = 0;
+                       erp_idx++;
+                       erp = erp_idx < nlists ? erp + 1 : NULL;
+                       break;
+               } else {
+                       page_idx -= erp->er_extoff;
+                       break;
+               }
+       }
+       *idxp = page_idx;
+       *erp_idxp = erp_idx;
+       return(erp);
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+       if (nextents == 0) {
+               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       } else if (!ifp->if_real_bytes) {
+               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+       }
+       erp->er_extbuf = ifp->if_u1.if_extents;
+       erp->er_extcount = nextents;
+       erp->er_extoff = 0;
+
+       ifp->if_flags |= XFS_IFEXTIREC;
+       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+       ifp->if_u1.if_ext_irec = erp;
+
+       return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* index for new irec */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /* Resize indirection array */
+       xfs_iext_realloc_indirect(ifp, ++nlists *
+                                 sizeof(xfs_ext_irec_t));
+       /*
+        * Move records down in the array so the
+        * new page can use erp_idx.
+        */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = nlists - 1; i > erp_idx; i--) {
+               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+       }
+       ASSERT(i == erp_idx);
+
+       /* Initialize new extent record */
+       erp = ifp->if_u1.if_ext_irec;
+       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+       erp[erp_idx].er_extcount = 0;
+       erp[erp_idx].er_extoff = erp_idx > 0 ?
+               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+       return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* irec index to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       if (erp->er_extbuf) {
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+                       -erp->er_extcount);
+               kmem_free(erp->er_extbuf);
+       }
+       /* Compact extent records */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = erp_idx; i < nlists - 1; i++) {
+               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+       }
+       /*
+        * Manually free the last extent record from the indirection
+        * array.  A call to xfs_iext_realloc_indirect() with a size
+        * of zero would result in a call to xfs_iext_destroy() which
+        * would in turn call this function again, creating a nasty
+        * infinite loop.
+        */
+       if (--nlists) {
+               xfs_iext_realloc_indirect(ifp,
+                       nlists * sizeof(xfs_ext_irec_t));
+       } else {
+               kmem_free(ifp->if_u1.if_ext_irec);
+       }
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (nextents == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (nextents <= XFS_INLINE_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+               xfs_iext_direct_to_inline(ifp, nextents);
+       } else if (nextents <= XFS_LINEAR_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+               xfs_iext_irec_compact_pages(ifp);
+       }
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
+       int             erp_idx = 0;    /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       while (erp_idx < nlists - 1) {
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp + 1;
+               if (erp_next->er_extcount <=
+                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
+                       memcpy(&erp->er_extbuf[erp->er_extcount],
+                               erp_next->er_extbuf, erp_next->er_extcount *
+                               sizeof(xfs_bmbt_rec_t));
+                       erp->er_extcount += erp_next->er_extcount;
+                       /*
+                        * Free page before removing extent record
+                        * so er_extoffs don't get modified in
+                        * xfs_iext_irec_remove.
+                        */
+                       kmem_free(erp_next->er_extbuf);
+                       erp_next->er_extbuf = NULL;
+                       xfs_iext_irec_remove(ifp, erp_idx + 1);
+                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               } else {
+                       erp_idx++;
+               }
+       }
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx,        /* irec index to update */
+       int             ext_diff)       /* number of new extents */
+{
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       for (i = erp_idx; i < nlists; i++) {
+               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+       }
+}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h

new file mode 100644 (file)

index 0000000..28661a0
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_FORK_H__
+#define        __XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+       xfs_extnum_t    er_extoff;      /* extent offset in file */
+       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define        XFS_IEXT_BUFSZ          4096
+#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define        XFS_INLINE_EXTS         2
+#define        XFS_INLINE_DATA         32
+typedef struct xfs_ifork {
+       int                     if_bytes;       /* bytes in if_u1 */
+       int                     if_real_bytes;  /* bytes allocated in if_u1 */
+       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
+       short                   if_broot_bytes; /* bytes allocated for root */
+       unsigned char           if_flags;       /* per-fork flags */
+       union {
+               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
+               char            *if_data;       /* inline file data */
+       } if_u1;
+       union {
+               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+                                               /* very small file extents */
+               char            if_inline_data[XFS_INLINE_DATA];
+                                               /* very small file data */
+               xfs_dev_t       if_rdev;        /* dev number if special */
+               uuid_t          if_uuid;        /* mount point value */
+       } if_u2;
+} xfs_ifork_t;
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define        XFS_IFINLINE    0x01    /* Inline data is read in */
+#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)            \
+       ((w) == XFS_DATA_FORK ? \
+               &(ip)->i_df : \
+               (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_IFORK_BOFF(ip) : \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+                       XFS_IFORK_BOFF(ip) : \
+               0)
+#define XFS_IFORK_SIZE(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_IFORK_DSIZE(ip) : \
+               XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_format : \
+               (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_format = (n)) : \
+               ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_nextents : \
+               (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_nextents = (n)) : \
+               ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+int            xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void           xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+                               struct xfs_inode_log_item *, int,
+                               struct xfs_buf *);
+void           xfs_idestroy_fork(struct xfs_inode *, int);
+void           xfs_idata_realloc(struct xfs_inode *, int, int);
+void           xfs_iroot_realloc(struct xfs_inode *, int, int);
+int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+                                 int);
+
+struct xfs_bmbt_rec_host *
+               xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+                               struct xfs_bmbt_irec *, int);
+void           xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+                                           xfs_extnum_t, int);
+void           xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void           xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void           xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void           xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+               xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+                                    int);
+void           xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+               xfs_iext_irec_new(struct xfs_ifork *, int);
+void           xfs_iext_irec_remove(struct xfs_ifork *, int);
+void           xfs_iext_irec_compact(struct xfs_ifork *);
+void           xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void           xfs_iext_irec_compact_full(struct xfs_ifork *);
+void           xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+
+extern struct kmem_zone        *xfs_ifork_zone;
+
+#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index f76ff52e43c0a4f5536163a61230f1ac89f77358..378081109844b09b2bbfd07dcb4214027fefe2c2 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -47,32 +47,44 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
   * inode core, and possibly one for the inode data/extents/b-tree root
   * and one for the inode attribute data/extents/b-tree root.
   */
-STATIC uint
+STATIC void
  xfs_inode_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
  {
         struct xfs_inode_log_item *iip = INODE_ITEM(lip);
         struct xfs_inode        *ip = iip->ili_inode;
-       uint                    nvecs = 2;
+
+       *nvecs += 2;
+       *nbytes += sizeof(struct xfs_inode_log_format) +
+                  xfs_icdinode_size(ip->i_d.di_version);
  
         switch (ip->i_d.di_format) {
         case XFS_DINODE_FMT_EXTENTS:
                 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
                     ip->i_d.di_nextents > 0 &&
-                   ip->i_df.if_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_bytes > 0) {
+                       /* worst case, doesn't subtract delalloc extents */
+                       *nbytes += XFS_IFORK_DSIZE(ip);
+                       *nvecs += 1;
+               }
                 break;
  
         case XFS_DINODE_FMT_BTREE:
                 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
-                   ip->i_df.if_broot_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_broot_bytes > 0) {
+                       *nbytes += ip->i_df.if_broot_bytes;
+                       *nvecs += 1;
+               }
                 break;
  
         case XFS_DINODE_FMT_LOCAL:
                 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
-                   ip->i_df.if_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_bytes > 0) {
+                       *nbytes += roundup(ip->i_df.if_bytes, 4);
+                       *nvecs += 1;
+               }
                 break;
  
         case XFS_DINODE_FMT_DEV:
@@ -85,7 +97,7 @@ xfs_inode_item_size(
         }
  
         if (!XFS_IFORK_Q(ip))
-               return nvecs;
+               return;
  
  
         /*
@@ -95,28 +107,33 @@ xfs_inode_item_size(
         case XFS_DINODE_FMT_EXTENTS:
                 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
                     ip->i_d.di_anextents > 0 &&
-                   ip->i_afp->if_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_bytes > 0) {
+                       /* worst case, doesn't subtract unused space */
+                       *nbytes += XFS_IFORK_ASIZE(ip);
+                       *nvecs += 1;
+               }
                 break;
  
         case XFS_DINODE_FMT_BTREE:
                 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
-                   ip->i_afp->if_broot_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_broot_bytes > 0) {
+                       *nbytes += ip->i_afp->if_broot_bytes;
+                       *nvecs += 1;
+               }
                 break;
  
         case XFS_DINODE_FMT_LOCAL:
                 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
-                   ip->i_afp->if_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_bytes > 0) {
+                       *nbytes += roundup(ip->i_afp->if_bytes, 4);
+                       *nvecs += 1;
+               }
                 break;
  
         default:
                 ASSERT(0);
                 break;
         }
-
-       return nvecs;
  }
  
  /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h

index 779812fb3d80b27c94d97f288fea9195db8be4fb..dce4d656768c32888521dc0c9007c6c2961b7c4e 100644 (file)
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -18,123 +18,13 @@
  #ifndef        __XFS_INODE_ITEM_H__
  #define        __XFS_INODE_ITEM_H__
  
-/*
- * This is the structure used to lay out an inode log item in the
- * log.  The size of the inline data/extents/b-tree root to be logged
- * (if any) is indicated in the ilf_dsize field.  Changes to this structure
- * must be added on to the end.
- */
-typedef struct xfs_inode_log_format {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
-
-/*
- * Flags for xfs_trans_log_inode flags field.
- */
-#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
-#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
-#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
-#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
-#define        XFS_ILOG_DEV    0x010   /* log the dev field */
-#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
-#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
-#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
-#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
-
-
-/*
- * The timestamps are dirty, but not necessarily anything else in the inode
- * core.  Unlike the other fields above this one must never make it to disk
- * in the ilf_fields of the inode_log_format, but is purely store in-memory in
- * ili_fields in the inode_log_item.
- */
-#define XFS_ILOG_TIMESTAMP     0x4000
-
-#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
-                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
-
-#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT)
-
-#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT)
-
-#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
-                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
-                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
-                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
-
-static inline int xfs_ilog_fbroot(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-static inline int xfs_ilog_fext(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
-static inline int xfs_ilog_fdata(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#ifdef __KERNEL__
+/* kernel only definitions */
  
  struct xfs_buf;
  struct xfs_bmbt_rec;
  struct xfs_inode;
  struct xfs_mount;
  
-
  typedef struct xfs_inode_log_item {
         xfs_log_item_t          ili_item;          /* common portion */
         struct xfs_inode        *ili_inode;        /* inode ptr */
@@ -151,7 +41,6 @@ typedef struct xfs_inode_log_item {
         xfs_inode_log_format_t  ili_format;        /* logged structure */
  } xfs_inode_log_item_t;
  
-
  static inline int xfs_inode_clean(xfs_inode_t *ip)
  {
         return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
@@ -165,6 +54,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool);
  extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
                                          xfs_inode_log_format_t *);
  
-#endif /* __KERNEL__ */
+extern struct kmem_zone        *xfs_ili_zone;
  
  #endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index 6e2bca5d44d67acb52a58115a6b9482fc04a5bc1..bdebc21078d7e83bac4347ad13a5f569ed4d6a0f 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
@@ -32,17 +33,16 @@
  #include "xfs_error.h"
  #include "xfs_attr.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
  #include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
  #include "xfs_discard.h"
  #include "xfs_quota.h"
  #include "xfs_inode_item.h"
  #include "xfs_export.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
+#include "xfs_symlink.h"
  
  #include <linux/capability.h>
  #include <linux/dcache.h>
@@ -350,6 +350,40 @@ xfs_readlink_by_handle(
         return error;
  }
  
+int
+xfs_set_dmattrs(
+       xfs_inode_t     *ip,
+       u_int           evmask,
+       u_int16_t       state)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_trans_t     *tp;
+       int             error;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return XFS_ERROR(EPERM);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       ip->i_d.di_dmevmask = evmask;
+       ip->i_d.di_dmstate  = state;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       error = xfs_trans_commit(tp, 0);
+
+       return error;
+}
+
  STATIC int
  xfs_fssetdm_by_handle(
         struct file             *parfilp,
@@ -967,7 +1001,7 @@ xfs_ioctl_setattr(
          * first do an error checking pass.
          */
         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
         if (code)
                 goto error_return;
  
@@ -981,15 +1015,22 @@ xfs_ioctl_setattr(
          * to the file owner ID, except in cases where the
          * CAP_FSETID capability is applicable.
          */
-       if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+       if (!inode_owner_or_capable(VFS_I(ip))) {
                 code = XFS_ERROR(EPERM);
                 goto error_return;
         }
  
         /*
          * Do a quota reservation only if projid is actually going to change.
+        * Only allow changing of projid from init_user_ns since it is a
+        * non user namespace aware identifier.
          */
         if (mask & FSX_PROJID) {
+               if (current_user_ns() != &init_user_ns) {
+                       code = XFS_ERROR(EINVAL);
+                       goto error_return;
+               }
+
                 if (XFS_IS_QUOTA_RUNNING(mp) &&
                     XFS_IS_PQUOTA_ON(mp) &&
                     xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1103,7 +1144,7 @@ xfs_ioctl_setattr(
                  * cleared upon successful return from chown()
                  */
                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                   !capable(CAP_FSETID))
+                   !inode_capable(VFS_I(ip), CAP_FSETID))
                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
  
                 /*
@@ -1328,6 +1369,75 @@ xfs_ioc_getbmapx(
         return 0;
  }
  
+int
+xfs_ioc_swapext(
+       xfs_swapext_t   *sxp)
+{
+       xfs_inode_t     *ip, *tip;
+       struct fd       f, tmp;
+       int             error = 0;
+
+       /* Pull information for the target fd */
+       f = fdget((int)sxp->sx_fdtarget);
+       if (!f.file) {
+               error = XFS_ERROR(EINVAL);
+               goto out;
+       }
+
+       if (!(f.file->f_mode & FMODE_WRITE) ||
+           !(f.file->f_mode & FMODE_READ) ||
+           (f.file->f_flags & O_APPEND)) {
+               error = XFS_ERROR(EBADF);
+               goto out_put_file;
+       }
+
+       tmp = fdget((int)sxp->sx_fdtmp);
+       if (!tmp.file) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_file;
+       }
+
+       if (!(tmp.file->f_mode & FMODE_WRITE) ||
+           !(tmp.file->f_mode & FMODE_READ) ||
+           (tmp.file->f_flags & O_APPEND)) {
+               error = XFS_ERROR(EBADF);
+               goto out_put_tmp_file;
+       }
+
+       if (IS_SWAPFILE(file_inode(f.file)) ||
+           IS_SWAPFILE(file_inode(tmp.file))) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       ip = XFS_I(file_inode(f.file));
+       tip = XFS_I(file_inode(tmp.file));
+
+       if (ip->i_mount != tip->i_mount) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       if (ip->i_ino == tip->i_ino) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               error = XFS_ERROR(EIO);
+               goto out_put_tmp_file;
+       }
+
+       error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+       fdput(tmp);
+ out_put_file:
+       fdput(f);
+ out:
+       return error;
+}
+
  /*
   * Note: some of the ioctl's return positive numbers as a
   * byte count indicating success, such as readlink_by_handle.
@@ -1472,7 +1582,7 @@ xfs_file_ioctl(
                 error = mnt_want_write_file(filp);
                 if (error)
                         return error;
-               error = xfs_swapext(&sxp);
+               error = xfs_ioc_swapext(&sxp);
                 mnt_drop_write_file(filp);
                 return -error;
         }
@@ -1610,23 +1720,23 @@ xfs_file_ioctl(
                 return -error;
  
         case XFS_IOC_FREE_EOFBLOCKS: {
-               struct xfs_eofblocks eofb;
+               struct xfs_fs_eofblocks eofb;
+               struct xfs_eofblocks keofb;
  
-               if (copy_from_user(&eofb, arg, sizeof(eofb)))
-                       return -XFS_ERROR(EFAULT);
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
  
-               if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
-                       return -XFS_ERROR(EINVAL);
+               if (mp->m_flags & XFS_MOUNT_RDONLY)
+                       return -XFS_ERROR(EROFS);
  
-               if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
-                       return -XFS_ERROR(EINVAL);
+               if (copy_from_user(&eofb, arg, sizeof(eofb)))
+                       return -XFS_ERROR(EFAULT);
  
-               if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
-                   memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
-                       return -XFS_ERROR(EINVAL);
+               error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+               if (error)
+                       return -error;
  
-               error = xfs_icache_free_eofblocks(mp, &eofb);
-               return -error;
+               return -xfs_icache_free_eofblocks(mp, &keofb);
         }
  
         default:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h

index d56173b34a2a55662575f79a4ba5426662b647c9..77c02c7900b6eb8d78276abdd7d0b6ec03c12e29 100644 (file)
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -27,6 +27,10 @@ xfs_ioc_space(
         unsigned int            cmd,
         xfs_flock64_t           *bf);
  
+int
+xfs_ioc_swapext(
+       xfs_swapext_t   *sxp);
+
  extern int
  xfs_find_handle(
         unsigned int            cmd,
@@ -82,4 +86,10 @@ xfs_file_compat_ioctl(
         unsigned int            cmd,
         unsigned long           arg);
  
+extern int
+xfs_set_dmattrs(
+       struct xfs_inode        *ip,
+       u_int                   evmask,
+       u_int16_t               state);
+
  #endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c

index c0c66259cc913d3a19df61efeab3bb86cb028aaf..d3ab9534307fcaa7e863be8965f80311090ce499 100644 (file)
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -33,8 +33,6 @@
  #include "xfs_inode.h"
  #include "xfs_itable.h"
  #include "xfs_error.h"
-#include "xfs_dfrag.h"
-#include "xfs_vnodeops.h"
  #include "xfs_fsops.h"
  #include "xfs_alloc.h"
  #include "xfs_rtalloc.h"
@@ -644,7 +642,7 @@ xfs_file_compat_ioctl(
                 error = mnt_want_write_file(filp);
                 if (error)
                         return error;
-               error = xfs_swapext(&sxp);
+               error = xfs_ioc_swapext(&sxp);
                 mnt_drop_write_file(filp);
                 return -error;
         }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 6a7096422295d1d821f1e0bab397041c42f53de1..8d4d49b6fbf347b3add01ed4675b489a653a6dfb 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
@@ -32,13 +33,13 @@
  #include "xfs_inode_item.h"
  #include "xfs_btree.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_rtalloc.h"
  #include "xfs_error.h"
  #include "xfs_itable.h"
  #include "xfs_attr.h"
  #include "xfs_buf_item.h"
  #include "xfs_trans_space.h"
-#include "xfs_utils.h"
  #include "xfs_iomap.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
@@ -187,10 +188,8 @@ xfs_iomap_write_direct(
          * Allocate and setup the transaction
          */
         tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-       error = xfs_trans_reserve(tp, resblks,
-                       XFS_WRITE_LOG_RES(mp), resrtextents,
-                       XFS_TRANS_PERM_LOG_RES,
-                       XFS_WRITE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                 resblks, resrtextents);
         /*
          * Check for running out of space, note: need lock to return
          */
@@ -698,10 +697,8 @@ xfs_iomap_write_allocate(
                         tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
                         tp->t_flags |= XFS_TRANS_RESERVE;
                         nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       error = xfs_trans_reserve(tp, nres,
-                                       XFS_WRITE_LOG_RES(mp),
-                                       0, XFS_TRANS_PERM_LOG_RES,
-                                       XFS_WRITE_LOG_COUNT);
+                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                                 nres, 0);
                         if (error) {
                                 xfs_trans_cancel(tp, 0);
                                 return XFS_ERROR(error);
@@ -864,10 +861,8 @@ xfs_iomap_write_unwritten(
                 sb_start_intwrite(mp->m_super);
                 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
                 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
-               error = xfs_trans_reserve(tp, resblks,
-                               XFS_WRITE_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_WRITE_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                         resblks, 0);
                 if (error) {
                         xfs_trans_cancel(tp, 0);
                         return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index 96dda62d497b7e04a68a1a6ddfc579aececf8374..2b8952d9199bbd145473a48b326120b8d43ed9b6 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_acl.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
@@ -29,16 +30,19 @@
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_rtalloc.h"
  #include "xfs_error.h"
  #include "xfs_itable.h"
  #include "xfs_attr.h"
  #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
  #include "xfs_inode_item.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
  
  #include <linux/capability.h>
  #include <linux/xattr.h>
@@ -87,10 +91,12 @@ xfs_init_security(
  static void
  xfs_dentry_to_name(
         struct xfs_name *namep,
-       struct dentry   *dentry)
+       struct dentry   *dentry,
+       int             mode)
  {
         namep->name = dentry->d_name.name;
         namep->len = dentry->d_name.len;
+       namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
  }
  
  STATIC void
@@ -106,7 +112,7 @@ xfs_cleanup_inode(
          * xfs_init_security we must back out.
          * ENOSPC can hit here, among other things.
          */
-       xfs_dentry_to_name(&teardown, dentry);
+       xfs_dentry_to_name(&teardown, dentry, 0);
  
         xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
         iput(inode);
@@ -146,7 +152,7 @@ xfs_vn_mknod(
                         mode &= ~current_umask();
         }
  
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, mode);
         error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
         if (unlikely(error))
                 goto out_free_acl;
@@ -207,7 +213,7 @@ xfs_vn_lookup(
         if (dentry->d_name.len >= MAXNAMELEN)
                 return ERR_PTR(-ENAMETOOLONG);
  
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, 0);
         error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
         if (unlikely(error)) {
                 if (unlikely(error != ENOENT))
@@ -234,7 +240,7 @@ xfs_vn_ci_lookup(
         if (dentry->d_name.len >= MAXNAMELEN)
                 return ERR_PTR(-ENAMETOOLONG);
  
-       xfs_dentry_to_name(&xname, dentry);
+       xfs_dentry_to_name(&xname, dentry, 0);
         error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
         if (unlikely(error)) {
                 if (unlikely(error != ENOENT))
@@ -269,7 +275,7 @@ xfs_vn_link(
         struct xfs_name name;
         int             error;
  
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, inode->i_mode);
  
         error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
         if (unlikely(error))
@@ -288,7 +294,7 @@ xfs_vn_unlink(
         struct xfs_name name;
         int             error;
  
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, 0);
  
         error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
         if (error)
@@ -318,7 +324,7 @@ xfs_vn_symlink(
  
         mode = S_IFLNK |
                 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, mode);
  
         error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
         if (unlikely(error))
@@ -350,12 +356,12 @@ xfs_vn_rename(
         struct xfs_name oname;
         struct xfs_name nname;
  
-       xfs_dentry_to_name(&oname, odentry);
-       xfs_dentry_to_name(&nname, ndentry);
+       xfs_dentry_to_name(&oname, odentry, 0);
+       xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
  
         return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
                            XFS_I(ndir), &nname, new_inode ?
-                                               XFS_I(new_inode) : NULL);
+                                               XFS_I(new_inode) : NULL);
  }
  
  /*
@@ -420,8 +426,8 @@ xfs_vn_getattr(
         stat->dev = inode->i_sb->s_dev;
         stat->mode = ip->i_d.di_mode;
         stat->nlink = ip->i_d.di_nlink;
-       stat->uid = ip->i_d.di_uid;
-       stat->gid = ip->i_d.di_gid;
+       stat->uid = inode->i_uid;
+       stat->gid = inode->i_gid;
         stat->ino = ip->i_ino;
         stat->atime = inode->i_atime;
         stat->mtime = inode->i_mtime;
@@ -485,8 +491,8 @@ xfs_setattr_nonsize(
         int                     mask = iattr->ia_valid;
         xfs_trans_t             *tp;
         int                     error;
-       uid_t                   uid = 0, iuid = 0;
-       gid_t                   gid = 0, igid = 0;
+       kuid_t                  uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+       kgid_t                  gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
         struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
         struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
  
@@ -522,13 +528,13 @@ xfs_setattr_nonsize(
                         uid = iattr->ia_uid;
                         qflags |= XFS_QMOPT_UQUOTA;
                 } else {
-                       uid = ip->i_d.di_uid;
+                       uid = inode->i_uid;
                 }
                 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
                         gid = iattr->ia_gid;
                         qflags |= XFS_QMOPT_GQUOTA;
                 }  else {
-                       gid = ip->i_d.di_gid;
+                       gid = inode->i_gid;
                 }
  
                 /*
@@ -538,14 +544,16 @@ xfs_setattr_nonsize(
                  */
                 ASSERT(udqp == NULL);
                 ASSERT(gdqp == NULL);
-               error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-                                        qflags, &udqp, &gdqp, NULL);
+               error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
+                                          xfs_kgid_to_gid(gid),
+                                          xfs_get_projid(ip),
+                                          qflags, &udqp, &gdqp, NULL);
                 if (error)
                         return error;
         }
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
         if (error)
                 goto out_dqrele;
  
@@ -561,8 +569,8 @@ xfs_setattr_nonsize(
                  * while we didn't have the inode locked, inode's dquot(s)
                  * would have changed also.
                  */
-               iuid = ip->i_d.di_uid;
-               igid = ip->i_d.di_gid;
+               iuid = inode->i_uid;
+               igid = inode->i_gid;
                 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
                 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
  
@@ -571,8 +579,8 @@ xfs_setattr_nonsize(
                  * going to change.
                  */
                 if (XFS_IS_QUOTA_RUNNING(mp) &&
-                   ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-                    (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+                   ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+                    (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
                         ASSERT(tp);
                         error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                 NULL, capable(CAP_FOWNER) ?
@@ -602,17 +610,17 @@ xfs_setattr_nonsize(
                  * Change the ownerships and register quota modifications
                  * in the transaction.
                  */
-               if (iuid != uid) {
+               if (!uid_eq(iuid, uid)) {
                         if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
                                 ASSERT(mask & ATTR_UID);
                                 ASSERT(udqp);
                                 olddquot1 = xfs_qm_vop_chown(tp, ip,
                                                         &ip->i_udquot, udqp);
                         }
-                       ip->i_d.di_uid = uid;
+                       ip->i_d.di_uid = xfs_kuid_to_uid(uid);
                         inode->i_uid = uid;
                 }
-               if (igid != gid) {
+               if (!gid_eq(igid, gid)) {
                         if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
                                 ASSERT(mask & ATTR_GID);
@@ -620,7 +628,7 @@ xfs_setattr_nonsize(
                                 olddquot2 = xfs_qm_vop_chown(tp, ip,
                                                         &ip->i_gdquot, gdqp);
                         }
-                       ip->i_d.di_gid = gid;
+                       ip->i_d.di_gid = xfs_kgid_to_gid(gid);
                         inode->i_gid = gid;
                 }
         }
@@ -807,9 +815,7 @@ xfs_setattr_size(
                 goto out_unlock;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                XFS_TRANS_PERM_LOG_RES,
-                                XFS_ITRUNCATE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error)
                 goto out_trans_cancel;
  
@@ -932,7 +938,7 @@ xfs_vn_update_time(
         trace_xfs_update_time(ip);
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return -error;
@@ -1173,8 +1179,8 @@ xfs_setup_inode(
  
         inode->i_mode   = ip->i_d.di_mode;
         set_nlink(inode, ip->i_d.di_nlink);
-       inode->i_uid    = ip->i_d.di_uid;
-       inode->i_gid    = ip->i_d.di_gid;
+       inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
+       inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
  
         switch (inode->i_mode & S_IFMT) {
         case S_IFBLK:
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h

index ef41c92ce66e9e8159a2c3b1319774e589877ce6..d81fb41205ec97b9a00ecc0789e99763adc5af71 100644 (file)
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -27,4 +27,17 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
  
  extern void xfs_setup_inode(struct xfs_inode *);
  
+/*
+ * Internal setattr interfaces.
+ */
+#define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
+#define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if op would block */
+#define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL         0x08    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC          0x10    /* synchronous operation required */
+
+extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
+                              int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
+
  #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index 800f896a6cc48cdffc19f85d6a9dd4c30ef897d5..f9bb590acc0ebfd38a4aaaee3baaa5b0bf6ec505 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,6 +32,38 @@
  # define XFS_BIG_INUMS 0
  #endif
  
+/*
+ * Kernel specific type declarations for XFS
+ */
+typedef signed char            __int8_t;
+typedef unsigned char          __uint8_t;
+typedef signed short int       __int16_t;
+typedef unsigned short int     __uint16_t;
+typedef signed int             __int32_t;
+typedef unsigned int           __uint32_t;
+typedef signed long long int   __int64_t;
+typedef unsigned long long int __uint64_t;
+
+typedef __uint32_t             inst_t;         /* an instruction */
+
+typedef __s64                  xfs_off_t;      /* <file offset> type */
+typedef unsigned long long     xfs_ino_t;      /* <inode> type */
+typedef __s64                  xfs_daddr_t;    /* <disk address> type */
+typedef char *                 xfs_caddr_t;    /* <core address> type */
+typedef __u32                  xfs_dev_t;
+typedef __u32                  xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
  #include "xfs_types.h"
  
  #include "kmem.h"
@@ -114,8 +146,6 @@
  #define xfs_inherit_sync       xfs_params.inherit_sync.val
  #define xfs_inherit_nodump     xfs_params.inherit_nodump.val
  #define xfs_inherit_noatime    xfs_params.inherit_noatim.val
-#define xfs_buf_timer_centisecs        xfs_params.xfs_buf_timer.val
-#define xfs_buf_age_centisecs  xfs_params.xfs_buf_age.val
  #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
  #define xfs_rotorstep          xfs_params.rotorstep.val
  #define xfs_inherit_nodefrag   xfs_params.inherit_nodfrg.val
@@ -159,6 +189,32 @@
  #define MAX(a,b)       (max(a,b))
  #define howmany(x, y)  (((x)+((y)-1))/(y))
  
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+{
+       return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+{
+       return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+{
+       return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+{
+       return make_kgid(&init_user_ns, gid);
+}
+
  /*
   * Various platform dependent calls that don't fit anywhere else
   */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index d852a2b3e1fdfae0c4fb5bf18452ec79fd03ab01..5372d58ef93a26220f0d916763e7f37de63d0050 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -614,7 +614,8 @@ xfs_log_mount(
         xfs_daddr_t     blk_offset,
         int             num_bblks)
  {
-       int             error;
+       int             error = 0;
+       int             min_logfsbs;
  
         if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
                 xfs_notice(mp, "Mounting Filesystem");
@@ -630,6 +631,50 @@ xfs_log_mount(
                 goto out;
         }
  
+       /*
+        * Validate the given log space and drop a critical message via syslog
+        * if the log size is too small that would lead to some unexpected
+        * situations in transaction log space reservation stage.
+        *
+        * Note: we can't just reject the mount if the validation fails.  This
+        * would mean that people would have to downgrade their kernel just to
+        * remedy the situation as there is no way to grow the log (short of
+        * black magic surgery with xfs_db).
+        *
+        * We can, however, reject mounts for CRC format filesystems, as the
+        * mkfs binary being used to make the filesystem should never create a
+        * filesystem with a log that is too small.
+        */
+       min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+       if (mp->m_sb.sb_logblocks < min_logfsbs) {
+               xfs_warn(mp,
+               "Log size %d blocks too small, minimum size is %d blocks",
+                        mp->m_sb.sb_logblocks, min_logfsbs);
+               error = EINVAL;
+       } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+               xfs_warn(mp,
+               "Log size %d blocks too large, maximum size is %lld blocks",
+                        mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+               error = EINVAL;
+       } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+               xfs_warn(mp,
+               "log size %lld bytes too large, maximum size is %lld bytes",
+                        XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+                        XFS_MAX_LOG_BYTES);
+               error = EINVAL;
+       }
+       if (error) {
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+                       ASSERT(0);
+                       goto out_free_log;
+               }
+               xfs_crit(mp,
+"Log size out of supported range. Continuing onwards, but if log hangs are\n"
+"experienced then please report this message in the bug report.");
+       }
+
         /*
          * Initialize the AIL now we have a log.
          */
@@ -720,7 +765,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
   * Unmount record used to have a string "Unmount filesystem--" in the
   * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
   * We just write the magic number now since that particular field isn't
- * currently architecture converted and "nUmount" is a bit foo.
+ * currently architecture converted and "Unmount" is a bit foo.
   * As far as I know, there weren't any dependencies on the old behaviour.
   */
  
@@ -1941,7 +1986,7 @@ xlog_print_tic_res(
  
         xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                 "xlog_write: reservation ran out. Need to up reservation");
-       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+       xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
  }
  
  /*
@@ -2044,7 +2089,7 @@ xlog_write_setup_ophdr(
   * Set up the parameters of the region copy into the log. This has
   * to handle region write split across multiple log buffers - this
   * state is kept external to this function so that this code can
- * can be written in an obvious, self documenting manner.
+ * be written in an obvious, self documenting manner.
   */
  static int
  xlog_write_setup_copy(
@@ -3391,24 +3436,17 @@ xfs_log_ticket_get(
  }
  
  /*
- * Allocate and initialise a new log ticket.
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
   */
-struct xlog_ticket *
-xlog_ticket_alloc(
-       struct xlog     *log,
-       int             unit_bytes,
-       int             cnt,
-       char            client,
-       bool            permanent,
-       xfs_km_flags_t  alloc_flags)
+int
+xfs_log_calc_unit_res(
+       struct xfs_mount        *mp,
+       int                     unit_bytes)
  {
-       struct xlog_ticket *tic;
-       uint            num_headers;
-       int             iclog_space;
-
-       tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
-       if (!tic)
-               return NULL;
+       struct xlog             *log = mp->m_log;
+       int                     iclog_space;
+       uint                    num_headers;
  
         /*
          * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3483,20 +3521,43 @@ xlog_ticket_alloc(
         unit_bytes += log->l_iclog_hsize;
  
         /* for roundoff padding for transaction data and one for commit record */
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
-           log->l_mp->m_sb.sb_logsunit > 1) {
+       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
                 /* log su roundoff */
-               unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+               unit_bytes += 2 * mp->m_sb.sb_logsunit;
         } else {
                 /* BB roundoff */
-               unit_bytes += 2*BBSIZE;
+               unit_bytes += 2 * BBSIZE;
          }
  
+       return unit_bytes;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+       struct xlog             *log,
+       int                     unit_bytes,
+       int                     cnt,
+       char                    client,
+       bool                    permanent,
+       xfs_km_flags_t          alloc_flags)
+{
+       struct xlog_ticket      *tic;
+       int                     unit_res;
+
+       tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+       if (!tic)
+               return NULL;
+
+       unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+
         atomic_set(&tic->t_ref, 1);
         tic->t_task             = current;
         INIT_LIST_HEAD(&tic->t_queue);
-       tic->t_unit_res         = unit_bytes;
-       tic->t_curr_res         = unit_bytes;
+       tic->t_unit_res         = unit_res;
+       tic->t_curr_res         = unit_res;
         tic->t_cnt              = cnt;
         tic->t_ocnt             = cnt;
         tic->t_tid              = prandom_u32();
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h

index fb630e496c12406c558b7cc53854bf0cd123ccaf..1c458487f000a42306cb44f14509d80fea0c2f02 100644 (file)
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,14 +18,30 @@
  #ifndef        __XFS_LOG_H__
  #define __XFS_LOG_H__
  
-/* get lsn fields */
-#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
-#define BLOCK_LSN(lsn) ((uint)(lsn))
+#include "xfs_log_format.h"
  
-/* this is used in a spot where we might otherwise double-endian-flip */
-#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+struct xfs_log_vec {
+       struct xfs_log_vec      *lv_next;       /* next lv in build list */
+       int                     lv_niovecs;     /* number of iovecs in lv */
+       struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+       struct xfs_log_item     *lv_item;       /* owner */
+       char                    *lv_buf;        /* formatted buffer */
+       int                     lv_buf_len;     /* size of formatted buffer */
+       int                     lv_size;        /* size of allocated lv */
+};
+
+#define XFS_LOG_VEC_ORDERED    (-1)
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+       struct xfs_log_callback *cb_next;
+       void                    (*cb_func)(void *, int);
+       void                    *cb_arg;
+} xfs_log_callback_t;
  
-#ifdef __KERNEL__
  /*
   * By comparing each component, we don't have to worry about extra
   * endian issues in treating two 32 bit numbers as one 64 bit number
@@ -59,67 +75,6 @@ static inline xfs_lsn_t      _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
   */
  #define XFS_LOG_SYNC           0x1
  
-#endif /* __KERNEL__ */
-
-
-/* Log Clients */
-#define XFS_TRANSACTION                0x69
-#define XFS_VOLUME             0x2
-#define XFS_LOG                        0xaa
-
-
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT          1
-#define XLOG_REG_TYPE_BCHUNK           2
-#define XLOG_REG_TYPE_EFI_FORMAT       3
-#define XLOG_REG_TYPE_EFD_FORMAT       4
-#define XLOG_REG_TYPE_IFORMAT          5
-#define XLOG_REG_TYPE_ICORE            6
-#define XLOG_REG_TYPE_IEXT             7
-#define XLOG_REG_TYPE_IBROOT           8
-#define XLOG_REG_TYPE_ILOCAL           9
-#define XLOG_REG_TYPE_IATTR_EXT                10
-#define XLOG_REG_TYPE_IATTR_BROOT      11
-#define XLOG_REG_TYPE_IATTR_LOCAL      12
-#define XLOG_REG_TYPE_QFORMAT          13
-#define XLOG_REG_TYPE_DQUOT            14
-#define XLOG_REG_TYPE_QUOTAOFF         15
-#define XLOG_REG_TYPE_LRHEADER         16
-#define XLOG_REG_TYPE_UNMOUNT          17
-#define XLOG_REG_TYPE_COMMIT           18
-#define XLOG_REG_TYPE_TRANSHDR         19
-#define XLOG_REG_TYPE_ICREATE          20
-#define XLOG_REG_TYPE_MAX              20
-
-typedef struct xfs_log_iovec {
-       void            *i_addr;        /* beginning address of region */
-       int             i_len;          /* length in bytes of region */
-       uint            i_type;         /* type of region */
-} xfs_log_iovec_t;
-
-struct xfs_log_vec {
-       struct xfs_log_vec      *lv_next;       /* next lv in build list */
-       int                     lv_niovecs;     /* number of iovecs in lv */
-       struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
-       struct xfs_log_item     *lv_item;       /* owner */
-       char                    *lv_buf;        /* formatted buffer */
-       int                     lv_buf_len;     /* size of formatted buffer */
-};
-
-#define XFS_LOG_VEC_ORDERED    (-1)
-
-/*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
-       struct xfs_log_callback *cb_next;
-       void                    (*cb_func)(void *, int);
-       void                    *cb_arg;
-} xfs_log_callback_t;
-
-
-#ifdef __KERNEL__
  /* Log manager interfaces */
  struct xfs_mount;
  struct xlog_in_core;
@@ -188,5 +143,4 @@ void        xfs_log_work_queue(struct xfs_mount *mp);
  void   xfs_log_worker(struct work_struct *work);
  void   xfs_log_quiesce(struct xfs_mount *mp);
  
-#endif
  #endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index 02b9cf3f8252baeade5d4e99b3e88853a7b50b98..cfe97973ba36d1d586c3704b536aebce2e391af1 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -80,6 +80,83 @@ xlog_cil_init_post_recovery(
                                                                 log->l_curr_block);
  }
  
+STATIC int
+xlog_cil_lv_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_vec      *lv)
+{
+       int     index;
+       char    *ptr;
+
+       /* format new vectors into array */
+       lip->li_ops->iop_format(lip, lv->lv_iovecp);
+
+       /* copy data into existing array */
+       ptr = lv->lv_buf;
+       for (index = 0; index < lv->lv_niovecs; index++) {
+               struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+               memcpy(ptr, vec->i_addr, vec->i_len);
+               vec->i_addr = ptr;
+               ptr += vec->i_len;
+       }
+
+       /*
+        * some size calculations for log vectors over-estimate, so the caller
+        * doesn't know the amount of space actually used by the item. Return
+        * the byte count to the caller so they can check and store it
+        * appropriately.
+        */
+       return ptr - lv->lv_buf;
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+       struct xlog             *log,
+       struct xfs_log_vec      *lv,
+       struct xfs_log_vec      *old_lv,
+       int                     *diff_len,
+       int                     *diff_iovecs)
+{
+       /* Account for the new LV being passed in */
+       if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+               *diff_len += lv->lv_buf_len;
+               *diff_iovecs += lv->lv_niovecs;
+       }
+
+       /*
+        * If there is no old LV, this is the first time we've seen the item in
+        * this CIL context and so we need to pin it. If we are replacing the
+        * old_lv, then remove the space it accounts for and free it.
+        */
+       if (!old_lv)
+               lv->lv_item->li_ops->iop_pin(lv->lv_item);
+       else if (old_lv != lv) {
+               ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+
+               *diff_len -= old_lv->lv_buf_len;
+               *diff_iovecs -= old_lv->lv_niovecs;
+               kmem_free(old_lv);
+       }
+
+       /* attach new log vector to log item */
+       lv->lv_item->li_lv = lv;
+
+       /*
+        * If this is the first time the item is being committed to the
+        * CIL, store the sequence number on the log item so we can
+        * tell in future commits whether this is the first checkpoint
+        * the item is being committed into.
+        */
+       if (!lv->lv_item->li_seq)
+               lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
  /*
   * Format log item into a flat buffers
   *
@@ -106,35 +183,39 @@ xlog_cil_init_post_recovery(
   * format the regions into the iclog as though they are being formatted
   * directly out of the objects themselves.
   */
-static struct xfs_log_vec *
-xlog_cil_prepare_log_vecs(
-       struct xfs_trans        *tp)
+static void
+xlog_cil_insert_format_items(
+       struct xlog             *log,
+       struct xfs_trans        *tp,
+       int                     *diff_len,
+       int                     *diff_iovecs)
  {
         struct xfs_log_item_desc *lidp;
-       struct xfs_log_vec      *lv = NULL;
-       struct xfs_log_vec      *ret_lv = NULL;
  
  
         /* Bail out if we didn't find a log item.  */
         if (list_empty(&tp->t_items)) {
                 ASSERT(0);
-               return NULL;
+               return;
         }
  
         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-               struct xfs_log_vec *new_lv;
-               void    *ptr;
-               int     index;
-               int     len = 0;
-               uint    niovecs;
+               struct xfs_log_item *lip = lidp->lid_item;
+               struct xfs_log_vec *lv;
+               struct xfs_log_vec *old_lv;
+               int     niovecs = 0;
+               int     nbytes = 0;
+               int     buf_size;
                 bool    ordered = false;
  
                 /* Skip items which aren't dirty in this transaction. */
                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
                         continue;
  
+               /* get number of vecs and size of data to be stored */
+               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
                 /* Skip items that do not have any vectors for writing */
-               niovecs = IOP_SIZE(lidp->lid_item);
                 if (!niovecs)
                         continue;
  
@@ -146,109 +227,63 @@ xlog_cil_prepare_log_vecs(
                 if (niovecs == XFS_LOG_VEC_ORDERED) {
                         ordered = true;
                         niovecs = 0;
+                       nbytes = 0;
                 }
  
-               new_lv = kmem_zalloc(sizeof(*new_lv) +
-                               niovecs * sizeof(struct xfs_log_iovec),
-                               KM_SLEEP|KM_NOFS);
-
-               new_lv->lv_item = lidp->lid_item;
-               new_lv->lv_niovecs = niovecs;
-               if (ordered) {
-                       /* track as an ordered logvec */
-                       new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
-                       goto next;
-               }
-
-               /* The allocated iovec region lies beyond the log vector. */
-               new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+               /* grab the old item if it exists for reservation accounting */
+               old_lv = lip->li_lv;
  
-               /* build the vector array and calculate it's length */
-               IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
-               for (index = 0; index < new_lv->lv_niovecs; index++)
-                       len += new_lv->lv_iovecp[index].i_len;
+               /* calc buffer size */
+               buf_size = sizeof(struct xfs_log_vec) + nbytes +
+                               niovecs * sizeof(struct xfs_log_iovec);
  
-               new_lv->lv_buf_len = len;
-               new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
-                               KM_SLEEP|KM_NOFS);
-               ptr = new_lv->lv_buf;
+               /* compare to existing item size */
+               if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+                       /* same or smaller, optimise common overwrite case */
+                       lv = lip->li_lv;
+                       lv->lv_next = NULL;
  
-               for (index = 0; index < new_lv->lv_niovecs; index++) {
-                       struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
+                       if (ordered)
+                               goto insert;
  
-                       memcpy(ptr, vec->i_addr, vec->i_len);
-                       vec->i_addr = ptr;
-                       ptr += vec->i_len;
-               }
-               ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
-
-next:
-               if (!ret_lv)
-                       ret_lv = new_lv;
-               else
-                       lv->lv_next = new_lv;
-               lv = new_lv;
-       }
-
-       return ret_lv;
-}
-
-/*
- * Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
- */
-STATIC void
-xfs_cil_prepare_item(
-       struct xlog             *log,
-       struct xfs_log_vec      *lv,
-       int                     *len,
-       int                     *diff_iovecs)
-{
-       struct xfs_log_vec      *old = lv->lv_item->li_lv;
+                       /*
+                        * set the item up as though it is a new insertion so
+                        * that the space reservation accounting is correct.
+                        */
+                       *diff_iovecs -= lv->lv_niovecs;
+                       *diff_len -= lv->lv_buf_len;
  
-       if (old) {
-               /* existing lv on log item, space used is a delta */
-               ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
-                       old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+                       /* Ensure the lv is set up according to ->iop_size */
+                       lv->lv_niovecs = niovecs;
+                       lv->lv_buf = (char *)lv + buf_size - nbytes;
  
-               /*
-                * If the new item is ordered, keep the old one that is already
-                * tracking dirty or ordered regions
-                */
-               if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
-                       ASSERT(!lv->lv_buf);
-                       kmem_free(lv);
-                       return;
+                       lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+                       goto insert;
                 }
  
-               *len += lv->lv_buf_len - old->lv_buf_len;
-               *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
-               kmem_free(old->lv_buf);
-               kmem_free(old);
-       } else {
-               /* new lv, must pin the log item */
-               ASSERT(!lv->lv_item->li_lv);
-
-               if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
-                       *len += lv->lv_buf_len;
-                       *diff_iovecs += lv->lv_niovecs;
+               /* allocate new data chunk */
+               lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+               lv->lv_item = lip;
+               lv->lv_size = buf_size;
+               lv->lv_niovecs = niovecs;
+               if (ordered) {
+                       /* track as an ordered logvec */
+                       ASSERT(lip->li_lv == NULL);
+                       lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       goto insert;
                 }
-               IOP_PIN(lv->lv_item);
  
-       }
+               /* The allocated iovec region lies beyond the log vector. */
+               lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
  
-       /* attach new log vector to log item */
-       lv->lv_item->li_lv = lv;
+               /* The allocated data region lies beyond the iovec region */
+               lv->lv_buf = (char *)lv + buf_size - nbytes;
  
-       /*
-        * If this is the first time the item is being committed to the
-        * CIL, store the sequence number on the log item so we can
-        * tell in future commits whether this is the first checkpoint
-        * the item is being committed into.
-        */
-       if (!lv->lv_item->li_seq)
-               lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+               lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+insert:
+               ASSERT(lv->lv_buf_len <= nbytes);
+               xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+       }
  }
  
  /*
@@ -261,53 +296,47 @@ xfs_cil_prepare_item(
  static void
  xlog_cil_insert_items(
         struct xlog             *log,
-       struct xfs_log_vec      *log_vector,
-       struct xlog_ticket      *ticket)
+       struct xfs_trans        *tp)
  {
         struct xfs_cil          *cil = log->l_cilp;
         struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-       struct xfs_log_vec      *lv;
+       struct xfs_log_item_desc *lidp;
         int                     len = 0;
         int                     diff_iovecs = 0;
         int                     iclog_space;
  
-       ASSERT(log_vector);
+       ASSERT(tp);
  
         /*
-        * Do all the accounting aggregation and switching of log vectors
-        * around in a separate loop to the insertion of items into the CIL.
-        * Then we can do a separate loop to update the CIL within a single
-        * lock/unlock pair. This reduces the number of round trips on the CIL
-        * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
-        * hold time for the transaction commit.
-        *
-        * If this is the first time the item is being placed into the CIL in
-        * this context, pin it so it can't be written to disk until the CIL is
-        * flushed to the iclog and the iclog written to disk.
-        *
          * We can do this safely because the context can't checkpoint until we
          * are done so it doesn't matter exactly how we update the CIL.
          */
+       xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+
+       /*
+        * Now (re-)position everything modified at the tail of the CIL.
+        * We do this here so we only need to take the CIL lock once during
+        * the transaction commit.
+        */
         spin_lock(&cil->xc_cil_lock);
-       for (lv = log_vector; lv; ) {
-               struct xfs_log_vec *next = lv->lv_next;
+       list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+               struct xfs_log_item     *lip = lidp->lid_item;
  
-               ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
-               lv->lv_next = NULL;
+               /* Skip items which aren't dirty in this transaction. */
+               if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                       continue;
  
-               /*
-                * xfs_cil_prepare_item() may free the lv, so move the item on
-                * the CIL first.
-                */
-               list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
-               xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-               lv = next;
+               list_move_tail(&lip->li_cil, &cil->xc_cil);
         }
  
         /* account for space used by new iovec headers  */
         len += diff_iovecs * sizeof(xlog_op_header_t);
         ctx->nvecs += diff_iovecs;
  
+       /* attach the transaction to the CIL if it has any busy extents */
+       if (!list_empty(&tp->t_busy))
+               list_splice_init(&tp->t_busy, &ctx->busy_extents);
+
         /*
          * Now transfer enough transaction reservation to the context ticket
          * for the checkpoint. The context ticket is special - the unit
@@ -316,10 +345,8 @@ xlog_cil_insert_items(
          * during the transaction commit.
          */
         if (ctx->ticket->t_curr_res == 0) {
-               /* first commit in checkpoint, steal the header reservation */
-               ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
                 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-               ticket->t_curr_res -= ctx->ticket->t_unit_res;
+               tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
         }
  
         /* do we need space for more log record headers? */
@@ -333,10 +360,10 @@ xlog_cil_insert_items(
                 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
                 ctx->ticket->t_unit_res += hdrs;
                 ctx->ticket->t_curr_res += hdrs;
-               ticket->t_curr_res -= hdrs;
-               ASSERT(ticket->t_curr_res >= len);
+               tp->t_ticket->t_curr_res -= hdrs;
+               ASSERT(tp->t_ticket->t_curr_res >= len);
         }
-       ticket->t_curr_res -= len;
+       tp->t_ticket->t_curr_res -= len;
         ctx->space_used += len;
  
         spin_unlock(&cil->xc_cil_lock);
@@ -350,7 +377,6 @@ xlog_cil_free_logvec(
  
         for (lv = log_vector; lv; ) {
                 struct xfs_log_vec *next = lv->lv_next;
-               kmem_free(lv->lv_buf);
                 kmem_free(lv);
                 lv = next;
         }
@@ -376,9 +402,9 @@ xlog_cil_committed(
         xfs_extent_busy_clear(mp, &ctx->busy_extents,
                              (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
  
-       spin_lock(&ctx->cil->xc_cil_lock);
+       spin_lock(&ctx->cil->xc_push_lock);
         list_del(&ctx->committing);
-       spin_unlock(&ctx->cil->xc_cil_lock);
+       spin_unlock(&ctx->cil->xc_push_lock);
  
         xlog_cil_free_logvec(ctx->lv_chain);
  
@@ -433,7 +459,7 @@ xlog_cil_push(
         down_write(&cil->xc_ctx_lock);
         ctx = cil->xc_ctx;
  
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         push_seq = cil->xc_push_seq;
         ASSERT(push_seq <= ctx->sequence);
  
@@ -444,10 +470,10 @@ xlog_cil_push(
          */
         if (list_empty(&cil->xc_cil)) {
                 cil->xc_push_seq = 0;
-               spin_unlock(&cil->xc_cil_lock);
+               spin_unlock(&cil->xc_push_lock);
                 goto out_skip;
         }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
  
  
         /* check for a previously pushed seqeunce */
@@ -515,9 +541,9 @@ xlog_cil_push(
          * that higher sequences will wait for us to write out a commit record
          * before they do.
          */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         list_add(&ctx->committing, &cil->xc_committing);
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
         up_write(&cil->xc_ctx_lock);
  
         /*
@@ -552,7 +578,7 @@ xlog_cil_push(
          * order the commit records so replay will get them in the right order.
          */
  restart:
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
                 /*
                  * Higher sequences will wait for this one so skip them.
@@ -565,11 +591,11 @@ restart:
                          * It is still being pushed! Wait for the push to
                          * complete, then start again from the beginning.
                          */
-                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
                         goto restart;
                 }
         }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
  
         /* xfs_log_done always frees the ticket on error. */
         commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
@@ -588,10 +614,10 @@ restart:
          * callbacks to the iclog we can assign the commit LSN to the context
          * and wake up anyone who is waiting for the commit to complete.
          */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         ctx->commit_lsn = commit_lsn;
         wake_up_all(&cil->xc_commit_wait);
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
  
         /* release the hounds! */
         return xfs_log_release_iclog(log->l_mp, commit_iclog);
@@ -644,12 +670,12 @@ xlog_cil_push_background(
         if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
                 return;
  
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         if (cil->xc_push_seq < cil->xc_current_sequence) {
                 cil->xc_push_seq = cil->xc_current_sequence;
                 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
         }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
  
  }
  
@@ -672,14 +698,14 @@ xlog_cil_push_foreground(
          * If the CIL is empty or we've already pushed the sequence then
          * there's no work we need to do.
          */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
-               spin_unlock(&cil->xc_cil_lock);
+               spin_unlock(&cil->xc_push_lock);
                 return;
         }
  
         cil->xc_push_seq = push_seq;
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
  
         /* do the push now */
         xlog_cil_push(log);
@@ -706,43 +732,25 @@ xfs_log_commit_cil(
         int                     flags)
  {
         struct xlog             *log = mp->m_log;
+       struct xfs_cil          *cil = log->l_cilp;
         int                     log_flags = 0;
-       struct xfs_log_vec      *log_vector;
  
         if (flags & XFS_TRANS_RELEASE_LOG_RES)
                 log_flags = XFS_LOG_REL_PERM_RESERV;
  
-       /*
-        * Do all the hard work of formatting items (including memory
-        * allocation) outside the CIL context lock. This prevents stalling CIL
-        * pushes when we are low on memory and a transaction commit spends a
-        * lot of time in memory reclaim.
-        */
-       log_vector = xlog_cil_prepare_log_vecs(tp);
-       if (!log_vector)
-               return ENOMEM;
-
         /* lock out background commit */
-       down_read(&log->l_cilp->xc_ctx_lock);
-       if (commit_lsn)
-               *commit_lsn = log->l_cilp->xc_ctx->sequence;
+       down_read(&cil->xc_ctx_lock);
  
-       /* xlog_cil_insert_items() destroys log_vector list */
-       xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+       xlog_cil_insert_items(log, tp);
  
         /* check we didn't blow the reservation */
         if (tp->t_ticket->t_curr_res < 0)
-               xlog_print_tic_res(log->l_mp, tp->t_ticket);
+               xlog_print_tic_res(mp, tp->t_ticket);
  
-       /* attach the transaction to the CIL if it has any busy extents */
-       if (!list_empty(&tp->t_busy)) {
-               spin_lock(&log->l_cilp->xc_cil_lock);
-               list_splice_init(&tp->t_busy,
-                                       &log->l_cilp->xc_ctx->busy_extents);
-               spin_unlock(&log->l_cilp->xc_cil_lock);
-       }
+       tp->t_commit_lsn = cil->xc_ctx->sequence;
+       if (commit_lsn)
+               *commit_lsn = tp->t_commit_lsn;
  
-       tp->t_commit_lsn = *commit_lsn;
         xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
         xfs_trans_unreserve_and_mod_sb(tp);
  
@@ -757,11 +765,11 @@ xfs_log_commit_cil(
          * the log items. This affects (at least) processing of stale buffers,
          * inodes and EFIs.
          */
-       xfs_trans_free_items(tp, *commit_lsn, 0);
+       xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
  
         xlog_cil_push_background(log);
  
-       up_read(&log->l_cilp->xc_ctx_lock);
+       up_read(&cil->xc_ctx_lock);
         return 0;
  }
  
@@ -800,7 +808,7 @@ xlog_cil_force_lsn(
          * on commits for those as well.
          */
  restart:
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
         list_for_each_entry(ctx, &cil->xc_committing, committing) {
                 if (ctx->sequence > sequence)
                         continue;
@@ -809,7 +817,7 @@ restart:
                          * It is still being pushed! Wait for the push to
                          * complete, then start again from the beginning.
                          */
-                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
                         goto restart;
                 }
                 if (ctx->sequence != sequence)
@@ -817,7 +825,7 @@ restart:
                 /* found it! */
                 commit_lsn = ctx->commit_lsn;
         }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
         return commit_lsn;
  }
  
@@ -875,6 +883,7 @@ xlog_cil_init(
         INIT_LIST_HEAD(&cil->xc_cil);
         INIT_LIST_HEAD(&cil->xc_committing);
         spin_lock_init(&cil->xc_cil_lock);
+       spin_lock_init(&cil->xc_push_lock);
         init_rwsem(&cil->xc_ctx_lock);
         init_waitqueue_head(&cil->xc_commit_wait);
  
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h

new file mode 100644 (file)

index 0000000..31e3a06
--- /dev/null
+++ b/fs/xfs/xfs_log_format.h
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS                2
+#define XLOG_MAX_ICLOGS                8
+#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
+#define XLOG_VERSION_1         1
+#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE  (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE       512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR     3
+
+#define XLOG_REC_SHIFT(log) \
+       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+       return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+               return be32_to_cpu(*((__be32 *)ptr + 1));
+       else
+               return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION                0x69
+#define XFS_VOLUME             0x2
+#define XFS_LOG                        0xaa
+
+#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT          1
+#define XLOG_REG_TYPE_BCHUNK           2
+#define XLOG_REG_TYPE_EFI_FORMAT       3
+#define XLOG_REG_TYPE_EFD_FORMAT       4
+#define XLOG_REG_TYPE_IFORMAT          5
+#define XLOG_REG_TYPE_ICORE            6
+#define XLOG_REG_TYPE_IEXT             7
+#define XLOG_REG_TYPE_IBROOT           8
+#define XLOG_REG_TYPE_ILOCAL           9
+#define XLOG_REG_TYPE_IATTR_EXT                10
+#define XLOG_REG_TYPE_IATTR_BROOT      11
+#define XLOG_REG_TYPE_IATTR_LOCAL      12
+#define XLOG_REG_TYPE_QFORMAT          13
+#define XLOG_REG_TYPE_DQUOT            14
+#define XLOG_REG_TYPE_QUOTAOFF         15
+#define XLOG_REG_TYPE_LRHEADER         16
+#define XLOG_REG_TYPE_UNMOUNT          17
+#define XLOG_REG_TYPE_COMMIT           18
+#define XLOG_REG_TYPE_TRANSHDR         19
+#define XLOG_REG_TYPE_ICREATE          20
+#define XLOG_REG_TYPE_MAX              20
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS       0x01    /* Start a new transaction */
+#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
+#define XLOG_END_TRANS         0x10    /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+       __be32     oh_tid;      /* transaction id of operation  :  4 b */
+       __be32     oh_len;      /* bytes in data region         :  4 b */
+       __u8       oh_clientid; /* who sent me this             :  1 b */
+       __u8       oh_flags;    /*                              :  1 b */
+       __u16      oh_res2;     /* 32 bit align                 :  2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
+       __be32    h_cycle;      /* write cycle of log                   :  4 */
+       __be32    h_version;    /* LR version                           :  4 */
+       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
+       __be64    h_lsn;        /* lsn of this LR                       :  8 */
+       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
+       __le32    h_crc;        /* crc of log record                    :  4 */
+       __be32    h_prev_block; /* block number to previous LR          :  4 */
+       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
+       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+       /* new fields */
+       __be32    h_fmt;        /* format of log record                 :  4 */
+       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
+       __be32    h_size;       /* iclog size                           :  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+       __be32    xh_cycle;     /* write cycle of log                   : 4 */
+       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+       xlog_rec_header_t       hic_header;
+       xlog_rec_ext_header_t   hic_xheader;
+       char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+       void            *i_addr;        /* beginning address of region */
+       int             i_len;          /* length in bytes of region */
+       uint            i_type;         /* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+       uint            th_magic;               /* magic number */
+       uint            th_type;                /* transaction type */
+       __int32_t       th_tid;                 /* transaction id (unused) */
+       uint            th_num_items;           /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
+
+/*
+ * Log item types.
+ */
+#define        XFS_LI_EFI              0x1236
+#define        XFS_LI_EFD              0x1237
+#define        XFS_LI_IUNLINK          0x1238
+#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
+#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
+#define        XFS_LI_DQUOT            0x123d
+#define        XFS_LI_QUOTAOFF         0x123e
+#define        XFS_LI_ICREATE          0x123f
+
+#define XFS_LI_TYPE_DESC \
+       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
+       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
+
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE     1
+#define XFS_TRANS_SETATTR_SIZE         2
+#define XFS_TRANS_INACTIVE             3
+#define XFS_TRANS_CREATE               4
+#define XFS_TRANS_CREATE_TRUNC         5
+#define XFS_TRANS_TRUNCATE_FILE                6
+#define XFS_TRANS_REMOVE               7
+#define XFS_TRANS_LINK                 8
+#define XFS_TRANS_RENAME               9
+#define XFS_TRANS_MKDIR                        10
+#define XFS_TRANS_RMDIR                        11
+#define XFS_TRANS_SYMLINK              12
+#define XFS_TRANS_SET_DMATTRS          13
+#define XFS_TRANS_GROWFS               14
+#define XFS_TRANS_STRAT_WRITE          15
+#define XFS_TRANS_DIOSTRAT             16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define        XFS_TRANS_WRITEID               18
+#define        XFS_TRANS_ADDAFORK              19
+#define        XFS_TRANS_ATTRINVAL             20
+#define        XFS_TRANS_ATRUNCATE             21
+#define        XFS_TRANS_ATTR_SET              22
+#define        XFS_TRANS_ATTR_RM               23
+#define        XFS_TRANS_ATTR_FLAG             24
+#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
+#define XFS_TRANS_QM_SBCHANGE          26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1               27
+#define XFS_TRANS_DUMMY2               28
+#define XFS_TRANS_QM_QUOTAOFF          29
+#define XFS_TRANS_QM_DQALLOC           30
+#define XFS_TRANS_QM_SETQLIM           31
+#define XFS_TRANS_QM_DQCLUSTER         32
+#define XFS_TRANS_QM_QINOCREATE                33
+#define XFS_TRANS_QM_QUOTAOFF_END      34
+#define XFS_TRANS_SB_UNIT              35
+#define XFS_TRANS_FSYNC_TS             36
+#define        XFS_TRANS_GROWFSRT_ALLOC        37
+#define        XFS_TRANS_GROWFSRT_ZERO         38
+#define        XFS_TRANS_GROWFSRT_FREE         39
+#define        XFS_TRANS_SWAPEXT               40
+#define        XFS_TRANS_SB_COUNT              41
+#define        XFS_TRANS_CHECKPOINT            42
+#define        XFS_TRANS_ICREATE               43
+#define        XFS_TRANS_TYPE_MAX              43
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
+       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
+       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
+       { XFS_TRANS_CREATE,             "CREATE" }, \
+       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
+       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
+       { XFS_TRANS_REMOVE,             "REMOVE" }, \
+       { XFS_TRANS_LINK,               "LINK" }, \
+       { XFS_TRANS_RENAME,             "RENAME" }, \
+       { XFS_TRANS_MKDIR,              "MKDIR" }, \
+       { XFS_TRANS_RMDIR,              "RMDIR" }, \
+       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
+       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
+       { XFS_TRANS_GROWFS,             "GROWFS" }, \
+       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
+       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
+       { XFS_TRANS_WRITEID,            "WRITEID" }, \
+       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
+       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
+       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
+       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
+       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
+       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
+       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
+       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
+       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
+       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
+       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
+       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
+       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
+       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
+       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
+       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
+       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
+       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
+       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
+       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
+       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+       struct xfs_log_item     *lid_item;
+       struct list_head        lid_trans;
+       unsigned char           lid_flags;
+};
+
+#define XFS_LID_DIRTY          0x1
+
+/*
+ * Values for t_flags.
+ */
+#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
+#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
+#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
+#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
+                                          count in superblock */
+
+/*
+ * Values for call flags parameter.
+ */
+#define        XFS_TRANS_RELEASE_LOG_RES       0x4
+#define        XFS_TRANS_ABORT                 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define        XFS_TRANS_SB_ICOUNT             0x00000001
+#define        XFS_TRANS_SB_IFREE              0x00000002
+#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
+#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
+#define        XFS_TRANS_SB_FREXTENTS          0x00000010
+#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
+#define        XFS_TRANS_SB_DBLOCKS            0x00000040
+#define        XFS_TRANS_SB_AGCOUNT            0x00000080
+#define        XFS_TRANS_SB_IMAXPCT            0x00000100
+#define        XFS_TRANS_SB_REXTSIZE           0x00000200
+#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
+#define        XFS_TRANS_SB_RBLOCKS            0x00000800
+#define        XFS_TRANS_SB_REXTENTS           0x00001000
+#define        XFS_TRANS_SB_REXTSLOG           0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define        XFS_AGF_REF             4
+#define        XFS_AGI_REF             4
+#define        XFS_AGFL_REF            3
+#define        XFS_INO_BTREE_REF       3
+#define        XFS_ALLOC_BTREE_REF     2
+#define        XFS_BMAP_BTREE_REF      2
+#define        XFS_DIR_BTREE_REF       2
+#define        XFS_INO_REF             2
+#define        XFS_ATTR_BTREE_REF      1
+#define        XFS_DQUOT_REF           1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
+
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
+#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
+#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
+#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
+#define        XFS_ILOG_DEV    0x010   /* log the dev field */
+#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
+#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
+#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
+#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP     0x4000
+
+#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT)
+
+#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT)
+
+#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
+                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+       __int32_t       t_sec;          /* timestamp seconds */
+       __int32_t       t_nsec;         /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ *       in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+       __uint16_t      di_mode;        /* mode and type of file */
+       __int8_t        di_version;     /* inode version */
+       __int8_t        di_format;      /* format of di_c data */
+       __uint16_t      di_onlink;      /* old number of links to file */
+       __uint32_t      di_uid;         /* owner's user id */
+       __uint32_t      di_gid;         /* owner's group id */
+       __uint32_t      di_nlink;       /* number of links to file */
+       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
+       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+       __uint8_t       di_pad[6];      /* unused, zeroed space */
+       __uint16_t      di_flushiter;   /* incremented on flush */
+       xfs_ictimestamp_t di_atime;     /* time last accessed */
+       xfs_ictimestamp_t di_mtime;     /* time last modified */
+       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+       xfs_fsize_t     di_size;        /* number of bytes in file */
+       xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
+       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+       __int8_t        di_aformat;     /* format of attr fork's data */
+       __uint32_t      di_dmevmask;    /* DMIG event mask */
+       __uint16_t      di_dmstate;     /* DMIG state info */
+       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+       __uint32_t      di_gen;         /* generation number */
+
+       /* di_next_unlinked is the only non-core field in the old dinode */
+       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
+
+       /* start of the extended dinode, writable fields */
+       __uint32_t      di_crc;         /* CRC of the inode */
+       __uint64_t      di_changecount; /* number of attribute changes */
+       xfs_lsn_t       di_lsn;         /* flush sequence */
+       __uint64_t      di_flags2;      /* more random flags */
+       __uint8_t       di_pad2[16];    /* more padding for future expansion */
+
+       /* fields only written to during inode creation */
+       xfs_ictimestamp_t di_crtime;    /* time created */
+       xfs_ino_t       di_ino;         /* inode number */
+       uuid_t          di_uuid;        /* UUID of the filesystem */
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+
+static inline uint xfs_icdinode_size(int version)
+{
+       if (version == 3)
+               return sizeof(struct xfs_icdinode);
+       return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define        XFS_BLF_CHUNK           128
+#define        XFS_BLF_SHIFT           7
+#define        BIT_TO_WORD_SHIFT       5
+#define        NBWORD                  (NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define        XFS_BLF_INODE_BUF       (1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define        XFS_BLF_CANCEL          (1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define        XFS_BLF_UDQUOT_BUF      (1<<2)
+#define XFS_BLF_PDQUOT_BUF     (1<<3)
+#define        XFS_BLF_GDQUOT_BUF      (1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+       unsigned short  blf_type;       /* buf log item type indicator */
+       unsigned short  blf_size;       /* size of this item */
+       ushort          blf_flags;      /* misc state */
+       ushort          blf_len;        /* number of blocks in this buf */
+       __int64_t       blf_blkno;      /* starting blkno of this buf */
+       unsigned int    blf_map_size;   /* used size of data bitmap in words */
+       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS  5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+       XFS_BLFT_UNKNOWN_BUF = 0,
+       XFS_BLFT_UDQUOT_BUF,
+       XFS_BLFT_PDQUOT_BUF,
+       XFS_BLFT_GDQUOT_BUF,
+       XFS_BLFT_BTREE_BUF,
+       XFS_BLFT_AGF_BUF,
+       XFS_BLFT_AGFL_BUF,
+       XFS_BLFT_AGI_BUF,
+       XFS_BLFT_DINO_BUF,
+       XFS_BLFT_SYMLINK_BUF,
+       XFS_BLFT_DIR_BLOCK_BUF,
+       XFS_BLFT_DIR_DATA_BUF,
+       XFS_BLFT_DIR_FREE_BUF,
+       XFS_BLFT_DIR_LEAF1_BUF,
+       XFS_BLFT_DIR_LEAFN_BUF,
+       XFS_BLFT_DA_NODE_BUF,
+       XFS_BLFT_ATTR_LEAF_BUF,
+       XFS_BLFT_ATTR_RMT_BUF,
+       XFS_BLFT_SB_BUF,
+       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+       blf->blf_flags &= ~XFS_BLFT_MASK;
+       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+       xfs_dfsbno_t    ext_start;
+       xfs_extlen_t    ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+       __uint32_t      ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_t            efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_t            efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+       __uint16_t              qlf_type;      /* dquot log item type */
+       __uint16_t              qlf_size;      /* size of this item */
+       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
+       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
+       __int32_t               qlf_len;       /* len of dquot buffer */
+       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+       unsigned short          qf_type;        /* quotaoff log item type */
+       unsigned short          qf_size;        /* size of this item */
+       unsigned int            qf_flags;       /* USR and/or GRP */
+       char                    qf_pad[12];     /* padding for future */
+} xfs_qoff_logformat_t;
+
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT     \
+               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD     \
+               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD     \
+               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+       __uint16_t      icl_type;       /* type of log format structure */
+       __uint16_t      icl_size;       /* size of log format structure */
+       __be32          icl_ag;         /* ag being allocated in */
+       __be32          icl_agbno;      /* start block of inode range */
+       __be32          icl_count;      /* number of inodes to initialise */
+       __be32          icl_isize;      /* size of inodes */
+       __be32          icl_length;     /* length of extent to initialise */
+       __be32          icl_gen;        /* inode generation number to use */
+};
+
+int    xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int    xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index b9ea262dd1c2c7575fee738fba6e71575180924a..136654b9400df9b28a40415b1fbca3c9b7d6a958 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -24,51 +24,13 @@ struct xlog_ticket;
  struct xfs_mount;
  
  /*
- * Macros, structures, prototypes for internal log manager use.
+ * Flags for log structure
   */
-
-#define XLOG_MIN_ICLOGS                2
-#define XLOG_MAX_ICLOGS                8
-#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
-#define XLOG_VERSION_1         1
-#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
-#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
-#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
-#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
-#define XLOG_MAX_RECORD_BSIZE  (256*1024)
-#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
-#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
-#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
-#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
-                                 (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
-
-#define XLOG_HEADER_SIZE       512
-
-#define XLOG_REC_SHIFT(log) \
-       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-#define XLOG_TOTAL_REC_SHIFT(log) \
-       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-
-static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
-{
-       return ((xfs_lsn_t)cycle << 32) | block;
-}
-
-static inline uint xlog_get_cycle(char *ptr)
-{
-       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-               return be32_to_cpu(*((__be32 *)ptr + 1));
-       else
-               return be32_to_cpu(*(__be32 *)ptr);
-}
-
-#define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
-
-#ifdef __KERNEL__
+#define XLOG_ACTIVE_RECOVERY   0x2     /* in the middle of recovery */
+#define        XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
+#define XLOG_IO_ERROR          0x8     /* log hit an I/O error, and being
+                                          shutdown */
+#define XLOG_TAIL_WARN         0x10    /* log tail verify warning issued */
  
  /*
   * get client id from packed copy.
@@ -101,27 +63,7 @@ static inline uint xlog_get_client_id(__be32 i)
  #define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
  #define XLOG_STATE_ALL      0x7FFF /* All possible valid flags */
  #define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
-#endif /* __KERNEL__ */
  
-/*
- * Flags to log operation header
- *
- * The first write of a new transaction will be preceded with a start
- * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
- * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
- * the remainder of the current active in-core log, it is split up into
- * multiple regions.  Each partial region will be marked with a
- * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
- *
- */
-#define XLOG_START_TRANS       0x01    /* Start a new transaction */
-#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
-#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
-#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
-#define XLOG_END_TRANS         0x10    /* End a continued transaction */
-#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
-
-#ifdef __KERNEL__
  /*
   * Flags to log ticket
   */
@@ -132,22 +74,6 @@ static inline uint xlog_get_client_id(__be32 i)
         { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
         { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
  
-#endif /* __KERNEL__ */
-
-#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
-
-/*
- * Flags for log structure
- */
-#define XLOG_ACTIVE_RECOVERY   0x2     /* in the middle of recovery */
-#define        XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
-#define XLOG_IO_ERROR          0x8     /* log hit an I/O error, and being
-                                          shutdown */
-#define XLOG_TAIL_WARN         0x10    /* log tail verify warning issued */
-
-typedef __uint32_t xlog_tid_t;
-
-#ifdef __KERNEL__
  /*
   * Below are states for covering allocation transactions.
   * By covering, we mean changing the h_tail_lsn in the last on-disk
@@ -223,7 +149,6 @@ typedef __uint32_t xlog_tid_t;
  
  #define XLOG_COVER_OPS         5
  
-
  /* Ticket reservation region accounting */ 
  #define XLOG_TIC_LEN_MAX       15
  
@@ -258,64 +183,6 @@ typedef struct xlog_ticket {
         xlog_res_t         t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
  } xlog_ticket_t;
  
-#endif
-
-
-typedef struct xlog_op_header {
-       __be32     oh_tid;      /* transaction id of operation  :  4 b */
-       __be32     oh_len;      /* bytes in data region         :  4 b */
-       __u8       oh_clientid; /* who sent me this             :  1 b */
-       __u8       oh_flags;    /*                              :  1 b */
-       __u16      oh_res2;     /* 32 bit align                 :  2 b */
-} xlog_op_header_t;
-
-
-/* valid values for h_fmt */
-#define XLOG_FMT_UNKNOWN  0
-#define XLOG_FMT_LINUX_LE 1
-#define XLOG_FMT_LINUX_BE 2
-#define XLOG_FMT_IRIX_BE  3
-
-/* our fmt */
-#ifdef XFS_NATIVE_HOST
-#define XLOG_FMT XLOG_FMT_LINUX_BE
-#else
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#endif
-
-typedef struct xlog_rec_header {
-       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
-       __be32    h_cycle;      /* write cycle of log                   :  4 */
-       __be32    h_version;    /* LR version                           :  4 */
-       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
-       __be64    h_lsn;        /* lsn of this LR                       :  8 */
-       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-       __le32    h_crc;        /* crc of log record                    :  4 */
-       __be32    h_prev_block; /* block number to previous LR          :  4 */
-       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
-       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
-       /* new fields */
-       __be32    h_fmt;        /* format of log record                 :  4 */
-       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
-       __be32    h_size;       /* iclog size                           :  4 */
-} xlog_rec_header_t;
-
-typedef struct xlog_rec_ext_header {
-       __be32    xh_cycle;     /* write cycle of log                   : 4 */
-       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
-} xlog_rec_ext_header_t;
-
-#ifdef __KERNEL__
-
-/*
- * Quite misnamed, because this union lays out the actual on-disk log buffer.
- */
-typedef union xlog_in_core2 {
-       xlog_rec_header_t       hic_header;
-       xlog_rec_ext_header_t   hic_xheader;
-       char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
  /*
   * - A log record header is 512 bytes.  There is plenty of room to grow the
   *     xlog_rec_header_t into the reserved space.
@@ -411,14 +278,17 @@ struct xfs_cil {
         struct xlog             *xc_log;
         struct list_head        xc_cil;
         spinlock_t              xc_cil_lock;
+
+       struct rw_semaphore     xc_ctx_lock ____cacheline_aligned_in_smp;
         struct xfs_cil_ctx      *xc_ctx;
-       struct rw_semaphore     xc_ctx_lock;
+
+       spinlock_t              xc_push_lock ____cacheline_aligned_in_smp;
+       xfs_lsn_t               xc_push_seq;
         struct list_head        xc_committing;
         wait_queue_head_t       xc_commit_wait;
         xfs_lsn_t               xc_current_sequence;
         struct work_struct      xc_push_work;
-       xfs_lsn_t               xc_push_seq;
-};
+} ____cacheline_aligned_in_smp;
  
  /*
   * The amount of log space we allow the CIL to aggregate is difficult to size.
@@ -686,6 +556,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
         schedule();
         remove_wait_queue(wq, &wait);
  }
-#endif /* __KERNEL__ */
  
  #endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index 7681b19aa5dc565a9807005ff3f1d6428a22f016..7c0c1fdc728b4ff6e18da1a4f0b46edde2337e4e 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,7 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
@@ -41,7 +41,6 @@
  #include "xfs_extfree_item.h"
  #include "xfs_trans_priv.h"
  #include "xfs_quota.h"
-#include "xfs_utils.h"
  #include "xfs_cksum.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
@@ -51,10 +50,12 @@
  #include "xfs_symlink.h"
  #include "xfs_da_btree.h"
  #include "xfs_dir2_format.h"
-#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
  #include "xfs_attr_leaf.h"
  #include "xfs_attr_remote.h"
  
+#define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
+
  STATIC int
  xlog_find_zeroed(
         struct xlog     *,
@@ -607,7 +608,7 @@ out:
  
  /*
   * Head is defined to be the point of the log where the next log write
- * write could go.  This means that incomplete LR writes at the end are
+ * could go.  This means that incomplete LR writes at the end are
   * eliminated when calculating the head.  We aren't guaranteed that previous
   * LR have complete transactions.  We only know that a cycle number of
   * current cycle number -1 won't be present in the log if we start writing
@@ -963,6 +964,7 @@ xlog_find_tail(
         }
         if (!found) {
                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               xlog_put_bp(bp);
                 ASSERT(0);
                 return XFS_ERROR(EIO);
         }
@@ -1144,7 +1146,8 @@ xlog_find_zeroed(
                  */
                 xfs_warn(log->l_mp,
                         "Log inconsistent or not a log (last==0, first!=1)");
-               return XFS_ERROR(EINVAL);
+               error = XFS_ERROR(EINVAL);
+               goto bp_err;
         }
  
         /* we have a partially zeroed log */
@@ -1766,19 +1769,11 @@ xlog_recover_buffer_pass1(
  
  /*
   * Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table.  If it does then return 1
- * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
   */
-STATIC int
-xlog_check_buffer_cancelled(
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
         struct xlog             *log,
         xfs_daddr_t             blkno,
         uint                    len,
@@ -1787,22 +1782,16 @@ xlog_check_buffer_cancelled(
         struct list_head        *bucket;
         struct xfs_buf_cancel   *bcp;
  
-       if (log->l_buf_cancel_table == NULL) {
-               /*
-                * There is nothing in the table built in pass one,
-                * so this buffer must not be cancelled.
-                */
+       if (!log->l_buf_cancel_table) {
+               /* empty table means no cancelled buffers in the log */
                 ASSERT(!(flags & XFS_BLF_CANCEL));
-               return 0;
+               return NULL;
         }
  
-       /*
-        * Search for an entry in the  cancel table that matches our buffer.
-        */
         bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
         list_for_each_entry(bcp, bucket, bc_list) {
                 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                       goto found;
+                       return bcp;
         }
  
         /*
@@ -1810,9 +1799,32 @@ xlog_check_buffer_cancelled(
          * that the buffer is NOT cancelled.
          */
         ASSERT(!(flags & XFS_BLF_CANCEL));
-       return 0;
+       return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0.  If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+       struct xlog             *log,
+       xfs_daddr_t             blkno,
+       uint                    len,
+       ushort                  flags)
+{
+       struct xfs_buf_cancel   *bcp;
+
+       bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+       if (!bcp)
+               return 0;
  
-found:
         /*
          * We've go a match, so return 1 so that the recovery of this buffer
          * is cancelled.  If this buffer is actually a buffer cancel log
@@ -1946,6 +1958,104 @@ xlog_recover_do_inode_buffer(
         return 0;
  }
  
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number.  If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       __uint32_t              magic32;
+       __uint16_t              magic16;
+       __uint16_t              magicda;
+       void                    *blk = bp->b_addr;
+
+       /* v4 filesystems always recover immediately */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               goto recover_immediately;
+
+       magic32 = be32_to_cpu(*(__be32 *)blk);
+       switch (magic32) {
+       case XFS_ABTB_CRC_MAGIC:
+       case XFS_ABTC_CRC_MAGIC:
+       case XFS_ABTB_MAGIC:
+       case XFS_ABTC_MAGIC:
+       case XFS_IBT_CRC_MAGIC:
+       case XFS_IBT_MAGIC:
+               return be64_to_cpu(
+                               ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
+       case XFS_BMAP_CRC_MAGIC:
+       case XFS_BMAP_MAGIC:
+               return be64_to_cpu(
+                               ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
+       case XFS_AGF_MAGIC:
+               return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+       case XFS_AGFL_MAGIC:
+               return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+       case XFS_AGI_MAGIC:
+               return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+       case XFS_SYMLINK_MAGIC:
+               return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+       case XFS_DIR3_BLOCK_MAGIC:
+       case XFS_DIR3_DATA_MAGIC:
+       case XFS_DIR3_FREE_MAGIC:
+               return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+       case XFS_ATTR3_RMT_MAGIC:
+               return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+       case XFS_SB_MAGIC:
+               return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
+       default:
+               break;
+       }
+
+       magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+       switch (magicda) {
+       case XFS_DIR3_LEAF1_MAGIC:
+       case XFS_DIR3_LEAFN_MAGIC:
+       case XFS_DA3_NODE_MAGIC:
+               return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+       default:
+               break;
+       }
+
+       /*
+        * We do individual object checks on dquot and inode buffers as they
+        * have their own individual LSN records. Also, we could have a stale
+        * buffer here, so we have to at least recognise these buffer types.
+        *
+        * A notd complexity here is inode unlinked list processing - it logs
+        * the inode directly in the buffer, but we don't know which inodes have
+        * been modified, and there is no global buffer LSN. Hence we need to
+        * recover all inode buffer types immediately. This problem will be
+        * fixed by logical logging of the unlinked list modifications.
+        */
+       magic16 = be16_to_cpu(*(__be16 *)blk);
+       switch (magic16) {
+       case XFS_DQUOT_MAGIC:
+       case XFS_DINODE_MAGIC:
+               goto recover_immediately;
+       default:
+               break;
+       }
+
+       /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+       return (xfs_lsn_t)-1;
+
+}
+
  /*
   * Validate the recovered buffer is of the correct type and attach the
   * appropriate buffer operations to them for writeback. Magic numbers are in a
@@ -1955,7 +2065,7 @@ xlog_recover_do_inode_buffer(
   *     inside a struct xfs_da_blkinfo at the start of the buffer.
   */
  static void
-xlog_recovery_validate_buf_type(
+xlog_recover_validate_buf_type(
         struct xfs_mount        *mp,
         struct xfs_buf          *bp,
         xfs_buf_log_format_t    *buf_f)
@@ -2234,7 +2344,7 @@ xlog_recover_do_reg_buffer(
          * just avoid the verification stage for non-crc filesystems
          */
         if (xfs_sb_version_hascrc(&mp->m_sb))
-               xlog_recovery_validate_buf_type(mp, bp, buf_f);
+               xlog_recover_validate_buf_type(mp, bp, buf_f);
  }
  
  /*
@@ -2366,7 +2476,7 @@ xfs_qm_dqcheck(
  
  /*
   * Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
   * (ie. USR or GRP), then just toss this buffer away; don't recover it.
   * Else, treat it as a regular buffer and do recovery.
   */
@@ -2425,20 +2535,22 @@ xlog_recover_do_dquot_buffer(
   * over the log during recovery.  During the first we build a table of
   * those buffers which have been cancelled, and during the second we
   * only replay those buffers which do not have corresponding cancel
- * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table.  See xlog_recover_buffer_pass[1,2] above
   * for more details on the implementation of the table of cancel records.
   */
  STATIC int
  xlog_recover_buffer_pass2(
         struct xlog                     *log,
         struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
  {
         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
         xfs_mount_t             *mp = log->l_mp;
         xfs_buf_t               *bp;
         int                     error;
         uint                    buf_flags;
+       xfs_lsn_t               lsn;
  
         /*
          * In this pass we only want to recover all the buffers which have
@@ -2463,10 +2575,17 @@ xlog_recover_buffer_pass2(
         error = bp->b_error;
         if (error) {
                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
-               xfs_buf_relse(bp);
-               return error;
+               goto out_release;
         }
  
+       /*
+        * recover the buffer only if we get an LSN from it and it's less than
+        * the lsn of the transaction we are replaying.
+        */
+       lsn = xlog_recover_get_buf_lsn(mp, bp);
+       if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+               goto out_release;
+
         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
         } else if (buf_f->blf_flags &
@@ -2476,7 +2595,7 @@ xlog_recover_buffer_pass2(
                 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
         }
         if (error)
-               return XFS_ERROR(error);
+               goto out_release;
  
         /*
          * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2505,6 +2624,7 @@ xlog_recover_buffer_pass2(
                 xfs_buf_delwri_queue(bp, buffer_list);
         }
  
+out_release:
         xfs_buf_relse(bp);
         return error;
  }
@@ -2513,7 +2633,8 @@ STATIC int
  xlog_recover_inode_pass2(
         struct xlog                     *log,
         struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
  {
         xfs_inode_log_format_t  *in_f;
         xfs_mount_t             *mp = log->l_mp;
@@ -2592,6 +2713,20 @@ xlog_recover_inode_pass2(
                 goto error;
         }
  
+       /*
+        * If the inode has an LSN in it, recover the inode only if it's less
+        * than the lsn of the transaction we are replaying.
+        */
+       if (dip->di_version >= 3) {
+               xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       trace_xfs_log_recover_inode_skip(log, in_f);
+                       error = 0;
+                       goto out_release;
+               }
+       }
+
         /*
          * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
          * are transactional and if ordering is necessary we can determine that
@@ -2781,6 +2916,8 @@ write_inode_buffer:
         ASSERT(bp->b_target->bt_mount == mp);
         bp->b_iodone = xlog_recover_iodone;
         xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
         xfs_buf_relse(bp);
  error:
         if (need_free)
@@ -2822,7 +2959,8 @@ STATIC int
  xlog_recover_dquot_pass2(
         struct xlog                     *log,
         struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
  {
         xfs_mount_t             *mp = log->l_mp;
         xfs_buf_t               *bp;
@@ -2896,6 +3034,19 @@ xlog_recover_dquot_pass2(
                 return XFS_ERROR(EIO);
         }
  
+       /*
+        * If the dquot has an LSN in it, recover the dquot only if it's less
+        * than the lsn of the transaction we are replaying.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+               xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       goto out_release;
+               }
+       }
+
         memcpy(ddq, recddq, item->ri_buf[1].i_len);
         if (xfs_sb_version_hascrc(&mp->m_sb)) {
                 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2906,9 +3057,10 @@ xlog_recover_dquot_pass2(
         ASSERT(bp->b_target->bt_mount == mp);
         bp->b_iodone = xlog_recover_iodone;
         xfs_buf_delwri_queue(bp, buffer_list);
-       xfs_buf_relse(bp);
  
-       return (0);
+out_release:
+       xfs_buf_relse(bp);
+       return 0;
  }
  
  /*
@@ -3116,6 +3268,106 @@ xlog_recover_free_trans(
         kmem_free(trans);
  }
  
+STATIC void
+xlog_recover_buffer_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
+       struct xfs_mount                *mp = log->l_mp;
+
+       if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+                       buf_f->blf_len, buf_f->blf_flags)) {
+               return;
+       }
+
+       xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+                               buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_inode_log_format     ilf_buf;
+       struct xfs_inode_log_format     *ilfp;
+       struct xfs_mount                *mp = log->l_mp;
+       int                     error;
+
+       if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+               ilfp = item->ri_buf[0].i_addr;
+       } else {
+               ilfp = &ilf_buf;
+               memset(ilfp, 0, sizeof(*ilfp));
+               error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+               if (error)
+                       return;
+       }
+
+       if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+               return;
+
+       xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+                               ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_mount        *mp = log->l_mp;
+       struct xfs_disk_dquot   *recddq;
+       struct xfs_dq_logformat *dq_f;
+       uint                    type;
+
+
+       if (mp->m_qflags == 0)
+               return;
+
+       recddq = item->ri_buf[1].i_addr;
+       if (recddq == NULL)
+               return;
+       if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+               return;
+
+       type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+       ASSERT(type);
+       if (log->l_quotaoffs_flag & type)
+               return;
+
+       dq_f = item->ri_buf[0].i_addr;
+       ASSERT(dq_f);
+       ASSERT(dq_f->qlf_len == 1);
+
+       xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+                         XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       switch (ITEM_TYPE(item)) {
+       case XFS_LI_BUF:
+               xlog_recover_buffer_ra_pass2(log, item);
+               break;
+       case XFS_LI_INODE:
+               xlog_recover_inode_ra_pass2(log, item);
+               break;
+       case XFS_LI_DQUOT:
+               xlog_recover_dquot_ra_pass2(log, item);
+               break;
+       case XFS_LI_EFI:
+       case XFS_LI_EFD:
+       case XFS_LI_QUOTAOFF:
+       default:
+               break;
+       }
+}
+
  STATIC int
  xlog_recover_commit_pass1(
         struct xlog                     *log,
@@ -3155,15 +3407,18 @@ xlog_recover_commit_pass2(
  
         switch (ITEM_TYPE(item)) {
         case XFS_LI_BUF:
-               return xlog_recover_buffer_pass2(log, buffer_list, item);
+               return xlog_recover_buffer_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
         case XFS_LI_INODE:
-               return xlog_recover_inode_pass2(log, buffer_list, item);
+               return xlog_recover_inode_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
         case XFS_LI_EFI:
                 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
         case XFS_LI_EFD:
                 return xlog_recover_efd_pass2(log, item);
         case XFS_LI_DQUOT:
-               return xlog_recover_dquot_pass2(log, buffer_list, item);
+               return xlog_recover_dquot_pass2(log, buffer_list, item,
+                                               trans->r_lsn);
         case XFS_LI_ICREATE:
                 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
         case XFS_LI_QUOTAOFF:
@@ -3177,6 +3432,26 @@ xlog_recover_commit_pass2(
         }
  }
  
+STATIC int
+xlog_recover_items_pass2(
+       struct xlog                     *log,
+       struct xlog_recover             *trans,
+       struct list_head                *buffer_list,
+       struct list_head                *item_list)
+{
+       struct xlog_recover_item        *item;
+       int                             error = 0;
+
+       list_for_each_entry(item, item_list, ri_list) {
+               error = xlog_recover_commit_pass2(log, trans,
+                                         buffer_list, item);
+               if (error)
+                       return error;
+       }
+
+       return error;
+}
+
  /*
   * Perform the transaction.
   *
@@ -3189,9 +3464,16 @@ xlog_recover_commit_trans(
         struct xlog_recover     *trans,
         int                     pass)
  {
-       int                     error = 0, error2;
-       xlog_recover_item_t     *item;
-       LIST_HEAD               (buffer_list);
+       int                             error = 0;
+       int                             error2;
+       int                             items_queued = 0;
+       struct xlog_recover_item        *item;
+       struct xlog_recover_item        *next;
+       LIST_HEAD                       (buffer_list);
+       LIST_HEAD                       (ra_list);
+       LIST_HEAD                       (done_list);
+
+       #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
  
         hlist_del(&trans->r_list);
  
@@ -3199,14 +3481,22 @@ xlog_recover_commit_trans(
         if (error)
                 return error;
  
-       list_for_each_entry(item, &trans->r_itemq, ri_list) {
+       list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
                 switch (pass) {
                 case XLOG_RECOVER_PASS1:
                         error = xlog_recover_commit_pass1(log, trans, item);
                         break;
                 case XLOG_RECOVER_PASS2:
-                       error = xlog_recover_commit_pass2(log, trans,
-                                                         &buffer_list, item);
+                       xlog_recover_ra_pass2(log, item);
+                       list_move_tail(&item->ri_list, &ra_list);
+                       items_queued++;
+                       if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+                               error = xlog_recover_items_pass2(log, trans,
+                                               &buffer_list, &ra_list);
+                               list_splice_tail_init(&ra_list, &done_list);
+                               items_queued = 0;
+                       }
+
                         break;
                 default:
                         ASSERT(0);
@@ -3216,9 +3506,19 @@ xlog_recover_commit_trans(
                         goto out;
         }
  
+out:
+       if (!list_empty(&ra_list)) {
+               if (!error)
+                       error = xlog_recover_items_pass2(log, trans,
+                                       &buffer_list, &ra_list);
+               list_splice_tail_init(&ra_list, &done_list);
+       }
+
+       if (!list_empty(&done_list))
+               list_splice_init(&done_list, &trans->r_itemq);
+
         xlog_recover_free_trans(trans);
  
-out:
         error2 = xfs_buf_delwri_submit(&buffer_list);
         return error ? error : error2;
  }
@@ -3376,7 +3676,7 @@ xlog_recover_process_efi(
         }
  
         tp = xfs_trans_alloc(mp, 0);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error)
                 goto abort_error;
         efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3482,8 +3782,7 @@ xlog_recover_clear_agi_bucket(
         int             error;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-                                 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
         if (error)
                 goto out_abort;
  
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c

new file mode 100644 (file)

index 0000000..bbcec0b
--- /dev/null
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+       struct xfs_mount        *mp)
+{
+       int                     size;
+       int                     nblks;
+
+       size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
+              MAXNAMELEN - 1;
+       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+       nblks += XFS_B_TO_FSB(mp, size);
+       nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+       return  M_RES(mp)->tr_attrsetm.tr_logres +
+               M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+       struct xfs_mount        *mp,
+       struct xfs_trans_res    *max_resp)
+{
+       struct xfs_trans_res    *resp;
+       struct xfs_trans_res    *end_resp;
+       int                     log_space = 0;
+       int                     attr_space;
+
+       attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+       resp = (struct xfs_trans_res *)M_RES(mp);
+       end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+       for (; resp < end_resp; resp++) {
+               int             tmp = resp->tr_logcount > 1 ?
+                                     resp->tr_logres * resp->tr_logcount :
+                                     resp->tr_logres;
+               if (log_space < tmp) {
+                       log_space = tmp;
+                       *max_resp = *resp;              /* struct copy */
+               }
+       }
+
+       if (attr_space > log_space) {
+               *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
+               max_resp->tr_logres = attr_space;
+       }
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+       struct xfs_mount        *mp)
+{
+       struct xfs_trans_res    tres = {0};
+       int                     max_logres;
+       int                     min_logblks = 0;
+       int                     lsunit = 0;
+
+       xfs_log_get_max_trans_res(mp, &tres);
+
+       max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+       if (tres.tr_logcount > 1)
+               max_logres *= tres.tr_logcount;
+
+       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+               lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+       /*
+        * Two factors should be taken into account for calculating the minimum
+        * log space.
+        * 1) The fundamental limitation is that no single transaction can be
+        *    larger than half size of the log.
+        *
+        *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+        *    define, which is set to 3. That means we can definitely fit
+        *    maximally sized 2 transactions in the log. We'll use this same
+        *    value here.
+        *
+        * 2) If the lsunit option is specified, a transaction requires 2 LSU
+        *    for the reservation because there are two log writes that can
+        *    require padding - the transaction data and the commit record which
+        *    are written separately and both can require padding to the LSU.
+        *    Consider that we can have an active CIL reservation holding 2*LSU,
+        *    but the CIL is not over a push threshold, in this case, if we
+        *    don't have enough log space for at one new transaction, which
+        *    includes another 2*LSU in the reservation, we will run into dead
+        *    loop situation in log space grant procedure. i.e.
+        *    xlog_grant_head_wait().
+        *
+        *    Hence the log size needs to be able to contain two maximally sized
+        *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+        *
+        * Also, the log size should be a multiple of the log stripe unit, round
+        * it up to lsunit boundary if lsunit is specified.
+        */
+       if (lsunit) {
+               min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+                             2 * lsunit;
+       } else
+               min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+       min_logblks *= XFS_MIN_LOG_FACTOR;
+
+       return XFS_BB_TO_FSB(mp, min_logblks);
+}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 2b0ba358165619b87523315f1ca3940b4e2606c0..5dcc68019d1bc8c49a799695436045d69d49167f 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,7 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
@@ -25,8 +25,10 @@
  #include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_ialloc_btree.h"
@@ -40,7 +42,6 @@
  #include "xfs_error.h"
  #include "xfs_quota.h"
  #include "xfs_fsops.h"
-#include "xfs_utils.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
  #include "xfs_cksum.h"
@@ -59,69 +60,6 @@ STATIC void  xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
  #define xfs_icsb_balance_counter_locked(mp, a, b)      do { } while (0)
  #endif
  
-static const struct {
-       short offset;
-       short type;     /* 0 = integer
-                        * 1 = binary / string (no translation)
-                        */
-} xfs_sb_info[] = {
-    { offsetof(xfs_sb_t, sb_magicnum),   0 },
-    { offsetof(xfs_sb_t, sb_blocksize),  0 },
-    { offsetof(xfs_sb_t, sb_dblocks),    0 },
-    { offsetof(xfs_sb_t, sb_rblocks),    0 },
-    { offsetof(xfs_sb_t, sb_rextents),   0 },
-    { offsetof(xfs_sb_t, sb_uuid),       1 },
-    { offsetof(xfs_sb_t, sb_logstart),   0 },
-    { offsetof(xfs_sb_t, sb_rootino),    0 },
-    { offsetof(xfs_sb_t, sb_rbmino),     0 },
-    { offsetof(xfs_sb_t, sb_rsumino),    0 },
-    { offsetof(xfs_sb_t, sb_rextsize),   0 },
-    { offsetof(xfs_sb_t, sb_agblocks),   0 },
-    { offsetof(xfs_sb_t, sb_agcount),    0 },
-    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
-    { offsetof(xfs_sb_t, sb_logblocks),  0 },
-    { offsetof(xfs_sb_t, sb_versionnum), 0 },
-    { offsetof(xfs_sb_t, sb_sectsize),   0 },
-    { offsetof(xfs_sb_t, sb_inodesize),  0 },
-    { offsetof(xfs_sb_t, sb_inopblock),  0 },
-    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
-    { offsetof(xfs_sb_t, sb_blocklog),   0 },
-    { offsetof(xfs_sb_t, sb_sectlog),    0 },
-    { offsetof(xfs_sb_t, sb_inodelog),   0 },
-    { offsetof(xfs_sb_t, sb_inopblog),   0 },
-    { offsetof(xfs_sb_t, sb_agblklog),   0 },
-    { offsetof(xfs_sb_t, sb_rextslog),   0 },
-    { offsetof(xfs_sb_t, sb_inprogress), 0 },
-    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
-    { offsetof(xfs_sb_t, sb_icount),     0 },
-    { offsetof(xfs_sb_t, sb_ifree),      0 },
-    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
-    { offsetof(xfs_sb_t, sb_frextents),  0 },
-    { offsetof(xfs_sb_t, sb_uquotino),   0 },
-    { offsetof(xfs_sb_t, sb_gquotino),   0 },
-    { offsetof(xfs_sb_t, sb_qflags),     0 },
-    { offsetof(xfs_sb_t, sb_flags),      0 },
-    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
-    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
-    { offsetof(xfs_sb_t, sb_unit),      0 },
-    { offsetof(xfs_sb_t, sb_width),     0 },
-    { offsetof(xfs_sb_t, sb_dirblklog),         0 },
-    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
-    { offsetof(xfs_sb_t, sb_logsectsize),0 },
-    { offsetof(xfs_sb_t, sb_logsunit),  0 },
-    { offsetof(xfs_sb_t, sb_features2),         0 },
-    { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-    { offsetof(xfs_sb_t, sb_features_compat), 0 },
-    { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
-    { offsetof(xfs_sb_t, sb_features_incompat), 0 },
-    { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-    { offsetof(xfs_sb_t, sb_crc),       0 },
-    { offsetof(xfs_sb_t, sb_pad),       0 },
-    { offsetof(xfs_sb_t, sb_pquotino),  0 },
-    { offsetof(xfs_sb_t, sb_lsn),       0 },
-    { sizeof(xfs_sb_t),                         0 }
-};
-
  static DEFINE_MUTEX(xfs_uuid_table_mutex);
  static int xfs_uuid_table_size;
  static uuid_t *xfs_uuid_table;
@@ -197,64 +135,6 @@ xfs_uuid_unmount(
  }
  
  
-/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
- */
-struct xfs_perag *
-xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
-{
-       struct xfs_perag        *pag;
-       int                     ref = 0;
-
-       rcu_read_lock();
-       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
-       if (pag) {
-               ASSERT(atomic_read(&pag->pag_ref) >= 0);
-               ref = atomic_inc_return(&pag->pag_ref);
-       }
-       rcu_read_unlock();
-       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
-       return pag;
-}
-
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          first,
-       int                     tag)
-{
-       struct xfs_perag        *pag;
-       int                     found;
-       int                     ref;
-
-       rcu_read_lock();
-       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                       (void **)&pag, first, 1, tag);
-       if (found <= 0) {
-               rcu_read_unlock();
-               return NULL;
-       }
-       ref = atomic_inc_return(&pag->pag_ref);
-       rcu_read_unlock();
-       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
-       return pag;
-}
-
-void
-xfs_perag_put(struct xfs_perag *pag)
-{
-       int     ref;
-
-       ASSERT(atomic_read(&pag->pag_ref) > 0);
-       ref = atomic_dec_return(&pag->pag_ref);
-       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
-}
-
  STATIC void
  __xfs_free_perag(
         struct rcu_head *head)
@@ -307,184 +187,6 @@ xfs_sb_validate_fsb_count(
         return 0;
  }
  
-/*
- * Check the validity of the SB found.
- */
-STATIC int
-xfs_mount_validate_sb(
-       xfs_mount_t     *mp,
-       xfs_sb_t        *sbp,
-       bool            check_inprogress,
-       bool            check_version)
-{
-
-       /*
-        * If the log device and data device have the
-        * same device number, the log is internal.
-        * Consequently, the sb_logstart should be non-zero.  If
-        * we have a zero sb_logstart in this case, we may be trying to mount
-        * a volume filesystem in a non-volume manner.
-        */
-       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-               xfs_warn(mp, "bad magic number");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-
-       if (!xfs_sb_good_version(sbp)) {
-               xfs_warn(mp, "bad version");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-       if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
-                       (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
-                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
-               xfs_notice(mp,
-"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Version 5 superblock feature mask validation. Reject combinations the
-        * kernel cannot support up front before checking anything else. For
-        * write validation, we don't need to check feature masks.
-        */
-       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-               xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
-
-               if (xfs_sb_has_compat_feature(sbp,
-                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
-                               (sbp->sb_features_compat &
-                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
-               }
-
-               if (xfs_sb_has_ro_compat_feature(sbp,
-                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
-                       xfs_alert(mp,
-"Superblock has unknown read-only compatible features (0x%x) enabled.",
-                               (sbp->sb_features_ro_compat &
-                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
-                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                               xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
-"Filesystem can only be safely mounted read only.");
-                               return XFS_ERROR(EINVAL);
-                       }
-               }
-               if (xfs_sb_has_incompat_feature(sbp,
-                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
-                               (sbp->sb_features_incompat &
-                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
-                       return XFS_ERROR(EINVAL);
-               }
-       }
-
-       if (unlikely(
-           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an external log; "
-               "specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       if (unlikely(
-           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an internal log; "
-               "do not specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * More sanity checking.  Most of these were stolen directly from
-        * xfs_repair.
-        */
-       if (unlikely(
-           sbp->sb_agcount <= 0                                        ||
-           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
-           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
-           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
-           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
-           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
-           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
-           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
-           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
-           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
-           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
-           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
-           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
-           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
-           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
-           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
-           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
-           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
-           sbp->sb_dblocks == 0                                        ||
-           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
-           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-               XFS_CORRUPTION_ERROR("SB sanity check failed",
-                               XFS_ERRLEVEL_LOW, mp, sbp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Until this is fixed only page-sized or smaller data blocks work.
-        */
-       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-               xfs_warn(mp,
-               "File system with blocksize %d bytes. "
-               "Only pagesize (%ld) or less will currently work.",
-                               sbp->sb_blocksize, PAGE_SIZE);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       /*
-        * Currently only very few inode sizes are supported.
-        */
-       switch (sbp->sb_inodesize) {
-       case 256:
-       case 512:
-       case 1024:
-       case 2048:
-               break;
-       default:
-               xfs_warn(mp, "inode size of %d bytes not supported",
-                               sbp->sb_inodesize);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
-           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-               xfs_warn(mp,
-               "file system too large to be mounted on this system.");
-               return XFS_ERROR(EFBIG);
-       }
-
-       if (check_inprogress && sbp->sb_inprogress) {
-               xfs_warn(mp, "Offline file system operation in progress!");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Version 1 directory format has never worked on Linux.
-        */
-       if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-               xfs_warn(mp, "file system using version 1 directory format");
-               return XFS_ERROR(ENOSYS);
-       }
-
-       return 0;
-}
-
  int
  xfs_initialize_perag(
         xfs_mount_t     *mp,
@@ -569,283 +271,15 @@ out_unwind:
         return error;
  }
  
-static void
-xfs_sb_quota_from_disk(struct xfs_sb *sbp)
-{
-       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
-       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
-       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
-}
-
-void
-xfs_sb_from_disk(
-       struct xfs_sb   *to,
-       xfs_dsb_t       *from)
-{
-       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
-       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
-       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
-       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
-       to->sb_rextents = be64_to_cpu(from->sb_rextents);
-       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-       to->sb_logstart = be64_to_cpu(from->sb_logstart);
-       to->sb_rootino = be64_to_cpu(from->sb_rootino);
-       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
-       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
-       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
-       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
-       to->sb_agcount = be32_to_cpu(from->sb_agcount);
-       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
-       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
-       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
-       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
-       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
-       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
-       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-       to->sb_blocklog = from->sb_blocklog;
-       to->sb_sectlog = from->sb_sectlog;
-       to->sb_inodelog = from->sb_inodelog;
-       to->sb_inopblog = from->sb_inopblog;
-       to->sb_agblklog = from->sb_agblklog;
-       to->sb_rextslog = from->sb_rextslog;
-       to->sb_inprogress = from->sb_inprogress;
-       to->sb_imax_pct = from->sb_imax_pct;
-       to->sb_icount = be64_to_cpu(from->sb_icount);
-       to->sb_ifree = be64_to_cpu(from->sb_ifree);
-       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
-       to->sb_frextents = be64_to_cpu(from->sb_frextents);
-       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
-       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
-       to->sb_qflags = be16_to_cpu(from->sb_qflags);
-       to->sb_flags = from->sb_flags;
-       to->sb_shared_vn = from->sb_shared_vn;
-       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
-       to->sb_unit = be32_to_cpu(from->sb_unit);
-       to->sb_width = be32_to_cpu(from->sb_width);
-       to->sb_dirblklog = from->sb_dirblklog;
-       to->sb_logsectlog = from->sb_logsectlog;
-       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
-       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
-       to->sb_features2 = be32_to_cpu(from->sb_features2);
-       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
-       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
-       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
-       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
-       to->sb_features_log_incompat =
-                               be32_to_cpu(from->sb_features_log_incompat);
-       to->sb_pad = 0;
-       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
-       to->sb_lsn = be64_to_cpu(from->sb_lsn);
-}
-
-static inline void
-xfs_sb_quota_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       *fields)
-{
-       __uint16_t      qflags = from->sb_qflags;
-
-       if (*fields & XFS_SB_QFLAGS) {
-               /*
-                * The in-core version of sb_qflags do not have
-                * XFS_OQUOTA_* flags, whereas the on-disk version
-                * does.  So, convert incore XFS_{PG}QUOTA_* flags
-                * to on-disk XFS_OQUOTA_* flags.
-                */
-               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                       qflags |= XFS_OQUOTA_ENFD;
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                       qflags |= XFS_OQUOTA_CHKD;
-               to->sb_qflags = cpu_to_be16(qflags);
-               *fields &= ~XFS_SB_QFLAGS;
-       }
-}
-
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
-void
-xfs_sb_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       fields)
-{
-       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
-       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-       xfs_sb_field_t  f;
-       int             first;
-       int             size;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-
-       xfs_sb_quota_to_disk(to, from, &fields);
-       while (fields) {
-               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-               first = xfs_sb_info[f].offset;
-               size = xfs_sb_info[f + 1].offset - first;
-
-               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
-
-               if (size == 1 || xfs_sb_info[f].type == 1) {
-                       memcpy(to_ptr + first, from_ptr + first, size);
-               } else {
-                       switch (size) {
-                       case 2:
-                               *(__be16 *)(to_ptr + first) =
-                                       cpu_to_be16(*(__u16 *)(from_ptr + first));
-                               break;
-                       case 4:
-                               *(__be32 *)(to_ptr + first) =
-                                       cpu_to_be32(*(__u32 *)(from_ptr + first));
-                               break;
-                       case 8:
-                               *(__be64 *)(to_ptr + first) =
-                                       cpu_to_be64(*(__u64 *)(from_ptr + first));
-                               break;
-                       default:
-                               ASSERT(0);
-                       }
-               }
-
-               fields &= ~(1LL << f);
-       }
-}
-
-static int
-xfs_sb_verify(
-       struct xfs_buf  *bp,
-       bool            check_version)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_sb   sb;
-
-       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
-
-       /*
-        * Only check the in progress field for the primary superblock as
-        * mkfs.xfs doesn't clear it from secondary superblocks.
-        */
-       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
-                                    check_version);
-}
-
-/*
- * If the superblock has the CRC feature bit set or the CRC field is non-null,
- * check that the CRC is valid.  We check the CRC field is non-null because a
- * single bit error could clear the feature bit and unused parts of the
- * superblock are supposed to be zero. Hence a non-null crc field indicates that
- * we've potentially lost a feature bit and we should check it anyway.
- */
-static void
-xfs_sb_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-       int             error;
-
-       /*
-        * open code the version check to avoid needing to convert the entire
-        * superblock from disk order just to check the version number
-        */
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
-           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
-                                               XFS_SB_VERSION_5) ||
-            dsb->sb_crc != 0)) {
-
-               if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
-                                     offsetof(struct xfs_sb, sb_crc))) {
-                       error = EFSCORRUPTED;
-                       goto out_error;
-               }
-       }
-       error = xfs_sb_verify(bp, true);
-
-out_error:
-       if (error) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, error);
-       }
-}
-
-/*
- * We may be probed for a filesystem match, so we may not want to emit
- * messages when the superblock buffer is not actually an XFS superblock.
- * If we find an XFS superblock, the run a normal, noisy mount because we are
- * really going to mount it and want to know about errors.
- */
-static void
-xfs_sb_quiet_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-
-
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
-               /* XFS filesystem, verify noisily! */
-               xfs_sb_read_verify(bp);
-               return;
-       }
-       /* quietly fail */
-       xfs_buf_ioerror(bp, EWRONGFS);
-}
-
-static void
-xfs_sb_write_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       int                     error;
-
-       error = xfs_sb_verify(bp, false);
-       if (error) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, error);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
-                        offsetof(struct xfs_sb, sb_crc));
-}
-
-const struct xfs_buf_ops xfs_sb_buf_ops = {
-       .verify_read = xfs_sb_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
-static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
-       .verify_read = xfs_sb_quiet_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
  /*
   * xfs_readsb
   *
   * Does the initial read of the superblock.
   */
  int
-xfs_readsb(xfs_mount_t *mp, int flags)
+xfs_readsb(
+       struct xfs_mount *mp,
+       int             flags)
  {
         unsigned int    sector_size;
         struct xfs_buf  *bp;
@@ -884,8 +318,8 @@ reread:
          * Initialize the mount structure from the superblock.
          */
         xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
-
         xfs_sb_quota_from_disk(&mp->m_sb);
+
         /*
          * We must be able to do sector-sized and sector-aligned IO.
          */
@@ -922,107 +356,6 @@ release_buf:
         return error;
  }
  
-
-/*
- * xfs_mount_common
- *
- * Mount initialization code establishing various mount
- * fields from the superblock associated with the given
- * mount structure
- */
-STATIC void
-xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
-{
-       mp->m_agfrotor = mp->m_agirotor = 0;
-       spin_lock_init(&mp->m_agirotor_lock);
-       mp->m_maxagi = mp->m_sb.sb_agcount;
-       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
-       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
-       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
-       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
-       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-       mp->m_blockmask = sbp->sb_blocksize - 1;
-       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
-       mp->m_blockwmask = mp->m_blockwsize - 1;
-
-       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-
-       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
-       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-
-       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
-       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
-                                       sbp->sb_inopblock);
-       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-}
-
-/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-STATIC int
-xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
-{
-       xfs_agnumber_t  index;
-       xfs_perag_t     *pag;
-       xfs_sb_t        *sbp = &mp->m_sb;
-       uint64_t        ifree = 0;
-       uint64_t        ialloc = 0;
-       uint64_t        bfree = 0;
-       uint64_t        bfreelst = 0;
-       uint64_t        btree = 0;
-       int             error;
-
-       for (index = 0; index < agcount; index++) {
-               /*
-                * read the agf, then the agi. This gets us
-                * all the information we need and populates the
-                * per-ag structures for us.
-                */
-               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
-               if (error)
-                       return error;
-
-               error = xfs_ialloc_pagi_init(mp, NULL, index);
-               if (error)
-                       return error;
-               pag = xfs_perag_get(mp, index);
-               ifree += pag->pagi_freecount;
-               ialloc += pag->pagi_count;
-               bfree += pag->pagf_freeblks;
-               bfreelst += pag->pagf_flcount;
-               btree += pag->pagf_btreeblks;
-               xfs_perag_put(pag);
-       }
-       /*
-        * Overwrite incore superblock counters with just-read data
-        */
-       spin_lock(&mp->m_sb_lock);
-       sbp->sb_ifree = ifree;
-       sbp->sb_icount = ialloc;
-       sbp->sb_fdblocks = bfree + bfreelst + btree;
-       spin_unlock(&mp->m_sb_lock);
-
-       /* Fixup the per-cpu counters as well. */
-       xfs_icsb_reinit_counters(mp);
-
-       return 0;
-}
-
  /*
   * Update alignment values based on mount options and sb values
   */
@@ -1194,7 +527,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
  }
  
  /*
- * Check that the data (and log if separate) are an ok size.
+ * Check that the data (and log if separate) is an ok size.
   */
  STATIC int
  xfs_check_sizes(xfs_mount_t *mp)
@@ -1264,8 +597,7 @@ xfs_mount_reset_sbqflags(
                 return 0;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1315,7 +647,7 @@ xfs_mountfs(
         uint            quotaflags = 0;
         int             error = 0;
  
-       xfs_mount_common(mp, sbp);
+       xfs_sb_mount_common(mp, sbp);
  
         /*
          * Check for a mismatched features2 values.  Older kernels
@@ -1400,7 +732,7 @@ xfs_mountfs(
         xfs_set_inoalignment(mp);
  
         /*
-        * Check that the data (and log if separate) are an ok size.
+        * Check that the data (and log if separate) is an ok size.
          */
         error = xfs_check_sizes(mp);
         if (error)
@@ -1738,8 +1070,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
                 return 0;
  
         tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
@@ -1752,49 +1083,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
  }
  
  /*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
- * in-core superblock into the superblock buffer to be logged.
- * It does not provide the higher level of locking that is
- * needed to protect the in-core superblock from concurrent
- * access.
- */
-void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
-{
-       xfs_buf_t       *bp;
-       int             first;
-       int             last;
-       xfs_mount_t     *mp;
-       xfs_sb_field_t  f;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-       mp = tp->t_mountp;
-       bp = xfs_trans_getsb(tp, mp, 0);
-       first = sizeof(xfs_sb_t);
-       last = 0;
-
-       /* translate/copy */
-
-       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
-
-       /* find modified range */
-       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       last = xfs_sb_info[f + 1].offset - 1;
-
-       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       first = xfs_sb_info[f].offset;
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
-       xfs_trans_log_buf(tp, bp, first, last);
-}
-
-
-/*
- * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
   * a delta to a specified field in the in-core superblock.  Simply
   * switch on the field indicated and apply the delta to that field.
   * Fields are not allowed to dip below zero, so if the delta would
@@ -2101,8 +1390,7 @@ xfs_mount_log_sb(
                          XFS_SB_VERSIONNUM));
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
@@ -2260,12 +1548,6 @@ xfs_icsb_init_counters(
         if (mp->m_sb_cnts == NULL)
                 return -ENOMEM;
  
-#ifdef CONFIG_HOTPLUG_CPU
-       mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-       mp->m_icsb_notifier.priority = 0;
-       register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
         for_each_online_cpu(i) {
                 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
                 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
@@ -2278,6 +1560,13 @@ xfs_icsb_init_counters(
          * initial balance kicks us off correctly
          */
         mp->m_icsb_counters = -1;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+       mp->m_icsb_notifier.priority = 0;
+       register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
+
         return 0;
  }
  
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 4e374d4a9189622bccb2b56aa331c7218567d1f2..1fa0584b5627830c77e4bf0fa02db9c55c066a2d 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,45 +18,7 @@
  #ifndef __XFS_MOUNT_H__
  #define        __XFS_MOUNT_H__
  
-typedef struct xfs_trans_reservations {
-       uint    tr_write;       /* extent alloc trans */
-       uint    tr_itruncate;   /* truncate trans */
-       uint    tr_rename;      /* rename trans */
-       uint    tr_link;        /* link trans */
-       uint    tr_remove;      /* unlink trans */
-       uint    tr_symlink;     /* symlink trans */
-       uint    tr_create;      /* create trans */
-       uint    tr_mkdir;       /* mkdir trans */
-       uint    tr_ifree;       /* inode free trans */
-       uint    tr_ichange;     /* inode update trans */
-       uint    tr_growdata;    /* fs data section grow trans */
-       uint    tr_swrite;      /* sync write inode trans */
-       uint    tr_addafork;    /* cvt inode to attributed trans */
-       uint    tr_writeid;     /* write setuid/setgid file */
-       uint    tr_attrinval;   /* attr fork buffer invalidation */
-       uint    tr_attrsetm;    /* set/create an attribute at mount time */
-       uint    tr_attrsetrt;   /* set/create an attribute at runtime */
-       uint    tr_attrrm;      /* remove an attribute */
-       uint    tr_clearagi;    /* clear bad agi unlinked ino bucket */
-       uint    tr_growrtalloc; /* grow realtime allocations */
-       uint    tr_growrtzero;  /* grow realtime zeroing */
-       uint    tr_growrtfree;  /* grow realtime freeing */
-       uint    tr_qm_sbchange; /* change quota flags */
-       uint    tr_qm_setqlim;  /* adjust quota limits */
-       uint    tr_qm_dqalloc;  /* allocate quota on disk */
-       uint    tr_qm_quotaoff; /* turn quota off */
-       uint    tr_qm_equotaoff;/* end of turn quota off */
-       uint    tr_sb;          /* modify superblock */
-} xfs_trans_reservations_t;
-
-#ifndef __KERNEL__
-
-#define xfs_daddr_to_agno(mp,d) \
-       ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define xfs_daddr_to_agbno(mp,d) \
-       ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-
-#else /* __KERNEL__ */
+#ifdef __KERNEL__
  
  struct xlog;
  struct xfs_inode;
@@ -174,7 +136,7 @@ typedef struct xfs_mount {
         int                     m_ialloc_blks;  /* blocks in inode allocation */
         int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
         uint                    m_qflags;       /* quota status flags */
-       xfs_trans_reservations_t m_reservations;/* precomputed res values */
+       struct xfs_trans_resv   m_resv;         /* precomputed res values */
         __uint64_t              m_maxicount;    /* maximum inode count */
         __uint64_t              m_resblks;      /* total reserved blocks */
         __uint64_t              m_resblks_avail;/* available reserved blocks */
@@ -329,14 +291,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
         return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
  }
  
-/*
- * perag get/put wrappers for ref counting
- */
-struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
-                                       int tag);
-void   xfs_perag_put(struct xfs_perag *pag);
-
  /*
   * Per-cpu superblock locking functions
   */
@@ -366,9 +320,63 @@ typedef struct xfs_mod_sb {
         int64_t         msb_delta;      /* Change to make to specified field */
  } xfs_mod_sb_t;
  
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection. This is defined for the kernel
+ * only, and hence is defined here instead of in xfs_ag.h. You need the struct
+ * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
+ * so this doesn't introduce any strange header file dependencies.
+ */
+typedef struct xfs_perag {
+       struct xfs_mount *pag_mount;    /* owner filesystem */
+       xfs_agnumber_t  pag_agno;       /* AG this structure belongs to */
+       atomic_t        pag_ref;        /* perag reference count */
+       char            pagf_init;      /* this agf's entry is initialized */
+       char            pagi_init;      /* this agi's entry is initialized */
+       char            pagf_metadata;  /* the agf is preferred to be metadata */
+       char            pagi_inodeok;   /* The agi is ok for inodes */
+       __uint8_t       pagf_levels[XFS_BTNUM_AGF];
+                                       /* # of levels in bno & cnt btree */
+       __uint32_t      pagf_flcount;   /* count of blocks in freelist */
+       xfs_extlen_t    pagf_freeblks;  /* total free blocks */
+       xfs_extlen_t    pagf_longest;   /* longest free space */
+       __uint32_t      pagf_btreeblks; /* # of blocks held in AGF btrees */
+       xfs_agino_t     pagi_freecount; /* number of free inodes */
+       xfs_agino_t     pagi_count;     /* number of allocated inodes */
+
+       /*
+        * Inode allocation search lookup optimisation.
+        * If the pagino matches, the search for new inodes
+        * doesn't need to search the near ones again straight away
+        */
+       xfs_agino_t     pagl_pagino;
+       xfs_agino_t     pagl_leftrec;
+       xfs_agino_t     pagl_rightrec;
+       spinlock_t      pagb_lock;      /* lock for pagb_tree */
+       struct rb_root  pagb_tree;      /* ordered tree of busy extents */
+
+       atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
+
+       spinlock_t      pag_ici_lock;   /* incore inode cache lock */
+       struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+       int             pag_ici_reclaimable;    /* reclaimable inodes */
+       struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
+       unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
+
+       /* buffer cache index */
+       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+
+       /* for rcu-safe freeing */
+       struct rcu_head rcu_head;
+       int             pagb_count;     /* pagb slots in use */
+} xfs_perag_t;
+
  extern int     xfs_log_sbcount(xfs_mount_t *);
  extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
  extern int     xfs_mountfs(xfs_mount_t *mp);
+extern int     xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
+                                    xfs_agnumber_t *maxagi);
  
  extern void    xfs_unmountfs(xfs_mount_t *);
  extern int     xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -387,13 +395,4 @@ extern void        xfs_set_low_space_thresholds(struct xfs_mount *);
  
  #endif /* __KERNEL__ */
  
-extern void    xfs_sb_calc_crc(struct xfs_buf  *);
-extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern int     xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
-                                       xfs_agnumber_t *);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
-
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-
  #endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index d320794d03ce233d93f7ccfcd0a6c2c3f186e9c2..6218a0aeeeea88449c4a1e29e54c0297ae8de6fb 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
@@ -37,7 +38,6 @@
  #include "xfs_attr.h"
  #include "xfs_buf_item.h"
  #include "xfs_trans_space.h"
-#include "xfs_utils.h"
  #include "xfs_qm.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
@@ -834,21 +834,52 @@ xfs_qm_qino_alloc(
         int             error;
         int             committed;
  
+       *ip = NULL;
+       /*
+        * With superblock that doesn't have separate pquotino, we
+        * share an inode between gquota and pquota. If the on-disk
+        * superblock has GQUOTA and the filesystem is now mounted
+        * with PQUOTA, just use sb_gquotino for sb_pquotino and
+        * vice-versa.
+        */
+       if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+                       (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+               xfs_ino_t ino = NULLFSINO;
+
+               if ((flags & XFS_QMOPT_PQUOTA) &&
+                            (mp->m_sb.sb_gquotino != NULLFSINO)) {
+                       ino = mp->m_sb.sb_gquotino;
+                       ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+               } else if ((flags & XFS_QMOPT_GQUOTA) &&
+                            (mp->m_sb.sb_pquotino != NULLFSINO)) {
+                       ino = mp->m_sb.sb_pquotino;
+                       ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+               }
+               if (ino != NULLFSINO) {
+                       error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+                       if (error)
+                               return error;
+                       mp->m_sb.sb_gquotino = NULLFSINO;
+                       mp->m_sb.sb_pquotino = NULLFSINO;
+               }
+       }
+
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-       if ((error = xfs_trans_reserve(tp,
-                                     XFS_QM_QINOCREATE_SPACE_RES(mp),
-                                     XFS_CREATE_LOG_RES(mp), 0,
-                                     XFS_TRANS_PERM_LOG_RES,
-                                     XFS_CREATE_LOG_COUNT))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
+                                 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
+       if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
         }
  
-       error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-       if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT);
-               return error;
+       if (!*ip) {
+               error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
+                                                               &committed);
+               if (error) {
+                       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                        XFS_TRANS_ABORT);
+                       return error;
+               }
         }
  
         /*
@@ -860,21 +891,25 @@ xfs_qm_qino_alloc(
         if (flags & XFS_QMOPT_SBVERSION) {
                 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                  XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
-                      (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                       XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+                       XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
+                               (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                                XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+                                XFS_SB_QFLAGS));
  
                 xfs_sb_version_addquota(&mp->m_sb);
                 mp->m_sb.sb_uquotino = NULLFSINO;
                 mp->m_sb.sb_gquotino = NULLFSINO;
+               mp->m_sb.sb_pquotino = NULLFSINO;
  
-               /* qflags will get updated _after_ quotacheck */
-               mp->m_sb.sb_qflags = 0;
+               /* qflags will get updated fully _after_ quotacheck */
+               mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
         }
         if (flags & XFS_QMOPT_UQUOTA)
                 mp->m_sb.sb_uquotino = (*ip)->i_ino;
-       else
+       else if (flags & XFS_QMOPT_GQUOTA)
                 mp->m_sb.sb_gquotino = (*ip)->i_ino;
+       else
+               mp->m_sb.sb_pquotino = (*ip)->i_ino;
         spin_unlock(&mp->m_sb_lock);
         xfs_mod_sb(tp, sbfields);
  
@@ -1484,11 +1519,10 @@ xfs_qm_init_quotainos(
                         if (error)
                                 goto error_rele;
                 }
-               /* XXX: Use gquotino for now */
                 if (XFS_IS_PQUOTA_ON(mp) &&
-                   mp->m_sb.sb_gquotino != NULLFSINO) {
-                       ASSERT(mp->m_sb.sb_gquotino > 0);
-                       error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                   mp->m_sb.sb_pquotino != NULLFSINO) {
+                       ASSERT(mp->m_sb.sb_pquotino > 0);
+                       error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
                                              0, 0, &pip);
                         if (error)
                                 goto error_rele;
@@ -1496,7 +1530,8 @@ xfs_qm_init_quotainos(
         } else {
                 flags |= XFS_QMOPT_SBVERSION;
                 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                           XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+                           XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+                           XFS_SB_QFLAGS);
         }
  
         /*
@@ -1524,9 +1559,8 @@ xfs_qm_init_quotainos(
                 flags &= ~XFS_QMOPT_SBVERSION;
         }
         if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
-               /* XXX: Use XFS_SB_GQUOTINO for now */
                 error = xfs_qm_qino_alloc(mp, &pip,
-                                         sbflags | XFS_SB_GQUOTINO,
+                                         sbflags | XFS_SB_PQUOTINO,
                                           flags | XFS_QMOPT_PQUOTA);
                 if (error)
                         goto error_rele;
@@ -1704,8 +1738,7 @@ xfs_qm_write_sb_changes(
         int             error;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return error;
@@ -1734,8 +1767,8 @@ xfs_qm_write_sb_changes(
  int
  xfs_qm_vop_dqalloc(
         struct xfs_inode        *ip,
-       uid_t                   uid,
-       gid_t                   gid,
+       xfs_dqid_t              uid,
+       xfs_dqid_t              gid,
         prid_t                  prid,
         uint                    flags,
         struct xfs_dquot        **O_udqpp,
@@ -1782,7 +1815,7 @@ xfs_qm_vop_dqalloc(
                          * holding ilock.
                          */
                         xfs_iunlock(ip, lockflags);
-                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                       error = xfs_qm_dqget(mp, NULL, uid,
                                                  XFS_DQ_USER,
                                                  XFS_QMOPT_DQALLOC |
                                                  XFS_QMOPT_DOWARN,
@@ -1809,7 +1842,7 @@ xfs_qm_vop_dqalloc(
         if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
                 if (ip->i_d.di_gid != gid) {
                         xfs_iunlock(ip, lockflags);
-                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                       error = xfs_qm_dqget(mp, NULL, gid,
                                                  XFS_DQ_GROUP,
                                                  XFS_QMOPT_DQALLOC |
                                                  XFS_QMOPT_DOWARN,
@@ -1943,7 +1976,7 @@ xfs_qm_vop_chown_reserve(
                         XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
  
         if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-           ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+           ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
                 udq_delblks = udqp;
                 /*
                  * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h

index 579d6a02a5b6ec5fd2e21c503f4ada8b6dfa54d6..670cd44640704eb4899f960045cb14f8f582acce 100644 (file)
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -160,6 +160,8 @@ extern int          xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
                                         struct fs_disk_quota *);
  extern int             xfs_qm_scall_getqstat(struct xfs_mount *,
                                         struct fs_quota_stat *);
+extern int             xfs_qm_scall_getqstatv(struct xfs_mount *,
+                                       struct fs_quota_statv *);
  extern int             xfs_qm_scall_quotaon(struct xfs_mount *, uint);
  extern int             xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
  
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c

index 437a52d91f6d91ce9f48acbfc5a9928a9ee366ed..3af50ccdfac1a10da858ef26e80b040ad4a474f1 100644 (file)
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c

index e4f8b2d6f38ba1960beefe35cbd15b0066f530cf..8174aad0b38813ec836ea044d9a1b7b4dc06f300 100644 (file)
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,6 +20,7 @@
  
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
@@ -37,7 +38,6 @@
  #include "xfs_error.h"
  #include "xfs_attr.h"
  #include "xfs_buf_item.h"
-#include "xfs_utils.h"
  #include "xfs_qm.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
@@ -247,9 +247,7 @@ xfs_qm_scall_trunc_qfile(
         xfs_ilock(ip, XFS_IOLOCK_EXCL);
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES,
-                                 XFS_ITRUNCATE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -296,8 +294,10 @@ xfs_qm_scall_trunc_qfiles(
  
         if (flags & XFS_DQ_USER)
                 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-       if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+       if (flags & XFS_DQ_GROUP)
                 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+       if (flags & XFS_DQ_PROJ)
+               error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
  
         return error ? error : error2;
  }
@@ -404,6 +404,7 @@ xfs_qm_scall_quotaon(
  
  /*
   * Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTAT command.
   */
  int
  xfs_qm_scall_getqstat(
@@ -413,8 +414,10 @@ xfs_qm_scall_getqstat(
         struct xfs_quotainfo    *q = mp->m_quotainfo;
         struct xfs_inode        *uip = NULL;
         struct xfs_inode        *gip = NULL;
+       struct xfs_inode        *pip = NULL;
         bool                    tempuqip = false;
         bool                    tempgqip = false;
+       bool                    temppqip = false;
  
         memset(out, 0, sizeof(fs_quota_stat_t));
  
@@ -424,16 +427,106 @@ xfs_qm_scall_getqstat(
                 out->qs_gquota.qfs_ino = NULLFSINO;
                 return (0);
         }
+
+       out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+                                                       (XFS_ALL_QUOTA_ACCT|
+                                                        XFS_ALL_QUOTA_ENFD));
+       if (q) {
+               uip = q->qi_uquotaip;
+               gip = q->qi_gquotaip;
+               pip = q->qi_pquotaip;
+       }
+       if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                       0, 0, &uip) == 0)
+                       tempuqip = true;
+       }
+       if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                       0, 0, &gip) == 0)
+                       tempgqip = true;
+       }
+       /*
+        * Q_XGETQSTAT doesn't have room for both group and project quotas.
+        * So, allow the project quota values to be copied out only if
+        * there is no group quota information available.
+        */
+       if (!gip) {
+               if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+                       if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+                                               0, 0, &pip) == 0)
+                               temppqip = true;
+               }
+       } else
+               pip = NULL;
+       if (uip) {
+               out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+               out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+               out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+               if (tempuqip)
+                       IRELE(uip);
+       }
+
+       if (gip) {
+               out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+               out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+               out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+               if (tempgqip)
+                       IRELE(gip);
+       }
+       if (pip) {
+               out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+               out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
+               out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
+               if (temppqip)
+                       IRELE(pip);
+       }
+       if (q) {
+               out->qs_incoredqs = q->qi_dquots;
+               out->qs_btimelimit = q->qi_btimelimit;
+               out->qs_itimelimit = q->qi_itimelimit;
+               out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+               out->qs_bwarnlimit = q->qi_bwarnlimit;
+               out->qs_iwarnlimit = q->qi_iwarnlimit;
+       }
+       return 0;
+}
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTATV command, to support separate project quota field.
+ */
+int
+xfs_qm_scall_getqstatv(
+       struct xfs_mount        *mp,
+       struct fs_quota_statv   *out)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       struct xfs_inode        *uip = NULL;
+       struct xfs_inode        *gip = NULL;
+       struct xfs_inode        *pip = NULL;
+       bool                    tempuqip = false;
+       bool                    tempgqip = false;
+       bool                    temppqip = false;
+
+       if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+               out->qs_uquota.qfs_ino = NULLFSINO;
+               out->qs_gquota.qfs_ino = NULLFSINO;
+               out->qs_pquota.qfs_ino = NULLFSINO;
+               return (0);
+       }
+
         out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                         (XFS_ALL_QUOTA_ACCT|
                                                          XFS_ALL_QUOTA_ENFD));
-       out->qs_pad = 0;
         out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
         out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+       out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
  
         if (q) {
                 uip = q->qi_uquotaip;
                 gip = q->qi_gquotaip;
+               pip = q->qi_pquotaip;
         }
         if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -445,18 +538,30 @@ xfs_qm_scall_getqstat(
                                         0, 0, &gip) == 0)
                         tempgqip = true;
         }
+       if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+                                       0, 0, &pip) == 0)
+                       temppqip = true;
+       }
         if (uip) {
                 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
                 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
                 if (tempuqip)
                         IRELE(uip);
         }
+
         if (gip) {
                 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
                 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
                 if (tempgqip)
                         IRELE(gip);
         }
+       if (pip) {
+               out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
+               out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
+               if (temppqip)
+                       IRELE(pip);
+       }
         if (q) {
                 out->qs_incoredqs = q->qi_dquots;
                 out->qs_btimelimit = q->qi_btimelimit;
@@ -515,8 +620,7 @@ xfs_qm_scall_setqlim(
         xfs_dqunlock(dqp);
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 goto out_rele;
@@ -650,8 +754,7 @@ xfs_qm_log_quotaoff_end(
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
  
-       error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
         if (error) {
                 xfs_trans_cancel(tp, 0);
                 return (error);
@@ -684,8 +787,7 @@ xfs_qm_log_quotaoff(
         uint                    oldsbqflag=0;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
         if (error)
                 goto error0;
  
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h

index b14f42c714b609b95eab2b993805a7e4e4e73c24..e7d84d2d86830a25a5cd9778521766d4a592c45b 100644 (file)
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,267 +18,14 @@
  #ifndef __XFS_QUOTA_H__
  #define __XFS_QUOTA_H__
  
-struct xfs_trans;
-
-/*
- * The ondisk form of a dquot structure.
- */
-#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
-#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
-
-/*
- * uid_t and gid_t are hard-coded to 32 bits in the inode.
- * Hence, an 'id' in a dquot is 32 bits..
- */
-typedef __uint32_t     xfs_dqid_t;
-
-/*
- * Even though users may not have quota limits occupying all 64-bits,
- * they may need 64-bit accounting. Hence, 64-bit quota-counters,
- * and quota-limits. This is a waste in the common case, but hey ...
- */
-typedef __uint64_t     xfs_qcnt_t;
-typedef __uint16_t     xfs_qwarncnt_t;
-
-/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
- */
-typedef struct xfs_disk_dquot {
-       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
-       __u8            d_version;      /* dquot version */
-       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
-       __be32          d_id;           /* user,project,group id */
-       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
-       __be64          d_blk_softlimit;/* preferred limit on disk blks */
-       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
-       __be64          d_ino_softlimit;/* preferred inode limit */
-       __be64          d_bcount;       /* disk blocks owned by the user */
-       __be64          d_icount;       /* inodes owned by the user */
-       __be32          d_itimer;       /* zero if within inode limits if not,
-                                          this is when we refuse service */
-       __be32          d_btimer;       /* similar to above; for disk blocks */
-       __be16          d_iwarns;       /* warnings issued wrt num inodes */
-       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
-       __be32          d_pad0;         /* 64 bit align */
-       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
-       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
-       __be64          d_rtbcount;     /* realtime blocks owned */
-       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
-       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
-       __be16          d_pad;
-} xfs_disk_dquot_t;
-
-/*
- * This is what goes on disk. This is separated from the xfs_disk_dquot because
- * carrying the unnecessary padding would be a waste of memory.
- */
-typedef struct xfs_dqblk {
-       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
-       char              dd_fill[4];   /* filling for posterity */
-
-       /*
-        * These two are only present on filesystems with the CRC bits set.
-        */
-       __be32            dd_crc;       /* checksum */
-       __be64            dd_lsn;       /* last modification in log */
-       uuid_t            dd_uuid;      /* location information */
-} xfs_dqblk_t;
-
-#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
-
-/*
- * flags for q_flags field in the dquot.
- */
-#define XFS_DQ_USER            0x0001          /* a user quota */
-#define XFS_DQ_PROJ            0x0002          /* project quota */
-#define XFS_DQ_GROUP           0x0004          /* a group quota */
-#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
-#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
-
-#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
-
-#define XFS_DQ_FLAGS \
-       { XFS_DQ_USER,          "USER" }, \
-       { XFS_DQ_PROJ,          "PROJ" }, \
-       { XFS_DQ_GROUP,         "GROUP" }, \
-       { XFS_DQ_DIRTY,         "DIRTY" }, \
-       { XFS_DQ_FREEING,       "FREEING" }
-
-/*
- * We have the possibility of all three quota types being active at once, and
- * hence free space modification requires modification of all three current
- * dquots in a single transaction. For this case we need to have a reservation
- * of at least 3 dquots.
- *
- * However, a chmod operation can change both UID and GID in a single
- * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
- * modified. Hence for this case we need to reserve space for at least 4 dquots.
- *
- * And in the worst case, there's a rename operation that can be modifying up to
- * 4 inodes with dquots attached to them. In reality, the only inodes that can
- * have their dquots modified are the source and destination directory inodes
- * due to directory name creation and removal. That can require space allocation
- * and/or freeing on both directory inodes, and hence all three dquots on each
- * inode can be modified. And if the directories are world writeable, all the
- * dquots can be unique and so 6 dquots can be modified....
- *
- * And, of course, we also need to take into account the dquot log format item
- * used to describe each dquot.
- */
-#define XFS_DQUOT_LOGRES(mp)   \
-       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-
-/*
- * These are the structures used to lay out dquots and quotaoff
- * records on the log. Quite similar to those of inodes.
- */
-
-/*
- * log format struct for dquots.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- */
-typedef struct xfs_dq_logformat {
-       __uint16_t              qlf_type;      /* dquot log item type */
-       __uint16_t              qlf_size;      /* size of this item */
-       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
-       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
-       __int32_t               qlf_len;       /* len of dquot buffer */
-       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
-} xfs_dq_logformat_t;
-
-/*
- * log format struct for QUOTAOFF records.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
- * to the first and ensures that the first logitem is taken out of the AIL
- * only when the last one is securely committed.
- */
-typedef struct xfs_qoff_logformat {
-       unsigned short          qf_type;        /* quotaoff log item type */
-       unsigned short          qf_size;        /* size of this item */
-       unsigned int            qf_flags;       /* USR and/or GRP */
-       char                    qf_pad[12];     /* padding for future */
-} xfs_qoff_logformat_t;
-
-
-/*
- * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
- */
-#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
-#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
-#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
-#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
-#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
-#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
-#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
-
-/*
- * Conversion to and from the combined OQUOTA flag (if necessary)
- * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
- */
-#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
-#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
-#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
-#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
-
-/*
- * Quota Accounting/Enforcement flags
- */
-#define XFS_ALL_QUOTA_ACCT     \
-               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD     \
-               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD     \
-               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
-
-#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
-#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
-#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
-
-/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE   \
-       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+#include "xfs_quota_defs.h"
  
  /*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
+ * Kernel only quota definitions and functions
   */
-#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
-                                                  XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp)   ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
  
-/*
- * Flags to tell various functions what to do. Not all of these are meaningful
- * to a single function. None of these XFS_QMOPT_* flags are meant to have
- * persistent values (ie. their values can and will change between versions)
- */
-#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
-#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
-#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
-#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
-
-/*
- * flags to xfs_trans_mod_dquot to indicate which field needs to be
- * modified.
- */
-#define XFS_QMOPT_RES_REGBLKS  0x0010000
-#define XFS_QMOPT_RES_RTBLKS   0x0020000
-#define XFS_QMOPT_BCOUNT       0x0040000
-#define XFS_QMOPT_ICOUNT       0x0080000
-#define XFS_QMOPT_RTBCOUNT     0x0100000
-#define XFS_QMOPT_DELBCOUNT    0x0200000
-#define XFS_QMOPT_DELRTBCOUNT  0x0400000
-#define XFS_QMOPT_RES_INOS     0x0800000
-
-/*
- * flags for dqalloc.
- */
-#define XFS_QMOPT_INHERIT      0x1000000
-
-/*
- * flags to xfs_trans_mod_dquot.
- */
-#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
-#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
-#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
-#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
-#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
-#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
-#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
-#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
-
-
-#define XFS_QMOPT_QUOTALL      \
-               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
-#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+struct xfs_trans;
  
-#ifdef __KERNEL__
  /*
   * This check is done typically without holding the inode lock;
   * that may seem racy, but it is harmless in the context that it is used.
@@ -301,13 +48,6 @@ typedef struct xfs_qoff_logformat {
          (XFS_IS_PQUOTA_ON(mp) && \
                 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
  
-#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
-                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
-                                XFS_PQUOTA_CHKD)
-
-
  /*
   * The structure kept inside the xfs_trans_t keep track of dquot changes
   * within a transaction and apply them later.
@@ -340,8 +80,9 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
                 struct xfs_mount *, struct xfs_dquot *,
                 struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
  
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-               struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+               prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
+               struct xfs_dquot **);
  extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
                 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
  extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -362,9 +103,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
  
  #else
  static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-               uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
-               struct xfs_dquot **pdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+               prid_t prid, uint flags, struct xfs_dquot **udqp,
+               struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
  {
         *udqp = NULL;
         *gdqp = NULL;
@@ -415,5 +156,4 @@ extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
  
  extern const struct xfs_buf_ops xfs_dquot_buf_ops;
  
-#endif /* __KERNEL__ */
  #endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h

new file mode 100644 (file)

index 0000000..e6b0d6e
--- /dev/null
+++ b/fs/xfs/xfs_quota_defs.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t     xfs_qcnt_t;
+typedef __uint16_t     xfs_qwarncnt_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER            0x0001          /* a user quota */
+#define XFS_DQ_PROJ            0x0002          /* project quota */
+#define XFS_DQ_GROUP           0x0004          /* a group quota */
+#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
+#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
+
+#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+       { XFS_DQ_USER,          "USER" }, \
+       { XFS_DQ_PROJ,          "PROJ" }, \
+       { XFS_DQ_GROUP,         "GROUP" }, \
+       { XFS_DQ_DIRTY,         "DIRTY" }, \
+       { XFS_DQ_FREEING,       "FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)   \
+       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE   \
+       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+                                                  XFS_GQUOTA_ACTIVE | \
+                                                  XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp)   ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+                                                  XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS  0x0010000
+#define XFS_QMOPT_RES_RTBLKS   0x0020000
+#define XFS_QMOPT_BCOUNT       0x0040000
+#define XFS_QMOPT_ICOUNT       0x0080000
+#define XFS_QMOPT_RTBCOUNT     0x0100000
+#define XFS_QMOPT_DELBCOUNT    0x0200000
+#define XFS_QMOPT_DELRTBCOUNT  0x0400000
+#define XFS_QMOPT_RES_INOS     0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT      0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL      \
+               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c

index 20e30f93b0c7dab8b548527c46634c49486e02cd..1326d81596c2920b27f45021b67b76979e5ef387 100644 (file)
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -16,8 +16,10 @@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  #include "xfs.h"
-#include "xfs_sb.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
  #include "xfs_log.h"
+#include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
  #include "xfs_quota.h"
@@ -53,6 +55,18 @@ xfs_fs_get_xstate(
         return -xfs_qm_scall_getqstat(mp, fqs);
  }
  
+STATIC int
+xfs_fs_get_xstatev(
+       struct super_block      *sb,
+       struct fs_quota_statv   *fqs)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       return -xfs_qm_scall_getqstatv(mp, fqs);
+}
+
  STATIC int
  xfs_fs_set_xstate(
         struct super_block      *sb,
@@ -133,6 +147,7 @@ xfs_fs_set_dqblk(
  }
  
  const struct quotactl_ops xfs_quotactl_operations = {
+       .get_xstatev            = xfs_fs_get_xstatev,
         .get_xstate             = xfs_fs_get_xstate,
         .set_xstate             = xfs_fs_set_xstate,
         .get_dqblk              = xfs_fs_get_dqblk,
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c

deleted file mode 100644 (file)

index 30ff5f4..0000000
--- a/fs/xfs/xfs_rename.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-/*
- * Enter all inodes for a rename transaction into a sorted array.
- */
-STATIC void
-xfs_sort_for_rename(
-       xfs_inode_t     *dp1,   /* in: old (source) directory inode */
-       xfs_inode_t     *dp2,   /* in: new (target) directory inode */
-       xfs_inode_t     *ip1,   /* in: inode of old entry */
-       xfs_inode_t     *ip2,   /* in: inode of new entry, if it
-                                  already exists, NULL otherwise. */
-       xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
-       int             *num_inodes)  /* out: number of inodes in array */
-{
-       xfs_inode_t             *temp;
-       int                     i, j;
-
-       /*
-        * i_tab contains a list of pointers to inodes.  We initialize
-        * the table here & we'll sort it.  We will then use it to
-        * order the acquisition of the inode locks.
-        *
-        * Note that the table may contain duplicates.  e.g., dp1 == dp2.
-        */
-       i_tab[0] = dp1;
-       i_tab[1] = dp2;
-       i_tab[2] = ip1;
-       if (ip2) {
-               *num_inodes = 4;
-               i_tab[3] = ip2;
-       } else {
-               *num_inodes = 3;
-               i_tab[3] = NULL;
-       }
-
-       /*
-        * Sort the elements via bubble sort.  (Remember, there are at
-        * most 4 elements to sort, so this is adequate.)
-        */
-       for (i = 0; i < *num_inodes; i++) {
-               for (j = 1; j < *num_inodes; j++) {
-                       if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
-                               temp = i_tab[j];
-                               i_tab[j] = i_tab[j-1];
-                               i_tab[j-1] = temp;
-                       }
-               }
-       }
-}
-
-/*
- * xfs_rename
- */
-int
-xfs_rename(
-       xfs_inode_t     *src_dp,
-       struct xfs_name *src_name,
-       xfs_inode_t     *src_ip,
-       xfs_inode_t     *target_dp,
-       struct xfs_name *target_name,
-       xfs_inode_t     *target_ip)
-{
-       xfs_trans_t     *tp = NULL;
-       xfs_mount_t     *mp = src_dp->i_mount;
-       int             new_parent;             /* moving to a new dir */
-       int             src_is_directory;       /* src_name is a directory */
-       int             error;
-       xfs_bmap_free_t free_list;
-       xfs_fsblock_t   first_block;
-       int             cancel_flags;
-       int             committed;
-       xfs_inode_t     *inodes[4];
-       int             spaceres;
-       int             num_inodes;
-
-       trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-
-       new_parent = (src_dp != target_dp);
-       src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-
-       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
-                               inodes, &num_inodes);
-
-       xfs_bmap_init(&free_list, &first_block);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
-       if (error == ENOSPC) {
-               spaceres = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
-       }
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               goto std_return;
-       }
-
-       /*
-        * Attach the dquots to the inodes
-        */
-       error = xfs_qm_vop_rename_dqattach(inodes);
-       if (error) {
-               xfs_trans_cancel(tp, cancel_flags);
-               goto std_return;
-       }
-
-       /*
-        * Lock all the participating inodes. Depending upon whether
-        * the target_name exists in the target directory, and
-        * whether the target directory is the same as the source
-        * directory, we can lock from 2 to 4 inodes.
-        */
-       xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
-
-       /*
-        * Join all the inodes to the transaction. From this point on,
-        * we can rely on either trans_commit or trans_cancel to unlock
-        * them.
-        */
-       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
-       if (new_parent)
-               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-       if (target_ip)
-               xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-
-       /*
-        * If we are using project inheritance, we only allow renames
-        * into our tree when the project IDs are the same; else the
-        * tree quota mechanism would be circumvented.
-        */
-       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                    (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
-               error = XFS_ERROR(EXDEV);
-               goto error_return;
-       }
-
-       /*
-        * Set up the target.
-        */
-       if (target_ip == NULL) {
-               /*
-                * If there's no space reservation, check the entry will
-                * fit before actually inserting it.
-                */
-               error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
-               if (error)
-                       goto error_return;
-               /*
-                * If target does not exist and the rename crosses
-                * directories, adjust the target directory link count
-                * to account for the ".." reference from the new entry.
-                */
-               error = xfs_dir_createname(tp, target_dp, target_name,
-                                               src_ip->i_ino, &first_block,
-                                               &free_list, spaceres);
-               if (error == ENOSPC)
-                       goto error_return;
-               if (error)
-                       goto abort_return;
-
-               xfs_trans_ichgtime(tp, target_dp,
-                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-               if (new_parent && src_is_directory) {
-                       error = xfs_bumplink(tp, target_dp);
-                       if (error)
-                               goto abort_return;
-               }
-       } else { /* target_ip != NULL */
-               /*
-                * If target exists and it's a directory, check that both
-                * target and source are directories and that target can be
-                * destroyed, or that neither is a directory.
-                */
-               if (S_ISDIR(target_ip->i_d.di_mode)) {
-                       /*
-                        * Make sure target dir is empty.
-                        */
-                       if (!(xfs_dir_isempty(target_ip)) ||
-                           (target_ip->i_d.di_nlink > 2)) {
-                               error = XFS_ERROR(EEXIST);
-                               goto error_return;
-                       }
-               }
-
-               /*
-                * Link the source inode under the target name.
-                * If the source inode is a directory and we are moving
-                * it across directories, its ".." entry will be
-                * inconsistent until we replace that down below.
-                *
-                * In case there is already an entry with the same
-                * name at the destination directory, remove it first.
-                */
-               error = xfs_dir_replace(tp, target_dp, target_name,
-                                       src_ip->i_ino,
-                                       &first_block, &free_list, spaceres);
-               if (error)
-                       goto abort_return;
-
-               xfs_trans_ichgtime(tp, target_dp,
-                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-               /*
-                * Decrement the link count on the target since the target
-                * dir no longer points to it.
-                */
-               error = xfs_droplink(tp, target_ip);
-               if (error)
-                       goto abort_return;
-
-               if (src_is_directory) {
-                       /*
-                        * Drop the link from the old "." entry.
-                        */
-                       error = xfs_droplink(tp, target_ip);
-                       if (error)
-                               goto abort_return;
-               }
-       } /* target_ip != NULL */
-
-       /*
-        * Remove the source.
-        */
-       if (new_parent && src_is_directory) {
-               /*
-                * Rewrite the ".." entry to point to the new
-                * directory.
-                */
-               error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
-                                       target_dp->i_ino,
-                                       &first_block, &free_list, spaceres);
-               ASSERT(error != EEXIST);
-               if (error)
-                       goto abort_return;
-       }
-
-       /*
-        * We always want to hit the ctime on the source inode.
-        *
-        * This isn't strictly required by the standards since the source
-        * inode isn't really being changed, but old unix file systems did
-        * it and some incremental backup programs won't work without it.
-        */
-       xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
-
-       /*
-        * Adjust the link count on src_dp.  This is necessary when
-        * renaming a directory, either within one parent when
-        * the target existed, or across two parent directories.
-        */
-       if (src_is_directory && (new_parent || target_ip != NULL)) {
-
-               /*
-                * Decrement link count on src_directory since the
-                * entry that's moved no longer points to it.
-                */
-               error = xfs_droplink(tp, src_dp);
-               if (error)
-                       goto abort_return;
-       }
-
-       error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-                                       &first_block, &free_list, spaceres);
-       if (error)
-               goto abort_return;
-
-       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-       if (new_parent)
-               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * rename transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT));
-               goto std_return;
-       }
-
-       /*
-        * trans_commit will unlock src_ip, target_ip & decrement
-        * the vnode references.
-        */
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c

index 98dc670d3ee04182da47b27e7db1695b71807434..6f9e63c9fc2617ab89966447083527f1d0c94257 100644 (file)
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,25 +17,24 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_alloc.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_rtalloc.h"
  #include "xfs_fsops.h"
  #include "xfs_error.h"
  #include "xfs_inode_item.h"
  #include "xfs_trans_space.h"
-#include "xfs_utils.h"
  #include "xfs_trace.h"
  #include "xfs_buf.h"
  #include "xfs_icache.h"
@@ -101,10 +100,9 @@ xfs_growfs_rt_alloc(
                 /*
                  * Reserve space & log for one extent added to the file.
                  */
-               if ((error = xfs_trans_reserve(tp, resblks,
-                               XFS_GROWRTALLOC_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_DEFAULT_PERM_LOG_COUNT)))
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+                                         resblks, 0);
+               if (error)
                         goto error_cancel;
                 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                 /*
@@ -147,8 +145,9 @@ xfs_growfs_rt_alloc(
                         /*
                          * Reserve log for one block zeroing.
                          */
-                       if ((error = xfs_trans_reserve(tp, 0,
-                                       XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+                                                 0, 0);
+                       if (error)
                                 goto error_cancel;
                         /*
                          * Lock the bitmap inode.
@@ -736,8 +735,8 @@ xfs_rtallocate_range(
  {
         xfs_rtblock_t   end;            /* end of the allocated extent */
         int             error;          /* error value */
-       xfs_rtblock_t   postblock;      /* first block allocated > end */
-       xfs_rtblock_t   preblock;       /* first block allocated < start */
+       xfs_rtblock_t   postblock = 0;  /* first block allocated > end */
+       xfs_rtblock_t   preblock = 0;   /* first block allocated < start */
  
         end = start + len - 1;
         /*
@@ -1958,8 +1957,9 @@ xfs_growfs_rt(
                  * Start a transaction, get the log reservation.
                  */
                 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-               if ((error = xfs_trans_reserve(tp, 0,
-                               XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
+                                         0, 0);
+               if (error)
                         goto error_cancel;
                 /*
                  * Lock out other callers by grabbing the bitmap inode lock.
@@ -2148,7 +2148,7 @@ xfs_rtfree_extent(
         ASSERT(mp->m_rbmip->i_itemp != NULL);
         ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
  
-#if defined(__KERNEL__) && defined(DEBUG)
+#ifdef DEBUG
         /*
          * Check to see that this whole range is currently allocated.
          */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h

index f7f3a359c1c5a238afd4884b19f4a74363e36287..b2a1a24c0e2f3d8037cdd03f2b8deffc298d38c9 100644 (file)
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -18,58 +18,11 @@
  #ifndef __XFS_RTALLOC_H__
  #define        __XFS_RTALLOC_H__
  
+/* kernel only definitions and functions */
+
  struct xfs_mount;
  struct xfs_trans;
  
-/* Min and max rt extent sizes, specified in bytes */
-#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
-#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
-#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
-
-/*
- * Constants for bit manipulations.
- */
-#define        XFS_NBBYLOG     3               /* log2(NBBY) */
-#define        XFS_WORDLOG     2               /* log2(sizeof(xfs_rtword_t)) */
-#define        XFS_NBWORDLOG   (XFS_NBBYLOG + XFS_WORDLOG)
-#define        XFS_NBWORD      (1 << XFS_NBWORDLOG)
-#define        XFS_WORDMASK    ((1 << XFS_WORDLOG) - 1)
-
-#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
-#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
-#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
-#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
-
-/*
- * Summary and bit manipulation macros.
- */
-#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
-       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define        XFS_SUMPTR(mp,bp,so)    \
-       ((xfs_suminfo_t *)((bp)->b_addr + \
-               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
-#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
-#define        XFS_BITTOWORD(mp,bi)    \
-       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
-#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
-#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
-
-#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
-#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
-
-#if XFS_BIG_BLKNOS
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
-#else
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
-#endif
-
-
-#ifdef __KERNEL__
-
  #ifdef CONFIG_XFS_RT
  /*
   * Function prototypes for exported functions.
@@ -161,6 +114,4 @@ xfs_rtmount_init(
  # define xfs_rtunmount_inodes(m)
  #endif /* CONFIG_XFS_RT */
  
-#endif /* __KERNEL__ */
-
  #endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c

new file mode 100644 (file)

index 0000000..a5b59d9
--- /dev/null
+++ b/fs/xfs/xfs_sb.c
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+static const struct {
+       short offset;
+       short type;     /* 0 = integer
+                        * 1 = binary / string (no translation)
+                        */
+} xfs_sb_info[] = {
+       { offsetof(xfs_sb_t, sb_magicnum),      0 },
+       { offsetof(xfs_sb_t, sb_blocksize),     0 },
+       { offsetof(xfs_sb_t, sb_dblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rextents),      0 },
+       { offsetof(xfs_sb_t, sb_uuid),          1 },
+       { offsetof(xfs_sb_t, sb_logstart),      0 },
+       { offsetof(xfs_sb_t, sb_rootino),       0 },
+       { offsetof(xfs_sb_t, sb_rbmino),        0 },
+       { offsetof(xfs_sb_t, sb_rsumino),       0 },
+       { offsetof(xfs_sb_t, sb_rextsize),      0 },
+       { offsetof(xfs_sb_t, sb_agblocks),      0 },
+       { offsetof(xfs_sb_t, sb_agcount),       0 },
+       { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
+       { offsetof(xfs_sb_t, sb_logblocks),     0 },
+       { offsetof(xfs_sb_t, sb_versionnum),    0 },
+       { offsetof(xfs_sb_t, sb_sectsize),      0 },
+       { offsetof(xfs_sb_t, sb_inodesize),     0 },
+       { offsetof(xfs_sb_t, sb_inopblock),     0 },
+       { offsetof(xfs_sb_t, sb_fname[0]),      1 },
+       { offsetof(xfs_sb_t, sb_blocklog),      0 },
+       { offsetof(xfs_sb_t, sb_sectlog),       0 },
+       { offsetof(xfs_sb_t, sb_inodelog),      0 },
+       { offsetof(xfs_sb_t, sb_inopblog),      0 },
+       { offsetof(xfs_sb_t, sb_agblklog),      0 },
+       { offsetof(xfs_sb_t, sb_rextslog),      0 },
+       { offsetof(xfs_sb_t, sb_inprogress),    0 },
+       { offsetof(xfs_sb_t, sb_imax_pct),      0 },
+       { offsetof(xfs_sb_t, sb_icount),        0 },
+       { offsetof(xfs_sb_t, sb_ifree),         0 },
+       { offsetof(xfs_sb_t, sb_fdblocks),      0 },
+       { offsetof(xfs_sb_t, sb_frextents),     0 },
+       { offsetof(xfs_sb_t, sb_uquotino),      0 },
+       { offsetof(xfs_sb_t, sb_gquotino),      0 },
+       { offsetof(xfs_sb_t, sb_qflags),        0 },
+       { offsetof(xfs_sb_t, sb_flags),         0 },
+       { offsetof(xfs_sb_t, sb_shared_vn),     0 },
+       { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
+       { offsetof(xfs_sb_t, sb_unit),          0 },
+       { offsetof(xfs_sb_t, sb_width),         0 },
+       { offsetof(xfs_sb_t, sb_dirblklog),     0 },
+       { offsetof(xfs_sb_t, sb_logsectlog),    0 },
+       { offsetof(xfs_sb_t, sb_logsectsize),   0 },
+       { offsetof(xfs_sb_t, sb_logsunit),      0 },
+       { offsetof(xfs_sb_t, sb_features2),     0 },
+       { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+       { offsetof(xfs_sb_t, sb_features_compat),       0 },
+       { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
+       { offsetof(xfs_sb_t, sb_features_incompat),     0 },
+       { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+       { offsetof(xfs_sb_t, sb_crc),           0 },
+       { offsetof(xfs_sb_t, sb_pad),           0 },
+       { offsetof(xfs_sb_t, sb_pquotino),      0 },
+       { offsetof(xfs_sb_t, sb_lsn),           0 },
+       { sizeof(xfs_sb_t),                     0 }
+};
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno)
+{
+       struct xfs_perag        *pag;
+       int                     ref = 0;
+
+       rcu_read_lock();
+       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+       if (pag) {
+               ASSERT(atomic_read(&pag->pag_ref) >= 0);
+               ref = atomic_inc_return(&pag->pag_ref);
+       }
+       rcu_read_unlock();
+       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+       return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          first,
+       int                     tag)
+{
+       struct xfs_perag        *pag;
+       int                     found;
+       int                     ref;
+
+       rcu_read_lock();
+       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                       (void **)&pag, first, 1, tag);
+       if (found <= 0) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       ref = atomic_inc_return(&pag->pag_ref);
+       rcu_read_unlock();
+       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+       return pag;
+}
+
+void
+xfs_perag_put(
+       struct xfs_perag        *pag)
+{
+       int     ref;
+
+       ASSERT(atomic_read(&pag->pag_ref) > 0);
+       ref = atomic_dec_return(&pag->pag_ref);
+       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+       xfs_mount_t     *mp,
+       xfs_sb_t        *sbp,
+       bool            check_inprogress,
+       bool            check_version)
+{
+
+       /*
+        * If the log device and data device have the
+        * same device number, the log is internal.
+        * Consequently, the sb_logstart should be non-zero.  If
+        * we have a zero sb_logstart in this case, we may be trying to mount
+        * a volume filesystem in a non-volume manner.
+        */
+       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+               xfs_warn(mp, "bad magic number");
+               return XFS_ERROR(EWRONGFS);
+       }
+
+
+       if (!xfs_sb_good_version(sbp)) {
+               xfs_warn(mp, "bad version");
+               return XFS_ERROR(EWRONGFS);
+       }
+
+       /*
+        * Version 5 superblock feature mask validation. Reject combinations the
+        * kernel cannot support up front before checking anything else. For
+        * write validation, we don't need to check feature masks.
+        */
+       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+               xfs_alert(mp,
+"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
+"Use of these features in this kernel is at your own risk!");
+
+               if (xfs_sb_has_compat_feature(sbp,
+                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+                               (sbp->sb_features_compat &
+                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
+               }
+
+               if (xfs_sb_has_ro_compat_feature(sbp,
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                       xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+                               (sbp->sb_features_ro_compat &
+                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                               xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+                               return XFS_ERROR(EINVAL);
+                       }
+               }
+               if (xfs_sb_has_incompat_feature(sbp,
+                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+                               (sbp->sb_features_incompat &
+                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+                       return XFS_ERROR(EINVAL);
+               }
+       }
+
+       if (xfs_sb_version_has_pquotino(sbp)) {
+               if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+                       xfs_notice(mp,
+                          "Version 5 of Super block has XFS_OQUOTA bits.\n");
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+       } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+                       xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
+                       return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely(
+           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an external log; "
+               "specify logdev on the mount command line.");
+               return XFS_ERROR(EINVAL);
+       }
+
+       if (unlikely(
+           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an internal log; "
+               "do not specify logdev on the mount command line.");
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * More sanity checking.  Most of these were stolen directly from
+        * xfs_repair.
+        */
+       if (unlikely(
+           sbp->sb_agcount <= 0                                        ||
+           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
+           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
+           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
+           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
+           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
+           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
+           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
+           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
+           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
+           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
+           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
+           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
+           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
+           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
+           sbp->sb_dblocks == 0                                        ||
+           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
+           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
+               XFS_CORRUPTION_ERROR("SB sanity check failed",
+                               XFS_ERRLEVEL_LOW, mp, sbp);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       /*
+        * Until this is fixed only page-sized or smaller data blocks work.
+        */
+       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+               xfs_warn(mp,
+               "File system with blocksize %d bytes. "
+               "Only pagesize (%ld) or less will currently work.",
+                               sbp->sb_blocksize, PAGE_SIZE);
+               return XFS_ERROR(ENOSYS);
+       }
+
+       /*
+        * Currently only very few inode sizes are supported.
+        */
+       switch (sbp->sb_inodesize) {
+       case 256:
+       case 512:
+       case 1024:
+       case 2048:
+               break;
+       default:
+               xfs_warn(mp, "inode size of %d bytes not supported",
+                               sbp->sb_inodesize);
+               return XFS_ERROR(ENOSYS);
+       }
+
+       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+               xfs_warn(mp,
+               "file system too large to be mounted on this system.");
+               return XFS_ERROR(EFBIG);
+       }
+
+       if (check_inprogress && sbp->sb_inprogress) {
+               xfs_warn(mp, "Offline file system operation in progress!");
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       /*
+        * Version 1 directory format has never worked on Linux.
+        */
+       if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
+               xfs_warn(mp, "file system using version 1 directory format");
+               return XFS_ERROR(ENOSYS);
+       }
+
+       return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+       /*
+        * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+        * leads to in-core values having two different values for a quota
+        * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+        * NULLFSINO.
+        *
+        * Note that this change affect only the in-core values. These
+        * values are not written back to disk unless any quota information
+        * is written to the disk. Even in that case, sb_pquotino field is
+        * not written to disk unless the superblock supports pquotino.
+        */
+       if (sbp->sb_uquotino == 0)
+               sbp->sb_uquotino = NULLFSINO;
+       if (sbp->sb_gquotino == 0)
+               sbp->sb_gquotino = NULLFSINO;
+       if (sbp->sb_pquotino == 0)
+               sbp->sb_pquotino = NULLFSINO;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(sbp))
+               return;
+
+       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+       if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+               /*
+                * In older version of superblock, on-disk superblock only
+                * has sb_gquotino, and in-core superblock has both sb_gquotino
+                * and sb_pquotino. But, only one of them is supported at any
+                * point of time. So, if PQUOTA is set in disk superblock,
+                * copy over sb_gquotino to sb_pquotino.
+                */
+               sbp->sb_pquotino = sbp->sb_gquotino;
+               sbp->sb_gquotino = NULLFSINO;
+       }
+}
+
+void
+xfs_sb_from_disk(
+       struct xfs_sb   *to,
+       xfs_dsb_t       *from)
+{
+       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+       to->sb_rextents = be64_to_cpu(from->sb_rextents);
+       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+       to->sb_logstart = be64_to_cpu(from->sb_logstart);
+       to->sb_rootino = be64_to_cpu(from->sb_rootino);
+       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+       to->sb_agcount = be32_to_cpu(from->sb_agcount);
+       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+       to->sb_blocklog = from->sb_blocklog;
+       to->sb_sectlog = from->sb_sectlog;
+       to->sb_inodelog = from->sb_inodelog;
+       to->sb_inopblog = from->sb_inopblog;
+       to->sb_agblklog = from->sb_agblklog;
+       to->sb_rextslog = from->sb_rextslog;
+       to->sb_inprogress = from->sb_inprogress;
+       to->sb_imax_pct = from->sb_imax_pct;
+       to->sb_icount = be64_to_cpu(from->sb_icount);
+       to->sb_ifree = be64_to_cpu(from->sb_ifree);
+       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+       to->sb_frextents = be64_to_cpu(from->sb_frextents);
+       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+       to->sb_qflags = be16_to_cpu(from->sb_qflags);
+       to->sb_flags = from->sb_flags;
+       to->sb_shared_vn = from->sb_shared_vn;
+       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+       to->sb_unit = be32_to_cpu(from->sb_unit);
+       to->sb_width = be32_to_cpu(from->sb_width);
+       to->sb_dirblklog = from->sb_dirblklog;
+       to->sb_logsectlog = from->sb_logsectlog;
+       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+       to->sb_features2 = be32_to_cpu(from->sb_features2);
+       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+       to->sb_features_log_incompat =
+                               be32_to_cpu(from->sb_features_log_incompat);
+       to->sb_pad = 0;
+       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+       to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       *fields)
+{
+       __uint16_t      qflags = from->sb_qflags;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(from))
+               return;
+
+       if (*fields & XFS_SB_QFLAGS) {
+               /*
+                * The in-core version of sb_qflags do not have
+                * XFS_OQUOTA_* flags, whereas the on-disk version
+                * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                * to on-disk XFS_OQUOTA_* flags.
+                */
+               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                       qflags |= XFS_OQUOTA_ENFD;
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                       qflags |= XFS_OQUOTA_CHKD;
+               to->sb_qflags = cpu_to_be16(qflags);
+               *fields &= ~XFS_SB_QFLAGS;
+       }
+
+       /*
+        * GQUOTINO and PQUOTINO cannot be used together in versions
+        * of superblock that do not have pquotino. from->sb_flags
+        * tells us which quota is active and should be copied to
+        * disk.
+        */
+       if ((*fields & XFS_SB_GQUOTINO) &&
+                               (from->sb_qflags & XFS_GQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+       else if ((*fields & XFS_SB_PQUOTINO) &&
+                               (from->sb_qflags & XFS_PQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+
+       *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       fields)
+{
+       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
+       xfs_sb_field_t  f;
+       int             first;
+       int             size;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+
+       xfs_sb_quota_to_disk(to, from, &fields);
+       while (fields) {
+               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+               first = xfs_sb_info[f].offset;
+               size = xfs_sb_info[f + 1].offset - first;
+
+               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+               if (size == 1 || xfs_sb_info[f].type == 1) {
+                       memcpy(to_ptr + first, from_ptr + first, size);
+               } else {
+                       switch (size) {
+                       case 2:
+                               *(__be16 *)(to_ptr + first) =
+                                     cpu_to_be16(*(__u16 *)(from_ptr + first));
+                               break;
+                       case 4:
+                               *(__be32 *)(to_ptr + first) =
+                                     cpu_to_be32(*(__u32 *)(from_ptr + first));
+                               break;
+                       case 8:
+                               *(__be64 *)(to_ptr + first) =
+                                     cpu_to_be64(*(__u64 *)(from_ptr + first));
+                               break;
+                       default:
+                               ASSERT(0);
+                       }
+               }
+
+               fields &= ~(1LL << f);
+       }
+}
+
+static int
+xfs_sb_verify(
+       struct xfs_buf  *bp,
+       bool            check_version)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_sb   sb;
+
+       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+       /*
+        * Only check the in progress field for the primary superblock as
+        * mkfs.xfs doesn't clear it from secondary superblocks.
+        */
+       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+                                    check_version);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ */
+static void
+xfs_sb_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+       int             error;
+
+       /*
+        * open code the version check to avoid needing to convert the entire
+        * superblock from disk order just to check the version number
+        */
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+                                               XFS_SB_VERSION_5) ||
+            dsb->sb_crc != 0)) {
+
+               if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
+                                     offsetof(struct xfs_sb, sb_crc))) {
+                       error = EFSCORRUPTED;
+                       goto out_error;
+               }
+       }
+       error = xfs_sb_verify(bp, true);
+
+out_error:
+       if (error) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                    mp, bp->b_addr);
+               xfs_buf_ioerror(bp, error);
+       }
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+
+
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+               /* XFS filesystem, verify noisily! */
+               xfs_sb_read_verify(bp);
+               return;
+       }
+       /* quietly fail */
+       xfs_buf_ioerror(bp, EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       int                     error;
+
+       error = xfs_sb_verify(bp, false);
+       if (error) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                    mp, bp->b_addr);
+               xfs_buf_ioerror(bp, error);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+                        offsetof(struct xfs_sb, sb_crc));
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+       .verify_read = xfs_sb_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+       .verify_read = xfs_sb_quiet_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+       struct xfs_mount *mp,
+       struct xfs_sb   *sbp)
+{
+       mp->m_agfrotor = mp->m_agirotor = 0;
+       spin_lock_init(&mp->m_agirotor_lock);
+       mp->m_maxagi = mp->m_sb.sb_agcount;
+       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+       mp->m_blockmask = sbp->sb_blocksize - 1;
+       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+       mp->m_blockwmask = mp->m_blockwsize - 1;
+
+       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+                                       sbp->sb_inopblock);
+       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+       struct xfs_mount *mp,
+       xfs_agnumber_t  agcount)
+{
+       xfs_agnumber_t  index;
+       xfs_perag_t     *pag;
+       xfs_sb_t        *sbp = &mp->m_sb;
+       uint64_t        ifree = 0;
+       uint64_t        ialloc = 0;
+       uint64_t        bfree = 0;
+       uint64_t        bfreelst = 0;
+       uint64_t        btree = 0;
+       int             error;
+
+       for (index = 0; index < agcount; index++) {
+               /*
+                * read the agf, then the agi. This gets us
+                * all the information we need and populates the
+                * per-ag structures for us.
+                */
+               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+               if (error)
+                       return error;
+
+               error = xfs_ialloc_pagi_init(mp, NULL, index);
+               if (error)
+                       return error;
+               pag = xfs_perag_get(mp, index);
+               ifree += pag->pagi_freecount;
+               ialloc += pag->pagi_count;
+               bfree += pag->pagf_freeblks;
+               bfreelst += pag->pagf_flcount;
+               btree += pag->pagf_btreeblks;
+               xfs_perag_put(pag);
+       }
+       /*
+        * Overwrite incore superblock counters with just-read data
+        */
+       spin_lock(&mp->m_sb_lock);
+       sbp->sb_ifree = ifree;
+       sbp->sb_icount = ialloc;
+       sbp->sb_fdblocks = bfree + bfreelst + btree;
+       spin_unlock(&mp->m_sb_lock);
+
+       /* Fixup the per-cpu counters as well. */
+       xfs_icsb_reinit_counters(mp);
+
+       return 0;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+       xfs_buf_t       *bp;
+       int             first;
+       int             last;
+       xfs_mount_t     *mp;
+       xfs_sb_field_t  f;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+       mp = tp->t_mountp;
+       bp = xfs_trans_getsb(tp, mp, 0);
+       first = sizeof(xfs_sb_t);
+       last = 0;
+
+       /* translate/copy */
+
+       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+       /* find modified range */
+       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       last = xfs_sb_info[f + 1].offset - 1;
+
+       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       first = xfs_sb_info[f].offset;
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+       xfs_trans_log_buf(tp, bp, first, last);
+}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h

index 78f9e70b80c7da8a64b92d528a9f7aa2ba49401e..6835b44f850e58e780c2712e24530dbb3e57f5f4 100644 (file)
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -26,6 +26,7 @@
  
  struct xfs_buf;
  struct xfs_mount;
+struct xfs_trans;
  
  #define        XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
  #define        XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
@@ -83,11 +84,13 @@ struct xfs_mount;
  #define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
  #define XFS_SB_VERSION2_PROJID32BIT    0x00000080      /* 32 bit project id */
  #define XFS_SB_VERSION2_CRCBIT         0x00000100      /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE          0x00000200      /* inode type in dir */
  
  #define        XFS_SB_VERSION2_OKREALFBITS     \
         (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
          XFS_SB_VERSION2_ATTR2BIT       | \
-        XFS_SB_VERSION2_PROJID32BIT)
+        XFS_SB_VERSION2_PROJID32BIT    | \
+        XFS_SB_VERSION2_FTYPE)
  #define        XFS_SB_VERSION2_OKSASHFBITS     \
         (0)
  #define XFS_SB_VERSION2_OKREALBITS     \
@@ -354,15 +357,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
                      (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
                         return 0;
  
-#ifdef __KERNEL__
                 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
                         return 0;
-#else
-               if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
-                   sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
-                       return 0;
-#endif
-
                 return 1;
         }
         if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
@@ -554,12 +550,13 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
                 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
  }
  
-static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
  {
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+       sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+       sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
  }
  
-
  /*
   * Extended v5 superblock feature masks. These are to be used for new v5
   * superblock features only.
@@ -598,7 +595,10 @@ xfs_sb_has_ro_compat_feature(
         return (sbp->sb_features_ro_compat & feature) != 0;
  }
  
-#define XFS_SB_FEAT_INCOMPAT_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+               (XFS_SB_FEAT_INCOMPAT_FTYPE)
+
  #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
  static inline bool
  xfs_sb_has_incompat_feature(
@@ -618,16 +618,39 @@ xfs_sb_has_incompat_log_feature(
         return (sbp->sb_features_log_incompat & feature) != 0;
  }
  
-static inline bool
-xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
  {
-       return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
  }
  
  /*
   * end of superblock version macros
   */
  
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+       return (ino == sbp->sb_uquotino ||
+               ino == sbp->sb_gquotino ||
+               ino == sbp->sb_pquotino);
+}
+
  #define XFS_SB_DADDR           ((xfs_daddr_t)0) /* daddr in filesystem/ag */
  #define        XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
  #define XFS_BUF_TO_SBP(bp)     ((xfs_dsb_t *)((bp)->b_addr))
@@ -660,4 +683,23 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
  #define XFS_B_TO_FSBT(mp,b)    (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
  #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
  
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+                                          int tag);
+extern void    xfs_perag_put(struct xfs_perag *pag);
+extern int     xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+
+extern void    xfs_sb_calc_crc(struct xfs_buf  *);
+extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void    xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void    xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+
  #endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 1d68ffcdeaa7f555ab77ee05f5c13571c6ee3b4d..979a77d4b87d1142c12b4b1cabcf41f75777041a 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,12 +17,12 @@
   */
  
  #include "xfs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_inum.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_alloc.h"
  #include "xfs_quota.h"
  #include "xfs_mount.h"
@@ -40,12 +40,12 @@
  #include "xfs_fsops.h"
  #include "xfs_attr.h"
  #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
  #include "xfs_log_priv.h"
  #include "xfs_trans_priv.h"
  #include "xfs_filestream.h"
  #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_extfree_item.h"
  #include "xfs_mru_cache.h"
  #include "xfs_inode_item.h"
@@ -421,12 +421,6 @@ xfs_parseargs(
         }
  #endif
  
-       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
-           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-               xfs_warn(mp, "cannot mount with both project and group quota");
-               return EINVAL;
-       }
-
         if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                 xfs_warn(mp, "sunit and swidth must be specified together");
                 return EINVAL;
@@ -556,14 +550,13 @@ xfs_showargs(
         else if (mp->m_qflags & XFS_UQUOTA_ACCT)
                 seq_puts(m, "," MNTOPT_UQUOTANOENF);
  
-       /* Either project or group quotas can be active, not both */
-
         if (mp->m_qflags & XFS_PQUOTA_ACCT) {
                 if (mp->m_qflags & XFS_PQUOTA_ENFD)
                         seq_puts(m, "," MNTOPT_PRJQUOTA);
                 else
                         seq_puts(m, "," MNTOPT_PQUOTANOENF);
-       } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+       }
+       if (mp->m_qflags & XFS_GQUOTA_ACCT) {
                 if (mp->m_qflags & XFS_GQUOTA_ENFD)
                         seq_puts(m, "," MNTOPT_GRPQUOTA);
                 else
@@ -870,17 +863,17 @@ xfs_init_mount_workqueues(
                 goto out_destroy_unwritten;
  
         mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
         if (!mp->m_reclaim_workqueue)
                 goto out_destroy_cil;
  
         mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
         if (!mp->m_log_workqueue)
                 goto out_destroy_reclaim;
  
         mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
         if (!mp->m_eofblocks_workqueue)
                 goto out_destroy_log;
  
@@ -1396,6 +1389,14 @@ xfs_finish_flags(
                 return XFS_ERROR(EROFS);
         }
  
+       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
+           !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+               xfs_warn(mp,
+                 "Super block does not support project and group quota together");
+               return XFS_ERROR(EINVAL);
+       }
+
         return 0;
  }
  
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c

index f4895b662fcb549706881a4dd65a8b048d23d03e..2f2a7c005be2d32219fd9c580bb2050f2f4e0050 100644 (file)
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -18,200 +18,29 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_bit.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
-#include "xfs_dir2.h"
  #include "xfs_mount.h"
  #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_ialloc_btree.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
  #include "xfs_ialloc.h"
  #include "xfs_alloc.h"
  #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
  #include "xfs_error.h"
  #include "xfs_quota.h"
-#include "xfs_utils.h"
  #include "xfs_trans_space.h"
-#include "xfs_log_priv.h"
  #include "xfs_trace.h"
  #include "xfs_symlink.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-
-
-/*
- * Each contiguous block has a header, so it is not just a simple pathlen
- * to FSB conversion.
- */
-int
-xfs_symlink_blocks(
-       struct xfs_mount *mp,
-       int             pathlen)
-{
-       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-
-       return (pathlen + buflen - 1) / buflen;
-}
-
-static int
-xfs_symlink_hdr_set(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return 0;
-
-       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
-       dsl->sl_offset = cpu_to_be32(offset);
-       dsl->sl_bytes = cpu_to_be32(size);
-       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
-       dsl->sl_owner = cpu_to_be64(ino);
-       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       return sizeof(struct xfs_dsymlink_hdr);
-}
-
-/*
- * Checking of the symlink header is split into two parts. the verifier does
- * CRC, location and bounds checking, the unpacking function checks the path
- * parameters and owner.
- */
-bool
-xfs_symlink_hdr_ok(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (offset != be32_to_cpu(dsl->sl_offset))
-               return false;
-       if (size != be32_to_cpu(dsl->sl_bytes))
-               return false;
-       if (ino != be64_to_cpu(dsl->sl_owner))
-               return false;
-
-       /* ok */
-       return true;
-}
-
-static bool
-xfs_symlink_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
-               return false;
-       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
-               return false;
-       if (be32_to_cpu(dsl->sl_offset) +
-                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
-               return false;
-       if (dsl->sl_owner == 0)
-               return false;
-
-       return true;
-}
-
-static void
-xfs_symlink_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
-                                 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
-           !xfs_symlink_verify(bp)) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-       }
-}
-
-static void
-xfs_symlink_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_symlink_verify(bp)) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               return;
-       }
-
-       if (bip) {
-               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       }
-       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
-                        offsetof(struct xfs_dsymlink_hdr, sl_crc));
-}
-
-const struct xfs_buf_ops xfs_symlink_buf_ops = {
-       .verify_read = xfs_symlink_read_verify,
-       .verify_write = xfs_symlink_write_verify,
-};
-
-void
-xfs_symlink_local_to_remote(
-       struct xfs_trans        *tp,
-       struct xfs_buf          *bp,
-       struct xfs_inode        *ip,
-       struct xfs_ifork        *ifp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       char                    *buf;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
-               bp->b_ops = NULL;
-               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-               return;
-       }
-
-       /*
-        * As this symlink fits in an inode literal area, it must also fit in
-        * the smallest buffer the filesystem supports.
-        */
-       ASSERT(BBTOB(bp->b_length) >=
-                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
-
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       buf = bp->b_addr;
-       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
-       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
-}
  
  /* ----- Kernel only functions below ----- */
  STATIC int
@@ -386,8 +215,11 @@ xfs_symlink(
         /*
          * Make sure that we have allocated dquot(s) on disk.
          */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-               XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
+       error = xfs_qm_vop_dqalloc(dp,
+                       xfs_kuid_to_uid(current_fsuid()),
+                       xfs_kgid_to_gid(current_fsgid()), prid,
+                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                       &udqp, &gdqp, &pdqp);
         if (error)
                 goto std_return;
  
@@ -402,12 +234,10 @@ xfs_symlink(
         else
                 fs_blocks = xfs_symlink_blocks(mp, pathlen);
         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-       error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
         if (error == ENOSPC && fs_blocks == 0) {
                 resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
         }
         if (error) {
                 cancel_flags = 0;
@@ -710,8 +540,8 @@ xfs_inactive_symlink_rmt(
          * Put an itruncate log reservation in the new transaction
          * for our caller.
          */
-       if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       if (error) {
                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
                 goto error0;
         }
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h

index 374394880c01e4d8db8dc36d6468e748295c6f29..99338ba666ac68c11350fb1b24906364c7a6de01 100644 (file)
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -17,50 +17,11 @@
  #ifndef __XFS_SYMLINK_H
  #define __XFS_SYMLINK_H 1
  
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_inode;
-struct xfs_buf;
-struct xfs_ifork;
-struct xfs_name;
-
-#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
-
-struct xfs_dsymlink_hdr {
-       __be32  sl_magic;
-       __be32  sl_offset;
-       __be32  sl_bytes;
-       __be32  sl_crc;
-       uuid_t  sl_uuid;
-       __be64  sl_owner;
-       __be64  sl_blkno;
-       __be64  sl_lsn;
-};
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 3 extents back from
- * bmapi when crc headers are taken into account.
- */
-#define XFS_SYMLINK_MAPS 3
-
-#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
-       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
-                       sizeof(struct xfs_dsymlink_hdr) : 0))
-
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
-                                struct xfs_inode *ip, struct xfs_ifork *ifp);
-
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-
-#ifdef __KERNEL__
+/* Kernel only symlink defintions */
  
  int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                 const char *target_path, umode_t mode, struct xfs_inode **ipp);
  int xfs_readlink(struct xfs_inode *ip, char *link);
  int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
  
-#endif /* __KERNEL__ */
  #endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c

new file mode 100644 (file)

index 0000000..01c85e3
--- /dev/null
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+       struct xfs_mount *mp,
+       int             pathlen)
+{
+       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+       return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return 0;
+
+       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+       dsl->sl_offset = cpu_to_be32(offset);
+       dsl->sl_bytes = cpu_to_be32(size);
+       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+       dsl->sl_owner = cpu_to_be64(ino);
+       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (offset != be32_to_cpu(dsl->sl_offset))
+               return false;
+       if (size != be32_to_cpu(dsl->sl_bytes))
+               return false;
+       if (ino != be64_to_cpu(dsl->sl_owner))
+               return false;
+
+       /* ok */
+       return true;
+}
+
+static bool
+xfs_symlink_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+               return false;
+       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+               return false;
+       if (be32_to_cpu(dsl->sl_offset) +
+                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+               return false;
+       if (dsl->sl_owner == 0)
+               return false;
+
+       return true;
+}
+
+static void
+xfs_symlink_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                                 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
+           !xfs_symlink_verify(bp)) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_symlink_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_symlink_verify(bp)) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+               return;
+       }
+
+       if (bip) {
+               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       }
+       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+                        offsetof(struct xfs_dsymlink_hdr, sl_crc));
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+       .verify_read = xfs_symlink_read_verify,
+       .verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       struct xfs_inode        *ip,
+       struct xfs_ifork        *ifp)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       char                    *buf;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+               bp->b_ops = NULL;
+               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+               return;
+       }
+
+       /*
+        * As this symlink fits in an inode literal area, it must also fit in
+        * the smallest buffer the filesystem supports.
+        */
+       ASSERT(BBTOB(bp->b_length) >=
+                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       buf = bp->b_addr;
+       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c

index b6e3897c1d9f0bbeb34f3b1e610e58217ee005f4..5d7b3e40705ffe4a96c75493d09ac74d9ae265cd 100644 (file)
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,6 +18,7 @@
  #include "xfs.h"
  #include "xfs_fs.h"
  #include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index 35a229981354159add4b143aea86682c22966a7f..5411e01ab4527318b187846fa3e7a53ad6300eb3 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,7 +18,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
@@ -48,629 +48,6 @@
  kmem_zone_t    *xfs_trans_zone;
  kmem_zone_t    *xfs_log_item_desc_zone;
  
-/*
- * A buffer has a format structure overhead in the log in addition
- * to the data, so we need to take this into account when reserving
- * space in a transaction for a buffer.  Round the space required up
- * to a multiple of 128 bytes so that we don't change the historical
- * reservation that has been used for this overhead.
- */
-STATIC uint
-xfs_buf_log_overhead(void)
-{
-       return round_up(sizeof(struct xlog_op_header) +
-                       sizeof(struct xfs_buf_log_format), 128);
-}
-
-/*
- * Calculate out transaction log reservation per item in bytes.
- *
- * The nbufs argument is used to indicate the number of items that
- * will be changed in a transaction.  size is used to tell how many
- * bytes should be reserved per item.
- */
-STATIC uint
-xfs_calc_buf_res(
-       uint            nbufs,
-       uint            size)
-{
-       return nbufs * (size + xfs_buf_log_overhead());
-}
-
-/*
- * Various log reservation values.
- *
- * These are based on the size of the file system block because that is what
- * most transactions manipulate.  Each adds in an additional 128 bytes per
- * item logged to try to account for the overhead of the transaction mechanism.
- *
- * Note:  Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
- * This is because the number in the worst case is quite high and quite
- * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
- * extents in only a single AG at a time.  This will require changes to the
- * EFI code as well, however, so that the EFI for the extents not freed is
- * logged again in each transaction.  See SGI PV #261917.
- *
- * Reservation functions here avoid a huge stack in xfs_trans_init due to
- * register overflow from temporaries in the calculations.
- */
-
-
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_write_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_itruncate_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(5, 0) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                    XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                                    mp->m_in_maxlevels, 0)));
-}
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *     of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_rename_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_link_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_remove_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For create, break it in to the two cases that the transaction
- * covers. We start with the modify case - allocation done by modification
- * of the state of existing inodes - and the allocation case.
- */
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- */
-STATIC uint
-xfs_calc_create_resv_modify(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               (uint)XFS_FSB_TO_B(mp, 1) +
-               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * For create we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_create_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-__xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_create_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-/*
- * For icreate we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_icreate_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_icreate_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-STATIC uint
-xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return xfs_calc_icreate_reservation(mp);
-       return __xfs_calc_create_reservation(mp);
-
-}
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-STATIC uint
-xfs_calc_mkdir_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp);
-}
-
-
-/*
- * Making a new symplink is the same as creating a new file, but
- * with the added blocks for remote symlink data which can be up to 1kB in
- * length (MAXPATHLEN).
- */
-STATIC uint
-xfs_calc_symlink_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp) +
-              xfs_calc_buf_res(1, MAXPATHLEN);
-}
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_ifree_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-               MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
-                   XFS_INODE_CLUSTER_SIZE(mp)) +
-               xfs_calc_buf_res(1, 0) +
-               xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                                mp->m_in_maxlevels, 0) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-STATIC uint
-xfs_calc_ichange_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               mp->m_sb.sb_inodesize +
-               mp->m_sb.sb_sectsize +
-               512;
-
-}
-
-/*
- * Growing the data section of the filesystem.
- *     superblock
- *     agi and agf
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_growdata_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *     superblock: sector size
- *     agf of the ag from which the extent is allocated: sector size
- *     bmap btree for bitmap/summary inode: max depth * blocksize
- *     bitmap/summary inode: inode size
- *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-STATIC uint
-xfs_calc_growrtalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *     one bitmap/summary block: blocksize
- */
-STATIC uint
-xfs_calc_growrtzero_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *     superblock: sector size
- *     bitmap inode: inode size
- *     summary inode: inode size
- *     one bitmap block: blocksize
- *     summary blocks: new summary size
- */
-STATIC uint
-xfs_calc_growrtfree_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
-               xfs_calc_buf_res(1, mp->m_rsumsize);
-}
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *     inode
- */
-STATIC uint
-xfs_calc_swrite_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
-}
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *     inode
- */
-STATIC uint
-xfs_calc_writeid_reservation(xfs_mount_t *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
-}
-
-/*
- * Converting the inode from non-attributed to attributed.
- *     the inode being converted: inode size
- *     agf block and superblock (for block allocation)
- *     the new block (directory sized)
- *     bmap blocks for the new directory block
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_addafork_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, mp->m_dirblksize) +
-               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrinval_reservation(
-       struct xfs_mount        *mp)
-{
-       return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                    XFS_FSB_TO_B(mp, 1))),
-                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                    XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Setting an attribute at mount time.
- *     the inode getting the attribute
- *     the superblock for allocations
- *     the agfs extents are allocated from
- *     the attribute btree * max depth
- *     the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime(see
- * below).
- */
-STATIC uint
-xfs_calc_attrsetm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Setting an attribute at runtime, transaction space unit per block.
- *     the superblock for allocations: sector size
- *     the inode bmap btree could join or split: max depth * block size
- * Since the runtime attribute transaction space is dependent on the total
- * blocks needed for the 1st bmap, here we calculate out the space unit for
- * one block so that the caller could figure out the total space according
- * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
- */
-STATIC uint
-xfs_calc_attrsetrt_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrrm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    (uint)XFS_FSB_TO_B(mp,
-                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-STATIC uint
-xfs_calc_clear_agi_bucket_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Clearing the quotaflags in the superblock.
- *     the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Adjusting quota limits.
- *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
- */
-STATIC uint
-xfs_calc_qm_setqlim_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
-}
-
-/*
- * Allocating quota on disk if needed.
- *     the write transaction log space: XFS_WRITE_LOG_RES(mp)
- *     the unit of quota allocation: one system block size
- */
-STATIC uint
-xfs_calc_qm_dqalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_WRITE_LOG_RES(mp) +
-               xfs_calc_buf_res(1,
-                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
-}
-
-/*
- * Turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- *    the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2 +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2;
-}
-
-/*
- * Syncing the incore super block changes to disk.
- *     the super block to reflect the changes: sector size
- */
-STATIC uint
-xfs_calc_sb_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
  /*
   * Initialize the precomputed transaction reservation values
   * in the mount structure.
@@ -679,36 +56,7 @@ void
  xfs_trans_init(
         struct xfs_mount        *mp)
  {
-       struct xfs_trans_reservations *resp = &mp->m_reservations;
-
-       resp->tr_write = xfs_calc_write_reservation(mp);
-       resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
-       resp->tr_rename = xfs_calc_rename_reservation(mp);
-       resp->tr_link = xfs_calc_link_reservation(mp);
-       resp->tr_remove = xfs_calc_remove_reservation(mp);
-       resp->tr_symlink = xfs_calc_symlink_reservation(mp);
-       resp->tr_create = xfs_calc_create_reservation(mp);
-       resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
-       resp->tr_ifree = xfs_calc_ifree_reservation(mp);
-       resp->tr_ichange = xfs_calc_ichange_reservation(mp);
-       resp->tr_growdata = xfs_calc_growdata_reservation(mp);
-       resp->tr_swrite = xfs_calc_swrite_reservation(mp);
-       resp->tr_writeid = xfs_calc_writeid_reservation(mp);
-       resp->tr_addafork = xfs_calc_addafork_reservation(mp);
-       resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
-       resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
-       resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
-       resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
-       resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
-       resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
-       resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
-       resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
-       resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
-       resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
-       resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
-       resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
-       resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
-       resp->tr_sb = xfs_calc_sb_reservation(mp);
+       xfs_trans_resv_calc(mp, M_RES(mp));
  }
  
  /*
@@ -744,7 +92,7 @@ _xfs_trans_alloc(
         atomic_inc(&mp->m_active_trans);
  
         tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-       tp->t_magic = XFS_TRANS_MAGIC;
+       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
         tp->t_type = type;
         tp->t_mountp = mp;
         INIT_LIST_HEAD(&tp->t_items);
@@ -789,7 +137,7 @@ xfs_trans_dup(
         /*
          * Initialize the new transaction structure.
          */
-       ntp->t_magic = XFS_TRANS_MAGIC;
+       ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
         ntp->t_type = tp->t_type;
         ntp->t_mountp = tp->t_mountp;
         INIT_LIST_HEAD(&ntp->t_items);
@@ -832,12 +180,10 @@ xfs_trans_dup(
   */
  int
  xfs_trans_reserve(
-       xfs_trans_t     *tp,
-       uint            blocks,
-       uint            logspace,
-       uint            rtextents,
-       uint            flags,
-       uint            logcount)
+       struct xfs_trans        *tp,
+       struct xfs_trans_res    *resp,
+       uint                    blocks,
+       uint                    rtextents)
  {
         int             error = 0;
         int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -863,13 +209,15 @@ xfs_trans_reserve(
         /*
          * Reserve the log space needed for this transaction.
          */
-       if (logspace > 0) {
+       if (resp->tr_logres > 0) {
                 bool    permanent = false;
  
-               ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
-               ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+               ASSERT(tp->t_log_res == 0 ||
+                      tp->t_log_res == resp->tr_logres);
+               ASSERT(tp->t_log_count == 0 ||
+                      tp->t_log_count == resp->tr_logcount);
  
-               if (flags & XFS_TRANS_PERM_LOG_RES) {
+               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
                         tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
                         permanent = true;
                 } else {
@@ -878,20 +226,21 @@ xfs_trans_reserve(
                 }
  
                 if (tp->t_ticket != NULL) {
-                       ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+                       ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
                         error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
                 } else {
-                       error = xfs_log_reserve(tp->t_mountp, logspace,
-                                               logcount, &tp->t_ticket,
-                                               XFS_TRANSACTION, permanent,
-                                               tp->t_type);
+                       error = xfs_log_reserve(tp->t_mountp,
+                                               resp->tr_logres,
+                                               resp->tr_logcount,
+                                               &tp->t_ticket, XFS_TRANSACTION,
+                                               permanent, tp->t_type);
                 }
  
                 if (error)
                         goto undo_blocks;
  
-               tp->t_log_res = logspace;
-               tp->t_log_count = logcount;
+               tp->t_log_res = resp->tr_logres;
+               tp->t_log_count = resp->tr_logcount;
         }
  
         /*
@@ -916,10 +265,10 @@ xfs_trans_reserve(
          * reservations which have already been performed.
          */
  undo_log:
-       if (logspace > 0) {
+       if (resp->tr_logres > 0) {
                 int             log_flags;
  
-               if (flags & XFS_TRANS_PERM_LOG_RES) {
+               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
                         log_flags = XFS_LOG_REL_PERM_RESERV;
                 } else {
                         log_flags = 0;
@@ -1367,10 +716,10 @@ xfs_trans_free_items(
                 lip->li_desc = NULL;
  
                 if (commit_lsn != NULLCOMMITLSN)
-                       IOP_COMMITTING(lip, commit_lsn);
+                       lip->li_ops->iop_committing(lip, commit_lsn);
                 if (flags & XFS_TRANS_ABORT)
                         lip->li_flags |= XFS_LI_ABORTED;
-               IOP_UNLOCK(lip);
+               lip->li_ops->iop_unlock(lip);
  
                 xfs_trans_free_item_desc(lidp);
         }
@@ -1390,8 +739,11 @@ xfs_log_item_batch_insert(
         /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
         xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
  
-       for (i = 0; i < nr_items; i++)
-               IOP_UNPIN(log_items[i], 0);
+       for (i = 0; i < nr_items; i++) {
+               struct xfs_log_item *lip = log_items[i];
+
+               lip->li_ops->iop_unpin(lip, 0);
+       }
  }
  
  /*
@@ -1401,11 +753,11 @@ xfs_log_item_batch_insert(
   *
   * If we are called with the aborted flag set, it is because a log write during
   * a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * checkpoint have already gone through iop_commited and iop_unlock, which
   * means that checkpoint commit abort handling is treated exactly the same
   * as an iclog write error even though we haven't started any IO yet. Hence in
- * this case all we need to do is IOP_COMMITTED processing, followed by an
- * IOP_UNPIN(aborted) call.
+ * this case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
   *
   * The AIL cursor is used to optimise the insert process. If commit_lsn is not
   * at the end of the AIL, the insert cursor avoids the need to walk
@@ -1438,7 +790,7 @@ xfs_trans_committed_bulk(
  
                 if (aborted)
                         lip->li_flags |= XFS_LI_ABORTED;
-               item_lsn = IOP_COMMITTED(lip, commit_lsn);
+               item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
  
                 /* item_lsn of -1 means the item needs no further processing */
                 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -1450,7 +802,7 @@ xfs_trans_committed_bulk(
                  */
                 if (aborted) {
                         ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
-                       IOP_UNPIN(lip, 1);
+                       lip->li_ops->iop_unpin(lip, 1);
                         continue;
                 }
  
@@ -1468,7 +820,7 @@ xfs_trans_committed_bulk(
                                 xfs_trans_ail_update(ailp, lip, item_lsn);
                         else
                                 spin_unlock(&ailp->xa_lock);
-                       IOP_UNPIN(lip, 0);
+                       lip->li_ops->iop_unpin(lip, 0);
                         continue;
                 }
  
@@ -1666,7 +1018,7 @@ xfs_trans_roll(
         struct xfs_inode        *dp)
  {
         struct xfs_trans        *trans;
-       unsigned int            logres, count;
+       struct xfs_trans_res    tres;
         int                     error;
  
         /*
@@ -1678,8 +1030,8 @@ xfs_trans_roll(
         /*
          * Copy the critical parameters from one trans to the next.
          */
-       logres = trans->t_log_res;
-       count = trans->t_log_count;
+       tres.tr_logres = trans->t_log_res;
+       tres.tr_logcount = trans->t_log_count;
         *tpp = xfs_trans_dup(trans);
  
         /*
@@ -1710,8 +1062,8 @@ xfs_trans_roll(
          * across this call, or that anything that is locked be logged in
          * the prior and the next transactions.
          */
-       error = xfs_trans_reserve(trans, 0, logres, 0,
-                                 XFS_TRANS_PERM_LOG_RES, count);
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(trans, &tres, 0, 0);
         /*
          *  Ensure that the inode is in the new transaction and locked.
          */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 2b4946393e30f56655e55c782813778760846f79..09cf40b89e8c1d85817cc649b22a12c3ba97aa78 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -20,285 +20,9 @@
  
  struct xfs_log_item;
  
-/*
- * This is the structure written in the log at the head of
- * every transaction. It identifies the type and id of the
- * transaction, and contains the number of items logged by
- * the transaction so we know how many to expect during recovery.
- *
- * Do not change the below structure without redoing the code in
- * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
- */
-typedef struct xfs_trans_header {
-       uint            th_magic;               /* magic number */
-       uint            th_type;                /* transaction type */
-       __int32_t       th_tid;                 /* transaction id (unused) */
-       uint            th_num_items;           /* num items logged by trans */
-} xfs_trans_header_t;
-
-#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
-
-/*
- * Log item types.
- */
-#define        XFS_LI_EFI              0x1236
-#define        XFS_LI_EFD              0x1237
-#define        XFS_LI_IUNLINK          0x1238
-#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
-#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
-#define        XFS_LI_DQUOT            0x123d
-#define        XFS_LI_QUOTAOFF         0x123e
-#define        XFS_LI_ICREATE          0x123f
-
-#define XFS_LI_TYPE_DESC \
-       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
-       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
-       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
-       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
-       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
-       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
-       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
-
-/*
- * Transaction types.  Used to distinguish types of buffers.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE     1
-#define XFS_TRANS_SETATTR_SIZE         2
-#define XFS_TRANS_INACTIVE             3
-#define XFS_TRANS_CREATE               4
-#define XFS_TRANS_CREATE_TRUNC         5
-#define XFS_TRANS_TRUNCATE_FILE                6
-#define XFS_TRANS_REMOVE               7
-#define XFS_TRANS_LINK                 8
-#define XFS_TRANS_RENAME               9
-#define XFS_TRANS_MKDIR                        10
-#define XFS_TRANS_RMDIR                        11
-#define XFS_TRANS_SYMLINK              12
-#define XFS_TRANS_SET_DMATTRS          13
-#define XFS_TRANS_GROWFS               14
-#define XFS_TRANS_STRAT_WRITE          15
-#define XFS_TRANS_DIOSTRAT             16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define        XFS_TRANS_WRITEID               18
-#define        XFS_TRANS_ADDAFORK              19
-#define        XFS_TRANS_ATTRINVAL             20
-#define        XFS_TRANS_ATRUNCATE             21
-#define        XFS_TRANS_ATTR_SET              22
-#define        XFS_TRANS_ATTR_RM               23
-#define        XFS_TRANS_ATTR_FLAG             24
-#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE          26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1               27
-#define XFS_TRANS_DUMMY2               28
-#define XFS_TRANS_QM_QUOTAOFF          29
-#define XFS_TRANS_QM_DQALLOC           30
-#define XFS_TRANS_QM_SETQLIM           31
-#define XFS_TRANS_QM_DQCLUSTER         32
-#define XFS_TRANS_QM_QINOCREATE                33
-#define XFS_TRANS_QM_QUOTAOFF_END      34
-#define XFS_TRANS_SB_UNIT              35
-#define XFS_TRANS_FSYNC_TS             36
-#define        XFS_TRANS_GROWFSRT_ALLOC        37
-#define        XFS_TRANS_GROWFSRT_ZERO         38
-#define        XFS_TRANS_GROWFSRT_FREE         39
-#define        XFS_TRANS_SWAPEXT               40
-#define        XFS_TRANS_SB_COUNT              41
-#define        XFS_TRANS_CHECKPOINT            42
-#define        XFS_TRANS_ICREATE               43
-#define        XFS_TRANS_TYPE_MAX              43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-       { XFS_TRANS_CREATE,             "CREATE" }, \
-       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-       { XFS_TRANS_REMOVE,             "REMOVE" }, \
-       { XFS_TRANS_LINK,               "LINK" }, \
-       { XFS_TRANS_RENAME,             "RENAME" }, \
-       { XFS_TRANS_MKDIR,              "MKDIR" }, \
-       { XFS_TRANS_RMDIR,              "RMDIR" }, \
-       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-       { XFS_TRANS_GROWFS,             "GROWFS" }, \
-       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-       { XFS_TRANS_WRITEID,            "WRITEID" }, \
-       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
-       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
-       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
-       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction.  It points to the log item and keeps some
- * flags to track the state of the log item.  It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
-       struct xfs_log_item     *lid_item;
-       struct list_head        lid_trans;
-       unsigned char           lid_flags;
-};
-
-#define XFS_LID_DIRTY          0x1
-
-#define        XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
-/*
- * Values for t_flags.
- */
-#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
-#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
-#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
-#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
-                                          count in superblock */
-
-/*
- * Values for call flags parameter.
- */
-#define        XFS_TRANS_RELEASE_LOG_RES       0x4
-#define        XFS_TRANS_ABORT                 0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define        XFS_TRANS_SB_ICOUNT             0x00000001
-#define        XFS_TRANS_SB_IFREE              0x00000002
-#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
-#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
-#define        XFS_TRANS_SB_FREXTENTS          0x00000010
-#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
-#define        XFS_TRANS_SB_DBLOCKS            0x00000040
-#define        XFS_TRANS_SB_AGCOUNT            0x00000080
-#define        XFS_TRANS_SB_IMAXPCT            0x00000100
-#define        XFS_TRANS_SB_REXTSIZE           0x00000200
-#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
-#define        XFS_TRANS_SB_RBLOCKS            0x00000800
-#define        XFS_TRANS_SB_REXTENTS           0x00001000
-#define        XFS_TRANS_SB_REXTSLOG           0x00002000
-
-
-/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1)
- */
-#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
-
-/*
- * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block)
- * bmap btree: (levels + 2) * max depth
- * v2 directory blocks can be fragmented below the dirblksize down to the fsb
- * size, so account for that in the DAENTER macros.
- */
-#define        XFS_DIROP_LOG_COUNT(mp) \
-       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
-        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-
+#include "xfs_trans_resv.h"
  
-#define        XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-#define        XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-#define        XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-#define        XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-#define        XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-#define        XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-#define        XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-#define        XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-#define        XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-#define        XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-#define        XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-#define        XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-#define        XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-#define        XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-#define        XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode timestamps on an fsync -- same as SWRITE
- * as long as SWRITE logs the entire inode core
- */
-#define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-#define        XFS_WRITEID_LOG_RES(mp)         ((mp)->m_reservations.tr_swrite)
-#define        XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-#define        XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-#define        XFS_ATTRSETM_LOG_RES(mp)        ((mp)->m_reservations.tr_attrsetm)
-#define XFS_ATTRSETRT_LOG_RES(mp)      ((mp)->m_reservations.tr_attrsetrt)
-#define        XFS_ATTRRM_LOG_RES(mp)          ((mp)->m_reservations.tr_attrrm)
-#define        XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
-#define XFS_QM_SBCHANGE_LOG_RES(mp)    ((mp)->m_reservations.tr_qm_sbchange)
-#define XFS_QM_SETQLIM_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_setqlim)
-#define XFS_QM_DQALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_dqalloc)
-#define XFS_QM_QUOTAOFF_LOG_RES(mp)    ((mp)->m_reservations.tr_qm_quotaoff)
-#define XFS_QM_QUOTAOFF_END_LOG_RES(mp)        ((mp)->m_reservations.tr_qm_equotaoff)
-#define XFS_SB_LOG_RES(mp)             ((mp)->m_reservations.tr_sb)
-
-/*
- * Various log count values.
- */
-#define        XFS_DEFAULT_LOG_COUNT           1
-#define        XFS_DEFAULT_PERM_LOG_COUNT      2
-#define        XFS_ITRUNCATE_LOG_COUNT         2
-#define XFS_INACTIVE_LOG_COUNT         2
-#define        XFS_CREATE_LOG_COUNT            2
-#define        XFS_MKDIR_LOG_COUNT             3
-#define        XFS_SYMLINK_LOG_COUNT           3
-#define        XFS_REMOVE_LOG_COUNT            2
-#define        XFS_LINK_LOG_COUNT              2
-#define        XFS_RENAME_LOG_COUNT            2
-#define        XFS_WRITE_LOG_COUNT             2
-#define        XFS_ADDAFORK_LOG_COUNT          2
-#define        XFS_ATTRINVAL_LOG_COUNT         1
-#define        XFS_ATTRSET_LOG_COUNT           3
-#define        XFS_ATTRRM_LOG_COUNT            3
-
-/*
- * Here we centralize the specification of XFS meta-data buffer
- * reference count values.  This determine how hard the buffer
- * cache tries to hold onto the buffer.
- */
-#define        XFS_AGF_REF             4
-#define        XFS_AGI_REF             4
-#define        XFS_AGFL_REF            3
-#define        XFS_INO_BTREE_REF       3
-#define        XFS_ALLOC_BTREE_REF     2
-#define        XFS_BMAP_BTREE_REF      2
-#define        XFS_DIR_BTREE_REF       2
-#define        XFS_INO_REF             2
-#define        XFS_ATTR_BTREE_REF      1
-#define        XFS_DQUOT_REF           1
-
-#ifdef __KERNEL__
+/* kernel only transaction subsystem defines */
  
  struct xfs_buf;
  struct xfs_buftarg;
@@ -310,6 +34,7 @@ struct xfs_log_iovec;
  struct xfs_log_item_desc;
  struct xfs_mount;
  struct xfs_trans;
+struct xfs_trans_res;
  struct xfs_dquot_acct;
  struct xfs_busy_extent;
  
@@ -342,7 +67,7 @@ typedef struct xfs_log_item {
         { XFS_LI_ABORTED,       "ABORTED" }
  
  struct xfs_item_ops {
-       uint (*iop_size)(xfs_log_item_t *);
+       void (*iop_size)(xfs_log_item_t *, int *, int *);
         void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
         void (*iop_pin)(xfs_log_item_t *);
         void (*iop_unpin)(xfs_log_item_t *, int remove);
@@ -352,17 +77,8 @@ struct xfs_item_ops {
         void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
  };
  
-#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, remove)  (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_PUSH(ip, list)     (*(ip)->li_ops->iop_push)(ip, list)
-#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-
  /*
- * Return values for the IOP_PUSH() routines.
+ * Return values for the iop_push() routines.
   */
  #define XFS_ITEM_SUCCESS       0
  #define XFS_ITEM_PINNED                1
@@ -446,7 +162,7 @@ typedef struct xfs_trans {
  xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
  xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
  xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
-int            xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
                                   uint, uint);
  void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
  
@@ -528,9 +244,4 @@ void                xfs_trans_ail_destroy(struct xfs_mount *);
  extern kmem_zone_t     *xfs_trans_zone;
  extern kmem_zone_t     *xfs_log_item_desc_zone;
  
-#endif /* __KERNEL__ */
-
-void           xfs_trans_init(struct xfs_mount *);
-int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-
  #endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c

index 0eda7254305f3c5a6524a378596215c8cf861140..21c6d7ddbc06b474e102b30cb6a13c7690d1a2a5 100644 (file)
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -61,20 +61,6 @@ xfs_ail_check(
  #endif /* DEBUG */
  
  /*
- * Return a pointer to the first item in the AIL.  If the AIL is empty, then
- * return NULL.
- */
-xfs_log_item_t *
-xfs_ail_min(
-       struct xfs_ail  *ailp)
-{
-       if (list_empty(&ailp->xa_ail))
-               return NULL;
-
-       return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-
- /*
   * Return a pointer to the last item in the AIL.  If the AIL is empty, then
   * return NULL.
   */
@@ -393,11 +379,11 @@ xfsaild_push(
                 int     lock_result;
  
                 /*
-                * Note that IOP_PUSH may unlock and reacquire the AIL lock.  We
+                * Note that iop_push may unlock and reacquire the AIL lock.  We
                  * rely on the AIL cursor implementation to be able to deal with
                  * the dropped lock.
                  */
-               lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
+               lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
                 switch (lock_result) {
                 case XFS_ITEM_SUCCESS:
                         XFS_STATS_INC(xs_push_ail_success);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c

index aa5a04b844d6d530b12edd39b3f99240aff625bc..8c75b8f672702419beede8e54363ec259616039a 100644 (file)
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -505,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t        *tp,
  
  /*
   * Mark the buffer as not needing to be unlocked when the buf item's
- * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * iop_unlock() routine is called.  The buffer must already be locked
   * and associated with the given transaction.
   */
  /* ARGSUSED */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c

index 61407a847b869a6bb0faec7bf2c3279d66aaeb83..54ee3c5dee76093b6a6136a5fde759a8be309ccd 100644 (file)
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
   */
  #include "xfs.h"
  #include "xfs_fs.h"
+#include "xfs_format.h"
  #include "xfs_log.h"
  #include "xfs_trans.h"
  #include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h

index 53b7c9b0f8f7a6fa3c96a3f73298c2862eb3d107..c52def0b441cd89f2cb207e1b7ba5a9f22cc915c 100644 (file)
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,6 +25,9 @@ struct xfs_trans;
  struct xfs_ail;
  struct xfs_log_vec;
  
+
+void   xfs_trans_init(struct xfs_mount *);
+int    xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
  void   xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
  void   xfs_trans_del_item(struct xfs_log_item *);
  void   xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -83,6 +86,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                 struct xfs_ail_cursor *cur,
                                 struct xfs_log_item **log_items, int nr_items,
                                 xfs_lsn_t lsn) __releases(ailp->xa_lock);
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_min(
+       struct xfs_ail  *ailp)
+{
+       return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
+                                       li_ail);
+}
+
  static inline void
  xfs_trans_ail_update(
         struct xfs_ail          *ailp,
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c

new file mode 100644 (file)

index 0000000..a65a3cc
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.c
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+       return round_up(sizeof(struct xlog_op_header) +
+                       sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+       uint            nbufs,
+       uint            size)
+{
+       return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - log op headers for object
+ * - inode log format object
+ * - the entire inode contents (core + 2 forks)
+ * - two bmap btree block headers
+ */
+STATIC uint
+xfs_calc_inode_res(
+       struct xfs_mount        *mp,
+       uint                    ninodes)
+{
+       return ninodes * (sizeof(struct xlog_op_header) +
+                         sizeof(struct xfs_inode_log_format) +
+                         mp->m_sb.sb_inodesize +
+                         2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(5, 0) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                    XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                                    mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *     of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 4) +
+                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 2) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 2) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               (uint)XFS_FSB_TO_B(mp, 1) +
+               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_create_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_icreate_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return xfs_calc_icreate_reservation(mp);
+       return __xfs_calc_create_reservation(mp);
+
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp) +
+              xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+               MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                   XFS_INODE_CLUSTER_SIZE(mp)) +
+               xfs_calc_buf_res(1, 0) +
+               xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                                mp->m_in_maxlevels, 0) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *     superblock
+ *     agi and agf
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *     superblock: sector size
+ *     agf of the ag from which the extent is allocated: sector size
+ *     bmap btree for bitmap/summary inode: max depth * blocksize
+ *     bitmap/summary inode: inode size
+ *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *     one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *     superblock: sector size
+ *     bitmap inode: inode size
+ *     summary inode: inode size
+ *     one bitmap block: blocksize
+ *     summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+               xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *     inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *     inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *     the inode being converted: inode size
+ *     agf block and superblock (for block allocation)
+ *     the new block (directory sized)
+ *     bmap blocks for the new directory block
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, mp->m_dirblksize) +
+               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+       struct xfs_mount        *mp)
+{
+       return MAX((xfs_calc_inode_res(mp, 1) +
+                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                    XFS_FSB_TO_B(mp, 1))),
+                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                    XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *     the inode getting the attribute
+ *     the superblock for allocations
+ *     the agfs extents are allocated from
+ *     the attribute btree * max depth
+ *     the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ *     the superblock for allocations: sector size
+ *     the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *     ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    (uint)XFS_FSB_TO_B(mp,
+                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ *     the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *     the write transaction log space: M_RES(mp)->tr_write.tr_logres
+ *     the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       ASSERT(M_RES(mp)->tr_write.tr_logres);
+       return M_RES(mp)->tr_write.tr_logres +
+               xfs_calc_buf_res(1,
+                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2 +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+       struct xfs_mount        *mp,
+       struct xfs_trans_resv   *resp)
+{
+       /*
+        * The following transactions are logged in physical format and
+        * require a permanent reservation on space.
+        */
+       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+       resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+       resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+       resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+       resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+       resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+       resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+       resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+       resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+       resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+       resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+       resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+       resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+       resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+       resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+       resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+       resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+       resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+       resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+       resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+       resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+       resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+       resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+       resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+       resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+       resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+       resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+       resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       /*
+        * The following transactions are logged in logical format with
+        * a default log count.
+        */
+       resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+       resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+       resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+       resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_equotaoff.tr_logres =
+               xfs_calc_qm_quotaoff_end_reservation(mp);
+       resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+       resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       /* The following transaction are logged in logical format */
+       resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+       resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+       resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
+       resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+       resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+       resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+       resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+       resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+       resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h

new file mode 100644 (file)

index 0000000..de7de9a
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_TRANS_RESV_H__
+#define        __XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+       uint    tr_logres;      /* log space unit in bytes per log ticket */
+       int     tr_logcount;    /* number of log operations per log ticket */
+       int     tr_logflags;    /* log flags, currently only used for indicating
+                                * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+       struct xfs_trans_res    tr_write;       /* extent alloc trans */
+       struct xfs_trans_res    tr_itruncate;   /* truncate trans */
+       struct xfs_trans_res    tr_rename;      /* rename trans */
+       struct xfs_trans_res    tr_link;        /* link trans */
+       struct xfs_trans_res    tr_remove;      /* unlink trans */
+       struct xfs_trans_res    tr_symlink;     /* symlink trans */
+       struct xfs_trans_res    tr_create;      /* create trans */
+       struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
+       struct xfs_trans_res    tr_ifree;       /* inode free trans */
+       struct xfs_trans_res    tr_ichange;     /* inode update trans */
+       struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
+       struct xfs_trans_res    tr_swrite;      /* sync write inode trans */
+       struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
+       struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
+       struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
+                                                * invalidation */
+       struct xfs_trans_res    tr_attrsetm;    /* set/create an attribute at
+                                                * mount time */
+       struct xfs_trans_res    tr_attrsetrt;   /* set/create an attribute at
+                                                * runtime */
+       struct xfs_trans_res    tr_attrrm;      /* remove an attribute */
+       struct xfs_trans_res    tr_clearagi;    /* clear agi unlinked bucket */
+       struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
+       struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
+       struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
+       struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
+       struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
+       struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
+       struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
+       struct xfs_trans_res    tr_qm_equotaoff;/* end of turn quota off */
+       struct xfs_trans_res    tr_sb;          /* modify superblock */
+       struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)      (&(mp)->m_resv)
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
+       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define        XFS_DIROP_LOG_RES(mp)   \
+       (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+        (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define        XFS_DIROP_LOG_COUNT(mp) \
+       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define        XFS_DEFAULT_LOG_COUNT           1
+#define        XFS_DEFAULT_PERM_LOG_COUNT      2
+#define        XFS_ITRUNCATE_LOG_COUNT         2
+#define XFS_INACTIVE_LOG_COUNT         2
+#define        XFS_CREATE_LOG_COUNT            2
+#define        XFS_MKDIR_LOG_COUNT             3
+#define        XFS_SYMLINK_LOG_COUNT           3
+#define        XFS_REMOVE_LOG_COUNT            2
+#define        XFS_LINK_LOG_COUNT              2
+#define        XFS_RENAME_LOG_COUNT            2
+#define        XFS_WRITE_LOG_COUNT             2
+#define        XFS_ADDAFORK_LOG_COUNT          2
+#define        XFS_ATTRINVAL_LOG_COUNT         1
+#define        XFS_ATTRSET_LOG_COUNT           3
+#define        XFS_ATTRRM_LOG_COUNT            3
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+
+#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h

index 61ba1cfa974c7e3e32c493c1317236e2cb397730..82bbc34d54a3b344559379a665365bb2bab9b903 100644 (file)
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -18,42 +18,7 @@
  #ifndef __XFS_TYPES_H__
  #define        __XFS_TYPES_H__
  
-#ifdef __KERNEL__
-
-/*
- * Additional type declarations for XFS
- */
-typedef signed char            __int8_t;
-typedef unsigned char          __uint8_t;
-typedef signed short int       __int16_t;
-typedef unsigned short int     __uint16_t;
-typedef signed int             __int32_t;
-typedef unsigned int           __uint32_t;
-typedef signed long long int   __int64_t;
-typedef unsigned long long int __uint64_t;
-
-typedef __uint32_t             prid_t;         /* project ID */
-typedef __uint32_t             inst_t;         /* an instruction */
-
-typedef __s64                  xfs_off_t;      /* <file offset> type */
-typedef unsigned long long     xfs_ino_t;      /* <inode> type */
-typedef __s64                  xfs_daddr_t;    /* <disk address> type */
-typedef char *                 xfs_caddr_t;    /* <core address> type */
-typedef __u32                  xfs_dev_t;
-typedef __u32                  xfs_nlink_t;
-
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
-#endif /* __KERNEL__ */
+typedef __uint32_t     prid_t;         /* project ID */
  
  typedef __uint32_t     xfs_agblock_t;  /* blockno in alloc. group */
  typedef        __uint32_t      xfs_agino_t;    /* inode # within allocation grp */
@@ -145,6 +110,12 @@ typedef __uint64_t xfs_filblks_t;  /* number of blocks in a file */
  #define XFS_MIN_SECTORSIZE     (1 << XFS_MIN_SECTORSIZE_LOG)
  #define XFS_MAX_SECTORSIZE     (1 << XFS_MAX_SECTORSIZE_LOG)
  
+/*
+ * Inode fork identifiers.
+ */
+#define        XFS_DATA_FORK   0
+#define        XFS_ATTR_FORK   1
+
  /*
   * Min numbers of data/attr fork btree root pointers.
   */
@@ -169,6 +140,23 @@ typedef enum {
  struct xfs_name {
         const unsigned char     *name;
         int                     len;
+       int                     type;
  };
  
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t     xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define        XFS_NBBYLOG     3               /* log2(NBBY) */
+#define        XFS_WORDLOG     2               /* log2(sizeof(xfs_rtword_t)) */
+#define        XFS_NBWORDLOG   (XFS_NBBYLOG + XFS_WORDLOG)
+#define        XFS_NBWORD      (1 << XFS_NBWORDLOG)
+#define        XFS_WORDMASK    ((1 << XFS_WORDLOG) - 1)
+
+
  #endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c

deleted file mode 100644 (file)

index 0025c78..0000000
--- a/fs/xfs/xfs_utils.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_itable.h"
-#include "xfs_utils.h"
-
-
-/*
- * Allocates a new inode from disk and return a pointer to the
- * incore copy. This routine will internally commit the current
- * transaction and allocate a new one if the Space Manager needed
- * to do an allocation to replenish the inode free-list.
- *
- * This routine is designed to be called from xfs_create and
- * xfs_create_dir.
- *
- */
-int
-xfs_dir_ialloc(
-       xfs_trans_t     **tpp,          /* input: current transaction;
-                                          output: may be a new transaction. */
-       xfs_inode_t     *dp,            /* directory within whose allocate
-                                          the inode. */
-       umode_t         mode,
-       xfs_nlink_t     nlink,
-       xfs_dev_t       rdev,
-       prid_t          prid,           /* project id */
-       int             okalloc,        /* ok to allocate new space */
-       xfs_inode_t     **ipp,          /* pointer to inode; it will be
-                                          locked. */
-       int             *committed)
-
-{
-       xfs_trans_t     *tp;
-       xfs_trans_t     *ntp;
-       xfs_inode_t     *ip;
-       xfs_buf_t       *ialloc_context = NULL;
-       int             code;
-       uint            log_res;
-       uint            log_count;
-       void            *dqinfo;
-       uint            tflags;
-
-       tp = *tpp;
-       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-
-       /*
-        * xfs_ialloc will return a pointer to an incore inode if
-        * the Space Manager has an available inode on the free
-        * list. Otherwise, it will do an allocation and replenish
-        * the freelist.  Since we can only do one allocation per
-        * transaction without deadlocks, we will need to commit the
-        * current transaction and start a new one.  We will then
-        * need to call xfs_ialloc again to get the inode.
-        *
-        * If xfs_ialloc did an allocation to replenish the freelist,
-        * it returns the bp containing the head of the freelist as
-        * ialloc_context. We will hold a lock on it across the
-        * transaction commit so that no other process can steal
-        * the inode(s) that we've just allocated.
-        */
-       code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
-                         &ialloc_context, &ip);
-
-       /*
-        * Return an error if we were unable to allocate a new inode.
-        * This should only happen if we run out of space on disk or
-        * encounter a disk error.
-        */
-       if (code) {
-               *ipp = NULL;
-               return code;
-       }
-       if (!ialloc_context && !ip) {
-               *ipp = NULL;
-               return XFS_ERROR(ENOSPC);
-       }
-
-       /*
-        * If the AGI buffer is non-NULL, then we were unable to get an
-        * inode in one operation.  We need to commit the current
-        * transaction and call xfs_ialloc() again.  It is guaranteed
-        * to succeed the second time.
-        */
-       if (ialloc_context) {
-               /*
-                * Normally, xfs_trans_commit releases all the locks.
-                * We call bhold to hang on to the ialloc_context across
-                * the commit.  Holding this buffer prevents any other
-                * processes from doing any allocations in this
-                * allocation group.
-                */
-               xfs_trans_bhold(tp, ialloc_context);
-               /*
-                * Save the log reservation so we can use
-                * them in the next transaction.
-                */
-               log_res = xfs_trans_get_log_res(tp);
-               log_count = xfs_trans_get_log_count(tp);
-
-               /*
-                * We want the quota changes to be associated with the next
-                * transaction, NOT this one. So, detach the dqinfo from this
-                * and attach it to the next transaction.
-                */
-               dqinfo = NULL;
-               tflags = 0;
-               if (tp->t_dqinfo) {
-                       dqinfo = (void *)tp->t_dqinfo;
-                       tp->t_dqinfo = NULL;
-                       tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
-                       tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
-               }
-
-               ntp = xfs_trans_dup(tp);
-               code = xfs_trans_commit(tp, 0);
-               tp = ntp;
-               if (committed != NULL) {
-                       *committed = 1;
-               }
-               /*
-                * If we get an error during the commit processing,
-                * release the buffer that is still held and return
-                * to the caller.
-                */
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       if (dqinfo) {
-                               tp->t_dqinfo = dqinfo;
-                               xfs_trans_free_dqinfo(tp);
-                       }
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-
-               /*
-                * transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               code = xfs_trans_reserve(tp, 0, log_res, 0,
-                                        XFS_TRANS_PERM_LOG_RES, log_count);
-               /*
-                * Re-attach the quota info that we detached from prev trx.
-                */
-               if (dqinfo) {
-                       tp->t_dqinfo = dqinfo;
-                       tp->t_flags |= tflags;
-               }
-
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-               xfs_trans_bjoin(tp, ialloc_context);
-
-               /*
-                * Call ialloc again. Since we've locked out all
-                * other allocations in this allocation group,
-                * this call should always succeed.
-                */
-               code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
-                                 okalloc, &ialloc_context, &ip);
-
-               /*
-                * If we get an error at this point, return to the caller
-                * so that the current transaction can be aborted.
-                */
-               if (code) {
-                       *tpp = tp;
-                       *ipp = NULL;
-                       return code;
-               }
-               ASSERT(!ialloc_context && ip);
-
-       } else {
-               if (committed != NULL)
-                       *committed = 0;
-       }
-
-       *ipp = ip;
-       *tpp = tp;
-
-       return 0;
-}
-
-/*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
- */
-int                            /* error */
-xfs_droplink(
-       xfs_trans_t *tp,
-       xfs_inode_t *ip)
-{
-       int     error;
-
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-       ASSERT (ip->i_d.di_nlink > 0);
-       ip->i_d.di_nlink--;
-       drop_nlink(VFS_I(ip));
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       error = 0;
-       if (ip->i_d.di_nlink == 0) {
-               /*
-                * We're dropping the last link to this file.
-                * Move the on-disk inode to the AGI unlinked list.
-                * From xfs_inactive() we will pull the inode from
-                * the list and free it.
-                */
-               error = xfs_iunlink(tp, ip);
-       }
-       return error;
-}
-
-/*
- * This gets called when the inode's version needs to be changed from 1 to 2.
- * Currently this happens when the nlink field overflows the old 16-bit value
- * or when chproj is called to change the project for the first time.
- * As a side effect the superblock version will also get rev'd
- * to contain the NLINK bit.
- */
-void
-xfs_bump_ino_vers2(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip)
-{
-       xfs_mount_t     *mp;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(ip->i_d.di_version == 1);
-
-       ip->i_d.di_version = 2;
-       ip->i_d.di_onlink = 0;
-       memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-       mp = tp->t_mountp;
-       if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-               spin_lock(&mp->m_sb_lock);
-               if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-                       xfs_sb_version_addnlink(&mp->m_sb);
-                       spin_unlock(&mp->m_sb_lock);
-                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
-               } else {
-                       spin_unlock(&mp->m_sb_lock);
-               }
-       }
-       /* Caller must log the inode */
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-int
-xfs_bumplink(
-       xfs_trans_t *tp,
-       xfs_inode_t *ip)
-{
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-       ASSERT(ip->i_d.di_nlink > 0);
-       ip->i_d.di_nlink++;
-       inc_nlink(VFS_I(ip));
-       if ((ip->i_d.di_version == 1) &&
-           (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
-               /*
-                * The inode has increased its number of links beyond
-                * what can fit in an old format inode.  It now needs
-                * to be converted to a version 2 inode with a 32 bit
-                * link count.  If this is the first inode in the file
-                * system to do this, then we need to bump the superblock
-                * version number as well.
-                */
-               xfs_bump_ino_vers2(tp, ip);
-       }
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       return 0;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h

deleted file mode 100644 (file)

index 5eeab46..0000000
--- a/fs/xfs/xfs_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_UTILS_H__
-#define __XFS_UTILS_H__
-
-extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
-                               xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
-
-#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c

deleted file mode 100644 (file)

index dc730ac..0000000
--- a/fs/xfs/xfs_vnodeops.c
+++ /dev/null
@@ -1,1870 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2012 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trans_space.h"
-#include "xfs_log_priv.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-#include "xfs_symlink.h"
-
-
-/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
- */
-int
-xfs_free_eofblocks(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip,
-       bool            need_iolock)
-{
-       xfs_trans_t     *tp;
-       int             error;
-       xfs_fileoff_t   end_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_filblks_t   map_len;
-       int             nimaps;
-       xfs_bmbt_irec_t imap;
-
-       /*
-        * Figure out if there are any blocks beyond the end
-        * of the file.  If not, then there is nothing to do.
-        */
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
-       last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-       if (last_fsb <= end_fsb)
-               return 0;
-       map_len = last_fsb - end_fsb;
-
-       nimaps = 1;
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (!error && (nimaps != 0) &&
-           (imap.br_startblock != HOLESTARTBLOCK ||
-            ip->i_delayed_blks)) {
-               /*
-                * Attach the dquots to the inode up front.
-                */
-               error = xfs_qm_dqattach(ip, 0);
-               if (error)
-                       return error;
-
-               /*
-                * There are blocks after the end of file.
-                * Free them up now by truncating the file to
-                * its current size.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
-               if (need_iolock) {
-                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp, 0);
-                               return EAGAIN;
-                       }
-               }
-
-               error = xfs_trans_reserve(tp, 0,
-                                         XFS_ITRUNCATE_LOG_RES(mp),
-                                         0, XFS_TRANS_PERM_LOG_RES,
-                                         XFS_ITRUNCATE_LOG_COUNT);
-               if (error) {
-                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       if (need_iolock)
-                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                       return error;
-               }
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * Do not update the on-disk file size.  If we update the
-                * on-disk file size and then the system crashes before the
-                * contents of the file are flushed to disk then the files
-                * may be full of holes (ie NULL files bug).
-                */
-               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-                                             XFS_ISIZE(ip));
-               if (error) {
-                       /*
-                        * If we get an error at this point we simply don't
-                        * bother truncating the file.
-                        */
-                       xfs_trans_cancel(tp,
-                                        (XFS_TRANS_RELEASE_LOG_RES |
-                                         XFS_TRANS_ABORT));
-               } else {
-                       error = xfs_trans_commit(tp,
-                                               XFS_TRANS_RELEASE_LOG_RES);
-                       if (!error)
-                               xfs_inode_clear_eofblocks_tag(ip);
-               }
-
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (need_iolock)
-                       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       }
-       return error;
-}
-
-int
-xfs_release(
-       xfs_inode_t     *ip)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       int             error;
-
-       if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
-               return 0;
-
-       /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return 0;
-
-       if (!XFS_FORCED_SHUTDOWN(mp)) {
-               int truncated;
-
-               /*
-                * If we are using filestreams, and we have an unlinked
-                * file that we are processing the last close on, then nothing
-                * will be able to reopen and write to this file. Purge this
-                * inode from the filestreams cache so that it doesn't delay
-                * teardown of the inode.
-                */
-               if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
-                       xfs_filestream_deassociate(ip);
-
-               /*
-                * If we previously truncated this file and removed old data
-                * in the process, we want to initiate "early" writeout on
-                * the last close.  This is an attempt to combat the notorious
-                * NULL files problem which is particularly noticeable from a
-                * truncate down, buffered (re-)write (delalloc), followed by
-                * a crash.  What we are effectively doing here is
-                * significantly reducing the time window where we'd otherwise
-                * be exposed to that problem.
-                */
-               truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-               if (truncated) {
-                       xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-                               error = -filemap_flush(VFS_I(ip)->i_mapping);
-                               if (error)
-                                       return error;
-                       }
-               }
-       }
-
-       if (ip->i_d.di_nlink == 0)
-               return 0;
-
-       if (xfs_can_free_eofblocks(ip, false)) {
-
-               /*
-                * If we can't get the iolock just skip truncating the blocks
-                * past EOF because we could deadlock with the mmap_sem
-                * otherwise.  We'll get another chance to drop them once the
-                * last reference to the inode is dropped, so we'll never leak
-                * blocks permanently.
-                *
-                * Further, check if the inode is being opened, written and
-                * closed frequently and we have delayed allocation blocks
-                * outstanding (e.g. streaming writes from the NFS server),
-                * truncating the blocks past EOF will cause fragmentation to
-                * occur.
-                *
-                * In this case don't do the truncation, either, but we have to
-                * be careful how we detect this case. Blocks beyond EOF show
-                * up as i_delayed_blks even when the inode is clean, so we
-                * need to truncate them away first before checking for a dirty
-                * release. Hence on the first dirty close we will still remove
-                * the speculative allocation, but after that we will leave it
-                * in place.
-                */
-               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
-                       return 0;
-
-               error = xfs_free_eofblocks(mp, ip, true);
-               if (error && error != EAGAIN)
-                       return error;
-
-               /* delalloc blocks after truncation means it really is dirty */
-               if (ip->i_delayed_blks)
-                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
-       }
-       return 0;
-}
-
-/*
- * xfs_inactive
- *
- * This is called when the vnode reference count for the vnode
- * goes to zero.  If the file has been unlinked, then it must
- * now be truncated.  Also, we clear all of the read-ahead state
- * kept for the inode here since the file is now closed.
- */
-int
-xfs_inactive(
-       xfs_inode_t     *ip)
-{
-       xfs_bmap_free_t free_list;
-       xfs_fsblock_t   first_block;
-       int             committed;
-       xfs_trans_t     *tp;
-       xfs_mount_t     *mp;
-       int             error;
-       int             truncate = 0;
-
-       /*
-        * If the inode is already free, then there can be nothing
-        * to clean up here.
-        */
-       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
-               ASSERT(ip->i_df.if_real_bytes == 0);
-               ASSERT(ip->i_df.if_broot_bytes == 0);
-               return VN_INACTIVE_CACHE;
-       }
-
-       mp = ip->i_mount;
-
-       error = 0;
-
-       /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               goto out;
-
-       if (ip->i_d.di_nlink != 0) {
-               /*
-                * force is true because we are evicting an inode from the
-                * cache. Post-eof blocks must be freed, lest we end up with
-                * broken free space accounting.
-                */
-               if (xfs_can_free_eofblocks(ip, true)) {
-                       error = xfs_free_eofblocks(mp, ip, false);
-                       if (error)
-                               return VN_INACTIVE_CACHE;
-               }
-               goto out;
-       }
-
-       if (S_ISREG(ip->i_d.di_mode) &&
-           (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
-            ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
-               truncate = 1;
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return VN_INACTIVE_CACHE;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, 0,
-                       (truncate || S_ISLNK(ip->i_d.di_mode)) ?
-                               XFS_ITRUNCATE_LOG_RES(mp) :
-                               XFS_IFREE_LOG_RES(mp),
-                       0,
-                       XFS_TRANS_PERM_LOG_RES,
-                       XFS_ITRUNCATE_LOG_COUNT);
-       if (error) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp, 0);
-               return VN_INACTIVE_CACHE;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, 0);
-
-       if (S_ISLNK(ip->i_d.di_mode)) {
-               error = xfs_inactive_symlink(ip, &tp);
-               if (error)
-                       goto out_cancel;
-       } else if (truncate) {
-               ip->i_d.di_size = 0;
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
-               if (error)
-                       goto out_cancel;
-
-               ASSERT(ip->i_d.di_nextents == 0);
-       }
-
-       /*
-        * If there are attributes associated with the file then blow them away
-        * now.  The code calls a routine that recursively deconstructs the
-        * attribute fork.  We need to just commit the current transaction
-        * because we can't use it for xfs_attr_inactive().
-        */
-       if (ip->i_d.di_anextents > 0) {
-               ASSERT(ip->i_d.di_forkoff != 0);
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               if (error)
-                       goto out_unlock;
-
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-               error = xfs_attr_inactive(ip);
-               if (error)
-                       goto out;
-
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-               error = xfs_trans_reserve(tp, 0,
-                                         XFS_IFREE_LOG_RES(mp),
-                                         0, XFS_TRANS_PERM_LOG_RES,
-                                         XFS_INACTIVE_LOG_COUNT);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       goto out;
-               }
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, 0);
-       }
-
-       if (ip->i_afp)
-               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-       ASSERT(ip->i_d.di_anextents == 0);
-
-       /*
-        * Free the inode.
-        */
-       xfs_bmap_init(&free_list, &first_block);
-       error = xfs_ifree(tp, ip, &free_list);
-       if (error) {
-               /*
-                * If we fail to free the inode, shut down.  The cancel
-                * might do that, we need to make sure.  Otherwise the
-                * inode might be lost for a long time or forever.
-                */
-               if (!XFS_FORCED_SHUTDOWN(mp)) {
-                       xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                               __func__, error);
-                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-               }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       } else {
-               /*
-                * Credit the quota account(s). The inode is gone.
-                */
-               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
-
-               /*
-                * Just ignore errors at this point.  There is nothing we can
-                * do except to try to keep going. Make sure it's not a silent
-                * error.
-                */
-               error = xfs_bmap_finish(&tp,  &free_list, &committed);
-               if (error)
-                       xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                               __func__, error);
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               if (error)
-                       xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                               __func__, error);
-       }
-
-       /*
-        * Release the dquots held by inode, if any.
-        */
-       xfs_qm_dqdetach(ip);
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
-       return VN_INACTIVE_CACHE;
-out_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       goto out_unlock;
-}
-
-/*
- * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
- * is allowed, otherwise it has to be an exact match. If a CI match is found,
- * ci_name->name will point to a the actual name (caller must free) or
- * will be set to NULL if an exact match is found.
- */
-int
-xfs_lookup(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       xfs_inode_t             **ipp,
-       struct xfs_name         *ci_name)
-{
-       xfs_ino_t               inum;
-       int                     error;
-       uint                    lock_mode;
-
-       trace_xfs_lookup(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
-
-       lock_mode = xfs_ilock_map_shared(dp);
-       error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-       xfs_iunlock_map_shared(dp, lock_mode);
-
-       if (error)
-               goto out;
-
-       error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
-       if (error)
-               goto out_free_name;
-
-       return 0;
-
-out_free_name:
-       if (ci_name)
-               kmem_free(ci_name->name);
-out:
-       *ipp = NULL;
-       return error;
-}
-
-int
-xfs_create(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       umode_t                 mode,
-       xfs_dev_t               rdev,
-       xfs_inode_t             **ipp)
-{
-       int                     is_dir = S_ISDIR(mode);
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_inode        *ip = NULL;
-       struct xfs_trans        *tp = NULL;
-       int                     error;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
-       int                     committed;
-       prid_t                  prid;
-       struct xfs_dquot        *udqp = NULL;
-       struct xfs_dquot        *gdqp = NULL;
-       struct xfs_dquot        *pdqp = NULL;
-       uint                    resblks;
-       uint                    log_res;
-       uint                    log_count;
-
-       trace_xfs_create(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-               prid = xfs_get_projid(dp);
-       else
-               prid = XFS_PROJID_DEFAULT;
-
-       /*
-        * Make sure that we have allocated dquot(s) on disk.
-        */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-                                       &udqp, &gdqp, &pdqp);
-       if (error)
-               return error;
-
-       if (is_dir) {
-               rdev = 0;
-               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
-               log_res = XFS_MKDIR_LOG_RES(mp);
-               log_count = XFS_MKDIR_LOG_COUNT;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
-       } else {
-               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
-               log_res = XFS_CREATE_LOG_RES(mp);
-               log_count = XFS_CREATE_LOG_COUNT;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
-       }
-
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
-       /*
-        * Initially assume that the file does not exist and
-        * reserve the resources for that case.  If that is not
-        * the case we'll drop the one we have and get a more
-        * appropriate transaction later.
-        */
-       error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                       XFS_TRANS_PERM_LOG_RES, log_count);
-       if (error == ENOSPC) {
-               /* flush outstanding delalloc blocks and retry */
-               xfs_flush_inodes(mp);
-               error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                               XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error == ENOSPC) {
-               /* No space at all so try a "no-allocation" reservation */
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, log_res, 0,
-                               XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error) {
-               cancel_flags = 0;
-               goto out_trans_cancel;
-       }
-
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-       unlock_dp_on_error = true;
-
-       xfs_bmap_init(&free_list, &first_block);
-
-       /*
-        * Reserve disk quota and the inode.
-        */
-       error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
-                                               pdqp, resblks, 1, 0);
-       if (error)
-               goto out_trans_cancel;
-
-       error = xfs_dir_canenter(tp, dp, name, resblks);
-       if (error)
-               goto out_trans_cancel;
-
-       /*
-        * A newly created regular or special file just has one directory
-        * entry pointing to them, but a directory also the "." entry
-        * pointing to itself.
-        */
-       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
-                              prid, resblks > 0, &ip, &committed);
-       if (error) {
-               if (error == ENOSPC)
-                       goto out_trans_cancel;
-               goto out_trans_abort;
-       }
-
-       /*
-        * Now we join the directory inode to the transaction.  We do not do it
-        * earlier because xfs_dir_ialloc might commit the previous transaction
-        * (and release all the locks).  An error from here on will result in
-        * the transaction cancel unlocking dp so don't do it explicitly in the
-        * error path.
-        */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-       unlock_dp_on_error = false;
-
-       error = xfs_dir_createname(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks ?
-                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
-       if (error) {
-               ASSERT(error != ENOSPC);
-               goto out_trans_abort;
-       }
-       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
-       if (is_dir) {
-               error = xfs_dir_init(tp, ip, dp);
-               if (error)
-                       goto out_bmap_cancel;
-
-               error = xfs_bumplink(tp, dp);
-               if (error)
-                       goto out_bmap_cancel;
-       }
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * create transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
-               xfs_trans_set_sync(tp);
-
-       /*
-        * Attach the dquot(s) to the inodes and modify them incore.
-        * These ids of the inode couldn't have changed since the new
-        * inode has been locked ever since it was created.
-        */
-       xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error)
-               goto out_bmap_cancel;
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       if (error)
-               goto out_release_inode;
-
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       xfs_qm_dqrele(pdqp);
-
-       *ipp = ip;
-       return 0;
-
- out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
- out_release_inode:
-       /*
-        * Wait until after the current transaction is aborted to
-        * release the inode.  This prevents recursive transactions
-        * and deadlocks from xfs_inactive.
-        */
-       if (ip)
-               IRELE(ip);
-
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       xfs_qm_dqrele(pdqp);
-
-       if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return error;
-}
-
-#ifdef DEBUG
-int xfs_locked_n;
-int xfs_small_retries;
-int xfs_middle_retries;
-int xfs_lots_retries;
-int xfs_lock_delays;
-#endif
-
-/*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
- * a different value
- */
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
-{
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
-       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
-
-       return lock_mode;
-}
-
-/*
- * The following routine will lock n inodes in exclusive mode.
- * We assume the caller calls us with the inodes in i_ino order.
- *
- * We need to detect deadlock where an inode that we lock
- * is in the AIL and we start waiting for another inode that is locked
- * by a thread in a long running transaction (such as truncate). This can
- * result in deadlock since the long running trans might need to wait
- * for the inode we just locked in order to push the tail and free space
- * in the log.
- */
-void
-xfs_lock_inodes(
-       xfs_inode_t     **ips,
-       int             inodes,
-       uint            lock_mode)
-{
-       int             attempts = 0, i, j, try_lock;
-       xfs_log_item_t  *lp;
-
-       ASSERT(ips && (inodes >= 2)); /* we need at least two */
-
-       try_lock = 0;
-       i = 0;
-
-again:
-       for (; i < inodes; i++) {
-               ASSERT(ips[i]);
-
-               if (i && (ips[i] == ips[i-1]))  /* Already locked */
-                       continue;
-
-               /*
-                * If try_lock is not set yet, make sure all locked inodes
-                * are not in the AIL.
-                * If any are, set try_lock to be used later.
-                */
-
-               if (!try_lock) {
-                       for (j = (i - 1); j >= 0 && !try_lock; j--) {
-                               lp = (xfs_log_item_t *)ips[j]->i_itemp;
-                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-                                       try_lock++;
-                               }
-                       }
-               }
-
-               /*
-                * If any of the previous locks we have locked is in the AIL,
-                * we must TRY to get the second and subsequent locks. If
-                * we can't get any, we must release all we have
-                * and try again.
-                */
-
-               if (try_lock) {
-                       /* try_lock must be 0 if i is 0. */
-                       /*
-                        * try_lock means we have an inode locked
-                        * that is in the AIL.
-                        */
-                       ASSERT(i != 0);
-                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
-                               attempts++;
-
-                               /*
-                                * Unlock all previous guys and try again.
-                                * xfs_iunlock will try to push the tail
-                                * if the inode is in the AIL.
-                                */
-
-                               for(j = i - 1; j >= 0; j--) {
-
-                                       /*
-                                        * Check to see if we've already
-                                        * unlocked this one.
-                                        * Not the first one going back,
-                                        * and the inode ptr is the same.
-                                        */
-                                       if ((j != (i - 1)) && ips[j] ==
-                                                               ips[j+1])
-                                               continue;
-
-                                       xfs_iunlock(ips[j], lock_mode);
-                               }
-
-                               if ((attempts % 5) == 0) {
-                                       delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-                                       xfs_lock_delays++;
-#endif
-                               }
-                               i = 0;
-                               try_lock = 0;
-                               goto again;
-                       }
-               } else {
-                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
-               }
-       }
-
-#ifdef DEBUG
-       if (attempts) {
-               if (attempts < 5) xfs_small_retries++;
-               else if (attempts < 100) xfs_middle_retries++;
-               else xfs_lots_retries++;
-       } else {
-               xfs_locked_n++;
-       }
-#endif
-}
-
-/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
- * at a time - the iolock or the ilock, but not both at once. If
- * we lock both at once, lockdep will report false positives saying
- * we have violated locking orders.
- */
-void
-xfs_lock_two_inodes(
-       xfs_inode_t             *ip0,
-       xfs_inode_t             *ip1,
-       uint                    lock_mode)
-{
-       xfs_inode_t             *temp;
-       int                     attempts = 0;
-       xfs_log_item_t          *lp;
-
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
-               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
-       ASSERT(ip0->i_ino != ip1->i_ino);
-
-       if (ip0->i_ino > ip1->i_ino) {
-               temp = ip0;
-               ip0 = ip1;
-               ip1 = temp;
-       }
-
- again:
-       xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
-
-       /*
-        * If the first lock we have locked is in the AIL, we must TRY to get
-        * the second lock. If we can't get it, we must release the first one
-        * and try again.
-        */
-       lp = (xfs_log_item_t *)ip0->i_itemp;
-       if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-               if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
-                       xfs_iunlock(ip0, lock_mode);
-                       if ((++attempts % 5) == 0)
-                               delay(1); /* Don't just spin the CPU */
-                       goto again;
-               }
-       } else {
-               xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
-       }
-}
-
-int
-xfs_remove(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       xfs_inode_t             *ip)
-{
-       xfs_mount_t             *mp = dp->i_mount;
-       xfs_trans_t             *tp = NULL;
-       int                     is_dir = S_ISDIR(ip->i_d.di_mode);
-       int                     error = 0;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       int                     cancel_flags;
-       int                     committed;
-       int                     link_zero;
-       uint                    resblks;
-       uint                    log_count;
-
-       trace_xfs_remove(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(dp, 0);
-       if (error)
-               goto std_return;
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               goto std_return;
-
-       if (is_dir) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-               log_count = XFS_DEFAULT_LOG_COUNT;
-       } else {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-               log_count = XFS_REMOVE_LOG_COUNT;
-       }
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
-       /*
-        * We try to get the real space reservation first,
-        * allowing for directory btree deletion(s) implying
-        * possible bmap insert(s).  If we can't get the space
-        * reservation then we use 0 instead, and avoid the bmap
-        * btree insert(s) in the directory code by, if the bmap
-        * insert tries to happen, instead trimming the LAST
-        * block from the directory.
-        */
-       resblks = XFS_REMOVE_SPACE_RES(mp);
-       error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES, log_count);
-       if (error == ENOSPC) {
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-                                         XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error) {
-               ASSERT(error != ENOSPC);
-               cancel_flags = 0;
-               goto out_trans_cancel;
-       }
-
-       xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       /*
-        * If we're removing a directory perform some additional validation.
-        */
-       if (is_dir) {
-               ASSERT(ip->i_d.di_nlink >= 2);
-               if (ip->i_d.di_nlink != 2) {
-                       error = XFS_ERROR(ENOTEMPTY);
-                       goto out_trans_cancel;
-               }
-               if (!xfs_dir_isempty(ip)) {
-                       error = XFS_ERROR(ENOTEMPTY);
-                       goto out_trans_cancel;
-               }
-       }
-
-       xfs_bmap_init(&free_list, &first_block);
-       error = xfs_dir_removename(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks);
-       if (error) {
-               ASSERT(error != ENOENT);
-               goto out_bmap_cancel;
-       }
-       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-       if (is_dir) {
-               /*
-                * Drop the link from ip's "..".
-                */
-               error = xfs_droplink(tp, dp);
-               if (error)
-                       goto out_bmap_cancel;
-
-               /*
-                * Drop the "." link from ip to self.
-                */
-               error = xfs_droplink(tp, ip);
-               if (error)
-                       goto out_bmap_cancel;
-       } else {
-               /*
-                * When removing a non-directory we need to log the parent
-                * inode here.  For a directory this is done implicitly
-                * by the xfs_droplink call for the ".." entry.
-                */
-               xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-       }
-
-       /*
-        * Drop the link from dp to ip.
-        */
-       error = xfs_droplink(tp, ip);
-       if (error)
-               goto out_bmap_cancel;
-
-       /*
-        * Determine if this is the last link while
-        * we are in the transaction.
-        */
-       link_zero = (ip->i_d.di_nlink == 0);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * remove transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
-               xfs_trans_set_sync(tp);
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error)
-               goto out_bmap_cancel;
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       if (error)
-               goto std_return;
-
-       /*
-        * If we are using filestreams, kill the stream association.
-        * If the file is still open it may get a new one but that
-        * will get killed on last close in xfs_close() so we don't
-        * have to worry about that.
-        */
-       if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
-               xfs_filestream_deassociate(ip);
-
-       return 0;
-
- out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
-       cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
-
-int
-xfs_link(
-       xfs_inode_t             *tdp,
-       xfs_inode_t             *sip,
-       struct xfs_name         *target_name)
-{
-       xfs_mount_t             *mp = tdp->i_mount;
-       xfs_trans_t             *tp;
-       int                     error;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       int                     cancel_flags;
-       int                     committed;
-       int                     resblks;
-
-       trace_xfs_link(tdp, target_name);
-
-       ASSERT(!S_ISDIR(sip->i_d.di_mode));
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(sip, 0);
-       if (error)
-               goto std_return;
-
-       error = xfs_qm_dqattach(tdp, 0);
-       if (error)
-               goto std_return;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
-       if (error == ENOSPC) {
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
-       }
-       if (error) {
-               cancel_flags = 0;
-               goto error_return;
-       }
-
-       xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-
-       xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
-       /*
-        * If we are using project inheritance, we only allow hard link
-        * creation in our tree when the project IDs are the same; else
-        * the tree quota mechanism could be circumvented.
-        */
-       if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                    (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
-               error = XFS_ERROR(EXDEV);
-               goto error_return;
-       }
-
-       error = xfs_dir_canenter(tp, tdp, target_name, resblks);
-       if (error)
-               goto error_return;
-
-       xfs_bmap_init(&free_list, &first_block);
-
-       error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-                                       &first_block, &free_list, resblks);
-       if (error)
-               goto abort_return;
-       xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
-       error = xfs_bumplink(tp, sip);
-       if (error)
-               goto abort_return;
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * link transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
-
-       error = xfs_bmap_finish (&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               goto abort_return;
-       }
-
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
-
-int
-xfs_set_dmattrs(
-       xfs_inode_t     *ip,
-       u_int           evmask,
-       u_int16_t       state)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_trans_t     *tp;
-       int             error;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
-       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       ip->i_d.di_dmevmask = evmask;
-       ip->i_d.di_dmstate  = state;
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       error = xfs_trans_commit(tp, 0);
-
-       return error;
-}
-
-/*
- * xfs_alloc_file_space()
- *      This routine allocates disk space for the given file.
- *
- *     If alloc_type == 0, this request is for an ALLOCSP type
- *     request which will change the file size.  In this case, no
- *     DMAPI event will be generated by the call.  A TRUNCATE event
- *     will be generated later by xfs_setattr.
- *
- *     If alloc_type != 0, this request is for a RESVSP type
- *     request, and a DMAPI DM_EVENT_WRITE will be generated if the
- *     lower block boundary byte address is less than the file's
- *     length.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
-xfs_alloc_file_space(
-       xfs_inode_t             *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     alloc_type,
-       int                     attr_flags)
-{
-       xfs_mount_t             *mp = ip->i_mount;
-       xfs_off_t               count;
-       xfs_filblks_t           allocated_fsb;
-       xfs_filblks_t           allocatesize_fsb;
-       xfs_extlen_t            extsz, temp;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_fsblock_t           firstfsb;
-       int                     nimaps;
-       int                     quota_flag;
-       int                     rt;
-       xfs_trans_t             *tp;
-       xfs_bmbt_irec_t         imaps[1], *imapp;
-       xfs_bmap_free_t         free_list;
-       uint                    qblocks, resblks, resrtextents;
-       int                     committed;
-       int                     error;
-
-       trace_xfs_alloc_file_space(ip);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return error;
-
-       if (len <= 0)
-               return XFS_ERROR(EINVAL);
-
-       rt = XFS_IS_REALTIME_INODE(ip);
-       extsz = xfs_get_extsz_hint(ip);
-
-       count = len;
-       imapp = &imaps[0];
-       nimaps = 1;
-       startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
-       allocatesize_fsb = XFS_B_TO_FSB(mp, count);
-
-       /*
-        * Allocate file space until done or until there is an error
-        */
-       while (allocatesize_fsb && !error) {
-               xfs_fileoff_t   s, e;
-
-               /*
-                * Determine space reservations for data/realtime.
-                */
-               if (unlikely(extsz)) {
-                       s = startoffset_fsb;
-                       do_div(s, extsz);
-                       s *= extsz;
-                       e = startoffset_fsb + allocatesize_fsb;
-                       if ((temp = do_mod(startoffset_fsb, extsz)))
-                               e += temp;
-                       if ((temp = do_mod(e, extsz)))
-                               e += extsz - temp;
-               } else {
-                       s = 0;
-                       e = allocatesize_fsb;
-               }
-
-               /*
-                * The transaction reservation is limited to a 32-bit block
-                * count, hence we need to limit the number of blocks we are
-                * trying to reserve to avoid an overflow. We can't allocate
-                * more than @nimaps extents, and an extent is limited on disk
-                * to MAXEXTLEN (21 bits), so use that to enforce the limit.
-                */
-               resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
-               if (unlikely(rt)) {
-                       resrtextents = qblocks = resblks;
-                       resrtextents /= mp->m_sb.sb_rextsize;
-                       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-                       quota_flag = XFS_QMOPT_RES_RTBLKS;
-               } else {
-                       resrtextents = 0;
-                       resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
-                       quota_flag = XFS_QMOPT_RES_REGBLKS;
-               }
-
-               /*
-                * Allocate and setup the transaction.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, resblks,
-                                         XFS_WRITE_LOG_RES(mp), resrtextents,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
-               /*
-                * Check for running out of space
-                */
-               if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       break;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
-                                                     0, quota_flag);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-                                       allocatesize_fsb, alloc_type, &firstfsb,
-                                       0, imapp, &nimaps, &free_list);
-               if (error) {
-                       goto error0;
-               }
-
-               /*
-                * Complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, &committed);
-               if (error) {
-                       goto error0;
-               }
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error) {
-                       break;
-               }
-
-               allocated_fsb = imapp->br_blockcount;
-
-               if (nimaps == 0) {
-                       error = XFS_ERROR(ENOSPC);
-                       break;
-               }
-
-               startoffset_fsb += allocated_fsb;
-               allocatesize_fsb -= allocated_fsb;
-       }
-
-       return error;
-
-error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1:        /* Just cancel transaction */
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-       xfs_inode_t             *ip,
-       xfs_off_t               startoff,
-       xfs_off_t               endoff)
-{
-       xfs_bmbt_irec_t         imap;
-       xfs_fileoff_t           offset_fsb;
-       xfs_off_t               lastoffset;
-       xfs_off_t               offset;
-       xfs_buf_t               *bp;
-       xfs_mount_t             *mp = ip->i_mount;
-       int                     nimap;
-       int                     error = 0;
-
-       /*
-        * Avoid doing I/O beyond eof - it's not necessary
-        * since nothing can read beyond eof.  The space will
-        * be zeroed when the file is extended anyway.
-        */
-       if (startoff >= XFS_ISIZE(ip))
-               return 0;
-
-       if (endoff > XFS_ISIZE(ip))
-               endoff = XFS_ISIZE(ip);
-
-       bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                       mp->m_rtdev_targp : mp->m_ddev_targp,
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
-       if (!bp)
-               return XFS_ERROR(ENOMEM);
-
-       xfs_buf_unlock(bp);
-
-       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
-               offset_fsb = XFS_B_TO_FSBT(mp, offset);
-               nimap = 1;
-               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-               if (error || nimap < 1)
-                       break;
-               ASSERT(imap.br_blockcount >= 1);
-               ASSERT(imap.br_startoff == offset_fsb);
-               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
-               if (lastoffset > endoff)
-                       lastoffset = endoff;
-               if (imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
-                       continue;
-               XFS_BUF_UNDONE(bp);
-               XFS_BUF_UNWRITE(bp);
-               XFS_BUF_READ(bp);
-               XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-               xfsbdstrat(mp, bp);
-               error = xfs_buf_iowait(bp);
-               if (error) {
-                       xfs_buf_ioerror_alert(bp,
-                                       "xfs_zero_remaining_bytes(read)");
-                       break;
-               }
-               memset(bp->b_addr +
-                       (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                     0, lastoffset - offset + 1);
-               XFS_BUF_UNDONE(bp);
-               XFS_BUF_UNREAD(bp);
-               XFS_BUF_WRITE(bp);
-               xfsbdstrat(mp, bp);
-               error = xfs_buf_iowait(bp);
-               if (error) {
-                       xfs_buf_ioerror_alert(bp,
-                                       "xfs_zero_remaining_bytes(write)");
-                       break;
-               }
-       }
-       xfs_buf_free(bp);
-       return error;
-}
-
-/*
- * xfs_free_file_space()
- *      This routine frees disk space for the given file.
- *
- *     This routine is only called by xfs_change_file_space
- *     for an UNRESVSP type call.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
-xfs_free_file_space(
-       xfs_inode_t             *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     attr_flags)
-{
-       int                     committed;
-       int                     done;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     error;
-       xfs_fsblock_t           firstfsb;
-       xfs_bmap_free_t         free_list;
-       xfs_bmbt_irec_t         imap;
-       xfs_off_t               ioffset;
-       xfs_extlen_t            mod=0;
-       xfs_mount_t             *mp;
-       int                     nimap;
-       uint                    resblks;
-       xfs_off_t               rounding;
-       int                     rt;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_trans_t             *tp;
-       int                     need_iolock = 1;
-
-       mp = ip->i_mount;
-
-       trace_xfs_free_file_space(ip);
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return error;
-
-       error = 0;
-       if (len <= 0)   /* if nothing being freed */
-               return error;
-       rt = XFS_IS_REALTIME_INODE(ip);
-       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
-       if (attr_flags & XFS_ATTR_NOLOCK)
-               need_iolock = 0;
-       if (need_iolock) {
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-               /* wait for the completion of any pending DIOs */
-               inode_dio_wait(VFS_I(ip));
-       }
-
-       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-       ioffset = offset & ~(rounding - 1);
-       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                             ioffset, -1);
-       if (error)
-               goto out_unlock_iolock;
-       truncate_pagecache_range(VFS_I(ip), ioffset, -1);
-
-       /*
-        * Need to zero the stuff we're not freeing, on disk.
-        * If it's a realtime file & can't use unwritten extents then we
-        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
-        * will take care of it for us.
-        */
-       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-               nimap = 1;
-               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out_unlock_iolock;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       xfs_daddr_t     block;
-
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       block = imap.br_startblock;
-                       mod = do_div(block, mp->m_sb.sb_rextsize);
-                       if (mod)
-                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-               }
-               nimap = 1;
-               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out_unlock_iolock;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       mod++;
-                       if (mod && (mod != mp->m_sb.sb_rextsize))
-                               endoffset_fsb -= mod;
-               }
-       }
-       if ((done = (endoffset_fsb <= startoffset_fsb)))
-               /*
-                * One contiguous piece to clear
-                */
-               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-       else {
-               /*
-                * Some full blocks, possibly two pieces to clear
-                */
-               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-                       error = xfs_zero_remaining_bytes(ip, offset,
-                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-               if (!error &&
-                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-                       error = xfs_zero_remaining_bytes(ip,
-                               XFS_FSB_TO_B(mp, endoffset_fsb),
-                               offset + len - 1);
-       }
-
-       /*
-        * free file space until done or until there is an error
-        */
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-       while (!error && !done) {
-
-               /*
-                * allocate and setup the transaction. Allow this
-                * transaction to dip into the reserve blocks to ensure
-                * the freeing of the space succeeds at ENOSPC.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               tp->t_flags |= XFS_TRANS_RESERVE;
-               error = xfs_trans_reserve(tp,
-                                         resblks,
-                                         XFS_WRITE_LOG_RES(mp),
-                                         0,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
-
-               /*
-                * check for running out of space
-                */
-               if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       break;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota(tp, mp,
-                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * issue the bunmapi() call to free the blocks
-                */
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bunmapi(tp, ip, startoffset_fsb,
-                                 endoffset_fsb - startoffset_fsb,
-                                 0, 2, &firstfsb, &free_list, &done);
-               if (error) {
-                       goto error0;
-               }
-
-               /*
-                * complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, &committed);
-               if (error) {
-                       goto error0;
-               }
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-
- out_unlock_iolock:
-       if (need_iolock)
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return error;
-
- error0:
-       xfs_bmap_cancel(&free_list);
- error1:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
-                   XFS_ILOCK_EXCL);
-       return error;
-}
-
-
-STATIC int
-xfs_zero_file_space(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     attr_flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       uint                    granularity;
-       xfs_off_t               start_boundary;
-       xfs_off_t               end_boundary;
-       int                     error;
-
-       granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-
-       /*
-        * Round the range of extents we are going to convert inwards.  If the
-        * offset is aligned, then it doesn't get changed so we zero from the
-        * start of the block offset points to.
-        */
-       start_boundary = round_up(offset, granularity);
-       end_boundary = round_down(offset + len, granularity);
-
-       ASSERT(start_boundary >= offset);
-       ASSERT(end_boundary <= offset + len);
-
-       if (!(attr_flags & XFS_ATTR_NOLOCK))
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-       if (start_boundary < end_boundary - 1) {
-               /* punch out the page cache over the conversion range */
-               truncate_pagecache_range(VFS_I(ip), start_boundary,
-                                        end_boundary - 1);
-               /* convert the blocks */
-               error = xfs_alloc_file_space(ip, start_boundary,
-                                       end_boundary - start_boundary - 1,
-                                       XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
-                                       attr_flags);
-               if (error)
-                       goto out_unlock;
-
-               /* We've handled the interior of the range, now for the edges */
-               if (start_boundary != offset)
-                       error = xfs_iozero(ip, offset, start_boundary - offset);
-               if (error)
-                       goto out_unlock;
-
-               if (end_boundary != offset + len)
-                       error = xfs_iozero(ip, end_boundary,
-                                          offset + len - end_boundary);
-
-       } else {
-               /*
-                * It's either a sub-granularity range or the range spanned lies
-                * partially across two adjacent blocks.
-                */
-               error = xfs_iozero(ip, offset, len);
-       }
-
-out_unlock:
-       if (!(attr_flags & XFS_ATTR_NOLOCK))
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return error;
-
-}
-
-/*
- * xfs_change_file_space()
- *      This routine allocates or frees disk space for the given file.
- *      The user specified parameters are checked for alignment and size
- *      limitations.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-int
-xfs_change_file_space(
-       xfs_inode_t     *ip,
-       int             cmd,
-       xfs_flock64_t   *bf,
-       xfs_off_t       offset,
-       int             attr_flags)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       int             clrprealloc;
-       int             error;
-       xfs_fsize_t     fsize;
-       int             setprealloc;
-       xfs_off_t       startoffset;
-       xfs_trans_t     *tp;
-       struct iattr    iattr;
-
-       if (!S_ISREG(ip->i_d.di_mode))
-               return XFS_ERROR(EINVAL);
-
-       switch (bf->l_whence) {
-       case 0: /*SEEK_SET*/
-               break;
-       case 1: /*SEEK_CUR*/
-               bf->l_start += offset;
-               break;
-       case 2: /*SEEK_END*/
-               bf->l_start += XFS_ISIZE(ip);
-               break;
-       default:
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * length of <= 0 for resv/unresv/zero is invalid.  length for
-        * alloc/free is ignored completely and we have no idea what userspace
-        * might have set it to, so set it to zero to allow range
-        * checks to pass.
-        */
-       switch (cmd) {
-       case XFS_IOC_ZERO_RANGE:
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_RESVSP64:
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_UNRESVSP64:
-               if (bf->l_len <= 0)
-                       return XFS_ERROR(EINVAL);
-               break;
-       default:
-               bf->l_len = 0;
-               break;
-       }
-
-       if (bf->l_start < 0 ||
-           bf->l_start > mp->m_super->s_maxbytes ||
-           bf->l_start + bf->l_len < 0 ||
-           bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
-               return XFS_ERROR(EINVAL);
-
-       bf->l_whence = 0;
-
-       startoffset = bf->l_start;
-       fsize = XFS_ISIZE(ip);
-
-       setprealloc = clrprealloc = 0;
-       switch (cmd) {
-       case XFS_IOC_ZERO_RANGE:
-               error = xfs_zero_file_space(ip, startoffset, bf->l_len,
-                                               attr_flags);
-               if (error)
-                       return error;
-               setprealloc = 1;
-               break;
-
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_RESVSP64:
-               error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                               XFS_BMAPI_PREALLOC, attr_flags);
-               if (error)
-                       return error;
-               setprealloc = 1;
-               break;
-
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_UNRESVSP64:
-               if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
-                                                               attr_flags)))
-                       return error;
-               break;
-
-       case XFS_IOC_ALLOCSP:
-       case XFS_IOC_ALLOCSP64:
-       case XFS_IOC_FREESP:
-       case XFS_IOC_FREESP64:
-               /*
-                * These operations actually do IO when extending the file, but
-                * the allocation is done seperately to the zeroing that is
-                * done. This set of operations need to be serialised against
-                * other IO operations, such as truncate and buffered IO. We
-                * need to take the IOLOCK here to serialise the allocation and
-                * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
-                * truncate, direct IO) from racing against the transient
-                * allocated but not written state we can have here.
-                */
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-               if (startoffset > fsize) {
-                       error = xfs_alloc_file_space(ip, fsize,
-                                       startoffset - fsize, 0,
-                                       attr_flags | XFS_ATTR_NOLOCK);
-                       if (error) {
-                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                               break;
-                       }
-               }
-
-               iattr.ia_valid = ATTR_SIZE;
-               iattr.ia_size = startoffset;
-
-               error = xfs_setattr_size(ip, &iattr,
-                                        attr_flags | XFS_ATTR_NOLOCK);
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
-               if (error)
-                       return error;
-
-               clrprealloc = 1;
-               break;
-
-       default:
-               ASSERT(0);
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * update the inode timestamp, mode, and prealloc flag bits
-        */
-       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-
-       if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
-                                     0, 0, 0))) {
-               /* ASSERT(0); */
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       if ((attr_flags & XFS_ATTR_DMI) == 0) {
-               ip->i_d.di_mode &= ~S_ISUID;
-
-               /*
-                * Note that we don't have to worry about mandatory
-                * file locking being disabled here because we only
-                * clear the S_ISGID bit if the Group execute bit is
-                * on, but if it was on then mandatory locking wouldn't
-                * have been enabled.
-                */
-               if (ip->i_d.di_mode & S_IXGRP)
-                       ip->i_d.di_mode &= ~S_ISGID;
-
-               xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       }
-       if (setprealloc)
-               ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-       else if (clrprealloc)
-               ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       if (attr_flags & XFS_ATTR_SYNC)
-               xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
-}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h

deleted file mode 100644 (file)

index 38c67c3..0000000
--- a/fs/xfs/xfs_vnodeops.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _XFS_VNODEOPS_H
-#define _XFS_VNODEOPS_H 1
-
-struct attrlist_cursor_kern;
-struct file;
-struct iattr;
-struct inode;
-struct iovec;
-struct kiocb;
-struct pipe_inode_info;
-struct uio;
-struct xfs_inode;
-
-
-int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
-#define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
-#define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
-#define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
-#define XFS_ATTR_NOACL         0x08    /* Don't call xfs_acl_chmod */
-#define XFS_ATTR_SYNC          0x10    /* synchronous operation required */
-
-int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_release(struct xfs_inode *ip);
-int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-               struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
-               xfs_dev_t rdev, struct xfs_inode **ipp);
-int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
-               struct xfs_inode *ip);
-int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-               struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
-int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-               const char *target_path, umode_t mode, struct xfs_inode **ipp);
-int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-               xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
-               struct xfs_inode *src_ip, struct xfs_inode *target_dp,
-               struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-               unsigned char *value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
-               unsigned char *value, int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
-               int flags, struct attrlist_cursor_kern *cursor);
-
-int xfs_iozero(struct xfs_inode *, loff_t, size_t);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-
-#endif /* _XFS_VNODEOPS_H */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c

index 87d3e03878c8da30b762d19263ec3e1cbe02aa24..e01f35ea76ba436310f11d82b9fdf78322d8f6fe 100644 (file)
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,13 +17,13 @@
   */
  
  #include "xfs.h"
+#include "xfs_log_format.h"
  #include "xfs_da_btree.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_inode.h"
  #include "xfs_attr.h"
  #include "xfs_attr_leaf.h"
  #include "xfs_acl.h"
-#include "xfs_vnodeops.h"
  
  #include <linux/posix_acl_xattr.h>
  #include <linux/xattr.h>
diff --git a/include/linux/quota.h b/include/linux/quota.h

index d13371134c59fbec0be79992a692257d8021a959..cc7494a3542983bbd4e73b95920eb981bb196716 100644 (file)
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -328,6 +328,7 @@ struct quotactl_ops {
         int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
         int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
         int (*set_xstate)(struct super_block *, unsigned int, int);
+       int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
  };
  
  struct quota_format_type {
diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h

index 86552807aed949529fc34a3007658ede564f4e52..dcd75cc261962f65c909a6efa0defe3f1dcdc281 100644 (file)
--- a/include/uapi/linux/dqblk_xfs.h
+++ b/include/uapi/linux/dqblk_xfs.h
@@ -38,6 +38,7 @@
  #define Q_XGETQSTAT    XQM_CMD(5)      /* get quota subsystem status */
  #define Q_XQUOTARM     XQM_CMD(6)      /* free disk space used by dquots */
  #define Q_XQUOTASYNC   XQM_CMD(7)      /* delalloc flush, updates dquots */
+#define Q_XGETQSTATV   XQM_CMD(8)      /* newer version of get quota */
  
  /*
   * fs_disk_quota structure:
@@ -163,4 +164,50 @@ typedef struct fs_quota_stat {
         __u16           qs_iwarnlimit;  /* limit for num warnings */
  } fs_quota_stat_t;
  
+/*
+ * fs_quota_statv is used by Q_XGETQSTATV for a given file system. It provides
+ * a centralized way to get meta information about the quota subsystem. eg.
+ * space taken up for user, group, and project quotas, number of dquots
+ * currently incore.
+ *
+ * This version has proper versioning support with appropriate padding for
+ * future expansions, and ability to expand for future without creating any
+ * backward compatibility issues.
+ *
+ * Q_XGETQSTATV uses the passed in value of the requested version via
+ * fs_quota_statv.qs_version to determine the return data layout of
+ * fs_quota_statv.  The kernel will fill the data fields relevant to that
+ * version.
+ *
+ * If kernel does not support user space caller specified version, EINVAL will
+ * be returned. User space caller can then reduce the version number and retry
+ * the same command.
+ */
+#define FS_QSTATV_VERSION1     1       /* fs_quota_statv.qs_version */
+/*
+ * Some basic information about 'quota files' for Q_XGETQSTATV command
+ */
+struct fs_qfilestatv {
+       __u64           qfs_ino;        /* inode number */
+       __u64           qfs_nblks;      /* number of BBs 512-byte-blks */
+       __u32           qfs_nextents;   /* number of extents */
+       __u32           qfs_pad;        /* pad for 8-byte alignment */
+};
+
+struct fs_quota_statv {
+       __s8                    qs_version;     /* version for future changes */
+       __u8                    qs_pad1;        /* pad for 16bit alignment */
+       __u16                   qs_flags;       /* FS_QUOTA_.* flags */
+       __u32                   qs_incoredqs;   /* number of dquots incore */
+       struct fs_qfilestatv    qs_uquota;      /* user quota information */
+       struct fs_qfilestatv    qs_gquota;      /* group quota information */
+       struct fs_qfilestatv    qs_pquota;      /* project quota information */
+       __s32                   qs_btimelimit;  /* limit for blks timer */
+       __s32                   qs_itimelimit;  /* limit for inodes timer */
+       __s32                   qs_rtbtimelimit;/* limit for rt blks timer */
+       __u16                   qs_bwarnlimit;  /* limit for num warnings */
+       __u16                   qs_iwarnlimit;  /* limit for num warnings */
+       __u64                   qs_pad2[8];     /* for future proofing */
+};
+
  #endif /* _LINUX_DQBLK_XFS_H */
diff --git a/init/Kconfig b/init/Kconfig

index 0a2c4bcf179e21d321b25206718cb58aad722b54..bfa9e13c9a939d34045bb8a360aef1743093b800 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1123,7 +1123,6 @@ config IPC_NS
  
  config USER_NS
         bool "User namespace"
-       depends on UIDGID_CONVERTED
         select UIDGID_STRICT_TYPE_CHECKS
  
         default n
@@ -1157,20 +1156,8 @@ config NET_NS
  
  endif # NAMESPACES
  
-config UIDGID_CONVERTED
-       # True if all of the selected software conmponents are known
-       # to have uid_t and gid_t converted to kuid_t and kgid_t
-       # where appropriate and are otherwise safe to use with
-       # the user namespace.
-       bool
-       default y
-
-       # Filesystems
-       depends on XFS_FS = n
-
  config UIDGID_STRICT_TYPE_CHECKS
         bool "Require conversions between uid/gids and their internal representation"
-       depends on UIDGID_CONVERTED
         default n
         help
          While the nececessary conversions are being added to all subsystems this option allows
diff --git a/kernel/capability.c b/kernel/capability.c

index 6fc1c8af44df745bad512e02be04645984d1b5a1..4e66bf9275b03edf3c62e0e350afc5248f2b00b8 100644 (file)
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -452,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
  
         return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
  }
+EXPORT_SYMBOL(inode_capable);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
arch/powerpc/platforms/cell/spufs/inode.c		patch \| blob \| history
fs/quota/quota.c		patch \| blob \| history
fs/xfs/Makefile		patch \| blob \| history
fs/xfs/xfs_acl.c		patch \| blob \| history
fs/xfs/xfs_ag.h		patch \| blob \| history
fs/xfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_attr.c		patch \| blob \| history
fs/xfs/xfs_attr.h		patch \| blob \| history
fs/xfs/xfs_attr_inactive.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_attr_leaf.c		patch \| blob \| history
fs/xfs/xfs_attr_leaf.h		patch \| blob \| history
fs/xfs/xfs_attr_list.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_attr_remote.c		patch \| blob \| history
fs/xfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/xfs_bmap_btree.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_bmap_util.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_btree.c		patch \| blob \| history
fs/xfs/xfs_btree.h		patch \| blob \| history
fs/xfs/xfs_buf.c		patch \| blob \| history
fs/xfs/xfs_buf_item.c		patch \| blob \| history
fs/xfs/xfs_buf_item.h		patch \| blob \| history
fs/xfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/xfs_da_btree.h		patch \| blob \| history
fs/xfs/xfs_dfrag.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_dfrag.h	[deleted file]	patch \| blob \| history
fs/xfs/xfs_dir2.c		patch \| blob \| history
fs/xfs/xfs_dir2.h		patch \| blob \| history
fs/xfs/xfs_dir2_block.c		patch \| blob \| history
fs/xfs/xfs_dir2_data.c		patch \| blob \| history
fs/xfs/xfs_dir2_format.h		patch \| blob \| history
fs/xfs/xfs_dir2_leaf.c		patch \| blob \| history
fs/xfs/xfs_dir2_node.c		patch \| blob \| history
fs/xfs/xfs_dir2_priv.h		patch \| blob \| history
fs/xfs/xfs_dir2_readdir.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_dir2_sf.c		patch \| blob \| history
fs/xfs/xfs_discard.c		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_dquot_item.c		patch \| blob \| history
fs/xfs/xfs_error.c		patch \| blob \| history
fs/xfs/xfs_export.c		patch \| blob \| history
fs/xfs/xfs_extent_busy.c		patch \| blob \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| history
fs/xfs/xfs_extfree_item.h		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_filestream.c		patch \| blob \| history
fs/xfs/xfs_filestream.h		patch \| blob \| history
fs/xfs/xfs_format.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_fs.h		patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_icache.h		patch \| blob \| history
fs/xfs/xfs_icreate_item.c		patch \| blob \| history
fs/xfs/xfs_icreate_item.h		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_inode_buf.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_inode_buf.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_inode_fork.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_inode_fork.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_inode_item.c		patch \| blob \| history
fs/xfs/xfs_inode_item.h		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_ioctl.h		patch \| blob \| history
fs/xfs/xfs_ioctl32.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_iops.h		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log.h		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_log_format.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_log_rlimit.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_qm.h		patch \| blob \| history
fs/xfs/xfs_qm_bhv.c		patch \| blob \| history
fs/xfs/xfs_qm_syscalls.c		patch \| blob \| history
fs/xfs/xfs_quota.h		patch \| blob \| history
fs/xfs/xfs_quota_defs.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_quotaops.c		patch \| blob \| history
fs/xfs/xfs_rename.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_rtalloc.c		patch \| blob \| history
fs/xfs/xfs_rtalloc.h		patch \| blob \| history
fs/xfs/xfs_sb.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_sb.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_symlink.c		patch \| blob \| history
fs/xfs/xfs_symlink.h		patch \| blob \| history
fs/xfs/xfs_symlink_remote.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_trace.c		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
fs/xfs/xfs_trans_ail.c		patch \| blob \| history
fs/xfs/xfs_trans_buf.c		patch \| blob \| history
fs/xfs/xfs_trans_dquot.c		patch \| blob \| history
fs/xfs/xfs_trans_priv.h		patch \| blob \| history
fs/xfs/xfs_trans_resv.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_trans_resv.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_types.h		patch \| blob \| history
fs/xfs/xfs_utils.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_utils.h	[deleted file]	patch \| blob \| history
fs/xfs/xfs_vnodeops.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_vnodeops.h	[deleted file]	patch \| blob \| history
fs/xfs/xfs_xattr.c		patch \| blob \| history
include/linux/quota.h		patch \| blob \| history
include/uapi/linux/dqblk_xfs.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/capability.c		patch \| blob \| history