]> Pileus Git - ~andy/linux/commitdiff
Merge tag 'xfs-for-linus-v3.12-rc1' of git://oss.sgi.com/xfs/xfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 9 Sep 2013 18:19:09 +0000 (11:19 -0700)
Pull xfs updates from Ben Myers:
 "For 3.12-rc1 there are a number of bugfixes in addition to work to
  ease usage of shared code between libxfs and the kernel, the rest of
  the work to enable project and group quotas to be used simultaneously,
  performance optimisations in the log and the CIL, directory entry file
  type support, fixes for log space reservations, some spelling/grammar
  cleanups, and the addition of user namespace support.

   - introduce readahead to log recovery
   - add directory entry file type support
   - fix a number of spelling errors in comments
   - introduce new Q_XGETQSTATV quotactl for project quotas
   - add USER_NS support
   - log space reservation rework
   - CIL optimisations
  - kernel/userspace libxfs rework"

* tag 'xfs-for-linus-v3.12-rc1' of git://oss.sgi.com/xfs/xfs: (112 commits)
  xfs: XFS_MOUNT_QUOTA_ALL needed by userspace
  xfs: dtype changed xfs_dir2_sfe_put_ino to xfs_dir3_sfe_put_ino
  Fix wrong flag ASSERT in xfs_attr_shortform_getvalue
  xfs: finish removing IOP_* macros.
  xfs: inode log reservations are too small
  xfs: check correct status variable for xfs_inobt_get_rec() call
  xfs: inode buffers may not be valid during recovery readahead
  xfs: check LSN ordering for v5 superblocks during recovery
  xfs: btree block LSN escaping to disk uninitialised
  XFS: Assertion failed: first <= last && last < BBTOB(bp->b_length), file: fs/xfs/xfs_trans_buf.c, line: 568
  xfs: fix bad dquot buffer size in log recovery readahead
  xfs: don't account buffer cancellation during log recovery readahead
  xfs: check for underflow in xfs_iformat_fork()
  xfs: xfs_dir3_sfe_put_ino can be static
  xfs: introduce object readahead to log recovery
  xfs: Simplify xfs_ail_min() with list_first_entry_or_null()
  xfs: Register hotcpu notifier after initialization
  xfs: add xfs sb v4 support for dirent filetype field
  xfs: Add write support for dirent filetype field
  xfs: Add read-only support for dirent filetype field
  ...

116 files changed:
arch/powerpc/platforms/cell/spufs/inode.c
fs/quota/quota.c
fs/xfs/Makefile
fs/xfs/xfs_acl.c
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_attr.c
fs/xfs/xfs_attr.h
fs/xfs/xfs_attr_inactive.c [new file with mode: 0644]
fs/xfs/xfs_attr_leaf.c
fs/xfs/xfs_attr_leaf.h
fs/xfs/xfs_attr_list.c [new file with mode: 0644]
fs/xfs/xfs_attr_remote.c
fs/xfs/xfs_bmap.c
fs/xfs/xfs_bmap.h
fs/xfs/xfs_bmap_btree.c
fs/xfs/xfs_bmap_util.c [new file with mode: 0644]
fs/xfs/xfs_bmap_util.h [new file with mode: 0644]
fs/xfs/xfs_btree.c
fs/xfs/xfs_btree.h
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_buf_item.h
fs/xfs/xfs_da_btree.c
fs/xfs/xfs_da_btree.h
fs/xfs/xfs_dfrag.c [deleted file]
fs/xfs/xfs_dfrag.h [deleted file]
fs/xfs/xfs_dir2.c
fs/xfs/xfs_dir2.h
fs/xfs/xfs_dir2_block.c
fs/xfs/xfs_dir2_data.c
fs/xfs/xfs_dir2_format.h
fs/xfs/xfs_dir2_leaf.c
fs/xfs/xfs_dir2_node.c
fs/xfs/xfs_dir2_priv.h
fs/xfs/xfs_dir2_readdir.c [new file with mode: 0644]
fs/xfs/xfs_dir2_sf.c
fs/xfs/xfs_discard.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot_item.c
fs/xfs/xfs_error.c
fs/xfs/xfs_export.c
fs/xfs/xfs_extent_busy.c
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_extfree_item.h
fs/xfs/xfs_file.c
fs/xfs/xfs_filestream.c
fs/xfs/xfs_filestream.h
fs/xfs/xfs_format.h [new file with mode: 0644]
fs/xfs/xfs_fs.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_icreate_item.c
fs/xfs/xfs_icreate_item.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_buf.c [new file with mode: 0644]
fs/xfs/xfs_inode_buf.h [new file with mode: 0644]
fs/xfs/xfs_inode_fork.c [new file with mode: 0644]
fs/xfs/xfs_inode_fork.h [new file with mode: 0644]
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item.h
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_ioctl.h
fs/xfs/xfs_ioctl32.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_iops.h
fs/xfs/xfs_linux.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_format.h [new file with mode: 0644]
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_log_rlimit.c [new file with mode: 0644]
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm.h
fs/xfs/xfs_qm_bhv.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_quota_defs.h [new file with mode: 0644]
fs/xfs/xfs_quotaops.c
fs/xfs/xfs_rename.c [deleted file]
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_rtalloc.h
fs/xfs/xfs_sb.c [new file with mode: 0644]
fs/xfs/xfs_sb.h
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_symlink.h
fs/xfs/xfs_symlink_remote.c [new file with mode: 0644]
fs/xfs/xfs_trace.c
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_dquot.c
fs/xfs/xfs_trans_priv.h
fs/xfs/xfs_trans_resv.c [new file with mode: 0644]
fs/xfs/xfs_trans_resv.h [new file with mode: 0644]
fs/xfs/xfs_types.h
fs/xfs/xfs_utils.c [deleted file]
fs/xfs/xfs_utils.h [deleted file]
fs/xfs/xfs_vnodeops.c [deleted file]
fs/xfs/xfs_vnodeops.h [deleted file]
fs/xfs/xfs_xattr.c
include/linux/quota.h
include/uapi/linux/dqblk_xfs.h
init/Kconfig
kernel/capability.c

index f3900427ffab5173ee05041a7375b9633312e835..87ba7cf99cd754590ffaf2f038926ae9098f4ca9 100644 (file)
@@ -620,12 +620,16 @@ spufs_parse_options(struct super_block *sb, char *options, struct inode *root)
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return 0;
-                       root->i_uid = option;
+                       root->i_uid = make_kuid(current_user_ns(), option);
+                       if (!uid_valid(root->i_uid))
+                               return 0;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return 0;
-                       root->i_gid = option;
+                       root->i_gid = make_kgid(current_user_ns(), option);
+                       if (!gid_valid(root->i_gid))
+                               return 0;
                        break;
                case Opt_mode:
                        if (match_octal(&args[0], &option))
index c7314f1771f5824be95fffb4cd27695494c767ff..dea86e8967ee2c354d489384a83d1492b1dbcba3 100644 (file)
@@ -27,6 +27,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
        case Q_SYNC:
        case Q_GETINFO:
        case Q_XGETQSTAT:
+       case Q_XGETQSTATV:
        case Q_XQUOTASYNC:
                break;
        /* allow to query information for dquots we "own" */
@@ -217,6 +218,31 @@ static int quota_getxstate(struct super_block *sb, void __user *addr)
        return ret;
 }
 
+static int quota_getxstatev(struct super_block *sb, void __user *addr)
+{
+       struct fs_quota_statv fqs;
+       int ret;
+
+       if (!sb->s_qcop->get_xstatev)
+               return -ENOSYS;
+
+       memset(&fqs, 0, sizeof(fqs));
+       if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */
+               return -EFAULT;
+
+       /* If this kernel doesn't support user specified version, fail */
+       switch (fqs.qs_version) {
+       case FS_QSTATV_VERSION1:
+               break;
+       default:
+               return -EINVAL;
+       }
+       ret = sb->s_qcop->get_xstatev(sb, &fqs);
+       if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+               return -EFAULT;
+       return ret;
+}
+
 static int quota_setxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
 {
@@ -293,6 +319,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                return quota_setxstate(sb, cmd, addr);
        case Q_XGETQSTAT:
                return quota_getxstate(sb, addr);
+       case Q_XGETQSTATV:
+               return quota_getxstatev(sb, addr);
        case Q_XSETQLIM:
                return quota_setxquota(sb, type, id, addr);
        case Q_XGETQUOTA:
@@ -317,6 +345,7 @@ static int quotactl_cmd_write(int cmd)
        case Q_GETINFO:
        case Q_SYNC:
        case Q_XGETQSTAT:
+       case Q_XGETQSTATV:
        case Q_XGETQUOTA:
        case Q_XQUOTASYNC:
                return 0;
index 4a4508023a3c15724a2edfd87204550f718c805b..0719e4db93f274de9af844f3ef66ce387b02a716 100644 (file)
@@ -27,9 +27,12 @@ xfs-y                                += xfs_trace.o
 
 # highlevel code
 xfs-y                          += xfs_aops.o \
+                                  xfs_attr_inactive.o \
+                                  xfs_attr_list.o \
                                   xfs_bit.o \
+                                  xfs_bmap_util.o \
                                   xfs_buf.o \
-                                  xfs_dfrag.o \
+                                  xfs_dir2_readdir.o \
                                   xfs_discard.o \
                                   xfs_error.o \
                                   xfs_export.o \
@@ -44,11 +47,11 @@ xfs-y                               += xfs_aops.o \
                                   xfs_iops.o \
                                   xfs_itable.o \
                                   xfs_message.o \
+                                  xfs_mount.o \
                                   xfs_mru_cache.o \
-                                  xfs_rename.o \
                                   xfs_super.o \
-                                  xfs_utils.o \
-                                  xfs_vnodeops.o \
+                                  xfs_symlink.o \
+                                  xfs_trans.o \
                                   xfs_xattr.o \
                                   kmem.o \
                                   uuid.o
@@ -73,10 +76,13 @@ xfs-y                               += xfs_alloc.o \
                                   xfs_ialloc_btree.o \
                                   xfs_icreate_item.o \
                                   xfs_inode.o \
+                                  xfs_inode_fork.o \
+                                  xfs_inode_buf.o \
                                   xfs_log_recover.o \
-                                  xfs_mount.o \
-                                  xfs_symlink.o \
-                                  xfs_trans.o
+                                  xfs_log_rlimit.o \
+                                  xfs_sb.o \
+                                  xfs_symlink_remote.o \
+                                  xfs_trans_resv.o
 
 # low-level transaction/log code
 xfs-y                          += xfs_log.o \
index 306d883d89bc7d6420ca4b5b8c5f848e573249bf..69518960b2ba17e888d71f75d30e4df57d0cca73 100644 (file)
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_vnodeops.h"
+#include "xfs_ag.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_trace.h"
@@ -68,14 +70,15 @@ xfs_acl_from_disk(
 
                switch (acl_e->e_tag) {
                case ACL_USER:
+                       acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+                       break;
                case ACL_GROUP:
-                       acl_e->e_id = be32_to_cpu(ace->ae_id);
+                       acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
                        break;
                case ACL_USER_OBJ:
                case ACL_GROUP_OBJ:
                case ACL_MASK:
                case ACL_OTHER:
-                       acl_e->e_id = ACL_UNDEFINED_ID;
                        break;
                default:
                        goto fail;
@@ -101,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
                acl_e = &acl->a_entries[i];
 
                ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-               ace->ae_id = cpu_to_be32(acl_e->e_id);
+               switch (acl_e->e_tag) {
+               case ACL_USER:
+                       ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+                       break;
+               case ACL_GROUP:
+                       ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+                       break;
+               default:
+                       ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+                       break;
+               }
+
                ace->ae_perm = cpu_to_be16(acl_e->e_perm);
        }
 }
@@ -360,7 +374,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
                return -EINVAL;
        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return value ? -EACCES : 0;
-       if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+       if (!inode_owner_or_capable(inode))
                return -EPERM;
 
        if (!value)
index 317aa86d96ea04925b35e995cc70e1eb024beb28..1cb740afd674e8fd3f5ce0a3f47bfc00f8b36471 100644 (file)
@@ -226,59 +226,6 @@ typedef struct xfs_agfl {
        __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
 } xfs_agfl_t;
 
-/*
- * Per-ag incore structure, copies of information in agf and agi,
- * to improve the performance of allocation group selection.
- */
-#define XFS_PAGB_NUM_SLOTS     128
-
-typedef struct xfs_perag {
-       struct xfs_mount *pag_mount;    /* owner filesystem */
-       xfs_agnumber_t  pag_agno;       /* AG this structure belongs to */
-       atomic_t        pag_ref;        /* perag reference count */
-       char            pagf_init;      /* this agf's entry is initialized */
-       char            pagi_init;      /* this agi's entry is initialized */
-       char            pagf_metadata;  /* the agf is preferred to be metadata */
-       char            pagi_inodeok;   /* The agi is ok for inodes */
-       __uint8_t       pagf_levels[XFS_BTNUM_AGF];
-                                       /* # of levels in bno & cnt btree */
-       __uint32_t      pagf_flcount;   /* count of blocks in freelist */
-       xfs_extlen_t    pagf_freeblks;  /* total free blocks */
-       xfs_extlen_t    pagf_longest;   /* longest free space */
-       __uint32_t      pagf_btreeblks; /* # of blocks held in AGF btrees */
-       xfs_agino_t     pagi_freecount; /* number of free inodes */
-       xfs_agino_t     pagi_count;     /* number of allocated inodes */
-
-       /*
-        * Inode allocation search lookup optimisation.
-        * If the pagino matches, the search for new inodes
-        * doesn't need to search the near ones again straight away
-        */
-       xfs_agino_t     pagl_pagino;
-       xfs_agino_t     pagl_leftrec;
-       xfs_agino_t     pagl_rightrec;
-#ifdef __KERNEL__
-       spinlock_t      pagb_lock;      /* lock for pagb_tree */
-       struct rb_root  pagb_tree;      /* ordered tree of busy extents */
-
-       atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-
-       spinlock_t      pag_ici_lock;   /* incore inode cache lock */
-       struct radix_tree_root pag_ici_root;    /* incore inode cache root */
-       int             pag_ici_reclaimable;    /* reclaimable inodes */
-       struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
-       unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
-
-       /* buffer cache index */
-       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
-       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
-
-       /* for rcu-safe freeing */
-       struct rcu_head rcu_head;
-#endif
-       int             pagb_count;     /* pagb slots in use */
-} xfs_perag_t;
-
 /*
  * tags for inode radix tree
  */
index 71596e57283ae6b44702f8d6866405de6b911728..5a1393f5e020739a648612002f7ae9a3f704d032 100644 (file)
@@ -878,7 +878,7 @@ xfs_alloc_ag_vextent_near(
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
        int             forced = 0;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
        /*
         * Randomly don't execute the first algorithm.
         */
@@ -938,8 +938,8 @@ restart:
                xfs_extlen_t    blen=0;
                xfs_agblock_t   bnew=0;
 
-#if defined(DEBUG) && defined(__KERNEL__)
-               if (!dofirst)
+#ifdef DEBUG
+               if (dofirst)
                        break;
 #endif
                /*
index e11d654af786b580086f188f72881fe7416ca928..977da0ec66047eedacb3298ebec57de3a2968f0c 100644 (file)
@@ -28,9 +28,9 @@
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
@@ -108,7 +108,7 @@ xfs_setfilesize_trans_alloc(
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
 
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
@@ -440,7 +440,7 @@ xfs_start_page_writeback(
                end_page_writeback(page);
 }
 
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 {
        return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 }
@@ -514,7 +514,7 @@ xfs_submit_ioend(
                                goto retry;
                        }
 
-                       if (bio_add_buffer(bio, bh) != bh->b_size) {
+                       if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
                                xfs_submit_ioend_bio(wbc, ioend, bio);
                                goto retry;
                        }
@@ -1498,13 +1498,26 @@ xfs_vm_write_failed(
        loff_t                  pos,
        unsigned                len)
 {
-       loff_t                  block_offset = pos & PAGE_MASK;
+       loff_t                  block_offset;
        loff_t                  block_start;
        loff_t                  block_end;
        loff_t                  from = pos & (PAGE_CACHE_SIZE - 1);
        loff_t                  to = from + len;
        struct buffer_head      *bh, *head;
 
+       /*
+        * The request pos offset might be 32 or 64 bit, this is all fine
+        * on 64-bit platform.  However, for 64-bit pos request on 32-bit
+        * platform, the high 32-bit will be masked off if we evaluate the
+        * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+        * 0xfffff000 as an unsigned long, hence the result is incorrect
+        * which could cause the following ASSERT failed in most cases.
+        * In order to avoid this, we can evaluate the block_offset of the
+        * start of the page by using shifts rather than masks the mismatch
+        * problem.
+        */
+       block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
        ASSERT(block_offset + from == pos);
 
        head = page_buffers(page);
index 20fe3fe9d3417aabcd566ddad7719d3653ad4443..ddcf2267ffa6fdf1bcf33cf7439c6b379472c2b4 100644 (file)
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_alloc.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
 /*
@@ -62,7 +63,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
 STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
 STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
 STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
 
 /*
  * Internal routines when attribute list is more than one block.
@@ -70,7 +70,6 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
 STATIC int xfs_attr_node_get(xfs_da_args_t *args);
 STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
 STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
-STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
 STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
 STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
 
@@ -90,7 +89,7 @@ xfs_attr_name_to_xname(
        return 0;
 }
 
-STATIC int
+int
 xfs_inode_hasattr(
        struct xfs_inode        *ip)
 {
@@ -227,13 +226,14 @@ xfs_attr_set_int(
        int             valuelen,
        int             flags)
 {
-       xfs_da_args_t   args;
-       xfs_fsblock_t   firstblock;
-       xfs_bmap_free_t flist;
-       int             error, err2, committed;
-       xfs_mount_t     *mp = dp->i_mount;
-       int             rsvd = (flags & ATTR_ROOT) != 0;
-       int             local;
+       xfs_da_args_t           args;
+       xfs_fsblock_t           firstblock;
+       xfs_bmap_free_t         flist;
+       int                     error, err2, committed;
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_trans_res    tres;
+       int                     rsvd = (flags & ATTR_ROOT) != 0;
+       int                     local;
 
        /*
         * Attach the dquots to the inode.
@@ -293,11 +293,11 @@ xfs_attr_set_int(
        if (rsvd)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
 
-       error = xfs_trans_reserve(args.trans, args.total,
-                                 XFS_ATTRSETM_LOG_RES(mp) +
-                                 XFS_ATTRSETRT_LOG_RES(mp) * args.total,
-                                 0, XFS_TRANS_PERM_LOG_RES,
-                                 XFS_ATTRSET_LOG_COUNT);
+       tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+                        M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
+       tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
        if (error) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
@@ -517,11 +517,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        if (flags & ATTR_ROOT)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
 
-       if ((error = xfs_trans_reserve(args.trans,
-                                     XFS_ATTRRM_SPACE_RES(mp),
-                                     XFS_ATTRRM_LOG_RES(mp),
-                                     0, XFS_TRANS_PERM_LOG_RES,
-                                     XFS_ATTRRM_LOG_COUNT))) {
+       error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
+                                 XFS_ATTRRM_SPACE_RES(mp), 0);
+       if (error) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
        }
@@ -611,228 +609,6 @@ xfs_attr_remove(
        return xfs_attr_remove_int(dp, &xname, flags);
 }
 
-int
-xfs_attr_list_int(xfs_attr_list_context_t *context)
-{
-       int error;
-       xfs_inode_t *dp = context->dp;
-
-       XFS_STATS_INC(xs_attr_list);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return EIO;
-
-       xfs_ilock(dp, XFS_ILOCK_SHARED);
-
-       /*
-        * Decide on what work routines to call based on the inode size.
-        */
-       if (!xfs_inode_hasattr(dp)) {
-               error = 0;
-       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               error = xfs_attr_shortform_list(context);
-       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               error = xfs_attr_leaf_list(context);
-       } else {
-               error = xfs_attr_node_list(context);
-       }
-
-       xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
-       return error;
-}
-
-#define        ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
-       (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define        ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
-       ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
-        & ~(sizeof(u_int32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(
-       xfs_attr_list_context_t *context,
-       int             flags,
-       unsigned char   *name,
-       int             namelen,
-       int             valuelen,
-       unsigned char   *value)
-{
-       struct attrlist *alist = (struct attrlist *)context->alist;
-       attrlist_ent_t *aep;
-       int arraytop;
-
-       ASSERT(!(context->flags & ATTR_KERNOVAL));
-       ASSERT(context->count >= 0);
-       ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-       ASSERT(context->firstu >= sizeof(*alist));
-       ASSERT(context->firstu <= context->bufsize);
-
-       /*
-        * Only list entries in the right namespace.
-        */
-       if (((context->flags & ATTR_SECURE) == 0) !=
-           ((flags & XFS_ATTR_SECURE) == 0))
-               return 0;
-       if (((context->flags & ATTR_ROOT) == 0) !=
-           ((flags & XFS_ATTR_ROOT) == 0))
-               return 0;
-
-       arraytop = sizeof(*alist) +
-                       context->count * sizeof(alist->al_offset[0]);
-       context->firstu -= ATTR_ENTSIZE(namelen);
-       if (context->firstu < arraytop) {
-               trace_xfs_attr_list_full(context);
-               alist->al_more = 1;
-               context->seen_enough = 1;
-               return 1;
-       }
-
-       aep = (attrlist_ent_t *)&context->alist[context->firstu];
-       aep->a_valuelen = valuelen;
-       memcpy(aep->a_name, name, namelen);
-       aep->a_name[namelen] = 0;
-       alist->al_offset[context->count++] = context->firstu;
-       alist->al_count = context->count;
-       trace_xfs_attr_list_add(context);
-       return 0;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths.  Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
-       xfs_inode_t     *dp,
-       char            *buffer,
-       int             bufsize,
-       int             flags,
-       attrlist_cursor_kern_t *cursor)
-{
-       xfs_attr_list_context_t context;
-       struct attrlist *alist;
-       int error;
-
-       /*
-        * Validate the cursor.
-        */
-       if (cursor->pad1 || cursor->pad2)
-               return(XFS_ERROR(EINVAL));
-       if ((cursor->initted == 0) &&
-           (cursor->hashval || cursor->blkno || cursor->offset))
-               return XFS_ERROR(EINVAL);
-
-       /*
-        * Check for a properly aligned buffer.
-        */
-       if (((long)buffer) & (sizeof(int)-1))
-               return XFS_ERROR(EFAULT);
-       if (flags & ATTR_KERNOVAL)
-               bufsize = 0;
-
-       /*
-        * Initialize the output buffer.
-        */
-       memset(&context, 0, sizeof(context));
-       context.dp = dp;
-       context.cursor = cursor;
-       context.resynch = 1;
-       context.flags = flags;
-       context.alist = buffer;
-       context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
-       context.firstu = context.bufsize;
-       context.put_listent = xfs_attr_put_listent;
-
-       alist = (struct attrlist *)context.alist;
-       alist->al_count = 0;
-       alist->al_more = 0;
-       alist->al_offset[0] = context.bufsize;
-
-       error = xfs_attr_list_int(&context);
-       ASSERT(error >= 0);
-       return error;
-}
-
-int                                                            /* error */
-xfs_attr_inactive(xfs_inode_t *dp)
-{
-       xfs_trans_t *trans;
-       xfs_mount_t *mp;
-       int error;
-
-       mp = dp->i_mount;
-       ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
-
-       xfs_ilock(dp, XFS_ILOCK_SHARED);
-       if (!xfs_inode_hasattr(dp) ||
-           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               xfs_iunlock(dp, XFS_ILOCK_SHARED);
-               return 0;
-       }
-       xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
-       /*
-        * Start our first transaction of the day.
-        *
-        * All future transactions during this code must be "chained" off
-        * this one via the trans_dup() call.  All transactions will contain
-        * the inode, and the inode will always be marked with trans_ihold().
-        * Since the inode will be locked in all transactions, we must log
-        * the inode in every transaction to let it float upward through
-        * the log.
-        */
-       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
-       if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
-                                     XFS_TRANS_PERM_LOG_RES,
-                                     XFS_ATTRINVAL_LOG_COUNT))) {
-               xfs_trans_cancel(trans, 0);
-               return(error);
-       }
-       xfs_ilock(dp, XFS_ILOCK_EXCL);
-
-       /*
-        * No need to make quota reservations here. We expect to release some
-        * blocks, not allocate, in the common case.
-        */
-       xfs_trans_ijoin(trans, dp, 0);
-
-       /*
-        * Decide on what work routines to call based on the inode size.
-        */
-       if (!xfs_inode_hasattr(dp) ||
-           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               error = 0;
-               goto out;
-       }
-       error = xfs_attr3_root_inactive(&trans, dp);
-       if (error)
-               goto out;
-
-       error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
-       if (error)
-               goto out;
-
-       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-       return(error);
-
-out:
-       xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return(error);
-}
-
-
 
 /*========================================================================
  * External routines when attribute list is inside the inode
@@ -1166,28 +942,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
        return error;
 }
 
-/*
- * Copy out attribute entries for attr_list(), for leaf attribute lists.
- */
-STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
-{
-       int error;
-       struct xfs_buf *bp;
-
-       trace_xfs_attr_leaf_list(context);
-
-       context->cursor->blkno = 0;
-       error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
-       if (error)
-               return XFS_ERROR(error);
-
-       error = xfs_attr3_leaf_list_int(bp, context);
-       xfs_trans_brelse(NULL, bp);
-       return XFS_ERROR(error);
-}
-
-
 /*========================================================================
  * External routines when attribute list size > XFS_LBSIZE(mp).
  *========================================================================*/
@@ -1260,6 +1014,7 @@ restart:
                         * have been a b-tree.
                         */
                        xfs_da_state_free(state);
+                       state = NULL;
                        xfs_bmap_init(args->flist, args->firstblock);
                        error = xfs_attr3_leaf_to_node(args);
                        if (!error) {
@@ -1780,143 +1535,3 @@ xfs_attr_node_get(xfs_da_args_t *args)
        xfs_da_state_free(state);
        return(retval);
 }
-
-STATIC int                                                     /* error */
-xfs_attr_node_list(xfs_attr_list_context_t *context)
-{
-       attrlist_cursor_kern_t *cursor;
-       xfs_attr_leafblock_t *leaf;
-       xfs_da_intnode_t *node;
-       struct xfs_attr3_icleaf_hdr leafhdr;
-       struct xfs_da3_icnode_hdr nodehdr;
-       struct xfs_da_node_entry *btree;
-       int error, i;
-       struct xfs_buf *bp;
-
-       trace_xfs_attr_node_list(context);
-
-       cursor = context->cursor;
-       cursor->initted = 1;
-
-       /*
-        * Do all sorts of validation on the passed-in cursor structure.
-        * If anything is amiss, ignore the cursor and look up the hashval
-        * starting from the btree root.
-        */
-       bp = NULL;
-       if (cursor->blkno > 0) {
-               error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
-                                             &bp, XFS_ATTR_FORK);
-               if ((error != 0) && (error != EFSCORRUPTED))
-                       return(error);
-               if (bp) {
-                       struct xfs_attr_leaf_entry *entries;
-
-                       node = bp->b_addr;
-                       switch (be16_to_cpu(node->hdr.info.magic)) {
-                       case XFS_DA_NODE_MAGIC:
-                       case XFS_DA3_NODE_MAGIC:
-                               trace_xfs_attr_list_wrong_blk(context);
-                               xfs_trans_brelse(NULL, bp);
-                               bp = NULL;
-                               break;
-                       case XFS_ATTR_LEAF_MAGIC:
-                       case XFS_ATTR3_LEAF_MAGIC:
-                               leaf = bp->b_addr;
-                               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-                               entries = xfs_attr3_leaf_entryp(leaf);
-                               if (cursor->hashval > be32_to_cpu(
-                                               entries[leafhdr.count - 1].hashval)) {
-                                       trace_xfs_attr_list_wrong_blk(context);
-                                       xfs_trans_brelse(NULL, bp);
-                                       bp = NULL;
-                               } else if (cursor->hashval <= be32_to_cpu(
-                                               entries[0].hashval)) {
-                                       trace_xfs_attr_list_wrong_blk(context);
-                                       xfs_trans_brelse(NULL, bp);
-                                       bp = NULL;
-                               }
-                               break;
-                       default:
-                               trace_xfs_attr_list_wrong_blk(context);
-                               xfs_trans_brelse(NULL, bp);
-                               bp = NULL;
-                       }
-               }
-       }
-
-       /*
-        * We did not find what we expected given the cursor's contents,
-        * so we start from the top and work down based on the hash value.
-        * Note that start of node block is same as start of leaf block.
-        */
-       if (bp == NULL) {
-               cursor->blkno = 0;
-               for (;;) {
-                       __uint16_t magic;
-
-                       error = xfs_da3_node_read(NULL, context->dp,
-                                                     cursor->blkno, -1, &bp,
-                                                     XFS_ATTR_FORK);
-                       if (error)
-                               return(error);
-                       node = bp->b_addr;
-                       magic = be16_to_cpu(node->hdr.info.magic);
-                       if (magic == XFS_ATTR_LEAF_MAGIC ||
-                           magic == XFS_ATTR3_LEAF_MAGIC)
-                               break;
-                       if (magic != XFS_DA_NODE_MAGIC &&
-                           magic != XFS_DA3_NODE_MAGIC) {
-                               XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    context->dp->i_mount,
-                                                    node);
-                               xfs_trans_brelse(NULL, bp);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
-
-                       xfs_da3_node_hdr_from_disk(&nodehdr, node);
-                       btree = xfs_da3_node_tree_p(node);
-                       for (i = 0; i < nodehdr.count; btree++, i++) {
-                               if (cursor->hashval
-                                               <= be32_to_cpu(btree->hashval)) {
-                                       cursor->blkno = be32_to_cpu(btree->before);
-                                       trace_xfs_attr_list_node_descend(context,
-                                                                        btree);
-                                       break;
-                               }
-                       }
-                       if (i == nodehdr.count) {
-                               xfs_trans_brelse(NULL, bp);
-                               return 0;
-                       }
-                       xfs_trans_brelse(NULL, bp);
-               }
-       }
-       ASSERT(bp != NULL);
-
-       /*
-        * Roll upward through the blocks, processing each leaf block in
-        * order.  As long as there is space in the result buffer, keep
-        * adding the information.
-        */
-       for (;;) {
-               leaf = bp->b_addr;
-               error = xfs_attr3_leaf_list_int(bp, context);
-               if (error) {
-                       xfs_trans_brelse(NULL, bp);
-                       return error;
-               }
-               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
-               if (context->seen_enough || leafhdr.forw == 0)
-                       break;
-               cursor->blkno = leafhdr.forw;
-               xfs_trans_brelse(NULL, bp);
-               error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
-                                          &bp);
-               if (error)
-                       return error;
-       }
-       xfs_trans_brelse(NULL, bp);
-       return 0;
-}
index de8dd58da46c28ec078dbd5e6d829ae725e973ab..dd4824589470eb106a2b5a764da6039d56121726 100644 (file)
@@ -141,5 +141,14 @@ typedef struct xfs_attr_list_context {
  */
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+                unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+                unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+                 int flags, struct attrlist_cursor_kern *cursor);
+
 
 #endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644 (file)
index 0000000..bb24b07
--- /dev/null
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_remote.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+STATIC int
+xfs_attr3_leaf_freextent(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             blkno,
+       int                     blkcnt)
+{
+       struct xfs_bmbt_irec    map;
+       struct xfs_buf          *bp;
+       xfs_dablk_t             tblkno;
+       xfs_daddr_t             dblkno;
+       int                     tblkcnt;
+       int                     dblkcnt;
+       int                     nmap;
+       int                     error;
+
+       /*
+        * Roll through the "value", invalidating the attribute value's
+        * blocks.
+        */
+       tblkno = blkno;
+       tblkcnt = blkcnt;
+       while (tblkcnt > 0) {
+               /*
+                * Try to remember where we decided to put the value.
+                */
+               nmap = 1;
+               error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+                                      &map, &nmap, XFS_BMAPI_ATTRFORK);
+               if (error) {
+                       return(error);
+               }
+               ASSERT(nmap == 1);
+               ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+               /*
+                * If it's a hole, these are already unmapped
+                * so there's nothing to invalidate.
+                */
+               if (map.br_startblock != HOLESTARTBLOCK) {
+
+                       dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+                                                 map.br_startblock);
+                       dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+                                               map.br_blockcount);
+                       bp = xfs_trans_get_buf(*trans,
+                                       dp->i_mount->m_ddev_targp,
+                                       dblkno, dblkcnt, 0);
+                       if (!bp)
+                               return ENOMEM;
+                       xfs_trans_binval(*trans, bp);
+                       /*
+                        * Roll to next transaction.
+                        */
+                       error = xfs_trans_roll(trans, dp);
+                       if (error)
+                               return (error);
+               }
+
+               tblkno += map.br_blockcount;
+               tblkcnt -= map.br_blockcount;
+       }
+
+       return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+STATIC int
+xfs_attr3_leaf_inactive(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp,
+       struct xfs_buf          *bp)
+{
+       struct xfs_attr_leafblock *leaf;
+       struct xfs_attr3_icleaf_hdr ichdr;
+       struct xfs_attr_leaf_entry *entry;
+       struct xfs_attr_leaf_name_remote *name_rmt;
+       struct xfs_attr_inactive_list *list;
+       struct xfs_attr_inactive_list *lp;
+       int                     error;
+       int                     count;
+       int                     size;
+       int                     tmp;
+       int                     i;
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+       /*
+        * Count the number of "remote" value extents.
+        */
+       count = 0;
+       entry = xfs_attr3_leaf_entryp(leaf);
+       for (i = 0; i < ichdr.count; entry++, i++) {
+               if (be16_to_cpu(entry->nameidx) &&
+                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       if (name_rmt->valueblk)
+                               count++;
+               }
+       }
+
+       /*
+        * If there are no "remote" values, we're done.
+        */
+       if (count == 0) {
+               xfs_trans_brelse(*trans, bp);
+               return 0;
+       }
+
+       /*
+        * Allocate storage for a list of all the "remote" value extents.
+        */
+       size = count * sizeof(xfs_attr_inactive_list_t);
+       list = kmem_alloc(size, KM_SLEEP);
+
+       /*
+        * Identify each of the "remote" value extents.
+        */
+       lp = list;
+       entry = xfs_attr3_leaf_entryp(leaf);
+       for (i = 0; i < ichdr.count; entry++, i++) {
+               if (be16_to_cpu(entry->nameidx) &&
+                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+                       if (name_rmt->valueblk) {
+                               lp->valueblk = be32_to_cpu(name_rmt->valueblk);
+                               lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
+                                                   be32_to_cpu(name_rmt->valuelen));
+                               lp++;
+                       }
+               }
+       }
+       xfs_trans_brelse(*trans, bp);   /* unlock for trans. in freextent() */
+
+       /*
+        * Invalidate each of the "remote" value extents.
+        */
+       error = 0;
+       for (lp = list, i = 0; i < count; i++, lp++) {
+               tmp = xfs_attr3_leaf_freextent(trans, dp,
+                               lp->valueblk, lp->valuelen);
+
+               if (error == 0)
+                       error = tmp;    /* save only the 1st errno */
+       }
+
+       kmem_free(list);
+       return error;
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+STATIC int
+xfs_attr3_node_inactive(
+       struct xfs_trans **trans,
+       struct xfs_inode *dp,
+       struct xfs_buf  *bp,
+       int             level)
+{
+       xfs_da_blkinfo_t *info;
+       xfs_da_intnode_t *node;
+       xfs_dablk_t child_fsb;
+       xfs_daddr_t parent_blkno, child_blkno;
+       int error, i;
+       struct xfs_buf *child_bp;
+       struct xfs_da_node_entry *btree;
+       struct xfs_da3_icnode_hdr ichdr;
+
+       /*
+        * Since this code is recursive (gasp!) we must protect ourselves.
+        */
+       if (level > XFS_DA_NODE_MAXDEPTH) {
+               xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
+               return XFS_ERROR(EIO);
+       }
+
+       node = bp->b_addr;
+       xfs_da3_node_hdr_from_disk(&ichdr, node);
+       parent_blkno = bp->b_bn;
+       if (!ichdr.count) {
+               xfs_trans_brelse(*trans, bp);
+               return 0;
+       }
+       btree = xfs_da3_node_tree_p(node);
+       child_fsb = be32_to_cpu(btree[0].before);
+       xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
+
+       /*
+        * If this is the node level just above the leaves, simply loop
+        * over the leaves removing all of them.  If this is higher up
+        * in the tree, recurse downward.
+        */
+       for (i = 0; i < ichdr.count; i++) {
+               /*
+                * Read the subsidiary block to see what we have to work with.
+                * Don't do this in a transaction.  This is a depth-first
+                * traversal of the tree so we may deal with many blocks
+                * before we come back to this one.
+                */
+               error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
+                                               XFS_ATTR_FORK);
+               if (error)
+                       return(error);
+               if (child_bp) {
+                                               /* save for re-read later */
+                       child_blkno = XFS_BUF_ADDR(child_bp);
+
+                       /*
+                        * Invalidate the subtree, however we have to.
+                        */
+                       info = child_bp->b_addr;
+                       switch (info->magic) {
+                       case cpu_to_be16(XFS_DA_NODE_MAGIC):
+                       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+                               error = xfs_attr3_node_inactive(trans, dp,
+                                                       child_bp, level + 1);
+                               break;
+                       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+                       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+                               error = xfs_attr3_leaf_inactive(trans, dp,
+                                                       child_bp);
+                               break;
+                       default:
+                               error = XFS_ERROR(EIO);
+                               xfs_trans_brelse(*trans, child_bp);
+                               break;
+                       }
+                       if (error)
+                               return error;
+
+                       /*
+                        * Remove the subsidiary block from the cache
+                        * and from the log.
+                        */
+                       error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+                               &child_bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+                       xfs_trans_binval(*trans, child_bp);
+               }
+
+               /*
+                * If we're not done, re-read the parent to get the next
+                * child block number.
+                */
+               if (i + 1 < ichdr.count) {
+                       error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
+                                                &bp, XFS_ATTR_FORK);
+                       if (error)
+                               return error;
+                       child_fsb = be32_to_cpu(btree[i + 1].before);
+                       xfs_trans_brelse(*trans, bp);
+               }
+               /*
+                * Atomically commit the whole invalidate stuff.
+                */
+               error = xfs_trans_roll(trans, dp);
+               if (error)
+                       return  error;
+       }
+
+       return 0;
+}
+
+/*
+ * Indiscriminately delete the entire attribute fork
+ *
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr3_root_inactive(
+       struct xfs_trans        **trans,
+       struct xfs_inode        *dp)
+{
+       struct xfs_da_blkinfo   *info;
+       struct xfs_buf          *bp;
+       xfs_daddr_t             blkno;
+       int                     error;
+
+       /*
+        * Read block 0 to see what we have to work with.
+        * We only get here if we have extents, since we remove
+        * the extents in reverse order the extent containing
+        * block 0 must still be there.
+        */
+       error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+       if (error)
+               return error;
+       blkno = bp->b_bn;
+
+       /*
+        * Invalidate the tree, even if the "tree" is only a single leaf block.
+        * This is a depth-first traversal!
+        */
+       info = bp->b_addr;
+       switch (info->magic) {
+       case cpu_to_be16(XFS_DA_NODE_MAGIC):
+       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+               error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+               break;
+       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+               error = xfs_attr3_leaf_inactive(trans, dp, bp);
+               break;
+       default:
+               error = XFS_ERROR(EIO);
+               xfs_trans_brelse(*trans, bp);
+               break;
+       }
+       if (error)
+               return error;
+
+       /*
+        * Invalidate the incore copy of the root block.
+        */
+       error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+       if (error)
+               return error;
+       xfs_trans_binval(*trans, bp);   /* remove from cache */
+       /*
+        * Commit the invalidate and start the next transaction.
+        */
+       error = xfs_trans_roll(trans, dp);
+
+       return error;
+}
+
+int
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+       xfs_trans_t *trans;
+       xfs_mount_t *mp;
+       int error;
+
+       mp = dp->i_mount;
+       ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+       xfs_ilock(dp, XFS_ILOCK_SHARED);
+       if (!xfs_inode_hasattr(dp) ||
+           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               xfs_iunlock(dp, XFS_ILOCK_SHARED);
+               return 0;
+       }
+       xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+       /*
+        * Start our first transaction of the day.
+        *
+        * All future transactions during this code must be "chained" off
+        * this one via the trans_dup() call.  All transactions will contain
+        * the inode, and the inode will always be marked with trans_ihold().
+        * Since the inode will be locked in all transactions, we must log
+        * the inode in every transaction to let it float upward through
+        * the log.
+        */
+       trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+       error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
+       if (error) {
+               xfs_trans_cancel(trans, 0);
+               return(error);
+       }
+       xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+       /*
+        * No need to make quota reservations here. We expect to release some
+        * blocks, not allocate, in the common case.
+        */
+       xfs_trans_ijoin(trans, dp, 0);
+
+       /*
+        * Decide on what work routines to call based on the inode size.
+        */
+       if (!xfs_inode_hasattr(dp) ||
+           dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               error = 0;
+               goto out;
+       }
+       error = xfs_attr3_root_inactive(&trans, dp);
+       if (error)
+               goto out;
+
+       error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+       if (error)
+               goto out;
+
+       error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+       return(error);
+
+out:
+       xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return(error);
+}
index b800fbcafc7f639f05a83fc97fc5e964f77422b0..86db20a9cc02b5df2dd7ecb3c44eca39d7498dd7 100644 (file)
@@ -22,6 +22,7 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -77,16 +78,6 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
                        int *number_entries_in_blk1,
                        int *number_usedbytes_in_blk1);
 
-/*
- * Routines used for shrinking the Btree.
- */
-STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
-                                 struct xfs_buf *bp, int level);
-STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
-                                 struct xfs_buf *bp);
-STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
-                                  xfs_dablk_t blkno, int blkcnt);
-
 /*
  * Utility routines.
  */
@@ -635,7 +626,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
        xfs_attr_sf_entry_t *sfe;
        int i;
 
-       ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+       ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
        sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
        sfe = &sf->list[0];
        for (i = 0; i < sf->hdr.count;
@@ -751,182 +742,6 @@ out:
        return(error);
 }
 
-STATIC int
-xfs_attr_shortform_compare(const void *a, const void *b)
-{
-       xfs_attr_sf_sort_t *sa, *sb;
-
-       sa = (xfs_attr_sf_sort_t *)a;
-       sb = (xfs_attr_sf_sort_t *)b;
-       if (sa->hash < sb->hash) {
-               return(-1);
-       } else if (sa->hash > sb->hash) {
-               return(1);
-       } else {
-               return(sa->entno - sb->entno);
-       }
-}
-
-
-#define XFS_ISRESET_CURSOR(cursor) \
-       (!((cursor)->initted) && !((cursor)->hashval) && \
-        !((cursor)->blkno) && !((cursor)->offset))
-/*
- * Copy out entries of shortform attribute lists for attr_list().
- * Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
- * we have to calculate each entries' hashvalue and sort them before
- * we can begin returning them to the user.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
-{
-       attrlist_cursor_kern_t *cursor;
-       xfs_attr_sf_sort_t *sbuf, *sbp;
-       xfs_attr_shortform_t *sf;
-       xfs_attr_sf_entry_t *sfe;
-       xfs_inode_t *dp;
-       int sbsize, nsbuf, count, i;
-       int error;
-
-       ASSERT(context != NULL);
-       dp = context->dp;
-       ASSERT(dp != NULL);
-       ASSERT(dp->i_afp != NULL);
-       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
-       ASSERT(sf != NULL);
-       if (!sf->hdr.count)
-               return(0);
-       cursor = context->cursor;
-       ASSERT(cursor != NULL);
-
-       trace_xfs_attr_list_sf(context);
-
-       /*
-        * If the buffer is large enough and the cursor is at the start,
-        * do not bother with sorting since we will return everything in
-        * one buffer and another call using the cursor won't need to be
-        * made.
-        * Note the generous fudge factor of 16 overhead bytes per entry.
-        * If bufsize is zero then put_listent must be a search function
-        * and can just scan through what we have.
-        */
-       if (context->bufsize == 0 ||
-           (XFS_ISRESET_CURSOR(cursor) &&
-             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
-               for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-                       error = context->put_listent(context,
-                                          sfe->flags,
-                                          sfe->nameval,
-                                          (int)sfe->namelen,
-                                          (int)sfe->valuelen,
-                                          &sfe->nameval[sfe->namelen]);
-
-                       /*
-                        * Either search callback finished early or
-                        * didn't fit it all in the buffer after all.
-                        */
-                       if (context->seen_enough)
-                               break;
-
-                       if (error)
-                               return error;
-                       sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-               }
-               trace_xfs_attr_list_sf_all(context);
-               return(0);
-       }
-
-       /* do no more for a search callback */
-       if (context->bufsize == 0)
-               return 0;
-
-       /*
-        * It didn't all fit, so we have to sort everything on hashval.
-        */
-       sbsize = sf->hdr.count * sizeof(*sbuf);
-       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
-
-       /*
-        * Scan the attribute list for the rest of the entries, storing
-        * the relevant info from only those that match into a buffer.
-        */
-       nsbuf = 0;
-       for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-               if (unlikely(
-                   ((char *)sfe < (char *)sf) ||
-                   ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
-                       XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
-                                            XFS_ERRLEVEL_LOW,
-                                            context->dp->i_mount, sfe);
-                       kmem_free(sbuf);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-
-               sbp->entno = i;
-               sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
-               sbp->name = sfe->nameval;
-               sbp->namelen = sfe->namelen;
-               /* These are bytes, and both on-disk, don't endian-flip */
-               sbp->valuelen = sfe->valuelen;
-               sbp->flags = sfe->flags;
-               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-               sbp++;
-               nsbuf++;
-       }
-
-       /*
-        * Sort the entries on hash then entno.
-        */
-       xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
-
-       /*
-        * Re-find our place IN THE SORTED LIST.
-        */
-       count = 0;
-       cursor->initted = 1;
-       cursor->blkno = 0;
-       for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
-               if (sbp->hash == cursor->hashval) {
-                       if (cursor->offset == count) {
-                               break;
-                       }
-                       count++;
-               } else if (sbp->hash > cursor->hashval) {
-                       break;
-               }
-       }
-       if (i == nsbuf) {
-               kmem_free(sbuf);
-               return(0);
-       }
-
-       /*
-        * Loop putting entries into the user buffer.
-        */
-       for ( ; i < nsbuf; i++, sbp++) {
-               if (cursor->hashval != sbp->hash) {
-                       cursor->hashval = sbp->hash;
-                       cursor->offset = 0;
-               }
-               error = context->put_listent(context,
-                                       sbp->flags,
-                                       sbp->name,
-                                       sbp->namelen,
-                                       sbp->valuelen,
-                                       &sbp->name[sbp->namelen]);
-               if (error)
-                       return error;
-               if (context->seen_enough)
-                       break;
-               cursor->offset++;
-       }
-
-       kmem_free(sbuf);
-       return(0);
-}
-
 /*
  * Check a leaf attribute block to see if all the entries would fit into
  * a shortform attribute list.
@@ -1121,7 +936,6 @@ out:
        return error;
 }
 
-
 /*========================================================================
  * Routines used for growing the Btree.
  *========================================================================*/
@@ -1482,7 +1296,6 @@ xfs_attr3_leaf_compact(
        ichdr_dst->freemap[0].size = ichdr_dst->firstused -
                                                ichdr_dst->freemap[0].base;
 
-
        /* write the header back to initialise the underlying buffer */
        xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
 
@@ -2643,130 +2456,6 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
        return size;
 }
 
-/*
- * Copy out attribute list entries for attr_list(), for leaf attribute lists.
- */
-int
-xfs_attr3_leaf_list_int(
-       struct xfs_buf                  *bp,
-       struct xfs_attr_list_context    *context)
-{
-       struct attrlist_cursor_kern     *cursor;
-       struct xfs_attr_leafblock       *leaf;
-       struct xfs_attr3_icleaf_hdr     ichdr;
-       struct xfs_attr_leaf_entry      *entries;
-       struct xfs_attr_leaf_entry      *entry;
-       int                             retval;
-       int                             i;
-
-       trace_xfs_attr_list_leaf(context);
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-       entries = xfs_attr3_leaf_entryp(leaf);
-
-       cursor = context->cursor;
-       cursor->initted = 1;
-
-       /*
-        * Re-find our place in the leaf block if this is a new syscall.
-        */
-       if (context->resynch) {
-               entry = &entries[0];
-               for (i = 0; i < ichdr.count; entry++, i++) {
-                       if (be32_to_cpu(entry->hashval) == cursor->hashval) {
-                               if (cursor->offset == context->dupcnt) {
-                                       context->dupcnt = 0;
-                                       break;
-                               }
-                               context->dupcnt++;
-                       } else if (be32_to_cpu(entry->hashval) >
-                                       cursor->hashval) {
-                               context->dupcnt = 0;
-                               break;
-                       }
-               }
-               if (i == ichdr.count) {
-                       trace_xfs_attr_list_notfound(context);
-                       return 0;
-               }
-       } else {
-               entry = &entries[0];
-               i = 0;
-       }
-       context->resynch = 0;
-
-       /*
-        * We have found our place, start copying out the new attributes.
-        */
-       retval = 0;
-       for (; i < ichdr.count; entry++, i++) {
-               if (be32_to_cpu(entry->hashval) != cursor->hashval) {
-                       cursor->hashval = be32_to_cpu(entry->hashval);
-                       cursor->offset = 0;
-               }
-
-               if (entry->flags & XFS_ATTR_INCOMPLETE)
-                       continue;               /* skip incomplete entries */
-
-               if (entry->flags & XFS_ATTR_LOCAL) {
-                       xfs_attr_leaf_name_local_t *name_loc =
-                               xfs_attr3_leaf_name_local(leaf, i);
-
-                       retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_loc->nameval,
-                                               (int)name_loc->namelen,
-                                               be16_to_cpu(name_loc->valuelen),
-                                               &name_loc->nameval[name_loc->namelen]);
-                       if (retval)
-                               return retval;
-               } else {
-                       xfs_attr_leaf_name_remote_t *name_rmt =
-                               xfs_attr3_leaf_name_remote(leaf, i);
-
-                       int valuelen = be32_to_cpu(name_rmt->valuelen);
-
-                       if (context->put_value) {
-                               xfs_da_args_t args;
-
-                               memset((char *)&args, 0, sizeof(args));
-                               args.dp = context->dp;
-                               args.whichfork = XFS_ATTR_FORK;
-                               args.valuelen = valuelen;
-                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
-                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
-                                                       args.dp->i_mount, valuelen);
-                               retval = xfs_attr_rmtval_get(&args);
-                               if (retval)
-                                       return retval;
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               args.value);
-                               kmem_free(args.value);
-                       } else {
-                               retval = context->put_listent(context,
-                                               entry->flags,
-                                               name_rmt->name,
-                                               (int)name_rmt->namelen,
-                                               valuelen,
-                                               NULL);
-                       }
-                       if (retval)
-                               return retval;
-               }
-               if (context->seen_enough)
-                       break;
-               cursor->offset++;
-       }
-       trace_xfs_attr_list_leaf_end(context);
-       return retval;
-}
-
 
 /*========================================================================
  * Manage the INCOMPLETE flag in a leaf entry
@@ -3011,345 +2700,3 @@ xfs_attr3_leaf_flipflags(
 
        return error;
 }
-
-/*========================================================================
- * Indiscriminately delete the entire attribute fork
- *========================================================================*/
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-int
-xfs_attr3_root_inactive(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp)
-{
-       struct xfs_da_blkinfo   *info;
-       struct xfs_buf          *bp;
-       xfs_daddr_t             blkno;
-       int                     error;
-
-       /*
-        * Read block 0 to see what we have to work with.
-        * We only get here if we have extents, since we remove
-        * the extents in reverse order the extent containing
-        * block 0 must still be there.
-        */
-       error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
-       if (error)
-               return error;
-       blkno = bp->b_bn;
-
-       /*
-        * Invalidate the tree, even if the "tree" is only a single leaf block.
-        * This is a depth-first traversal!
-        */
-       info = bp->b_addr;
-       switch (info->magic) {
-       case cpu_to_be16(XFS_DA_NODE_MAGIC):
-       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
-               error = xfs_attr3_node_inactive(trans, dp, bp, 1);
-               break;
-       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
-       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
-               error = xfs_attr3_leaf_inactive(trans, dp, bp);
-               break;
-       default:
-               error = XFS_ERROR(EIO);
-               xfs_trans_brelse(*trans, bp);
-               break;
-       }
-       if (error)
-               return error;
-
-       /*
-        * Invalidate the incore copy of the root block.
-        */
-       error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
-       if (error)
-               return error;
-       xfs_trans_binval(*trans, bp);   /* remove from cache */
-       /*
-        * Commit the invalidate and start the next transaction.
-        */
-       error = xfs_trans_roll(trans, dp);
-
-       return error;
-}
-
-/*
- * Recurse (gasp!) through the attribute nodes until we find leaves.
- * We're doing a depth-first traversal in order to invalidate everything.
- */
-STATIC int
-xfs_attr3_node_inactive(
-       struct xfs_trans **trans,
-       struct xfs_inode *dp,
-       struct xfs_buf  *bp,
-       int             level)
-{
-       xfs_da_blkinfo_t *info;
-       xfs_da_intnode_t *node;
-       xfs_dablk_t child_fsb;
-       xfs_daddr_t parent_blkno, child_blkno;
-       int error, i;
-       struct xfs_buf *child_bp;
-       struct xfs_da_node_entry *btree;
-       struct xfs_da3_icnode_hdr ichdr;
-
-       /*
-        * Since this code is recursive (gasp!) we must protect ourselves.
-        */
-       if (level > XFS_DA_NODE_MAXDEPTH) {
-               xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-               return XFS_ERROR(EIO);
-       }
-
-       node = bp->b_addr;
-       xfs_da3_node_hdr_from_disk(&ichdr, node);
-       parent_blkno = bp->b_bn;
-       if (!ichdr.count) {
-               xfs_trans_brelse(*trans, bp);
-               return 0;
-       }
-       btree = xfs_da3_node_tree_p(node);
-       child_fsb = be32_to_cpu(btree[0].before);
-       xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-
-       /*
-        * If this is the node level just above the leaves, simply loop
-        * over the leaves removing all of them.  If this is higher up
-        * in the tree, recurse downward.
-        */
-       for (i = 0; i < ichdr.count; i++) {
-               /*
-                * Read the subsidiary block to see what we have to work with.
-                * Don't do this in a transaction.  This is a depth-first
-                * traversal of the tree so we may deal with many blocks
-                * before we come back to this one.
-                */
-               error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
-                                               XFS_ATTR_FORK);
-               if (error)
-                       return(error);
-               if (child_bp) {
-                                               /* save for re-read later */
-                       child_blkno = XFS_BUF_ADDR(child_bp);
-
-                       /*
-                        * Invalidate the subtree, however we have to.
-                        */
-                       info = child_bp->b_addr;
-                       switch (info->magic) {
-                       case cpu_to_be16(XFS_DA_NODE_MAGIC):
-                       case cpu_to_be16(XFS_DA3_NODE_MAGIC):
-                               error = xfs_attr3_node_inactive(trans, dp,
-                                                       child_bp, level + 1);
-                               break;
-                       case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
-                       case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
-                               error = xfs_attr3_leaf_inactive(trans, dp,
-                                                       child_bp);
-                               break;
-                       default:
-                               error = XFS_ERROR(EIO);
-                               xfs_trans_brelse(*trans, child_bp);
-                               break;
-                       }
-                       if (error)
-                               return error;
-
-                       /*
-                        * Remove the subsidiary block from the cache
-                        * and from the log.
-                        */
-                       error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
-                               &child_bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-                       xfs_trans_binval(*trans, child_bp);
-               }
-
-               /*
-                * If we're not done, re-read the parent to get the next
-                * child block number.
-                */
-               if (i + 1 < ichdr.count) {
-                       error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
-                                                &bp, XFS_ATTR_FORK);
-                       if (error)
-                               return error;
-                       child_fsb = be32_to_cpu(btree[i + 1].before);
-                       xfs_trans_brelse(*trans, bp);
-               }
-               /*
-                * Atomically commit the whole invalidate stuff.
-                */
-               error = xfs_trans_roll(trans, dp);
-               if (error)
-                       return  error;
-       }
-
-       return 0;
-}
-
-/*
- * Invalidate all of the "remote" value regions pointed to by a particular
- * leaf block.
- * Note that we must release the lock on the buffer so that we are not
- * caught holding something that the logging code wants to flush to disk.
- */
-STATIC int
-xfs_attr3_leaf_inactive(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp,
-       struct xfs_buf          *bp)
-{
-       struct xfs_attr_leafblock *leaf;
-       struct xfs_attr3_icleaf_hdr ichdr;
-       struct xfs_attr_leaf_entry *entry;
-       struct xfs_attr_leaf_name_remote *name_rmt;
-       struct xfs_attr_inactive_list *list;
-       struct xfs_attr_inactive_list *lp;
-       int                     error;
-       int                     count;
-       int                     size;
-       int                     tmp;
-       int                     i;
-
-       leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
-
-       /*
-        * Count the number of "remote" value extents.
-        */
-       count = 0;
-       entry = xfs_attr3_leaf_entryp(leaf);
-       for (i = 0; i < ichdr.count; entry++, i++) {
-               if (be16_to_cpu(entry->nameidx) &&
-                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
-                       if (name_rmt->valueblk)
-                               count++;
-               }
-       }
-
-       /*
-        * If there are no "remote" values, we're done.
-        */
-       if (count == 0) {
-               xfs_trans_brelse(*trans, bp);
-               return 0;
-       }
-
-       /*
-        * Allocate storage for a list of all the "remote" value extents.
-        */
-       size = count * sizeof(xfs_attr_inactive_list_t);
-       list = kmem_alloc(size, KM_SLEEP);
-
-       /*
-        * Identify each of the "remote" value extents.
-        */
-       lp = list;
-       entry = xfs_attr3_leaf_entryp(leaf);
-       for (i = 0; i < ichdr.count; entry++, i++) {
-               if (be16_to_cpu(entry->nameidx) &&
-                   ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-                       name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
-                       if (name_rmt->valueblk) {
-                               lp->valueblk = be32_to_cpu(name_rmt->valueblk);
-                               lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
-                                                   be32_to_cpu(name_rmt->valuelen));
-                               lp++;
-                       }
-               }
-       }
-       xfs_trans_brelse(*trans, bp);   /* unlock for trans. in freextent() */
-
-       /*
-        * Invalidate each of the "remote" value extents.
-        */
-       error = 0;
-       for (lp = list, i = 0; i < count; i++, lp++) {
-               tmp = xfs_attr3_leaf_freextent(trans, dp,
-                               lp->valueblk, lp->valuelen);
-
-               if (error == 0)
-                       error = tmp;    /* save only the 1st errno */
-       }
-
-       kmem_free(list);
-       return error;
-}
-
-/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
- */
-STATIC int
-xfs_attr3_leaf_freextent(
-       struct xfs_trans        **trans,
-       struct xfs_inode        *dp,
-       xfs_dablk_t             blkno,
-       int                     blkcnt)
-{
-       struct xfs_bmbt_irec    map;
-       struct xfs_buf          *bp;
-       xfs_dablk_t             tblkno;
-       xfs_daddr_t             dblkno;
-       int                     tblkcnt;
-       int                     dblkcnt;
-       int                     nmap;
-       int                     error;
-
-       /*
-        * Roll through the "value", invalidating the attribute value's
-        * blocks.
-        */
-       tblkno = blkno;
-       tblkcnt = blkcnt;
-       while (tblkcnt > 0) {
-               /*
-                * Try to remember where we decided to put the value.
-                */
-               nmap = 1;
-               error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
-                                      &map, &nmap, XFS_BMAPI_ATTRFORK);
-               if (error) {
-                       return(error);
-               }
-               ASSERT(nmap == 1);
-               ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-
-               /*
-                * If it's a hole, these are already unmapped
-                * so there's nothing to invalidate.
-                */
-               if (map.br_startblock != HOLESTARTBLOCK) {
-
-                       dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
-                                                 map.br_startblock);
-                       dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
-                                               map.br_blockcount);
-                       bp = xfs_trans_get_buf(*trans,
-                                       dp->i_mount->m_ddev_targp,
-                                       dblkno, dblkcnt, 0);
-                       if (!bp)
-                               return ENOMEM;
-                       xfs_trans_binval(*trans, bp);
-                       /*
-                        * Roll to next transaction.
-                        */
-                       error = xfs_trans_roll(trans, dp);
-                       if (error)
-                               return (error);
-               }
-
-               tblkno += map.br_blockcount;
-               tblkcnt -= map.br_blockcount;
-       }
-
-       return(0);
-}
index 444a7704596c409f43f0ec495c9e6cc9838460c4..c1022138c7e6f3261819a0c52618b6bb1a9cf34f 100644 (file)
@@ -333,6 +333,8 @@ int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
                        struct xfs_buf **bpp);
 void   xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
                                     struct xfs_attr_leafblock *from);
+void   xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+                                  struct xfs_attr3_icleaf_hdr *from);
 
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
 
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644 (file)
index 0000000..cbc80d4
--- /dev/null
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+       xfs_attr_sf_sort_t *sa, *sb;
+
+       sa = (xfs_attr_sf_sort_t *)a;
+       sb = (xfs_attr_sf_sort_t *)b;
+       if (sa->hash < sb->hash) {
+               return(-1);
+       } else if (sa->hash > sb->hash) {
+               return(1);
+       } else {
+               return(sa->entno - sb->entno);
+       }
+}
+
+#define XFS_ISRESET_CURSOR(cursor) \
+       (!((cursor)->initted) && !((cursor)->hashval) && \
+        !((cursor)->blkno) && !((cursor)->offset))
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform attribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+       attrlist_cursor_kern_t *cursor;
+       xfs_attr_sf_sort_t *sbuf, *sbp;
+       xfs_attr_shortform_t *sf;
+       xfs_attr_sf_entry_t *sfe;
+       xfs_inode_t *dp;
+       int sbsize, nsbuf, count, i;
+       int error;
+
+       ASSERT(context != NULL);
+       dp = context->dp;
+       ASSERT(dp != NULL);
+       ASSERT(dp->i_afp != NULL);
+       sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+       ASSERT(sf != NULL);
+       if (!sf->hdr.count)
+               return(0);
+       cursor = context->cursor;
+       ASSERT(cursor != NULL);
+
+       trace_xfs_attr_list_sf(context);
+
+       /*
+        * If the buffer is large enough and the cursor is at the start,
+        * do not bother with sorting since we will return everything in
+        * one buffer and another call using the cursor won't need to be
+        * made.
+        * Note the generous fudge factor of 16 overhead bytes per entry.
+        * If bufsize is zero then put_listent must be a search function
+        * and can just scan through what we have.
+        */
+       if (context->bufsize == 0 ||
+           (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
+               for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+                       error = context->put_listent(context,
+                                          sfe->flags,
+                                          sfe->nameval,
+                                          (int)sfe->namelen,
+                                          (int)sfe->valuelen,
+                                          &sfe->nameval[sfe->namelen]);
+
+                       /*
+                        * Either search callback finished early or
+                        * didn't fit it all in the buffer after all.
+                        */
+                       if (context->seen_enough)
+                               break;
+
+                       if (error)
+                               return error;
+                       sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+               }
+               trace_xfs_attr_list_sf_all(context);
+               return(0);
+       }
+
+       /* do no more for a search callback */
+       if (context->bufsize == 0)
+               return 0;
+
+       /*
+        * It didn't all fit, so we have to sort everything on hashval.
+        */
+       sbsize = sf->hdr.count * sizeof(*sbuf);
+       sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+
+       /*
+        * Scan the attribute list for the rest of the entries, storing
+        * the relevant info from only those that match into a buffer.
+        */
+       nsbuf = 0;
+       for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+               if (unlikely(
+                   ((char *)sfe < (char *)sf) ||
+                   ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+                       XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+                                            XFS_ERRLEVEL_LOW,
+                                            context->dp->i_mount, sfe);
+                       kmem_free(sbuf);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               sbp->entno = i;
+               sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+               sbp->name = sfe->nameval;
+               sbp->namelen = sfe->namelen;
+               /* These are bytes, and both on-disk, don't endian-flip */
+               sbp->valuelen = sfe->valuelen;
+               sbp->flags = sfe->flags;
+               sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+               sbp++;
+               nsbuf++;
+       }
+
+       /*
+        * Sort the entries on hash then entno.
+        */
+       xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+       /*
+        * Re-find our place IN THE SORTED LIST.
+        */
+       count = 0;
+       cursor->initted = 1;
+       cursor->blkno = 0;
+       for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+               if (sbp->hash == cursor->hashval) {
+                       if (cursor->offset == count) {
+                               break;
+                       }
+                       count++;
+               } else if (sbp->hash > cursor->hashval) {
+                       break;
+               }
+       }
+       if (i == nsbuf) {
+               kmem_free(sbuf);
+               return(0);
+       }
+
+       /*
+        * Loop putting entries into the user buffer.
+        */
+       for ( ; i < nsbuf; i++, sbp++) {
+               if (cursor->hashval != sbp->hash) {
+                       cursor->hashval = sbp->hash;
+                       cursor->offset = 0;
+               }
+               error = context->put_listent(context,
+                                       sbp->flags,
+                                       sbp->name,
+                                       sbp->namelen,
+                                       sbp->valuelen,
+                                       &sbp->name[sbp->namelen]);
+               if (error)
+                       return error;
+               if (context->seen_enough)
+                       break;
+               cursor->offset++;
+       }
+
+       kmem_free(sbuf);
+       return(0);
+}
+
+STATIC int
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+       attrlist_cursor_kern_t *cursor;
+       xfs_attr_leafblock_t *leaf;
+       xfs_da_intnode_t *node;
+       struct xfs_attr3_icleaf_hdr leafhdr;
+       struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_da_node_entry *btree;
+       int error, i;
+       struct xfs_buf *bp;
+
+       trace_xfs_attr_node_list(context);
+
+       cursor = context->cursor;
+       cursor->initted = 1;
+
+       /*
+        * Do all sorts of validation on the passed-in cursor structure.
+        * If anything is amiss, ignore the cursor and look up the hashval
+        * starting from the btree root.
+        */
+       bp = NULL;
+       if (cursor->blkno > 0) {
+               error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
+                                             &bp, XFS_ATTR_FORK);
+               if ((error != 0) && (error != EFSCORRUPTED))
+                       return(error);
+               if (bp) {
+                       struct xfs_attr_leaf_entry *entries;
+
+                       node = bp->b_addr;
+                       switch (be16_to_cpu(node->hdr.info.magic)) {
+                       case XFS_DA_NODE_MAGIC:
+                       case XFS_DA3_NODE_MAGIC:
+                               trace_xfs_attr_list_wrong_blk(context);
+                               xfs_trans_brelse(NULL, bp);
+                               bp = NULL;
+                               break;
+                       case XFS_ATTR_LEAF_MAGIC:
+                       case XFS_ATTR3_LEAF_MAGIC:
+                               leaf = bp->b_addr;
+                               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                               entries = xfs_attr3_leaf_entryp(leaf);
+                               if (cursor->hashval > be32_to_cpu(
+                                               entries[leafhdr.count - 1].hashval)) {
+                                       trace_xfs_attr_list_wrong_blk(context);
+                                       xfs_trans_brelse(NULL, bp);
+                                       bp = NULL;
+                               } else if (cursor->hashval <= be32_to_cpu(
+                                               entries[0].hashval)) {
+                                       trace_xfs_attr_list_wrong_blk(context);
+                                       xfs_trans_brelse(NULL, bp);
+                                       bp = NULL;
+                               }
+                               break;
+                       default:
+                               trace_xfs_attr_list_wrong_blk(context);
+                               xfs_trans_brelse(NULL, bp);
+                               bp = NULL;
+                       }
+               }
+       }
+
+       /*
+        * We did not find what we expected given the cursor's contents,
+        * so we start from the top and work down based on the hash value.
+        * Note that start of node block is same as start of leaf block.
+        */
+       if (bp == NULL) {
+               cursor->blkno = 0;
+               for (;;) {
+                       __uint16_t magic;
+
+                       error = xfs_da3_node_read(NULL, context->dp,
+                                                     cursor->blkno, -1, &bp,
+                                                     XFS_ATTR_FORK);
+                       if (error)
+                               return(error);
+                       node = bp->b_addr;
+                       magic = be16_to_cpu(node->hdr.info.magic);
+                       if (magic == XFS_ATTR_LEAF_MAGIC ||
+                           magic == XFS_ATTR3_LEAF_MAGIC)
+                               break;
+                       if (magic != XFS_DA_NODE_MAGIC &&
+                           magic != XFS_DA3_NODE_MAGIC) {
+                               XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    context->dp->i_mount,
+                                                    node);
+                               xfs_trans_brelse(NULL, bp);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       xfs_da3_node_hdr_from_disk(&nodehdr, node);
+                       btree = xfs_da3_node_tree_p(node);
+                       for (i = 0; i < nodehdr.count; btree++, i++) {
+                               if (cursor->hashval
+                                               <= be32_to_cpu(btree->hashval)) {
+                                       cursor->blkno = be32_to_cpu(btree->before);
+                                       trace_xfs_attr_list_node_descend(context,
+                                                                        btree);
+                                       break;
+                               }
+                       }
+                       if (i == nodehdr.count) {
+                               xfs_trans_brelse(NULL, bp);
+                               return 0;
+                       }
+                       xfs_trans_brelse(NULL, bp);
+               }
+       }
+       ASSERT(bp != NULL);
+
+       /*
+        * Roll upward through the blocks, processing each leaf block in
+        * order.  As long as there is space in the result buffer, keep
+        * adding the information.
+        */
+       for (;;) {
+               leaf = bp->b_addr;
+               error = xfs_attr3_leaf_list_int(bp, context);
+               if (error) {
+                       xfs_trans_brelse(NULL, bp);
+                       return error;
+               }
+               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+               if (context->seen_enough || leafhdr.forw == 0)
+                       break;
+               cursor->blkno = leafhdr.forw;
+               xfs_trans_brelse(NULL, bp);
+               error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
+                                          &bp);
+               if (error)
+                       return error;
+       }
+       xfs_trans_brelse(NULL, bp);
+       return 0;
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr3_leaf_list_int(
+       struct xfs_buf                  *bp,
+       struct xfs_attr_list_context    *context)
+{
+       struct attrlist_cursor_kern     *cursor;
+       struct xfs_attr_leafblock       *leaf;
+       struct xfs_attr3_icleaf_hdr     ichdr;
+       struct xfs_attr_leaf_entry      *entries;
+       struct xfs_attr_leaf_entry      *entry;
+       int                             retval;
+       int                             i;
+
+       trace_xfs_attr_list_leaf(context);
+
+       leaf = bp->b_addr;
+       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       entries = xfs_attr3_leaf_entryp(leaf);
+
+       cursor = context->cursor;
+       cursor->initted = 1;
+
+       /*
+        * Re-find our place in the leaf block if this is a new syscall.
+        */
+       if (context->resynch) {
+               entry = &entries[0];
+               for (i = 0; i < ichdr.count; entry++, i++) {
+                       if (be32_to_cpu(entry->hashval) == cursor->hashval) {
+                               if (cursor->offset == context->dupcnt) {
+                                       context->dupcnt = 0;
+                                       break;
+                               }
+                               context->dupcnt++;
+                       } else if (be32_to_cpu(entry->hashval) >
+                                       cursor->hashval) {
+                               context->dupcnt = 0;
+                               break;
+                       }
+               }
+               if (i == ichdr.count) {
+                       trace_xfs_attr_list_notfound(context);
+                       return 0;
+               }
+       } else {
+               entry = &entries[0];
+               i = 0;
+       }
+       context->resynch = 0;
+
+       /*
+        * We have found our place, start copying out the new attributes.
+        */
+       retval = 0;
+       for (; i < ichdr.count; entry++, i++) {
+               if (be32_to_cpu(entry->hashval) != cursor->hashval) {
+                       cursor->hashval = be32_to_cpu(entry->hashval);
+                       cursor->offset = 0;
+               }
+
+               if (entry->flags & XFS_ATTR_INCOMPLETE)
+                       continue;               /* skip incomplete entries */
+
+               if (entry->flags & XFS_ATTR_LOCAL) {
+                       xfs_attr_leaf_name_local_t *name_loc =
+                               xfs_attr3_leaf_name_local(leaf, i);
+
+                       retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_loc->nameval,
+                                               (int)name_loc->namelen,
+                                               be16_to_cpu(name_loc->valuelen),
+                                               &name_loc->nameval[name_loc->namelen]);
+                       if (retval)
+                               return retval;
+               } else {
+                       xfs_attr_leaf_name_remote_t *name_rmt =
+                               xfs_attr3_leaf_name_remote(leaf, i);
+
+                       int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+                       if (context->put_value) {
+                               xfs_da_args_t args;
+
+                               memset((char *)&args, 0, sizeof(args));
+                               args.dp = context->dp;
+                               args.whichfork = XFS_ATTR_FORK;
+                               args.valuelen = valuelen;
+                               args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
+                               args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                               args.rmtblkcnt = xfs_attr3_rmt_blocks(
+                                                       args.dp->i_mount, valuelen);
+                               retval = xfs_attr_rmtval_get(&args);
+                               if (retval)
+                                       return retval;
+                               retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_rmt->name,
+                                               (int)name_rmt->namelen,
+                                               valuelen,
+                                               args.value);
+                               kmem_free(args.value);
+                       } else {
+                               retval = context->put_listent(context,
+                                               entry->flags,
+                                               name_rmt->name,
+                                               (int)name_rmt->namelen,
+                                               valuelen,
+                                               NULL);
+                       }
+                       if (retval)
+                               return retval;
+               }
+               if (context->seen_enough)
+                       break;
+               cursor->offset++;
+       }
+       trace_xfs_attr_list_leaf_end(context);
+       return retval;
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+       int error;
+       struct xfs_buf *bp;
+
+       trace_xfs_attr_leaf_list(context);
+
+       context->cursor->blkno = 0;
+       error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
+       if (error)
+               return XFS_ERROR(error);
+
+       error = xfs_attr3_leaf_list_int(bp, context);
+       xfs_trans_brelse(NULL, bp);
+       return XFS_ERROR(error);
+}
+
+int
+xfs_attr_list_int(
+       xfs_attr_list_context_t *context)
+{
+       int error;
+       xfs_inode_t *dp = context->dp;
+
+       XFS_STATS_INC(xs_attr_list);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return EIO;
+
+       xfs_ilock(dp, XFS_ILOCK_SHARED);
+
+       /*
+        * Decide on what work routines to call based on the inode size.
+        */
+       if (!xfs_inode_hasattr(dp)) {
+               error = 0;
+       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               error = xfs_attr_shortform_list(context);
+       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_list(context);
+       } else {
+               error = xfs_attr_node_list(context);
+       }
+
+       xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+       return error;
+}
+
+#define        ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
+       (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define        ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
+       ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+        & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+STATIC int
+xfs_attr_put_listent(
+       xfs_attr_list_context_t *context,
+       int             flags,
+       unsigned char   *name,
+       int             namelen,
+       int             valuelen,
+       unsigned char   *value)
+{
+       struct attrlist *alist = (struct attrlist *)context->alist;
+       attrlist_ent_t *aep;
+       int arraytop;
+
+       ASSERT(!(context->flags & ATTR_KERNOVAL));
+       ASSERT(context->count >= 0);
+       ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+       ASSERT(context->firstu >= sizeof(*alist));
+       ASSERT(context->firstu <= context->bufsize);
+
+       /*
+        * Only list entries in the right namespace.
+        */
+       if (((context->flags & ATTR_SECURE) == 0) !=
+           ((flags & XFS_ATTR_SECURE) == 0))
+               return 0;
+       if (((context->flags & ATTR_ROOT) == 0) !=
+           ((flags & XFS_ATTR_ROOT) == 0))
+               return 0;
+
+       arraytop = sizeof(*alist) +
+                       context->count * sizeof(alist->al_offset[0]);
+       context->firstu -= ATTR_ENTSIZE(namelen);
+       if (context->firstu < arraytop) {
+               trace_xfs_attr_list_full(context);
+               alist->al_more = 1;
+               context->seen_enough = 1;
+               return 1;
+       }
+
+       aep = (attrlist_ent_t *)&context->alist[context->firstu];
+       aep->a_valuelen = valuelen;
+       memcpy(aep->a_name, name, namelen);
+       aep->a_name[namelen] = 0;
+       alist->al_offset[context->count++] = context->firstu;
+       alist->al_count = context->count;
+       trace_xfs_attr_list_add(context);
+       return 0;
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.  Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(
+       xfs_inode_t     *dp,
+       char            *buffer,
+       int             bufsize,
+       int             flags,
+       attrlist_cursor_kern_t *cursor)
+{
+       xfs_attr_list_context_t context;
+       struct attrlist *alist;
+       int error;
+
+       /*
+        * Validate the cursor.
+        */
+       if (cursor->pad1 || cursor->pad2)
+               return(XFS_ERROR(EINVAL));
+       if ((cursor->initted == 0) &&
+           (cursor->hashval || cursor->blkno || cursor->offset))
+               return XFS_ERROR(EINVAL);
+
+       /*
+        * Check for a properly aligned buffer.
+        */
+       if (((long)buffer) & (sizeof(int)-1))
+               return XFS_ERROR(EFAULT);
+       if (flags & ATTR_KERNOVAL)
+               bufsize = 0;
+
+       /*
+        * Initialize the output buffer.
+        */
+       memset(&context, 0, sizeof(context));
+       context.dp = dp;
+       context.cursor = cursor;
+       context.resynch = 1;
+       context.flags = flags;
+       context.alist = buffer;
+       context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+       context.firstu = context.bufsize;
+       context.put_listent = xfs_attr_put_listent;
+
+       alist = (struct attrlist *)context.alist;
+       alist->al_count = 0;
+       alist->al_more = 0;
+       alist->al_offset[0] = context.bufsize;
+
+       error = xfs_attr_list_int(&context);
+       ASSERT(error >= 0);
+       return error;
+}
index ef6b0c124528f6bff8d59c0fee5fa31a1d5dcc8b..712a502de619b097df202f744ba481bd3471847d 100644 (file)
@@ -22,6 +22,7 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -33,6 +34,7 @@
 #include "xfs_alloc.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
@@ -237,7 +239,7 @@ xfs_attr_rmtval_copyout(
        xfs_ino_t       ino,
        int             *offset,
        int             *valuelen,
-       char            **dst)
+       __uint8_t       **dst)
 {
        char            *src = bp->b_addr;
        xfs_daddr_t     bno = bp->b_bn;
@@ -249,7 +251,7 @@ xfs_attr_rmtval_copyout(
                int hdr_size = 0;
                int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
 
-               byte_cnt = min_t(int, *valuelen, byte_cnt);
+               byte_cnt = min(*valuelen, byte_cnt);
 
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
@@ -284,7 +286,7 @@ xfs_attr_rmtval_copyin(
        xfs_ino_t       ino,
        int             *offset,
        int             *valuelen,
-       char            **src)
+       __uint8_t       **src)
 {
        char            *dst = bp->b_addr;
        xfs_daddr_t     bno = bp->b_bn;
@@ -337,7 +339,7 @@ xfs_attr_rmtval_get(
        struct xfs_mount        *mp = args->dp->i_mount;
        struct xfs_buf          *bp;
        xfs_dablk_t             lblkno = args->rmtblkno;
-       char                    *dst = args->value;
+       __uint8_t               *dst = args->value;
        int                     valuelen = args->valuelen;
        int                     nmap;
        int                     error;
@@ -401,7 +403,7 @@ xfs_attr_rmtval_set(
        struct xfs_bmbt_irec    map;
        xfs_dablk_t             lblkno;
        xfs_fileoff_t           lfileoff = 0;
-       char                    *src = args->value;
+       __uint8_t               *src = args->value;
        int                     blkcnt;
        int                     valuelen;
        int                     nmap;
@@ -543,11 +545,6 @@ xfs_attr_rmtval_remove(
 
        /*
         * Roll through the "value", invalidating the attribute value's blocks.
-        * Note that args->rmtblkcnt is the minimum number of data blocks we'll
-        * see for a CRC enabled remote attribute. Each extent will have a
-        * header, and so we may have more blocks than we realise here.  If we
-        * fail to map the blocks correctly, we'll have problems with the buffer
-        * lookups.
         */
        lblkno = args->rmtblkno;
        blkcnt = args->rmtblkcnt;
@@ -628,4 +625,3 @@ xfs_attr_rmtval_remove(
        }
        return(0);
 }
-
index 05c698ccb238f4fa9eef9a1844683fe24f62bd13..92b830901d60bcf2b662315bb1625b2944964ddf 100644 (file)
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
@@ -39,6 +40,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_attr_leaf.h"
@@ -46,7 +48,6 @@
 #include "xfs_trans_space.h"
 #include "xfs_buf_item.h"
 #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
 
@@ -108,19 +109,6 @@ xfs_bmap_compute_maxlevels(
        mp->m_bm_maxlevels[whichfork] = level;
 }
 
-/*
- * Convert the given file system block to a disk block.  We have to treat it
- * differently based on whether the file is a real time file or not, because the
- * bmap code does.
- */
-xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
-       return (XFS_IS_REALTIME_INODE(ip) ? \
-                (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
-                XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
-
 STATIC int                             /* error */
 xfs_bmbt_lookup_eq(
        struct xfs_btree_cur    *cur,
@@ -262,173 +250,6 @@ xfs_bmap_forkoff_reset(
        }
 }
 
-/*
- * Extent tree block counting routines.
- */
-
-/*
- * Count leaf blocks given a range of extent records.
- */
-STATIC void
-xfs_bmap_count_leaves(
-       xfs_ifork_t             *ifp,
-       xfs_extnum_t            idx,
-       int                     numrecs,
-       int                     *count)
-{
-       int             b;
-
-       for (b = 0; b < numrecs; b++) {
-               xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
-               *count += xfs_bmbt_get_blockcount(frp);
-       }
-}
-
-/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
-       struct xfs_mount        *mp,
-       struct xfs_btree_block  *block,
-       int                     numrecs,
-       int                     *count)
-{
-       int             b;
-       xfs_bmbt_rec_t  *frp;
-
-       for (b = 1; b <= numrecs; b++) {
-               frp = XFS_BMBT_REC_ADDR(mp, block, b);
-               *count += xfs_bmbt_disk_get_blockcount(frp);
-       }
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-STATIC int                                     /* error */
-xfs_bmap_count_tree(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fsblock_t   blockno,        /* file system block number */
-       int             levelin,        /* level in btree */
-       int             *count)         /* Count of blocks */
-{
-       int                     error;
-       xfs_buf_t               *bp, *nbp;
-       int                     level = levelin;
-       __be64                  *pp;
-       xfs_fsblock_t           bno = blockno;
-       xfs_fsblock_t           nextbno;
-       struct xfs_btree_block  *block, *nextblock;
-       int                     numrecs;
-
-       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-       if (error)
-               return error;
-       *count += 1;
-       block = XFS_BUF_TO_BLOCK(bp);
-
-       if (--level) {
-               /* Not at node above leaves, count this level of nodes */
-               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-               while (nextbno != NULLFSBLOCK) {
-                       error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               return error;
-                       *count += 1;
-                       nextblock = XFS_BUF_TO_BLOCK(nbp);
-                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
-                       xfs_trans_brelse(tp, nbp);
-               }
-
-               /* Dive to the next level */
-               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-               bno = be64_to_cpu(*pp);
-               if (unlikely((error =
-                    xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
-                       xfs_trans_brelse(tp, bp);
-                       XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
-                                        XFS_ERRLEVEL_LOW, mp);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               xfs_trans_brelse(tp, bp);
-       } else {
-               /* count all level 1 nodes and their leaves */
-               for (;;) {
-                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-                       numrecs = be16_to_cpu(block->bb_numrecs);
-                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
-                       xfs_trans_brelse(tp, bp);
-                       if (nextbno == NULLFSBLOCK)
-                               break;
-                       bno = nextbno;
-                       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                               XFS_BMAP_BTREE_REF,
-                                               &xfs_bmbt_buf_ops);
-                       if (error)
-                               return error;
-                       *count += 1;
-                       block = XFS_BUF_TO_BLOCK(bp);
-               }
-       }
-       return 0;
-}
-
-/*
- * Count fsblocks of the given fork.
- */
-int                                            /* error */
-xfs_bmap_count_blocks(
-       xfs_trans_t             *tp,            /* transaction pointer */
-       xfs_inode_t             *ip,            /* incore inode */
-       int                     whichfork,      /* data or attr fork */
-       int                     *count)         /* out: count of blocks */
-{
-       struct xfs_btree_block  *block; /* current btree block */
-       xfs_fsblock_t           bno;    /* block # of "block" */
-       xfs_ifork_t             *ifp;   /* fork structure */
-       int                     level;  /* btree level, for checking */
-       xfs_mount_t             *mp;    /* file system mount structure */
-       __be64                  *pp;    /* pointer to block address */
-
-       bno = NULLFSBLOCK;
-       mp = ip->i_mount;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-               xfs_bmap_count_leaves(ifp, 0,
-                       ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-                       count);
-               return 0;
-       }
-
-       /*
-        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-        */
-       block = ifp->if_broot;
-       level = be16_to_cpu(block->bb_level);
-       ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-       bno = be64_to_cpu(*pp);
-       ASSERT(bno != NULLDFSBNO);
-       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-       if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
-               XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
-                                mp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       return 0;
-}
-
 /*
  * Debug/sanity checking code
  */
@@ -724,8 +545,8 @@ xfs_bmap_trace_exlist(
 
 /*
  * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters.  Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
+ * given the caller's original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extend beyond
  * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
  */
 STATIC void
@@ -823,7 +644,7 @@ xfs_bmap_add_free(
  * Remove the entry "free" from the free item list.  Prev points to the
  * previous entry, unless "free" is the head of the list.
  */
-STATIC void
+void
 xfs_bmap_del_free(
        xfs_bmap_free_t         *flist, /* free item list header */
        xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
@@ -837,92 +658,6 @@ xfs_bmap_del_free(
        kmem_zone_free(xfs_bmap_free_item_zone, free);
 }
 
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.  We never free any extents in
- * the first transaction.
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-int                                            /* error */
-xfs_bmap_finish(
-       xfs_trans_t             **tp,           /* transaction pointer addr */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *committed)     /* xact committed or not */
-{
-       xfs_efd_log_item_t      *efd;           /* extent free data */
-       xfs_efi_log_item_t      *efi;           /* extent free intention */
-       int                     error;          /* error return value */
-       xfs_bmap_free_item_t    *free;          /* free extent item */
-       unsigned int            logres;         /* new log reservation */
-       unsigned int            logcount;       /* new log count */
-       xfs_mount_t             *mp;            /* filesystem mount structure */
-       xfs_bmap_free_item_t    *next;          /* next item on free list */
-       xfs_trans_t             *ntp;           /* new transaction pointer */
-
-       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-       if (flist->xbf_count == 0) {
-               *committed = 0;
-               return 0;
-       }
-       ntp = *tp;
-       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
-       for (free = flist->xbf_first; free; free = free->xbfi_next)
-               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
-                       free->xbfi_blockcount);
-       logres = ntp->t_log_res;
-       logcount = ntp->t_log_count;
-       ntp = xfs_trans_dup(*tp);
-       error = xfs_trans_commit(*tp, 0);
-       *tp = ntp;
-       *committed = 1;
-       /*
-        * We have a new transaction, so we should return committed=1,
-        * even though we're returning an error.
-        */
-       if (error)
-               return error;
-
-       /*
-        * transaction commit worked ok so we can drop the extra ticket
-        * reference that we gained in xfs_trans_dup()
-        */
-       xfs_log_ticket_put(ntp->t_ticket);
-
-       if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
-                       logcount)))
-               return error;
-       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
-       for (free = flist->xbf_first; free != NULL; free = next) {
-               next = free->xbfi_next;
-               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
-                               free->xbfi_blockcount))) {
-                       /*
-                        * The bmap free list will be cleaned up at a
-                        * higher level.  The EFI will be canceled when
-                        * this transaction is aborted.
-                        * Need to force shutdown here to make sure it
-                        * happens, since this transaction may not be
-                        * dirty yet.
-                        */
-                       mp = ntp->t_mountp;
-                       if (!XFS_FORCED_SHUTDOWN(mp))
-                               xfs_force_shutdown(mp,
-                                                  (error == EFSCORRUPTED) ?
-                                                  SHUTDOWN_CORRUPT_INCORE :
-                                                  SHUTDOWN_META_IO_ERROR);
-                       return error;
-               }
-               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
-                       free->xbfi_blockcount);
-               xfs_bmap_del_free(flist, NULL, free);
-       }
-       return 0;
-}
-
 /*
  * Free up any items left in the list.
  */
@@ -1413,8 +1148,8 @@ xfs_bmap_add_attrfork(
        blks = XFS_ADDAFORK_SPACE_RES(mp);
        if (rsvd)
                tp->t_flags |= XFS_TRANS_RESERVE;
-       if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
+       if (error)
                goto error0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
@@ -1815,7 +1550,7 @@ xfs_bmap_first_unused(
 }
 
 /*
- * Returns the file-relative block number of the last block + 1 before
+ * Returns the file-relative block number of the last block - 1 before
  * last_block (input value) in the file.
  * This is not based on i_size, it is based on the extent records.
  * Returns 0 for local files, as they do not have extent records.
@@ -1863,7 +1598,7 @@ xfs_bmap_last_before(
        return 0;
 }
 
-STATIC int
+int
 xfs_bmap_last_extent(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
@@ -1926,29 +1661,6 @@ xfs_bmap_isaeof(
        return 0;
 }
 
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.  All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           endoff,
-       int                     whichfork,
-       int                     *eof)
-{
-       struct xfs_bmbt_irec    rec;
-       int                     error;
-
-       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
-       if (error || *eof)
-               return error;
-
-       *eof = endoff >= rec.br_startoff + rec.br_blockcount;
-       return 0;
-}
-
 /*
  * Returns the file-relative block number of the first block past eof in
  * the file.  This is not based on i_size, it is based on the extent records.
@@ -3488,7 +3200,7 @@ done:
 /*
  * Adjust the size of the new extent based on di_extsize and rt extsize.
  */
-STATIC int
+int
 xfs_bmap_extsize_align(
        xfs_mount_t     *mp,
        xfs_bmbt_irec_t *gotp,          /* next extent pointer */
@@ -3650,9 +3362,9 @@ xfs_bmap_extsize_align(
 
 #define XFS_ALLOC_GAP_UNITS    4
 
-STATIC void
+void
 xfs_bmap_adjacent(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
 {
        xfs_fsblock_t   adjust;         /* adjustment to block numbers */
        xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
@@ -3798,109 +3510,6 @@ xfs_bmap_adjacent(
 #undef ISVALID
 }
 
-STATIC int
-xfs_bmap_rtalloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
-{
-       xfs_alloctype_t atype = 0;      /* type for allocation routines */
-       int             error;          /* error return value */
-       xfs_mount_t     *mp;            /* mount point structure */
-       xfs_extlen_t    prod = 0;       /* product factor for allocators */
-       xfs_extlen_t    ralen = 0;      /* realtime allocation length */
-       xfs_extlen_t    align;          /* minimum allocation alignment */
-       xfs_rtblock_t   rtb;
-
-       mp = ap->ip->i_mount;
-       align = xfs_get_extsz_hint(ap->ip);
-       prod = align / mp->m_sb.sb_rextsize;
-       error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
-                                       align, 1, ap->eof, 0,
-                                       ap->conv, &ap->offset, &ap->length);
-       if (error)
-               return error;
-       ASSERT(ap->length);
-       ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
-
-       /*
-        * If the offset & length are not perfectly aligned
-        * then kill prod, it will just get us in trouble.
-        */
-       if (do_mod(ap->offset, align) || ap->length % align)
-               prod = 1;
-       /*
-        * Set ralen to be the actual requested length in rtextents.
-        */
-       ralen = ap->length / mp->m_sb.sb_rextsize;
-       /*
-        * If the old value was close enough to MAXEXTLEN that
-        * we rounded up to it, cut it back so it's valid again.
-        * Note that if it's a really large request (bigger than
-        * MAXEXTLEN), we don't hear about that number, and can't
-        * adjust the starting point to match it.
-        */
-       if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
-               ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
-
-       /*
-        * Lock out other modifications to the RT bitmap inode.
-        */
-       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-
-       /*
-        * If it's an allocation to an empty file at offset 0,
-        * pick an extent that will space things out in the rt area.
-        */
-       if (ap->eof && ap->offset == 0) {
-               xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
-
-               error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
-               if (error)
-                       return error;
-               ap->blkno = rtx * mp->m_sb.sb_rextsize;
-       } else {
-               ap->blkno = 0;
-       }
-
-       xfs_bmap_adjacent(ap);
-
-       /*
-        * Realtime allocation, done through xfs_rtallocate_extent.
-        */
-       atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
-       do_div(ap->blkno, mp->m_sb.sb_rextsize);
-       rtb = ap->blkno;
-       ap->length = ralen;
-       if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
-                               &ralen, atype, ap->wasdel, prod, &rtb)))
-               return error;
-       if (rtb == NULLFSBLOCK && prod > 1 &&
-           (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
-                                          ap->length, &ralen, atype,
-                                          ap->wasdel, 1, &rtb)))
-               return error;
-       ap->blkno = rtb;
-       if (ap->blkno != NULLFSBLOCK) {
-               ap->blkno *= mp->m_sb.sb_rextsize;
-               ralen *= mp->m_sb.sb_rextsize;
-               ap->length = ralen;
-               ap->ip->i_d.di_nblocks += ralen;
-               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-               if (ap->wasdel)
-                       ap->ip->i_delayed_blks -= ralen;
-               /*
-                * Adjust the disk quota also. This was reserved
-                * earlier.
-                */
-               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
-                       ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
-                                       XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
-       } else {
-               ap->length = 0;
-       }
-       return 0;
-}
-
 STATIC int
 xfs_bmap_btalloc_nullfb(
        struct xfs_bmalloca     *ap,
@@ -4018,7 +3627,7 @@ xfs_bmap_btalloc_nullfb(
 
 STATIC int
 xfs_bmap_btalloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
 {
        xfs_mount_t     *mp;            /* mount point structure */
        xfs_alloctype_t atype = 0;      /* type for allocation routines */
@@ -4250,7 +3859,7 @@ xfs_bmap_btalloc(
  */
 STATIC int
 xfs_bmap_alloc(
-       xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
 {
        if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
                return xfs_bmap_rtalloc(ap);
@@ -4638,7 +4247,7 @@ xfs_bmapi_delay(
 }
 
 
-STATIC int
+int
 __xfs_bmapi_allocate(
        struct xfs_bmalloca     *bma)
 {
@@ -4648,12 +4257,9 @@ __xfs_bmapi_allocate(
        struct xfs_ifork        *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
        int                     tmp_logflags = 0;
        int                     error;
-       int                     rt;
 
        ASSERT(bma->length > 0);
 
-       rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
-
        /*
         * For the wasdelay case, we could also just allocate the stuff asked
         * for in this bmap call but that wouldn't be as good.
@@ -4756,45 +4362,6 @@ __xfs_bmapi_allocate(
        return 0;
 }
 
-static void
-xfs_bmapi_allocate_worker(
-       struct work_struct      *work)
-{
-       struct xfs_bmalloca     *args = container_of(work,
-                                               struct xfs_bmalloca, work);
-       unsigned long           pflags;
-
-       /* we are in a transaction context here */
-       current_set_flags_nested(&pflags, PF_FSTRANS);
-
-       args->result = __xfs_bmapi_allocate(args);
-       complete(args->done);
-
-       current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Some allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Otherwise just
- * call directly to avoid the context switch overhead here.
- */
-int
-xfs_bmapi_allocate(
-       struct xfs_bmalloca     *args)
-{
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       if (!args->stack_switch)
-               return __xfs_bmapi_allocate(args);
-
-
-       args->done = &done;
-       INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
-       queue_work(xfs_alloc_wq, &args->work);
-       wait_for_completion(&done);
-       return args->result;
-}
-
 STATIC int
 xfs_bmapi_convert_unwritten(
        struct xfs_bmalloca     *bma,
@@ -5789,359 +5356,3 @@ error0:
        }
        return error;
 }
-
-/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
-       xfs_inode_t             *ip,            /* xfs incore inode pointer */
-       struct getbmapx         *out,           /* output structure */
-       int                     prealloced,     /* this is a file with
-                                                * preallocated data space */
-       __int64_t               end,            /* last block requested */
-       xfs_fsblock_t           startblock)
-{
-       __int64_t               fixlen;
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_ifork_t             *ifp;           /* inode fork pointer */
-       xfs_extnum_t            lastx;          /* last extent pointer */
-       xfs_fileoff_t           fileblock;
-
-       if (startblock == HOLESTARTBLOCK) {
-               mp = ip->i_mount;
-               out->bmv_block = -1;
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
-               fixlen -= out->bmv_offset;
-               if (prealloced && out->bmv_offset + out->bmv_length == end) {
-                       /* Came to hole at EOF. Trim it. */
-                       if (fixlen <= 0)
-                               return 0;
-                       out->bmv_length = fixlen;
-               }
-       } else {
-               if (startblock == DELAYSTARTBLOCK)
-                       out->bmv_block = -2;
-               else
-                       out->bmv_block = xfs_fsb_to_db(ip, startblock);
-               fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
-               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-               if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
-                  (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
-                       out->bmv_oflags |= BMV_OF_LAST;
-       }
-
-       return 1;
-}
-
-/*
- * Get inode's extents as described in bmv, and format for output.
- * Calls formatter to fill the user's buffer until all extents
- * are mapped, until the passed-in bmv->bmv_count slots have
- * been filled, or until the formatter short-circuits the loop,
- * if it is tracking filled-in extents on its own.
- */
-int                                            /* error code */
-xfs_getbmap(
-       xfs_inode_t             *ip,
-       struct getbmapx         *bmv,           /* user bmap structure */
-       xfs_bmap_format_t       formatter,      /* format to user */
-       void                    *arg)           /* formatter arg */
-{
-       __int64_t               bmvend;         /* last block requested */
-       int                     error = 0;      /* return value */
-       __int64_t               fixlen;         /* length for -1 case */
-       int                     i;              /* extent number */
-       int                     lock;           /* lock state */
-       xfs_bmbt_irec_t         *map;           /* buffer for user's data */
-       xfs_mount_t             *mp;            /* file system mount point */
-       int                     nex;            /* # of user extents can do */
-       int                     nexleft;        /* # of user extents left */
-       int                     subnex;         /* # of bmapi's can do */
-       int                     nmap;           /* number of map entries */
-       struct getbmapx         *out;           /* output structure */
-       int                     whichfork;      /* data or attr fork */
-       int                     prealloced;     /* this is a file with
-                                                * preallocated data space */
-       int                     iflags;         /* interface flags */
-       int                     bmapi_flags;    /* flags for xfs_bmapi */
-       int                     cur_ext = 0;
-
-       mp = ip->i_mount;
-       iflags = bmv->bmv_iflags;
-       whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-
-       if (whichfork == XFS_ATTR_FORK) {
-               if (XFS_IFORK_Q(ip)) {
-                       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
-                           ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
-                           ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-                               return XFS_ERROR(EINVAL);
-               } else if (unlikely(
-                          ip->i_d.di_aformat != 0 &&
-                          ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
-                       XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
-                                        ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-
-               prealloced = 0;
-               fixlen = 1LL << 32;
-       } else {
-               if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
-                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-                       return XFS_ERROR(EINVAL);
-
-               if (xfs_get_extsz_hint(ip) ||
-                   ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
-                       prealloced = 1;
-                       fixlen = mp->m_super->s_maxbytes;
-               } else {
-                       prealloced = 0;
-                       fixlen = XFS_ISIZE(ip);
-               }
-       }
-
-       if (bmv->bmv_length == -1) {
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-               bmv->bmv_length =
-                       max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
-       } else if (bmv->bmv_length == 0) {
-               bmv->bmv_entries = 0;
-               return 0;
-       } else if (bmv->bmv_length < 0) {
-               return XFS_ERROR(EINVAL);
-       }
-
-       nex = bmv->bmv_count - 1;
-       if (nex <= 0)
-               return XFS_ERROR(EINVAL);
-       bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
-       if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-               return XFS_ERROR(ENOMEM);
-       out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-       if (!out) {
-               out = kmem_zalloc_large(bmv->bmv_count *
-                                       sizeof(struct getbmapx));
-               if (!out)
-                       return XFS_ERROR(ENOMEM);
-       }
-
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
-                       if (error)
-                               goto out_unlock_iolock;
-               }
-               /*
-                * even after flushing the inode, there can still be delalloc
-                * blocks on the inode beyond EOF due to speculative
-                * preallocation. These are not removed until the release
-                * function is called or the inode is inactivated. Hence we
-                * cannot assert here that ip->i_delayed_blks == 0.
-                */
-       }
-
-       lock = xfs_ilock_map_shared(ip);
-
-       /*
-        * Don't let nex be bigger than the number of extents
-        * we can have assuming alternating holes and real extents.
-        */
-       if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
-               nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
-
-       bmapi_flags = xfs_bmapi_aflag(whichfork);
-       if (!(iflags & BMV_IF_PREALLOC))
-               bmapi_flags |= XFS_BMAPI_IGSTATE;
-
-       /*
-        * Allocate enough space to handle "subnex" maps at a time.
-        */
-       error = ENOMEM;
-       subnex = 16;
-       map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
-       if (!map)
-               goto out_unlock_ilock;
-
-       bmv->bmv_entries = 0;
-
-       if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
-           (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
-               error = 0;
-               goto out_free_map;
-       }
-
-       nexleft = nex;
-
-       do {
-               nmap = (nexleft > subnex) ? subnex : nexleft;
-               error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-                                      XFS_BB_TO_FSB(mp, bmv->bmv_length),
-                                      map, &nmap, bmapi_flags);
-               if (error)
-                       goto out_free_map;
-               ASSERT(nmap <= subnex);
-
-               for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                       out[cur_ext].bmv_oflags = 0;
-                       if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                               out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
-                       else if (map[i].br_startblock == DELAYSTARTBLOCK)
-                               out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
-                       out[cur_ext].bmv_offset =
-                               XFS_FSB_TO_BB(mp, map[i].br_startoff);
-                       out[cur_ext].bmv_length =
-                               XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                       out[cur_ext].bmv_unused1 = 0;
-                       out[cur_ext].bmv_unused2 = 0;
-
-                       /*
-                        * delayed allocation extents that start beyond EOF can
-                        * occur due to speculative EOF allocation when the
-                        * delalloc extent is larger than the largest freespace
-                        * extent at conversion time. These extents cannot be
-                        * converted by data writeback, so can exist here even
-                        * if we are not supposed to be finding delalloc
-                        * extents.
-                        */
-                       if (map[i].br_startblock == DELAYSTARTBLOCK &&
-                           map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
-                               ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
-                        if (map[i].br_startblock == HOLESTARTBLOCK &&
-                           whichfork == XFS_ATTR_FORK) {
-                               /* came to the end of attribute fork */
-                               out[cur_ext].bmv_oflags |= BMV_OF_LAST;
-                               goto out_free_map;
-                       }
-
-                       if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
-                                       prealloced, bmvend,
-                                       map[i].br_startblock))
-                               goto out_free_map;
-
-                       bmv->bmv_offset =
-                               out[cur_ext].bmv_offset +
-                               out[cur_ext].bmv_length;
-                       bmv->bmv_length =
-                               max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
-
-                       /*
-                        * In case we don't want to return the hole,
-                        * don't increase cur_ext so that we can reuse
-                        * it in the next loop.
-                        */
-                       if ((iflags & BMV_IF_NO_HOLES) &&
-                           map[i].br_startblock == HOLESTARTBLOCK) {
-                               memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
-                               continue;
-                       }
-
-                       nexleft--;
-                       bmv->bmv_entries++;
-                       cur_ext++;
-               }
-       } while (nmap && nexleft && bmv->bmv_length);
-
- out_free_map:
-       kmem_free(map);
- out_unlock_ilock:
-       xfs_iunlock_map_shared(ip, lock);
- out_unlock_iolock:
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-       for (i = 0; i < cur_ext; i++) {
-               int full = 0;   /* user array is full */
-
-               /* format results & advance arg */
-               error = formatter(&arg, &out[i], &full);
-               if (error || full)
-                       break;
-       }
-
-       if (is_vmalloc_addr(out))
-               kmem_free_large(out);
-       else
-               kmem_free(out);
-       return error;
-}
-
-/*
- * dead simple method of punching delalyed allocation blocks from a range in
- * the inode. Walks a block at a time so will be slow, but is only executed in
- * rare error cases so the overhead is not critical. This will alays punch out
- * both the start and end blocks, even if the ranges only partially overlap
- * them, so it is up to the caller to ensure that partial blocks are not
- * passed in.
- */
-int
-xfs_bmap_punch_delalloc_range(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           start_fsb,
-       xfs_fileoff_t           length)
-{
-       xfs_fileoff_t           remaining = length;
-       int                     error = 0;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       do {
-               int             done;
-               xfs_bmbt_irec_t imap;
-               int             nimaps = 1;
-               xfs_fsblock_t   firstblock;
-               xfs_bmap_free_t flist;
-
-               /*
-                * Map the range first and check that it is a delalloc extent
-                * before trying to unmap the range. Otherwise we will be
-                * trying to remove a real extent (which requires a
-                * transaction) or a hole, which is probably a bad idea...
-                */
-               error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
-                                      XFS_BMAPI_ENTIRE);
-
-               if (error) {
-                       /* something screwed, just bail */
-                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                               xfs_alert(ip->i_mount,
-                       "Failed delalloc mapping lookup ino %lld fsb %lld.",
-                                               ip->i_ino, start_fsb);
-                       }
-                       break;
-               }
-               if (!nimaps) {
-                       /* nothing there */
-                       goto next_block;
-               }
-               if (imap.br_startblock != DELAYSTARTBLOCK) {
-                       /* been converted, ignore */
-                       goto next_block;
-               }
-               WARN_ON(imap.br_blockcount == 0);
-
-               /*
-                * Note: while we initialise the firstblock/flist pair, they
-                * should never be used because blocks should never be
-                * allocated or freed for a delalloc extent and hence we need
-                * don't cancel or finish them after the xfs_bunmapi() call.
-                */
-               xfs_bmap_init(&flist, &firstblock);
-               error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
-                                       &flist, &done);
-               if (error)
-                       break;
-
-               ASSERT(!flist.xbf_count && !flist.xbf_first);
-next_block:
-               start_fsb++;
-               remaining--;
-       } while(remaining > 0);
-
-       return error;
-}
index 1cf1292d29b70cdbee6168c9a530d3a891547d7f..33b41f35122574e0b1cf7ad7a2a9ae23ecfadddb 100644 (file)
@@ -107,41 +107,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
                (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
 }
 
-/*
- * Argument structure for xfs_bmap_alloc.
- */
-typedef struct xfs_bmalloca {
-       xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-       struct xfs_bmap_free    *flist; /* bmap freelist */
-       struct xfs_trans        *tp;    /* transaction pointer */
-       struct xfs_inode        *ip;    /* incore inode pointer */
-       struct xfs_bmbt_irec    prev;   /* extent before the new one */
-       struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-
-       xfs_fileoff_t           offset; /* offset in file filling in */
-       xfs_extlen_t            length; /* i/o length asked/allocated */
-       xfs_fsblock_t           blkno;  /* starting block of new extent */
-
-       struct xfs_btree_cur    *cur;   /* btree cursor */
-       xfs_extnum_t            idx;    /* current extent index */
-       int                     nallocs;/* number of extents alloc'd */
-       int                     logflags;/* flags for transaction logging */
-
-       xfs_extlen_t            total;  /* total blocks needed for xaction */
-       xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
-       xfs_extlen_t            minleft; /* amount must be left after alloc */
-       char                    eof;    /* set if allocating past last extent */
-       char                    wasdel; /* replacing a delayed allocation */
-       char                    userdata;/* set if is user data */
-       char                    aeof;   /* allocated space at eof */
-       char                    conv;   /* overwriting unwritten extents */
-       char                    stack_switch;
-       int                     flags;
-       struct completion       *done;
-       struct work_struct      work;
-       int                     result;
-} xfs_bmalloca_t;
-
 /*
  * Flags for xfs_bmap_add_extent*.
  */
@@ -162,7 +127,7 @@ typedef struct xfs_bmalloca {
        { BMAP_RIGHT_FILLING,   "RF" }, \
        { BMAP_ATTRFORK,        "ATTR" }
 
-#if defined(__KERNEL) && defined(DEBUG)
+#ifdef DEBUG
 void   xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                int whichfork, unsigned long caller_ip);
 #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
@@ -205,23 +170,4 @@ int        xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint   xfs_default_attroffset(struct xfs_inode *ip);
 
-#ifdef __KERNEL__
-/* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
-
-int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-               int *committed);
-int    xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
-               xfs_bmap_format_t formatter, void *arg);
-int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
-               int whichfork, int *eof);
-int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-               int whichfork, int *count);
-int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
-               xfs_fileoff_t start_fsb, xfs_fileoff_t length);
-
-xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
-
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_BMAP_H__ */
index 0c61a22be6fd630668a16d0f92b3db625fa03173..cf3bc76710c3de6e021b37ccc275894458f8c931 100644 (file)
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
@@ -722,7 +722,7 @@ xfs_bmbt_key_diff(
                                      cur->bc_rec.b.br_startoff;
 }
 
-static int
+static bool
 xfs_bmbt_verify(
        struct xfs_buf          *bp)
 {
@@ -775,7 +775,6 @@ xfs_bmbt_verify(
                return false;
 
        return true;
-
 }
 
 static void
@@ -789,7 +788,6 @@ xfs_bmbt_read_verify(
                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
        }
-
 }
 
 static void
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644 (file)
index 0000000..541d59f
--- /dev/null
@@ -0,0 +1,2026 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+/* Kernel only BMAP related definitions and functions */
+
+/*
+ * Convert the given file system block to a disk block.  We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
+ */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+       return (XFS_IS_REALTIME_INODE(ip) ? \
+                (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+                XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int                                            /* error */
+xfs_bmap_finish(
+       xfs_trans_t             **tp,           /* transaction pointer addr */
+       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       int                     *committed)     /* xact committed or not */
+{
+       xfs_efd_log_item_t      *efd;           /* extent free data */
+       xfs_efi_log_item_t      *efi;           /* extent free intention */
+       int                     error;          /* error return value */
+       xfs_bmap_free_item_t    *free;          /* free extent item */
+       struct xfs_trans_res    tres;           /* new log reservation */
+       xfs_mount_t             *mp;            /* filesystem mount structure */
+       xfs_bmap_free_item_t    *next;          /* next item on free list */
+       xfs_trans_t             *ntp;           /* new transaction pointer */
+
+       ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+       if (flist->xbf_count == 0) {
+               *committed = 0;
+               return 0;
+       }
+       ntp = *tp;
+       efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+       for (free = flist->xbf_first; free; free = free->xbfi_next)
+               xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+                       free->xbfi_blockcount);
+
+       tres.tr_logres = ntp->t_log_res;
+       tres.tr_logcount = ntp->t_log_count;
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       ntp = xfs_trans_dup(*tp);
+       error = xfs_trans_commit(*tp, 0);
+       *tp = ntp;
+       *committed = 1;
+       /*
+        * We have a new transaction, so we should return committed=1,
+        * even though we're returning an error.
+        */
+       if (error)
+               return error;
+
+       /*
+        * transaction commit worked ok so we can drop the extra ticket
+        * reference that we gained in xfs_trans_dup()
+        */
+       xfs_log_ticket_put(ntp->t_ticket);
+
+       error = xfs_trans_reserve(ntp, &tres, 0, 0);
+       if (error)
+               return error;
+       efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+       for (free = flist->xbf_first; free != NULL; free = next) {
+               next = free->xbfi_next;
+               if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+                               free->xbfi_blockcount))) {
+                       /*
+                        * The bmap free list will be cleaned up at a
+                        * higher level.  The EFI will be canceled when
+                        * this transaction is aborted.
+                        * Need to force shutdown here to make sure it
+                        * happens, since this transaction may not be
+                        * dirty yet.
+                        */
+                       mp = ntp->t_mountp;
+                       if (!XFS_FORCED_SHUTDOWN(mp))
+                               xfs_force_shutdown(mp,
+                                                  (error == EFSCORRUPTED) ?
+                                                  SHUTDOWN_CORRUPT_INCORE :
+                                                  SHUTDOWN_META_IO_ERROR);
+                       return error;
+               }
+               xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+                       free->xbfi_blockcount);
+               xfs_bmap_del_free(flist, NULL, free);
+       }
+       return 0;
+}
+
+int
+xfs_bmap_rtalloc(
+       struct xfs_bmalloca     *ap)    /* bmap alloc argument struct */
+{
+       xfs_alloctype_t atype = 0;      /* type for allocation routines */
+       int             error;          /* error return value */
+       xfs_mount_t     *mp;            /* mount point structure */
+       xfs_extlen_t    prod = 0;       /* product factor for allocators */
+       xfs_extlen_t    ralen = 0;      /* realtime allocation length */
+       xfs_extlen_t    align;          /* minimum allocation alignment */
+       xfs_rtblock_t   rtb;
+
+       mp = ap->ip->i_mount;
+       align = xfs_get_extsz_hint(ap->ip);
+       prod = align / mp->m_sb.sb_rextsize;
+       error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+                                       align, 1, ap->eof, 0,
+                                       ap->conv, &ap->offset, &ap->length);
+       if (error)
+               return error;
+       ASSERT(ap->length);
+       ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
+
+       /*
+        * If the offset & length are not perfectly aligned
+        * then kill prod, it will just get us in trouble.
+        */
+       if (do_mod(ap->offset, align) || ap->length % align)
+               prod = 1;
+       /*
+        * Set ralen to be the actual requested length in rtextents.
+        */
+       ralen = ap->length / mp->m_sb.sb_rextsize;
+       /*
+        * If the old value was close enough to MAXEXTLEN that
+        * we rounded up to it, cut it back so it's valid again.
+        * Note that if it's a really large request (bigger than
+        * MAXEXTLEN), we don't hear about that number, and can't
+        * adjust the starting point to match it.
+        */
+       if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+               ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+       /*
+        * Lock out other modifications to the RT bitmap inode.
+        */
+       xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
+       /*
+        * If it's an allocation to an empty file at offset 0,
+        * pick an extent that will space things out in the rt area.
+        */
+       if (ap->eof && ap->offset == 0) {
+               xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+
+               error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+               if (error)
+                       return error;
+               ap->blkno = rtx * mp->m_sb.sb_rextsize;
+       } else {
+               ap->blkno = 0;
+       }
+
+       xfs_bmap_adjacent(ap);
+
+       /*
+        * Realtime allocation, done through xfs_rtallocate_extent.
+        */
+       atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+       do_div(ap->blkno, mp->m_sb.sb_rextsize);
+       rtb = ap->blkno;
+       ap->length = ralen;
+       if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+                               &ralen, atype, ap->wasdel, prod, &rtb)))
+               return error;
+       if (rtb == NULLFSBLOCK && prod > 1 &&
+           (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
+                                          ap->length, &ralen, atype,
+                                          ap->wasdel, 1, &rtb)))
+               return error;
+       ap->blkno = rtb;
+       if (ap->blkno != NULLFSBLOCK) {
+               ap->blkno *= mp->m_sb.sb_rextsize;
+               ralen *= mp->m_sb.sb_rextsize;
+               ap->length = ralen;
+               ap->ip->i_d.di_nblocks += ralen;
+               xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+               if (ap->wasdel)
+                       ap->ip->i_delayed_blks -= ralen;
+               /*
+                * Adjust the disk quota also. This was reserved
+                * earlier.
+                */
+               xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+                       ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+                                       XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+       } else {
+               ap->length = 0;
+       }
+       return 0;
+}
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_bmapi_allocate_worker(
+       struct work_struct      *work)
+{
+       struct xfs_bmalloca     *args = container_of(work,
+                                               struct xfs_bmalloca, work);
+       unsigned long           pflags;
+
+       /* we are in a transaction context here */
+       current_set_flags_nested(&pflags, PF_FSTRANS);
+
+       args->result = __xfs_bmapi_allocate(args);
+       complete(args->done);
+
+       current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+       struct xfs_bmalloca     *args)
+{
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       if (!args->stack_switch)
+               return __xfs_bmapi_allocate(args);
+
+
+       args->done = &done;
+       INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+       queue_work(xfs_alloc_wq, &args->work);
+       wait_for_completion(&done);
+       return args->result;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           endoff,
+       int                     whichfork,
+       int                     *eof)
+{
+       struct xfs_bmbt_irec    rec;
+       int                     error;
+
+       error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+       if (error || *eof)
+               return error;
+
+       *eof = endoff >= rec.br_startoff + rec.br_blockcount;
+       return 0;
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+       xfs_ifork_t             *ifp,
+       xfs_extnum_t            idx,
+       int                     numrecs,
+       int                     *count)
+{
+       int             b;
+
+       for (b = 0; b < numrecs; b++) {
+               xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+               *count += xfs_bmbt_get_blockcount(frp);
+       }
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
+       int                     numrecs,
+       int                     *count)
+{
+       int             b;
+       xfs_bmbt_rec_t  *frp;
+
+       for (b = 1; b <= numrecs; b++) {
+               frp = XFS_BMBT_REC_ADDR(mp, block, b);
+               *count += xfs_bmbt_disk_get_blockcount(frp);
+       }
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks in use.
+ */
+STATIC int                                     /* error */
+xfs_bmap_count_tree(
+       xfs_mount_t     *mp,            /* file system mount point */
+       xfs_trans_t     *tp,            /* transaction pointer */
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fsblock_t   blockno,        /* file system block number */
+       int             levelin,        /* level in btree */
+       int             *count)         /* Count of blocks */
+{
+       int                     error;
+       xfs_buf_t               *bp, *nbp;
+       int                     level = levelin;
+       __be64                  *pp;
+       xfs_fsblock_t           bno = blockno;
+       xfs_fsblock_t           nextbno;
+       struct xfs_btree_block  *block, *nextblock;
+       int                     numrecs;
+
+       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+       if (error)
+               return error;
+       *count += 1;
+       block = XFS_BUF_TO_BLOCK(bp);
+
+       if (--level) {
+               /* Not at node above leaves, count this level of nodes */
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+               while (nextbno != NULLFSBLOCK) {
+                       error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               return error;
+                       *count += 1;
+                       nextblock = XFS_BUF_TO_BLOCK(nbp);
+                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+                       xfs_trans_brelse(tp, nbp);
+               }
+
+               /* Dive to the next level */
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+               bno = be64_to_cpu(*pp);
+               if (unlikely((error =
+                    xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+                       xfs_trans_brelse(tp, bp);
+                       XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+                                        XFS_ERRLEVEL_LOW, mp);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               xfs_trans_brelse(tp, bp);
+       } else {
+               /* count all level 1 nodes and their leaves */
+               for (;;) {
+                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+                       numrecs = be16_to_cpu(block->bb_numrecs);
+                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+                       xfs_trans_brelse(tp, bp);
+                       if (nextbno == NULLFSBLOCK)
+                               break;
+                       bno = nextbno;
+                       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               return error;
+                       *count += 1;
+                       block = XFS_BUF_TO_BLOCK(bp);
+               }
+       }
+       return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int                                            /* error */
+xfs_bmap_count_blocks(
+       xfs_trans_t             *tp,            /* transaction pointer */
+       xfs_inode_t             *ip,            /* incore inode */
+       int                     whichfork,      /* data or attr fork */
+       int                     *count)         /* out: count of blocks */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       xfs_fsblock_t           bno;    /* block # of "block" */
+       xfs_ifork_t             *ifp;   /* fork structure */
+       int                     level;  /* btree level, for checking */
+       xfs_mount_t             *mp;    /* file system mount structure */
+       __be64                  *pp;    /* pointer to block address */
+
+       bno = NULLFSBLOCK;
+       mp = ip->i_mount;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+               xfs_bmap_count_leaves(ifp, 0,
+                       ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+                       count);
+               return 0;
+       }
+
+       /*
+        * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+        */
+       block = ifp->if_broot;
+       level = be16_to_cpu(block->bb_level);
+       ASSERT(level > 0);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+       bno = be64_to_cpu(*pp);
+       ASSERT(bno != NULLDFSBNO);
+       ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+       ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+       if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+               XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+                                mp);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       return 0;
+}
+
+/*
+ * returns 1 for success, 0 if we failed to map the extent.
+ */
+STATIC int
+xfs_getbmapx_fix_eof_hole(
+       xfs_inode_t             *ip,            /* xfs incore inode pointer */
+       struct getbmapx         *out,           /* output structure */
+       int                     prealloced,     /* this is a file with
+                                                * preallocated data space */
+       __int64_t               end,            /* last block requested */
+       xfs_fsblock_t           startblock)
+{
+       __int64_t               fixlen;
+       xfs_mount_t             *mp;            /* file system mount point */
+       xfs_ifork_t             *ifp;           /* inode fork pointer */
+       xfs_extnum_t            lastx;          /* last extent pointer */
+       xfs_fileoff_t           fileblock;
+
+       if (startblock == HOLESTARTBLOCK) {
+               mp = ip->i_mount;
+               out->bmv_block = -1;
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
+               fixlen -= out->bmv_offset;
+               if (prealloced && out->bmv_offset + out->bmv_length == end) {
+                       /* Came to hole at EOF. Trim it. */
+                       if (fixlen <= 0)
+                               return 0;
+                       out->bmv_length = fixlen;
+               }
+       } else {
+               if (startblock == DELAYSTARTBLOCK)
+                       out->bmv_block = -2;
+               else
+                       out->bmv_block = xfs_fsb_to_db(ip, startblock);
+               fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                  (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                       out->bmv_oflags |= BMV_OF_LAST;
+       }
+
+       return 1;
+}
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
+ */
+int                                            /* error code */
+xfs_getbmap(
+       xfs_inode_t             *ip,
+       struct getbmapx         *bmv,           /* user bmap structure */
+       xfs_bmap_format_t       formatter,      /* format to user */
+       void                    *arg)           /* formatter arg */
+{
+       __int64_t               bmvend;         /* last block requested */
+       int                     error = 0;      /* return value */
+       __int64_t               fixlen;         /* length for -1 case */
+       int                     i;              /* extent number */
+       int                     lock;           /* lock state */
+       xfs_bmbt_irec_t         *map;           /* buffer for user's data */
+       xfs_mount_t             *mp;            /* file system mount point */
+       int                     nex;            /* # of user extents can do */
+       int                     nexleft;        /* # of user extents left */
+       int                     subnex;         /* # of bmapi's can do */
+       int                     nmap;           /* number of map entries */
+       struct getbmapx         *out;           /* output structure */
+       int                     whichfork;      /* data or attr fork */
+       int                     prealloced;     /* this is a file with
+                                                * preallocated data space */
+       int                     iflags;         /* interface flags */
+       int                     bmapi_flags;    /* flags for xfs_bmapi */
+       int                     cur_ext = 0;
+
+       mp = ip->i_mount;
+       iflags = bmv->bmv_iflags;
+       whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+
+       if (whichfork == XFS_ATTR_FORK) {
+               if (XFS_IFORK_Q(ip)) {
+                       if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+                           ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+                           ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+                               return XFS_ERROR(EINVAL);
+               } else if (unlikely(
+                          ip->i_d.di_aformat != 0 &&
+                          ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+                       XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               prealloced = 0;
+               fixlen = 1LL << 32;
+       } else {
+               if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+                       return XFS_ERROR(EINVAL);
+
+               if (xfs_get_extsz_hint(ip) ||
+                   ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
+                       prealloced = 1;
+                       fixlen = mp->m_super->s_maxbytes;
+               } else {
+                       prealloced = 0;
+                       fixlen = XFS_ISIZE(ip);
+               }
+       }
+
+       if (bmv->bmv_length == -1) {
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+               bmv->bmv_length =
+                       max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
+       } else if (bmv->bmv_length == 0) {
+               bmv->bmv_entries = 0;
+               return 0;
+       } else if (bmv->bmv_length < 0) {
+               return XFS_ERROR(EINVAL);
+       }
+
+       nex = bmv->bmv_count - 1;
+       if (nex <= 0)
+               return XFS_ERROR(EINVAL);
+       bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+
+       if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+               return XFS_ERROR(ENOMEM);
+       out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
+       if (!out) {
+               out = kmem_zalloc_large(bmv->bmv_count *
+                                       sizeof(struct getbmapx));
+               if (!out)
+                       return XFS_ERROR(ENOMEM);
+       }
+
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
+               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+                       if (error)
+                               goto out_unlock_iolock;
+               }
+               /*
+                * even after flushing the inode, there can still be delalloc
+                * blocks on the inode beyond EOF due to speculative
+                * preallocation. These are not removed until the release
+                * function is called or the inode is inactivated. Hence we
+                * cannot assert here that ip->i_delayed_blks == 0.
+                */
+       }
+
+       lock = xfs_ilock_map_shared(ip);
+
+       /*
+        * Don't let nex be bigger than the number of extents
+        * we can have assuming alternating holes and real extents.
+        */
+       if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+               nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+       bmapi_flags = xfs_bmapi_aflag(whichfork);
+       if (!(iflags & BMV_IF_PREALLOC))
+               bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+       /*
+        * Allocate enough space to handle "subnex" maps at a time.
+        */
+       error = ENOMEM;
+       subnex = 16;
+       map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
+       if (!map)
+               goto out_unlock_ilock;
+
+       bmv->bmv_entries = 0;
+
+       if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
+           (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
+               error = 0;
+               goto out_free_map;
+       }
+
+       nexleft = nex;
+
+       do {
+               nmap = (nexleft > subnex) ? subnex : nexleft;
+               error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+                                      XFS_BB_TO_FSB(mp, bmv->bmv_length),
+                                      map, &nmap, bmapi_flags);
+               if (error)
+                       goto out_free_map;
+               ASSERT(nmap <= subnex);
+
+               for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+                       out[cur_ext].bmv_oflags = 0;
+                       if (map[i].br_state == XFS_EXT_UNWRITTEN)
+                               out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
+                       else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                               out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
+                       out[cur_ext].bmv_offset =
+                               XFS_FSB_TO_BB(mp, map[i].br_startoff);
+                       out[cur_ext].bmv_length =
+                               XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                       out[cur_ext].bmv_unused1 = 0;
+                       out[cur_ext].bmv_unused2 = 0;
+
+                       /*
+                        * delayed allocation extents that start beyond EOF can
+                        * occur due to speculative EOF allocation when the
+                        * delalloc extent is larger than the largest freespace
+                        * extent at conversion time. These extents cannot be
+                        * converted by data writeback, so can exist here even
+                        * if we are not supposed to be finding delalloc
+                        * extents.
+                        */
+                       if (map[i].br_startblock == DELAYSTARTBLOCK &&
+                           map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
+                               ASSERT((iflags & BMV_IF_DELALLOC) != 0);
+
+                        if (map[i].br_startblock == HOLESTARTBLOCK &&
+                           whichfork == XFS_ATTR_FORK) {
+                               /* came to the end of attribute fork */
+                               out[cur_ext].bmv_oflags |= BMV_OF_LAST;
+                               goto out_free_map;
+                       }
+
+                       if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
+                                       prealloced, bmvend,
+                                       map[i].br_startblock))
+                               goto out_free_map;
+
+                       bmv->bmv_offset =
+                               out[cur_ext].bmv_offset +
+                               out[cur_ext].bmv_length;
+                       bmv->bmv_length =
+                               max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+                       /*
+                        * In case we don't want to return the hole,
+                        * don't increase cur_ext so that we can reuse
+                        * it in the next loop.
+                        */
+                       if ((iflags & BMV_IF_NO_HOLES) &&
+                           map[i].br_startblock == HOLESTARTBLOCK) {
+                               memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+                               continue;
+                       }
+
+                       nexleft--;
+                       bmv->bmv_entries++;
+                       cur_ext++;
+               }
+       } while (nmap && nexleft && bmv->bmv_length);
+
+ out_free_map:
+       kmem_free(map);
+ out_unlock_ilock:
+       xfs_iunlock_map_shared(ip, lock);
+ out_unlock_iolock:
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       for (i = 0; i < cur_ext; i++) {
+               int full = 0;   /* user array is full */
+
+               /* format results & advance arg */
+               error = formatter(&arg, &out[i], &full);
+               if (error || full)
+                       break;
+       }
+
+       if (is_vmalloc_addr(out))
+               kmem_free_large(out);
+       else
+               kmem_free(out);
+       return error;
+}
+
+/*
+ * dead simple method of punching delalyed allocation blocks from a range in
+ * the inode. Walks a block at a time so will be slow, but is only executed in
+ * rare error cases so the overhead is not critical. This will always punch out
+ * both the start and end blocks, even if the ranges only partially overlap
+ * them, so it is up to the caller to ensure that partial blocks are not
+ * passed in.
+ */
+int
+xfs_bmap_punch_delalloc_range(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           start_fsb,
+       xfs_fileoff_t           length)
+{
+       xfs_fileoff_t           remaining = length;
+       int                     error = 0;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       do {
+               int             done;
+               xfs_bmbt_irec_t imap;
+               int             nimaps = 1;
+               xfs_fsblock_t   firstblock;
+               xfs_bmap_free_t flist;
+
+               /*
+                * Map the range first and check that it is a delalloc extent
+                * before trying to unmap the range. Otherwise we will be
+                * trying to remove a real extent (which requires a
+                * transaction) or a hole, which is probably a bad idea...
+                */
+               error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
+                                      XFS_BMAPI_ENTIRE);
+
+               if (error) {
+                       /* something screwed, just bail */
+                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                               xfs_alert(ip->i_mount,
+                       "Failed delalloc mapping lookup ino %lld fsb %lld.",
+                                               ip->i_ino, start_fsb);
+                       }
+                       break;
+               }
+               if (!nimaps) {
+                       /* nothing there */
+                       goto next_block;
+               }
+               if (imap.br_startblock != DELAYSTARTBLOCK) {
+                       /* been converted, ignore */
+                       goto next_block;
+               }
+               WARN_ON(imap.br_blockcount == 0);
+
+               /*
+                * Note: while we initialise the firstblock/flist pair, they
+                * should never be used because blocks should never be
+                * allocated or freed for a delalloc extent and hence we need
+                * don't cancel or finish them after the xfs_bunmapi() call.
+                */
+               xfs_bmap_init(&flist, &firstblock);
+               error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
+                                       &flist, &done);
+               if (error)
+                       break;
+
+               ASSERT(!flist.xbf_count && !flist.xbf_first);
+next_block:
+               start_fsb++;
+               remaining--;
+       } while(remaining > 0);
+
+       return error;
+}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+       /* prealloc/delalloc exists only on regular files */
+       if (!S_ISREG(ip->i_d.di_mode))
+               return false;
+
+       /*
+        * Zero sized files with no cached pages and delalloc blocks will not
+        * have speculative prealloc/delalloc blocks to remove.
+        */
+       if (VFS_I(ip)->i_size == 0 &&
+           VN_CACHED(VFS_I(ip)) == 0 &&
+           ip->i_delayed_blks == 0)
+               return false;
+
+       /* If we haven't read in the extent list, then don't do it now. */
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+               return false;
+
+       /*
+        * Do not free real preallocated or append-only files unless the file
+        * has delalloc blocks and we are forced to remove them.
+        */
+       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+               if (!force || ip->i_delayed_blks == 0)
+                       return false;
+
+       return true;
+}
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof
+ * when the link count isn't zero and by xfs_dm_punch_hole() when
+ * punching a hole to EOF.
+ */
+int
+xfs_free_eofblocks(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *ip,
+       bool            need_iolock)
+{
+       xfs_trans_t     *tp;
+       int             error;
+       xfs_fileoff_t   end_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_filblks_t   map_len;
+       int             nimaps;
+       xfs_bmbt_irec_t imap;
+
+       /*
+        * Figure out if there are any blocks beyond the end
+        * of the file.  If not, then there is nothing to do.
+        */
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+       last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
+       if (last_fsb <= end_fsb)
+               return 0;
+       map_len = last_fsb - end_fsb;
+
+       nimaps = 1;
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (!error && (nimaps != 0) &&
+           (imap.br_startblock != HOLESTARTBLOCK ||
+            ip->i_delayed_blks)) {
+               /*
+                * Attach the dquots to the inode up front.
+                */
+               error = xfs_qm_dqattach(ip, 0);
+               if (error)
+                       return error;
+
+               /*
+                * There are blocks after the end of file.
+                * Free them up now by truncating the file to
+                * its current size.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+               if (need_iolock) {
+                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+                               xfs_trans_cancel(tp, 0);
+                               return EAGAIN;
+                       }
+               }
+
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               if (error) {
+                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       if (need_iolock)
+                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                       return error;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+
+               /*
+                * Do not update the on-disk file size.  If we update the
+                * on-disk file size and then the system crashes before the
+                * contents of the file are flushed to disk then the files
+                * may be full of holes (ie NULL files bug).
+                */
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+                                             XFS_ISIZE(ip));
+               if (error) {
+                       /*
+                        * If we get an error at this point we simply don't
+                        * bother truncating the file.
+                        */
+                       xfs_trans_cancel(tp,
+                                        (XFS_TRANS_RELEASE_LOG_RES |
+                                         XFS_TRANS_ABORT));
+               } else {
+                       error = xfs_trans_commit(tp,
+                                               XFS_TRANS_RELEASE_LOG_RES);
+                       if (!error)
+                               xfs_inode_clear_eofblocks_tag(ip);
+               }
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (need_iolock)
+                       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       }
+       return error;
+}
+
+/*
+ * xfs_alloc_file_space()
+ *      This routine allocates disk space for the given file.
+ *
+ *     If alloc_type == 0, this request is for an ALLOCSP type
+ *     request which will change the file size.  In this case, no
+ *     DMAPI event will be generated by the call.  A TRUNCATE event
+ *     will be generated later by xfs_setattr.
+ *
+ *     If alloc_type != 0, this request is for a RESVSP type
+ *     request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *     lower block boundary byte address is less than the file's
+ *     length.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_alloc_file_space(
+       xfs_inode_t             *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     alloc_type,
+       int                     attr_flags)
+{
+       xfs_mount_t             *mp = ip->i_mount;
+       xfs_off_t               count;
+       xfs_filblks_t           allocated_fsb;
+       xfs_filblks_t           allocatesize_fsb;
+       xfs_extlen_t            extsz, temp;
+       xfs_fileoff_t           startoffset_fsb;
+       xfs_fsblock_t           firstfsb;
+       int                     nimaps;
+       int                     quota_flag;
+       int                     rt;
+       xfs_trans_t             *tp;
+       xfs_bmbt_irec_t         imaps[1], *imapp;
+       xfs_bmap_free_t         free_list;
+       uint                    qblocks, resblks, resrtextents;
+       int                     committed;
+       int                     error;
+
+       trace_xfs_alloc_file_space(ip);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return error;
+
+       if (len <= 0)
+               return XFS_ERROR(EINVAL);
+
+       rt = XFS_IS_REALTIME_INODE(ip);
+       extsz = xfs_get_extsz_hint(ip);
+
+       count = len;
+       imapp = &imaps[0];
+       nimaps = 1;
+       startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+       allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+       /*
+        * Allocate file space until done or until there is an error
+        */
+       while (allocatesize_fsb && !error) {
+               xfs_fileoff_t   s, e;
+
+               /*
+                * Determine space reservations for data/realtime.
+                */
+               if (unlikely(extsz)) {
+                       s = startoffset_fsb;
+                       do_div(s, extsz);
+                       s *= extsz;
+                       e = startoffset_fsb + allocatesize_fsb;
+                       if ((temp = do_mod(startoffset_fsb, extsz)))
+                               e += temp;
+                       if ((temp = do_mod(e, extsz)))
+                               e += extsz - temp;
+               } else {
+                       s = 0;
+                       e = allocatesize_fsb;
+               }
+
+               /*
+                * The transaction reservation is limited to a 32-bit block
+                * count, hence we need to limit the number of blocks we are
+                * trying to reserve to avoid an overflow. We can't allocate
+                * more than @nimaps extents, and an extent is limited on disk
+                * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+                */
+               resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+               if (unlikely(rt)) {
+                       resrtextents = qblocks = resblks;
+                       resrtextents /= mp->m_sb.sb_rextsize;
+                       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+                       quota_flag = XFS_QMOPT_RES_RTBLKS;
+               } else {
+                       resrtextents = 0;
+                       resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+                       quota_flag = XFS_QMOPT_RES_REGBLKS;
+               }
+
+               /*
+                * Allocate and setup the transaction.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                         resblks, resrtextents);
+               /*
+                * Check for running out of space
+                */
+               if (error) {
+                       /*
+                        * Free the transaction structure.
+                        */
+                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       break;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
+                                                     0, quota_flag);
+               if (error)
+                       goto error1;
+
+               xfs_trans_ijoin(tp, ip, 0);
+
+               xfs_bmap_init(&free_list, &firstfsb);
+               error = xfs_bmapi_write(tp, ip, startoffset_fsb,
+                                       allocatesize_fsb, alloc_type, &firstfsb,
+                                       0, imapp, &nimaps, &free_list);
+               if (error) {
+                       goto error0;
+               }
+
+               /*
+                * Complete the transaction
+                */
+               error = xfs_bmap_finish(&tp, &free_list, &committed);
+               if (error) {
+                       goto error0;
+               }
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               if (error) {
+                       break;
+               }
+
+               allocated_fsb = imapp->br_blockcount;
+
+               if (nimaps == 0) {
+                       error = XFS_ERROR(ENOSPC);
+                       break;
+               }
+
+               startoffset_fsb += allocated_fsb;
+               allocatesize_fsb -= allocated_fsb;
+       }
+
+       return error;
+
+error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
+
+error1:        /* Just cancel transaction */
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+       xfs_inode_t             *ip,
+       xfs_off_t               startoff,
+       xfs_off_t               endoff)
+{
+       xfs_bmbt_irec_t         imap;
+       xfs_fileoff_t           offset_fsb;
+       xfs_off_t               lastoffset;
+       xfs_off_t               offset;
+       xfs_buf_t               *bp;
+       xfs_mount_t             *mp = ip->i_mount;
+       int                     nimap;
+       int                     error = 0;
+
+       /*
+        * Avoid doing I/O beyond eof - it's not necessary
+        * since nothing can read beyond eof.  The space will
+        * be zeroed when the file is extended anyway.
+        */
+       if (startoff >= XFS_ISIZE(ip))
+               return 0;
+
+       if (endoff > XFS_ISIZE(ip))
+               endoff = XFS_ISIZE(ip);
+
+       bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
+                                       mp->m_rtdev_targp : mp->m_ddev_targp,
+                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+       if (!bp)
+               return XFS_ERROR(ENOMEM);
+
+       xfs_buf_unlock(bp);
+
+       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+               offset_fsb = XFS_B_TO_FSBT(mp, offset);
+               nimap = 1;
+               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+               if (error || nimap < 1)
+                       break;
+               ASSERT(imap.br_blockcount >= 1);
+               ASSERT(imap.br_startoff == offset_fsb);
+               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+               if (lastoffset > endoff)
+                       lastoffset = endoff;
+               if (imap.br_startblock == HOLESTARTBLOCK)
+                       continue;
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+               if (imap.br_state == XFS_EXT_UNWRITTEN)
+                       continue;
+               XFS_BUF_UNDONE(bp);
+               XFS_BUF_UNWRITE(bp);
+               XFS_BUF_READ(bp);
+               XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
+               xfsbdstrat(mp, bp);
+               error = xfs_buf_iowait(bp);
+               if (error) {
+                       xfs_buf_ioerror_alert(bp,
+                                       "xfs_zero_remaining_bytes(read)");
+                       break;
+               }
+               memset(bp->b_addr +
+                       (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                     0, lastoffset - offset + 1);
+               XFS_BUF_UNDONE(bp);
+               XFS_BUF_UNREAD(bp);
+               XFS_BUF_WRITE(bp);
+               xfsbdstrat(mp, bp);
+               error = xfs_buf_iowait(bp);
+               if (error) {
+                       xfs_buf_ioerror_alert(bp,
+                                       "xfs_zero_remaining_bytes(write)");
+                       break;
+               }
+       }
+       xfs_buf_free(bp);
+       return error;
+}
+
+/*
+ * xfs_free_file_space()
+ *      This routine frees disk space for the given file.
+ *
+ *     This routine is only called by xfs_change_file_space
+ *     for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+       xfs_inode_t             *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     attr_flags)
+{
+       int                     committed;
+       int                     done;
+       xfs_fileoff_t           endoffset_fsb;
+       int                     error;
+       xfs_fsblock_t           firstfsb;
+       xfs_bmap_free_t         free_list;
+       xfs_bmbt_irec_t         imap;
+       xfs_off_t               ioffset;
+       xfs_extlen_t            mod=0;
+       xfs_mount_t             *mp;
+       int                     nimap;
+       uint                    resblks;
+       xfs_off_t               rounding;
+       int                     rt;
+       xfs_fileoff_t           startoffset_fsb;
+       xfs_trans_t             *tp;
+       int                     need_iolock = 1;
+
+       mp = ip->i_mount;
+
+       trace_xfs_free_file_space(ip);
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return error;
+
+       error = 0;
+       if (len <= 0)   /* if nothing being freed */
+               return error;
+       rt = XFS_IS_REALTIME_INODE(ip);
+       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+
+       if (attr_flags & XFS_ATTR_NOLOCK)
+               need_iolock = 0;
+       if (need_iolock) {
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+               /* wait for the completion of any pending DIOs */
+               inode_dio_wait(VFS_I(ip));
+       }
+
+       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+       ioffset = offset & ~(rounding - 1);
+       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                             ioffset, -1);
+       if (error)
+               goto out_unlock_iolock;
+       truncate_pagecache_range(VFS_I(ip), ioffset, -1);
+
+       /*
+        * Need to zero the stuff we're not freeing, on disk.
+        * If it's a realtime file & can't use unwritten extents then we
+        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+        * will take care of it for us.
+        */
+       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+               nimap = 1;
+               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
+                                       &imap, &nimap, 0);
+               if (error)
+                       goto out_unlock_iolock;
+               ASSERT(nimap == 0 || nimap == 1);
+               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                       xfs_daddr_t     block;
+
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       block = imap.br_startblock;
+                       mod = do_div(block, mp->m_sb.sb_rextsize);
+                       if (mod)
+                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+               }
+               nimap = 1;
+               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
+                                       &imap, &nimap, 0);
+               if (error)
+                       goto out_unlock_iolock;
+               ASSERT(nimap == 0 || nimap == 1);
+               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                       mod++;
+                       if (mod && (mod != mp->m_sb.sb_rextsize))
+                               endoffset_fsb -= mod;
+               }
+       }
+       if ((done = (endoffset_fsb <= startoffset_fsb)))
+               /*
+                * One contiguous piece to clear
+                */
+               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+       else {
+               /*
+                * Some full blocks, possibly two pieces to clear
+                */
+               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+                       error = xfs_zero_remaining_bytes(ip, offset,
+                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+               if (!error &&
+                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+                       error = xfs_zero_remaining_bytes(ip,
+                               XFS_FSB_TO_B(mp, endoffset_fsb),
+                               offset + len - 1);
+       }
+
+       /*
+        * free file space until done or until there is an error
+        */
+       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+       while (!error && !done) {
+
+               /*
+                * allocate and setup the transaction. Allow this
+                * transaction to dip into the reserve blocks to ensure
+                * the freeing of the space succeeds at ENOSPC.
+                */
+               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+               tp->t_flags |= XFS_TRANS_RESERVE;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
+
+               /*
+                * check for running out of space
+                */
+               if (error) {
+                       /*
+                        * Free the transaction structure.
+                        */
+                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                       xfs_trans_cancel(tp, 0);
+                       break;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_trans_reserve_quota(tp, mp,
+                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
+                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
+               if (error)
+                       goto error1;
+
+               xfs_trans_ijoin(tp, ip, 0);
+
+               /*
+                * issue the bunmapi() call to free the blocks
+                */
+               xfs_bmap_init(&free_list, &firstfsb);
+               error = xfs_bunmapi(tp, ip, startoffset_fsb,
+                                 endoffset_fsb - startoffset_fsb,
+                                 0, 2, &firstfsb, &free_list, &done);
+               if (error) {
+                       goto error0;
+               }
+
+               /*
+                * complete the transaction
+                */
+               error = xfs_bmap_finish(&tp, &free_list, &committed);
+               if (error) {
+                       goto error0;
+               }
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+
+ out_unlock_iolock:
+       if (need_iolock)
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+
+ error0:
+       xfs_bmap_cancel(&free_list);
+ error1:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
+                   XFS_ILOCK_EXCL);
+       return error;
+}
+
+
+STATIC int
+xfs_zero_file_space(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     attr_flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    granularity;
+       xfs_off_t               start_boundary;
+       xfs_off_t               end_boundary;
+       int                     error;
+
+       granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+       /*
+        * Round the range of extents we are going to convert inwards.  If the
+        * offset is aligned, then it doesn't get changed so we zero from the
+        * start of the block offset points to.
+        */
+       start_boundary = round_up(offset, granularity);
+       end_boundary = round_down(offset + len, granularity);
+
+       ASSERT(start_boundary >= offset);
+       ASSERT(end_boundary <= offset + len);
+
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       if (start_boundary < end_boundary - 1) {
+               /* punch out the page cache over the conversion range */
+               truncate_pagecache_range(VFS_I(ip), start_boundary,
+                                        end_boundary - 1);
+               /* convert the blocks */
+               error = xfs_alloc_file_space(ip, start_boundary,
+                                       end_boundary - start_boundary - 1,
+                                       XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+                                       attr_flags);
+               if (error)
+                       goto out_unlock;
+
+               /* We've handled the interior of the range, now for the edges */
+               if (start_boundary != offset)
+                       error = xfs_iozero(ip, offset, start_boundary - offset);
+               if (error)
+                       goto out_unlock;
+
+               if (end_boundary != offset + len)
+                       error = xfs_iozero(ip, end_boundary,
+                                          offset + len - end_boundary);
+
+       } else {
+               /*
+                * It's either a sub-granularity range or the range spanned lies
+                * partially across two adjacent blocks.
+                */
+               error = xfs_iozero(ip, offset, len);
+       }
+
+out_unlock:
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+
+}
+
+/*
+ * xfs_change_file_space()
+ *      This routine allocates or frees disk space for the given file.
+ *      The user specified parameters are checked for alignment and size
+ *      limitations.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_change_file_space(
+       xfs_inode_t     *ip,
+       int             cmd,
+       xfs_flock64_t   *bf,
+       xfs_off_t       offset,
+       int             attr_flags)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       int             clrprealloc;
+       int             error;
+       xfs_fsize_t     fsize;
+       int             setprealloc;
+       xfs_off_t       startoffset;
+       xfs_trans_t     *tp;
+       struct iattr    iattr;
+
+       if (!S_ISREG(ip->i_d.di_mode))
+               return XFS_ERROR(EINVAL);
+
+       switch (bf->l_whence) {
+       case 0: /*SEEK_SET*/
+               break;
+       case 1: /*SEEK_CUR*/
+               bf->l_start += offset;
+               break;
+       case 2: /*SEEK_END*/
+               bf->l_start += XFS_ISIZE(ip);
+               break;
+       default:
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * length of <= 0 for resv/unresv/zero is invalid.  length for
+        * alloc/free is ignored completely and we have no idea what userspace
+        * might have set it to, so set it to zero to allow range
+        * checks to pass.
+        */
+       switch (cmd) {
+       case XFS_IOC_ZERO_RANGE:
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_RESVSP64:
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_UNRESVSP64:
+               if (bf->l_len <= 0)
+                       return XFS_ERROR(EINVAL);
+               break;
+       default:
+               bf->l_len = 0;
+               break;
+       }
+
+       if (bf->l_start < 0 ||
+           bf->l_start > mp->m_super->s_maxbytes ||
+           bf->l_start + bf->l_len < 0 ||
+           bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
+               return XFS_ERROR(EINVAL);
+
+       bf->l_whence = 0;
+
+       startoffset = bf->l_start;
+       fsize = XFS_ISIZE(ip);
+
+       setprealloc = clrprealloc = 0;
+       switch (cmd) {
+       case XFS_IOC_ZERO_RANGE:
+               error = xfs_zero_file_space(ip, startoffset, bf->l_len,
+                                               attr_flags);
+               if (error)
+                       return error;
+               setprealloc = 1;
+               break;
+
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_RESVSP64:
+               error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+                                               XFS_BMAPI_PREALLOC, attr_flags);
+               if (error)
+                       return error;
+               setprealloc = 1;
+               break;
+
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_UNRESVSP64:
+               if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+                                                               attr_flags)))
+                       return error;
+               break;
+
+       case XFS_IOC_ALLOCSP:
+       case XFS_IOC_ALLOCSP64:
+       case XFS_IOC_FREESP:
+       case XFS_IOC_FREESP64:
+               /*
+                * These operations actually do IO when extending the file, but
+                * the allocation is done seperately to the zeroing that is
+                * done. This set of operations need to be serialised against
+                * other IO operations, such as truncate and buffered IO. We
+                * need to take the IOLOCK here to serialise the allocation and
+                * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
+                * truncate, direct IO) from racing against the transient
+                * allocated but not written state we can have here.
+                */
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+               if (startoffset > fsize) {
+                       error = xfs_alloc_file_space(ip, fsize,
+                                       startoffset - fsize, 0,
+                                       attr_flags | XFS_ATTR_NOLOCK);
+                       if (error) {
+                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                               break;
+                       }
+               }
+
+               iattr.ia_valid = ATTR_SIZE;
+               iattr.ia_size = startoffset;
+
+               error = xfs_setattr_size(ip, &iattr,
+                                        attr_flags | XFS_ATTR_NOLOCK);
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+               if (error)
+                       return error;
+
+               clrprealloc = 1;
+               break;
+
+       default:
+               ASSERT(0);
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * update the inode timestamp, mode, and prealloc flag bits
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       if ((attr_flags & XFS_ATTR_DMI) == 0) {
+               ip->i_d.di_mode &= ~S_ISUID;
+
+               /*
+                * Note that we don't have to worry about mandatory
+                * file locking being disabled here because we only
+                * clear the S_ISGID bit if the Group execute bit is
+                * on, but if it was on then mandatory locking wouldn't
+                * have been enabled.
+                */
+               if (ip->i_d.di_mode & S_IXGRP)
+                       ip->i_d.di_mode &= ~S_ISGID;
+
+               xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       }
+       if (setprealloc)
+               ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+       else if (clrprealloc)
+               ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       if (attr_flags & XFS_ATTR_SYNC)
+               xfs_trans_set_sync(tp);
+       return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * We need to check that the format of the data fork in the temporary inode is
+ * valid for the target inode before doing the swap. This is not a problem with
+ * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
+ * data fork depending on the space the attribute fork is taking so we can get
+ * invalid formats on the target inode.
+ *
+ * E.g. target has space for 7 extents in extent format, temp inode only has
+ * space for 6.  If we defragment down to 7 extents, then the tmp format is a
+ * btree, but when swapped it needs to be in extent format. Hence we can't just
+ * blindly swap data forks on attr2 filesystems.
+ *
+ * Note that we check the swap in both directions so that we don't end up with
+ * a corrupt temporary inode, either.
+ *
+ * Note that fixing the way xfs_fsr sets up the attribute fork in the source
+ * inode will prevent this situation from occurring, so all we do here is
+ * reject and log the attempt. basically we are putting the responsibility on
+ * userspace to get this right.
+ */
+static int
+xfs_swap_extents_check_format(
+       xfs_inode_t     *ip,    /* target inode */
+       xfs_inode_t     *tip)   /* tmp inode */
+{
+
+       /* Should never get a local format */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+           tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+               return EINVAL;
+
+       /*
+        * if the target inode has less extents that then temporary inode then
+        * why did userspace call us?
+        */
+       if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+               return EINVAL;
+
+       /*
+        * if the target inode is in extent form and the temp inode is in btree
+        * form then we will end up with the target inode in the wrong format
+        * as we already know there are less extents in the temp inode.
+        */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               return EINVAL;
+
+       /* Check temp in extent form to max in target */
+       if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+               return EINVAL;
+
+       /* Check target in extent form to max in temp */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+               return EINVAL;
+
+       /*
+        * If we are in a btree format, check that the temp root block will fit
+        * in the target and that it has enough extents to be in btree format
+        * in the target.
+        *
+        * Note that we have to be careful to allow btree->extent conversions
+        * (a common defrag case) which will occur when the temp inode is in
+        * extent format...
+        */
+       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(ip) &&
+                   XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
+
+       /* Reciprocal target->temp btree format checks */
+       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(tip) &&
+                   XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
+
+       return 0;
+}
+
+int
+xfs_swap_extents(
+       xfs_inode_t     *ip,    /* target inode */
+       xfs_inode_t     *tip,   /* tmp inode */
+       xfs_swapext_t   *sxp)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_trans_t     *tp;
+       xfs_bstat_t     *sbp = &sxp->sx_stat;
+       xfs_ifork_t     *tempifp, *ifp, *tifp;
+       int             src_log_flags, target_log_flags;
+       int             error = 0;
+       int             aforkblks = 0;
+       int             taforkblks = 0;
+       __uint64_t      tmp;
+
+       /*
+        * We have no way of updating owner information in the BMBT blocks for
+        * each inode on CRC enabled filesystems, so to avoid corrupting the
+        * this metadata we simply don't allow extent swaps to occur.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return XFS_ERROR(EINVAL);
+
+       tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+       if (!tempifp) {
+               error = XFS_ERROR(ENOMEM);
+               goto out;
+       }
+
+       /*
+        * we have to do two separate lock calls here to keep lockdep
+        * happy. If we try to get all the locks in one call, lock will
+        * report false positives when we drop the ILOCK and regain them
+        * below.
+        */
+       xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+       /* Verify that both files have the same format */
+       if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       /* Verify both files are either real-time or non-realtime */
+       if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+       if (error)
+               goto out_unlock;
+       truncate_pagecache_range(VFS_I(tip), 0, -1);
+
+       /* Verify O_DIRECT for ftmp */
+       if (VN_CACHED(VFS_I(tip)) != 0) {
+               error = XFS_ERROR(EINVAL);
+               goto out_unlock;
+       }
+
+       /* Verify all data are being swapped */
+       if (sxp->sx_offset != 0 ||
+           sxp->sx_length != ip->i_d.di_size ||
+           sxp->sx_length != tip->i_d.di_size) {
+               error = XFS_ERROR(EFAULT);
+               goto out_unlock;
+       }
+
+       trace_xfs_swap_extent_before(ip, 0);
+       trace_xfs_swap_extent_before(tip, 1);
+
+       /* check inode formats now that data is flushed */
+       error = xfs_swap_extents_check_format(ip, tip);
+       if (error) {
+               xfs_notice(mp,
+                   "%s: inode 0x%llx format is incompatible for exchanging.",
+                               __func__, ip->i_ino);
+               goto out_unlock;
+       }
+
+       /*
+        * Compare the current change & modify times with that
+        * passed in.  If they differ, we abort this swap.
+        * This is the mechanism used to ensure the calling
+        * process that the file was not changed out from
+        * under it.
+        */
+       if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+           (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+           (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+           (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+               error = XFS_ERROR(EBUSY);
+               goto out_unlock;
+       }
+
+       /* We need to fail if the file is memory mapped.  Once we have tossed
+        * all existing pages, the page fault will have no option
+        * but to go to the filesystem for pages. By making the page fault call
+        * vop_read (or write in the case of autogrow) they block on the iolock
+        * until we have switched the extents.
+        */
+       if (VN_MAPPED(VFS_I(ip))) {
+               error = XFS_ERROR(EBUSY);
+               goto out_unlock;
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+       /*
+        * There is a race condition here since we gave up the
+        * ilock.  However, the data fork will not change since
+        * we have the iolock (locked for truncation too) so we
+        * are safe.  We don't really care if non-io related
+        * fields change.
+        */
+       truncate_pagecache_range(VFS_I(ip), 0, -1);
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       if (error) {
+               xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
+               xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+               xfs_trans_cancel(tp, 0);
+               goto out;
+       }
+       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+
+       /*
+        * Count the number of extended attribute blocks
+        */
+       if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+            (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+               error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+               if (error)
+                       goto out_trans_cancel;
+       }
+       if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+            (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+               error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+                       &taforkblks);
+               if (error)
+                       goto out_trans_cancel;
+       }
+
+       /*
+        * Swap the data forks of the inodes
+        */
+       ifp = &ip->i_df;
+       tifp = &tip->i_df;
+       *tempifp = *ifp;        /* struct copy */
+       *ifp = *tifp;           /* struct copy */
+       *tifp = *tempifp;       /* struct copy */
+
+       /*
+        * Fix the on-disk inode values
+        */
+       tmp = (__uint64_t)ip->i_d.di_nblocks;
+       ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+       tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+       tmp = (__uint64_t) ip->i_d.di_nextents;
+       ip->i_d.di_nextents = tip->i_d.di_nextents;
+       tip->i_d.di_nextents = tmp;
+
+       tmp = (__uint64_t) ip->i_d.di_format;
+       ip->i_d.di_format = tip->i_d.di_format;
+       tip->i_d.di_format = tmp;
+
+       /*
+        * The extents in the source inode could still contain speculative
+        * preallocation beyond EOF (e.g. the file is open but not modified
+        * while defrag is in progress). In that case, we need to copy over the
+        * number of delalloc blocks the data fork in the source inode is
+        * tracking beyond EOF so that when the fork is truncated away when the
+        * temporary inode is unlinked we don't underrun the i_delayed_blks
+        * counter on that inode.
+        */
+       ASSERT(tip->i_delayed_blks == 0);
+       tip->i_delayed_blks = ip->i_delayed_blks;
+       ip->i_delayed_blks = 0;
+
+       src_log_flags = XFS_ILOG_CORE;
+       switch (ip->i_d.di_format) {
+       case XFS_DINODE_FMT_EXTENTS:
+               /* If the extents fit in the inode, fix the
+                * pointer.  Otherwise it's already NULL or
+                * pointing to the extent.
+                */
+               if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                       ifp->if_u1.if_extents =
+                               ifp->if_u2.if_inline_ext;
+               }
+               src_log_flags |= XFS_ILOG_DEXT;
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               src_log_flags |= XFS_ILOG_DBROOT;
+               break;
+       }
+
+       target_log_flags = XFS_ILOG_CORE;
+       switch (tip->i_d.di_format) {
+       case XFS_DINODE_FMT_EXTENTS:
+               /* If the extents fit in the inode, fix the
+                * pointer.  Otherwise it's already NULL or
+                * pointing to the extent.
+                */
+               if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                       tifp->if_u1.if_extents =
+                               tifp->if_u2.if_inline_ext;
+               }
+               target_log_flags |= XFS_ILOG_DEXT;
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               target_log_flags |= XFS_ILOG_DBROOT;
+               break;
+       }
+
+
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+       xfs_trans_log_inode(tp, ip,  src_log_flags);
+       xfs_trans_log_inode(tp, tip, target_log_flags);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        */
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(tp);
+
+       error = xfs_trans_commit(tp, 0);
+
+       trace_xfs_swap_extent_after(ip, 0);
+       trace_xfs_swap_extent_after(tip, 1);
+out:
+       kmem_free(tempifp);
+       return error;
+
+out_unlock:
+       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       goto out;
+
+out_trans_cancel:
+       xfs_trans_cancel(tp, 0);
+       goto out_unlock;
+}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644 (file)
index 0000000..0612609
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BMAP_UTIL_H__
+#define        __XFS_BMAP_UTIL_H__
+
+/* Kernel only BMAP related definitions and functions */
+
+struct xfs_bmbt_irec;
+struct xfs_bmap_free_item;
+struct xfs_ifork;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+       xfs_fsblock_t           *firstblock; /* i/o first block allocated */
+       struct xfs_bmap_free    *flist; /* bmap freelist */
+       struct xfs_trans        *tp;    /* transaction pointer */
+       struct xfs_inode        *ip;    /* incore inode pointer */
+       struct xfs_bmbt_irec    prev;   /* extent before the new one */
+       struct xfs_bmbt_irec    got;    /* extent after, or delayed */
+
+       xfs_fileoff_t           offset; /* offset in file filling in */
+       xfs_extlen_t            length; /* i/o length asked/allocated */
+       xfs_fsblock_t           blkno;  /* starting block of new extent */
+
+       struct xfs_btree_cur    *cur;   /* btree cursor */
+       xfs_extnum_t            idx;    /* current extent index */
+       int                     nallocs;/* number of extents alloc'd */
+       int                     logflags;/* flags for transaction logging */
+
+       xfs_extlen_t            total;  /* total blocks needed for xaction */
+       xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
+       xfs_extlen_t            minleft; /* amount must be left after alloc */
+       char                    eof;    /* set if allocating past last extent */
+       char                    wasdel; /* replacing a delayed allocation */
+       char                    userdata;/* set if is user data */
+       char                    aeof;   /* allocated space at eof */
+       char                    conv;   /* overwriting unwritten extents */
+       char                    stack_switch;
+       int                     flags;
+       struct completion       *done;
+       struct work_struct      work;
+       int                     result;
+};
+
+int    xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+                       int *committed);
+int    xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
+int    xfs_bmapi_allocate(struct xfs_bmalloca *args);
+int    __xfs_bmapi_allocate(struct xfs_bmalloca *args);
+int    xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
+                    int whichfork, int *eof);
+int    xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+                             int whichfork, int *count);
+int    xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+               xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+int    xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
+               xfs_bmap_format_t formatter, void *arg);
+
+/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
+void   xfs_bmap_del_free(struct xfs_bmap_free *flist,
+                         struct xfs_bmap_free_item *prev,
+                         struct xfs_bmap_free_item *free);
+int    xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
+                              struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
+                              int rt, int eof, int delay, int convert,
+                              xfs_fileoff_t *offp, xfs_extlen_t *lenp);
+void   xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+int    xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+                            int whichfork, struct xfs_bmbt_irec *rec,
+                            int *is_empty);
+
+/* preallocation and hole punch interface */
+int    xfs_change_file_space(struct xfs_inode *ip, int cmd,
+                             xfs_flock64_t *bf, xfs_off_t offset,
+                             int attr_flags);
+
+/* EOF block manipulation functions */
+bool   xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+int    xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
+                          bool need_iolock);
+
+int    xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+                        struct xfs_swapext *sx);
+
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+
+#endif /* __XFS_BMAP_UTIL_H__ */
index 0903960410a255c171a7b663ffb3ba4af9137074..7a2b4da3c0db9a0f77f19a30cea966848ed60cef 100644 (file)
@@ -510,7 +510,7 @@ xfs_btree_ptr_addr(
 }
 
 /*
- * Get the root block which is stored in the inode.
+ * Get the root block which is stored in the inode.
  *
  * For now this btree implementation assumes the btree root is always
  * stored in the if_broot field of an inode fork.
@@ -978,6 +978,7 @@ xfs_btree_init_block_int(
                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
                        uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
                        buf->bb_u.l.bb_pad = 0;
+                       buf->bb_u.l.bb_lsn = 0;
                }
        } else {
                /* owner is a 32 bit value on short blocks */
@@ -989,6 +990,7 @@ xfs_btree_init_block_int(
                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
                        uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+                       buf->bb_u.s.bb_lsn = 0;
                }
        }
 }
@@ -1684,7 +1686,7 @@ xfs_lookup_get_search_key(
 
 /*
  * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * stat is set to 0 if can't find any such record, 1 for success.
  */
 int                                    /* error */
 xfs_btree_lookup(
@@ -2756,7 +2758,6 @@ xfs_btree_make_block_unfull(
 
                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
                        /* A root block that can be made bigger. */
-
                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
                } else {
                        /* A root block that needs replacing */
index 55e3c7cc3c3d3f22178fb1feb8aab44679821172..c8473c7ef45e4c764fd61eb1bf6419cb1d98f4ea 100644 (file)
@@ -88,13 +88,11 @@ struct xfs_btree_block {
 #define XFS_BTREE_SBLOCK_CRC_LEN       (XFS_BTREE_SBLOCK_LEN + 40)
 #define XFS_BTREE_LBLOCK_CRC_LEN       (XFS_BTREE_LBLOCK_LEN + 48)
 
-
 #define XFS_BTREE_SBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
 #define XFS_BTREE_LBLOCK_CRC_OFF \
        offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
 
-
 /*
  * Generic key, ptr and record wrapper structures.
  *
index 1b2472a46e46b96e31e0615f670120218ed7cf24..c06823fe10d3559c143c3608b04815efd6252631 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/freezer.h>
 
 #include "xfs_sb.h"
+#include "xfs_trans_resv.h"
 #include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -303,7 +304,7 @@ _xfs_buf_free_pages(
  *     Releases the specified buffer.
  *
  *     The modification state of any associated pages is left unchanged.
- *     The buffer most not be on any hash - use xfs_buf_rele instead for
+ *     The buffer must not be on any hash - use xfs_buf_rele instead for
  *     hashed and refcounted buffers
  */
 void
@@ -1621,7 +1622,7 @@ xfs_setsize_buftarg_flags(
 /*
  *     When allocating the initial buffer target we have not yet
  *     read in the superblock, so don't know what sized sectors
- *     are being used is at this early stage.  Play safe.
+ *     are being used at this early stage.  Play safe.
  */
 STATIC int
 xfs_setsize_buftarg_early(
index bfc4e0c26fd3404fb36f007da344be79543aea4c..3a944b198e35a0fbfc758a84fd39a2e0674d360f 100644 (file)
@@ -39,6 +39,14 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
 
 STATIC void    xfs_buf_do_callbacks(struct xfs_buf *bp);
 
+static inline int
+xfs_buf_log_format_size(
+       struct xfs_buf_log_format *blfp)
+{
+       return offsetof(struct xfs_buf_log_format, blf_data_map) +
+                       (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+}
+
 /*
  * This returns the number of log iovecs needed to log the
  * given buf log item.
@@ -49,25 +57,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
  *
  * If the XFS_BLI_STALE flag has been set, then log nothing.
  */
-STATIC uint
+STATIC void
 xfs_buf_item_size_segment(
        struct xfs_buf_log_item *bip,
-       struct xfs_buf_log_format *blfp)
+       struct xfs_buf_log_format *blfp,
+       int                     *nvecs,
+       int                     *nbytes)
 {
        struct xfs_buf          *bp = bip->bli_buf;
-       uint                    nvecs;
        int                     next_bit;
        int                     last_bit;
 
        last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
        if (last_bit == -1)
-               return 0;
+               return;
 
        /*
         * initial count for a dirty buffer is 2 vectors - the format structure
         * and the first dirty region.
         */
-       nvecs = 2;
+       *nvecs += 2;
+       *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
 
        while (last_bit != -1) {
                /*
@@ -87,18 +97,17 @@ xfs_buf_item_size_segment(
                        break;
                } else if (next_bit != last_bit + 1) {
                        last_bit = next_bit;
-                       nvecs++;
+                       (*nvecs)++;
                } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
                           (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
                            XFS_BLF_CHUNK)) {
                        last_bit = next_bit;
-                       nvecs++;
+                       (*nvecs)++;
                } else {
                        last_bit++;
                }
+               *nbytes += XFS_BLF_CHUNK;
        }
-
-       return nvecs;
 }
 
 /*
@@ -118,12 +127,13 @@ xfs_buf_item_size_segment(
  * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
  * format structures.
  */
-STATIC uint
+STATIC void
 xfs_buf_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
-       uint                    nvecs;
        int                     i;
 
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -135,7 +145,11 @@ xfs_buf_item_size(
                 */
                trace_xfs_buf_item_size_stale(bip);
                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
-               return bip->bli_format_count;
+               *nvecs += bip->bli_format_count;
+               for (i = 0; i < bip->bli_format_count; i++) {
+                       *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
+               }
+               return;
        }
 
        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
@@ -147,7 +161,8 @@ xfs_buf_item_size(
                 * commit, so no vectors are used at all.
                 */
                trace_xfs_buf_item_size_ordered(bip);
-               return XFS_LOG_VEC_ORDERED;
+               *nvecs = XFS_LOG_VEC_ORDERED;
+               return;
        }
 
        /*
@@ -159,13 +174,11 @@ xfs_buf_item_size(
         * count for the extra buf log format structure that will need to be
         * written.
         */
-       nvecs = 0;
        for (i = 0; i < bip->bli_format_count; i++) {
-               nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]);
+               xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
+                                         nvecs, nbytes);
        }
-
        trace_xfs_buf_item_size(bip);
-       return nvecs;
 }
 
 static struct xfs_log_iovec *
@@ -192,8 +205,7 @@ xfs_buf_item_format_segment(
         * the actual size of the dirty bitmap rather than the size of the in
         * memory structure.
         */
-       base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
-                       (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+       base_size = xfs_buf_log_format_size(blfp);
 
        nvecs = 0;
        first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
@@ -601,11 +613,9 @@ xfs_buf_item_unlock(
                        }
                }
        }
-       if (clean)
-               xfs_buf_item_relse(bp);
-       else if (aborted) {
+       if (clean || aborted) {
                if (atomic_dec_and_test(&bip->bli_refcount)) {
-                       ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                       ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp));
                        xfs_buf_item_relse(bp);
                }
        } else
index 0f1c247dc680031fe06554a4bd41f6f962290a53..db6371087fe8ea9b786f82189b387c55979a2460 100644 (file)
 #ifndef        __XFS_BUF_ITEM_H__
 #define        __XFS_BUF_ITEM_H__
 
-extern kmem_zone_t     *xfs_buf_item_zone;
-
-/*
- * This flag indicates that the buffer contains on disk inodes
- * and requires special recovery handling.
- */
-#define        XFS_BLF_INODE_BUF       (1<<0)
-/*
- * This flag indicates that the buffer should not be replayed
- * during recovery because its blocks are being freed.
- */
-#define        XFS_BLF_CANCEL          (1<<1)
-
-/*
- * This flag indicates that the buffer contains on disk
- * user or group dquots and may require special recovery handling.
- */
-#define        XFS_BLF_UDQUOT_BUF      (1<<2)
-#define XFS_BLF_PDQUOT_BUF     (1<<3)
-#define        XFS_BLF_GDQUOT_BUF      (1<<4)
-
-#define        XFS_BLF_CHUNK           128
-#define        XFS_BLF_SHIFT           7
-#define        BIT_TO_WORD_SHIFT       5
-#define        NBWORD                  (NBBY * sizeof(unsigned int))
-
-/*
- * This is the structure used to lay out a buf log item in the
- * log.  The data map describes which 128 byte chunks of the buffer
- * have been logged.
- */
-#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+/* kernel only definitions */
 
-typedef struct xfs_buf_log_format {
-       unsigned short  blf_type;       /* buf log item type indicator */
-       unsigned short  blf_size;       /* size of this item */
-       ushort          blf_flags;      /* misc state */
-       ushort          blf_len;        /* number of blocks in this buf */
-       __int64_t       blf_blkno;      /* starting blkno of this buf */
-       unsigned int    blf_map_size;   /* used size of data bitmap in words */
-       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
-
-/*
- * All buffers now need to tell recovery where the magic number
- * is so that it can verify and calculate the CRCs on the buffer correctly
- * once the changes have been replayed into the buffer.
- *
- * The type value is held in the upper 5 bits of the blf_flags field, which is
- * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
- */
-#define XFS_BLFT_BITS  5
-#define XFS_BLFT_SHIFT 11
-#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
-
-enum xfs_blft {
-       XFS_BLFT_UNKNOWN_BUF = 0,
-       XFS_BLFT_UDQUOT_BUF,
-       XFS_BLFT_PDQUOT_BUF,
-       XFS_BLFT_GDQUOT_BUF,
-       XFS_BLFT_BTREE_BUF,
-       XFS_BLFT_AGF_BUF,
-       XFS_BLFT_AGFL_BUF,
-       XFS_BLFT_AGI_BUF,
-       XFS_BLFT_DINO_BUF,
-       XFS_BLFT_SYMLINK_BUF,
-       XFS_BLFT_DIR_BLOCK_BUF,
-       XFS_BLFT_DIR_DATA_BUF,
-       XFS_BLFT_DIR_FREE_BUF,
-       XFS_BLFT_DIR_LEAF1_BUF,
-       XFS_BLFT_DIR_LEAFN_BUF,
-       XFS_BLFT_DA_NODE_BUF,
-       XFS_BLFT_ATTR_LEAF_BUF,
-       XFS_BLFT_ATTR_RMT_BUF,
-       XFS_BLFT_SB_BUF,
-       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
-};
-
-static inline void
-xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
-{
-       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
-       blf->blf_flags &= ~XFS_BLFT_MASK;
-       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
-}
-
-static inline __uint16_t
-xfs_blft_from_flags(struct xfs_buf_log_format *blf)
-{
-       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
-}
-
-/*
- * buf log item flags
- */
+/* buf log item flags */
 #define        XFS_BLI_HOLD            0x01
 #define        XFS_BLI_DIRTY           0x02
 #define        XFS_BLI_STALE           0x04
@@ -133,8 +41,6 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
        { XFS_BLI_ORDERED,      "ORDERED" }
 
 
-#ifdef __KERNEL__
-
 struct xfs_buf;
 struct xfs_mount;
 struct xfs_buf_log_item;
@@ -169,6 +75,6 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
                               enum xfs_blft);
 void   xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
 
-#endif /* __KERNEL__ */
+extern kmem_zone_t     *xfs_buf_item_zone;
 
 #endif /* __XFS_BUF_ITEM_H__ */
index 0b8b2a13cd24debe493c8982679a2c565ebae5a1..d4e59a4ff59ff1600cf8c9a83ce4b36f47ddfcd0 100644 (file)
@@ -27,8 +27,8 @@
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -399,7 +399,7 @@ xfs_da3_split(
        struct xfs_da_intnode   *node;
        struct xfs_buf          *bp;
        int                     max;
-       int                     action;
+       int                     action = 0;
        int                     error;
        int                     i;
 
@@ -2454,9 +2454,9 @@ static int
 xfs_buf_map_from_irec(
        struct xfs_mount        *mp,
        struct xfs_buf_map      **mapp,
-       unsigned int            *nmaps,
+       int                     *nmaps,
        struct xfs_bmbt_irec    *irecs,
-       unsigned int            nirecs)
+       int                     nirecs)
 {
        struct xfs_buf_map      *map;
        int                     i;
index 6fb3371c63cf3db535ea84cd6d62c31445b8c4f2..b1f267995dea32e97dc041c7dd14af77e1630dc8 100644 (file)
@@ -133,12 +133,19 @@ extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
                                     struct xfs_da3_icnode_hdr *from);
 
 static inline int
-xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+__xfs_da3_node_hdr_size(bool v3)
 {
-       if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC))
+       if (v3)
                return sizeof(struct xfs_da3_node_hdr);
        return sizeof(struct xfs_da_node_hdr);
 }
+static inline int
+xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+{
+       bool    v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
+
+       return __xfs_da3_node_hdr_size(v3);
+}
 
 static inline struct xfs_da_node_entry *
 xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
@@ -176,6 +183,7 @@ enum xfs_dacmp {
 typedef struct xfs_da_args {
        const __uint8_t *name;          /* string (maybe not NULL terminated) */
        int             namelen;        /* length of string (maybe no NULL) */
+       __uint8_t       filetype;       /* filetype of inode for directories */
        __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
        int             valuelen;       /* length of value */
        int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644 (file)
index e36445c..0000000
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_itable.h"
-#include "xfs_dfrag.h"
-#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-static int xfs_swap_extents(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip,   /* tmp inode */
-       xfs_swapext_t   *sxp);
-
-/*
- * ioctl interface for swapext
- */
-int
-xfs_swapext(
-       xfs_swapext_t   *sxp)
-{
-       xfs_inode_t     *ip, *tip;
-       struct fd       f, tmp;
-       int             error = 0;
-
-       /* Pull information for the target fd */
-       f = fdget((int)sxp->sx_fdtarget);
-       if (!f.file) {
-               error = XFS_ERROR(EINVAL);
-               goto out;
-       }
-
-       if (!(f.file->f_mode & FMODE_WRITE) ||
-           !(f.file->f_mode & FMODE_READ) ||
-           (f.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
-               goto out_put_file;
-       }
-
-       tmp = fdget((int)sxp->sx_fdtmp);
-       if (!tmp.file) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_file;
-       }
-
-       if (!(tmp.file->f_mode & FMODE_WRITE) ||
-           !(tmp.file->f_mode & FMODE_READ) ||
-           (tmp.file->f_flags & O_APPEND)) {
-               error = XFS_ERROR(EBADF);
-               goto out_put_tmp_file;
-       }
-
-       if (IS_SWAPFILE(file_inode(f.file)) ||
-           IS_SWAPFILE(file_inode(tmp.file))) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       ip = XFS_I(file_inode(f.file));
-       tip = XFS_I(file_inode(tmp.file));
-
-       if (ip->i_mount != tip->i_mount) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       if (ip->i_ino == tip->i_ino) {
-               error = XFS_ERROR(EINVAL);
-               goto out_put_tmp_file;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               error = XFS_ERROR(EIO);
-               goto out_put_tmp_file;
-       }
-
-       error = xfs_swap_extents(ip, tip, sxp);
-
- out_put_tmp_file:
-       fdput(tmp);
- out_put_file:
-       fdput(f);
- out:
-       return error;
-}
-
-/*
- * We need to check that the format of the data fork in the temporary inode is
- * valid for the target inode before doing the swap. This is not a problem with
- * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
- * data fork depending on the space the attribute fork is taking so we can get
- * invalid formats on the target inode.
- *
- * E.g. target has space for 7 extents in extent format, temp inode only has
- * space for 6.  If we defragment down to 7 extents, then the tmp format is a
- * btree, but when swapped it needs to be in extent format. Hence we can't just
- * blindly swap data forks on attr2 filesystems.
- *
- * Note that we check the swap in both directions so that we don't end up with
- * a corrupt temporary inode, either.
- *
- * Note that fixing the way xfs_fsr sets up the attribute fork in the source
- * inode will prevent this situation from occurring, so all we do here is
- * reject and log the attempt. basically we are putting the responsibility on
- * userspace to get this right.
- */
-static int
-xfs_swap_extents_check_format(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip)   /* tmp inode */
-{
-
-       /* Should never get a local format */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
-           tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-               return EINVAL;
-
-       /*
-        * if the target inode has less extents that then temporary inode then
-        * why did userspace call us?
-        */
-       if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-               return EINVAL;
-
-       /*
-        * if the target inode is in extent form and the temp inode is in btree
-        * form then we will end up with the target inode in the wrong format
-        * as we already know there are less extents in the temp inode.
-        */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-               return EINVAL;
-
-       /* Check temp in extent form to max in target */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
-                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-               return EINVAL;
-
-       /* Check target in extent form to max in temp */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
-                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-               return EINVAL;
-
-       /*
-        * If we are in a btree format, check that the temp root block will fit
-        * in the target and that it has enough extents to be in btree format
-        * in the target.
-        *
-        * Note that we have to be careful to allow btree->extent conversions
-        * (a common defrag case) which will occur when the temp inode is in
-        * extent format...
-        */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-               if (XFS_IFORK_BOFF(ip) &&
-                   XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-                       return EINVAL;
-               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
-                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-                       return EINVAL;
-       }
-
-       /* Reciprocal target->temp btree format checks */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-               if (XFS_IFORK_BOFF(tip) &&
-                   XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-                       return EINVAL;
-               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
-                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-                       return EINVAL;
-       }
-
-       return 0;
-}
-
-static int
-xfs_swap_extents(
-       xfs_inode_t     *ip,    /* target inode */
-       xfs_inode_t     *tip,   /* tmp inode */
-       xfs_swapext_t   *sxp)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_trans_t     *tp;
-       xfs_bstat_t     *sbp = &sxp->sx_stat;
-       xfs_ifork_t     *tempifp, *ifp, *tifp;
-       int             src_log_flags, target_log_flags;
-       int             error = 0;
-       int             aforkblks = 0;
-       int             taforkblks = 0;
-       __uint64_t      tmp;
-
-       /*
-        * We have no way of updating owner information in the BMBT blocks for
-        * each inode on CRC enabled filesystems, so to avoid corrupting the
-        * this metadata we simply don't allow extent swaps to occur.
-        */
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return XFS_ERROR(EINVAL);
-
-       tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-       if (!tempifp) {
-               error = XFS_ERROR(ENOMEM);
-               goto out;
-       }
-
-       /*
-        * we have to do two separate lock calls here to keep lockdep
-        * happy. If we try to get all the locks in one call, lock will
-        * report false positives when we drop the ILOCK and regain them
-        * below.
-        */
-       xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
-       /* Verify that both files have the same format */
-       if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       /* Verify both files are either real-time or non-realtime */
-       if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
-       if (error)
-               goto out_unlock;
-       truncate_pagecache_range(VFS_I(tip), 0, -1);
-
-       /* Verify O_DIRECT for ftmp */
-       if (VN_CACHED(VFS_I(tip)) != 0) {
-               error = XFS_ERROR(EINVAL);
-               goto out_unlock;
-       }
-
-       /* Verify all data are being swapped */
-       if (sxp->sx_offset != 0 ||
-           sxp->sx_length != ip->i_d.di_size ||
-           sxp->sx_length != tip->i_d.di_size) {
-               error = XFS_ERROR(EFAULT);
-               goto out_unlock;
-       }
-
-       trace_xfs_swap_extent_before(ip, 0);
-       trace_xfs_swap_extent_before(tip, 1);
-
-       /* check inode formats now that data is flushed */
-       error = xfs_swap_extents_check_format(ip, tip);
-       if (error) {
-               xfs_notice(mp,
-                   "%s: inode 0x%llx format is incompatible for exchanging.",
-                               __func__, ip->i_ino);
-               goto out_unlock;
-       }
-
-       /*
-        * Compare the current change & modify times with that
-        * passed in.  If they differ, we abort this swap.
-        * This is the mechanism used to ensure the calling
-        * process that the file was not changed out from
-        * under it.
-        */
-       if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
-           (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
-           (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-           (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       /* We need to fail if the file is memory mapped.  Once we have tossed
-        * all existing pages, the page fault will have no option
-        * but to go to the filesystem for pages. By making the page fault call
-        * vop_read (or write in the case of autogrow) they block on the iolock
-        * until we have switched the extents.
-        */
-       if (VN_MAPPED(VFS_I(ip))) {
-               error = XFS_ERROR(EBUSY);
-               goto out_unlock;
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL);
-
-       /*
-        * There is a race condition here since we gave up the
-        * ilock.  However, the data fork will not change since
-        * we have the iolock (locked for truncation too) so we
-        * are safe.  We don't really care if non-io related
-        * fields change.
-        */
-       truncate_pagecache_range(VFS_I(ip), 0, -1);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-       if ((error = xfs_trans_reserve(tp, 0,
-                                    XFS_ICHANGE_LOG_RES(mp), 0,
-                                    0, 0))) {
-               xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
-               xfs_iunlock(tip, XFS_IOLOCK_EXCL);
-               xfs_trans_cancel(tp, 0);
-               goto out;
-       }
-       xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-
-       /*
-        * Count the number of extended attribute blocks
-        */
-       if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
-            (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-               error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
-               if (error)
-                       goto out_trans_cancel;
-       }
-       if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
-            (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-               error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
-                       &taforkblks);
-               if (error)
-                       goto out_trans_cancel;
-       }
-
-       /*
-        * Swap the data forks of the inodes
-        */
-       ifp = &ip->i_df;
-       tifp = &tip->i_df;
-       *tempifp = *ifp;        /* struct copy */
-       *ifp = *tifp;           /* struct copy */
-       *tifp = *tempifp;       /* struct copy */
-
-       /*
-        * Fix the on-disk inode values
-        */
-       tmp = (__uint64_t)ip->i_d.di_nblocks;
-       ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
-       tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
-
-       tmp = (__uint64_t) ip->i_d.di_nextents;
-       ip->i_d.di_nextents = tip->i_d.di_nextents;
-       tip->i_d.di_nextents = tmp;
-
-       tmp = (__uint64_t) ip->i_d.di_format;
-       ip->i_d.di_format = tip->i_d.di_format;
-       tip->i_d.di_format = tmp;
-
-       /*
-        * The extents in the source inode could still contain speculative
-        * preallocation beyond EOF (e.g. the file is open but not modified
-        * while defrag is in progress). In that case, we need to copy over the
-        * number of delalloc blocks the data fork in the source inode is
-        * tracking beyond EOF so that when the fork is truncated away when the
-        * temporary inode is unlinked we don't underrun the i_delayed_blks
-        * counter on that inode.
-        */
-       ASSERT(tip->i_delayed_blks == 0);
-       tip->i_delayed_blks = ip->i_delayed_blks;
-       ip->i_delayed_blks = 0;
-
-       src_log_flags = XFS_ILOG_CORE;
-       switch (ip->i_d.di_format) {
-       case XFS_DINODE_FMT_EXTENTS:
-               /* If the extents fit in the inode, fix the
-                * pointer.  Otherwise it's already NULL or
-                * pointing to the extent.
-                */
-               if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-                       ifp->if_u1.if_extents =
-                               ifp->if_u2.if_inline_ext;
-               }
-               src_log_flags |= XFS_ILOG_DEXT;
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               src_log_flags |= XFS_ILOG_DBROOT;
-               break;
-       }
-
-       target_log_flags = XFS_ILOG_CORE;
-       switch (tip->i_d.di_format) {
-       case XFS_DINODE_FMT_EXTENTS:
-               /* If the extents fit in the inode, fix the
-                * pointer.  Otherwise it's already NULL or
-                * pointing to the extent.
-                */
-               if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-                       tifp->if_u1.if_extents =
-                               tifp->if_u2.if_inline_ext;
-               }
-               target_log_flags |= XFS_ILOG_DEXT;
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               target_log_flags |= XFS_ILOG_DBROOT;
-               break;
-       }
-
-
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-       xfs_trans_log_inode(tp, ip,  src_log_flags);
-       xfs_trans_log_inode(tp, tip, target_log_flags);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * transaction goes to disk before returning to the user.
-        */
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(tp);
-
-       error = xfs_trans_commit(tp, 0);
-
-       trace_xfs_swap_extent_after(ip, 0);
-       trace_xfs_swap_extent_after(tip, 1);
-out:
-       kmem_free(tempifp);
-       return error;
-
-out_unlock:
-       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       goto out;
-
-out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
-       goto out_unlock;
-}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644 (file)
index 20bdd93..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DFRAG_H__
-#define        __XFS_DFRAG_H__
-
-/*
- * Structure passed to xfs_swapext
- */
-
-typedef struct xfs_swapext
-{
-       __int64_t       sx_version;     /* version */
-       __int64_t       sx_fdtarget;    /* fd of target file */
-       __int64_t       sx_fdtmp;       /* fd of tmp file */
-       xfs_off_t       sx_offset;      /* offset into file */
-       xfs_off_t       sx_length;      /* leng from offset */
-       char            sx_pad[16];     /* pad space, unused */
-       xfs_bstat_t     sx_stat;        /* stat of target b4 copy */
-} xfs_swapext_t;
-
-/*
- * Version flag
- */
-#define XFS_SX_VERSION         0
-
-#ifdef __KERNEL__
-/*
- * Prototypes for visible xfs_dfrag.c routines.
- */
-
-/*
- * Syscall interface for xfs_swapext
- */
-int    xfs_swapext(struct xfs_swapext *sx);
-
-#endif /* __KERNEL__ */
-
-#endif /* __XFS_DFRAG_H__ */
index 8f023dee404da0da9c2092ba15d5ad904588d5e4..edf203ab50afa734a74faaace8e626721e7fc1bb 100644 (file)
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
-#include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+
 
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -90,6 +90,9 @@ void
 xfs_dir_mount(
        xfs_mount_t     *mp)
 {
+       int     nodehdr_size;
+
+
        ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
        ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
               XFS_MAX_BLOCKSIZE);
@@ -98,12 +101,13 @@ xfs_dir_mount(
        mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
        mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
        mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
-       mp->m_attr_node_ents =
-               (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
-               (uint)sizeof(xfs_da_node_entry_t);
-       mp->m_dir_node_ents =
-               (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
-               (uint)sizeof(xfs_da_node_entry_t);
+
+       nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+       mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+       mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
+                               (uint)sizeof(xfs_da_node_entry_t);
+
        mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
        if (xfs_sb_version_hasasciici(&mp->m_sb))
                mp->m_dirnameops = &xfs_ascii_ci_nameops;
@@ -209,6 +213,7 @@ xfs_dir_createname(
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
+       args.filetype = name->type;
        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = inum;
        args.dp = dp;
@@ -283,6 +288,7 @@ xfs_dir_lookup(
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
+       args.filetype = name->type;
        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.dp = dp;
        args.whichfork = XFS_DATA_FORK;
@@ -338,6 +344,7 @@ xfs_dir_removename(
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
+       args.filetype = name->type;
        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = ino;
        args.dp = dp;
@@ -362,37 +369,6 @@ xfs_dir_removename(
        return rval;
 }
 
-/*
- * Read a directory.
- */
-int
-xfs_readdir(
-       xfs_inode_t     *dp,
-       struct dir_context *ctx,
-       size_t          bufsize)
-{
-       int             rval;           /* return value */
-       int             v;              /* type-checking value */
-
-       trace_xfs_readdir(dp);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
-
-       ASSERT(S_ISDIR(dp->i_d.di_mode));
-       XFS_STATS_INC(xs_dir_getdents);
-
-       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-               rval = xfs_dir2_sf_getdents(dp, ctx);
-       else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
-               ;
-       else if (v)
-               rval = xfs_dir2_block_getdents(dp, ctx);
-       else
-               rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
-       return rval;
-}
-
 /*
  * Replace the inode number of a directory entry.
  */
@@ -418,6 +394,7 @@ xfs_dir_replace(
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
+       args.filetype = name->type;
        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = inum;
        args.dp = dp;
@@ -465,6 +442,7 @@ xfs_dir_canenter(
        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
+       args.filetype = name->type;
        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.dp = dp;
        args.whichfork = XFS_DATA_FORK;
index e937d9991c1850c5427ecae7419011c8395d09c3..9910401327d45c788586b2e0953309b81c8c7c6d 100644 (file)
@@ -23,6 +23,11 @@ struct xfs_da_args;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_dir2_sf_hdr;
+struct xfs_dir2_sf_entry;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_data_entry;
+struct xfs_dir2_data_unused;
 
 extern struct xfs_name xfs_name_dotdot;
 
@@ -57,4 +62,45 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
  */
 extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
 
+/*
+ * Interface routines used by userspace utilities
+ */
+extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
+extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
+               xfs_ino_t ino);
+extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
+               struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
+extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
+               struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
+               xfs_ino_t ino);
+
+extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+                               struct xfs_buf *bp);
+
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+               struct xfs_dir2_data_hdr *hdr, int *loghead);
+extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+               struct xfs_buf *bp);
+extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
+               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
+               int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
+               struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
+               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+
+extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
+               struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
+
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+
 #endif /* __XFS_DIR2_H__ */
index 5e7fbd72cf5255c53a8494a991986c339ea5293a..0957aa98b6c0d6bb1fdc3fa23161d761aeb58567 100644 (file)
@@ -31,8 +31,8 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
-#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -126,7 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
        .verify_write = xfs_dir3_block_write_verify,
 };
 
-static int
+int
 xfs_dir3_block_read(
        struct xfs_trans        *tp,
        struct xfs_inode        *dp,
@@ -369,7 +369,7 @@ xfs_dir2_block_addname(
        if (error)
                return error;
 
-       len = xfs_dir2_data_entsize(args->namelen);
+       len = xfs_dir3_data_entsize(mp, args->namelen);
 
        /*
         * Set up pointers to parts of the block.
@@ -549,7 +549,8 @@ xfs_dir2_block_addname(
        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, args->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        /*
         * Clean up the bestfree array and log the header, tail, and entry.
@@ -564,101 +565,6 @@ xfs_dir2_block_addname(
        return 0;
 }
 
-/*
- * Readdir for block directories.
- */
-int                                            /* error */
-xfs_dir2_block_getdents(
-       xfs_inode_t             *dp,            /* incore inode */
-       struct dir_context      *ctx)
-{
-       xfs_dir2_data_hdr_t     *hdr;           /* block header */
-       struct xfs_buf          *bp;            /* buffer for block */
-       xfs_dir2_block_tail_t   *btp;           /* block tail */
-       xfs_dir2_data_entry_t   *dep;           /* block data entry */
-       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
-       char                    *endptr;        /* end of the data entries */
-       int                     error;          /* error return value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       char                    *ptr;           /* current data entry */
-       int                     wantoff;        /* starting block offset */
-       xfs_off_t               cook;
-
-       mp = dp->i_mount;
-       /*
-        * If the block number in the offset is out of range, we're done.
-        */
-       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
-               return 0;
-
-       error = xfs_dir3_block_read(NULL, dp, &bp);
-       if (error)
-               return error;
-
-       /*
-        * Extract the byte offset we start at from the seek pointer.
-        * We'll skip entries before this.
-        */
-       wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
-       hdr = bp->b_addr;
-       xfs_dir3_data_check(dp, bp);
-       /*
-        * Set up values for the loop.
-        */
-       btp = xfs_dir2_block_tail_p(mp, hdr);
-       ptr = (char *)xfs_dir3_data_entry_p(hdr);
-       endptr = (char *)xfs_dir2_block_leaf_p(btp);
-
-       /*
-        * Loop over the data portion of the block.
-        * Each object is a real entry (dep) or an unused one (dup).
-        */
-       while (ptr < endptr) {
-               dup = (xfs_dir2_data_unused_t *)ptr;
-               /*
-                * Unused, skip it.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       ptr += be16_to_cpu(dup->length);
-                       continue;
-               }
-
-               dep = (xfs_dir2_data_entry_t *)ptr;
-
-               /*
-                * Bump pointer for the next iteration.
-                */
-               ptr += xfs_dir2_data_entsize(dep->namelen);
-               /*
-                * The entry is before the desired starting point, skip it.
-                */
-               if ((char *)dep - (char *)hdr < wantoff)
-                       continue;
-
-               cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                           (char *)dep - (char *)hdr);
-
-               ctx->pos = cook & 0x7fffffff;
-               /*
-                * If it didn't fit, set the final offset to here & return.
-                */
-               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                           be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
-                       xfs_trans_brelse(NULL, bp);
-                       return 0;
-               }
-       }
-
-       /*
-        * Reached the end of the block.
-        * Set the offset to a non-existent block 1 and return.
-        */
-       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
-                       0x7fffffff;
-       xfs_trans_brelse(NULL, bp);
-       return 0;
-}
-
 /*
  * Log leaf entries from the block.
  */
@@ -736,6 +642,7 @@ xfs_dir2_block_lookup(
         * Fill in inode number, CI name if appropriate, release the block.
         */
        args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_trans_brelse(args->trans, bp);
        return XFS_ERROR(error);
@@ -894,7 +801,7 @@ xfs_dir2_block_removename(
        needlog = needscan = 0;
        xfs_dir2_data_make_free(tp, bp,
                (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
        /*
         * Fix up the block tail.
         */
@@ -968,6 +875,7 @@ xfs_dir2_block_replace(
         * Change the inode number to the new value.
         */
        dep->inumber = cpu_to_be64(args->inumber);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
        xfs_dir2_data_log_entry(args->trans, bp, dep);
        xfs_dir3_data_check(dp, bp);
        return 0;
@@ -1254,7 +1162,8 @@ xfs_dir2_sf_to_block(
        dep->inumber = cpu_to_be64(dp->i_ino);
        dep->namelen = 1;
        dep->name[0] = '.';
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(tp, bp, dep);
        blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
@@ -1267,7 +1176,8 @@ xfs_dir2_sf_to_block(
        dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
        dep->namelen = 2;
        dep->name[0] = dep->name[1] = '.';
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(tp, bp, dep);
        blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
@@ -1312,10 +1222,12 @@ xfs_dir2_sf_to_block(
                 * Copy a real entry.
                 */
                dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
-               dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep));
+               dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
                dep->namelen = sfep->namelen;
+               xfs_dir3_dirent_put_ftype(mp, dep,
+                                       xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
                memcpy(dep->name, sfep->name, dep->namelen);
-               tagp = xfs_dir2_data_entry_tag_p(dep);
+               tagp = xfs_dir3_data_entry_tag_p(mp, dep);
                *tagp = cpu_to_be16((char *)dep - (char *)hdr);
                xfs_dir2_data_log_entry(tp, bp, dep);
                name.name = sfep->name;
@@ -1328,7 +1240,7 @@ xfs_dir2_sf_to_block(
                if (++i == sfp->count)
                        sfep = NULL;
                else
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
        }
        /* Done with the temporary buffer */
        kmem_free(sfp);
index c2930238005c6605c1e69e4dc455b9ed7267f3da..47e1326c169a08c71d8d21ac51e8d113444d3295 100644 (file)
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_cksum.h"
 
-STATIC xfs_dir2_data_free_t *
-xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-
 /*
  * Check the consistency of the data block.
  * The input can also be a block-format directory.
@@ -149,8 +147,10 @@ __xfs_dir3_data_check(
                XFS_WANT_CORRUPTED_RETURN(
                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
                XFS_WANT_CORRUPTED_RETURN(
-                       be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                       be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
                                               (char *)dep - (char *)hdr);
+               XFS_WANT_CORRUPTED_RETURN(
+                       xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
                count++;
                lastfree = 0;
                if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -168,7 +168,7 @@ __xfs_dir3_data_check(
                        }
                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
                }
-               p += xfs_dir2_data_entsize(dep->namelen);
+               p += xfs_dir3_data_entsize(mp, dep->namelen);
        }
        /*
         * Need to have seen all the entries and all the bestfree slots.
@@ -325,7 +325,7 @@ xfs_dir3_data_readahead(
  * Given a data block and an unused entry from that block,
  * return the bestfree entry if any that corresponds to it.
  */
-STATIC xfs_dir2_data_free_t *
+xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(
        xfs_dir2_data_hdr_t     *hdr,           /* data block */
        xfs_dir2_data_unused_t  *dup)           /* data unused entry */
@@ -333,7 +333,7 @@ xfs_dir2_data_freefind(
        xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
        xfs_dir2_data_aoff_t    off;            /* offset value needed */
        struct xfs_dir2_data_free *bf;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
        int                     matched;        /* matched the value */
        int                     seenzero;       /* saw a 0 bestfree entry */
 #endif
@@ -341,7 +341,7 @@ xfs_dir2_data_freefind(
        off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
        bf = xfs_dir3_data_bestfree_p(hdr);
 
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
        /*
         * Validate some consistency in the bestfree table.
         * Check order, non-overlapping entries, and if we find the
@@ -538,8 +538,8 @@ xfs_dir2_data_freescan(
                else {
                        dep = (xfs_dir2_data_entry_t *)p;
                        ASSERT((char *)dep - (char *)hdr ==
-                              be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)));
-                       p += xfs_dir2_data_entsize(dep->namelen);
+                              be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
+                       p += xfs_dir3_data_entsize(mp, dep->namelen);
                }
        }
 }
@@ -629,7 +629,8 @@ xfs_dir2_data_log_entry(
        struct xfs_buf          *bp,
        xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
 {
-       xfs_dir2_data_hdr_t     *hdr = bp->b_addr;
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+       struct xfs_mount        *mp = tp->t_mountp;
 
        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
               hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -637,7 +638,7 @@ xfs_dir2_data_log_entry(
               hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
        xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
-               (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
+               (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
                       (char *)hdr - 1));
 }
 
index 7826782b8d789461eef5a5443ae1d29944887815..a0961a61ac1a9a419fd537cd47dbb7337fbbc3a0 100644 (file)
 #define        XFS_DIR3_DATA_MAGIC     0x58444433      /* XDD3: multiblock dirs */
 #define        XFS_DIR3_FREE_MAGIC     0x58444633      /* XDF3: free index blocks */
 
+/*
+ * Dirents in version 3 directories have a file type field. Additions to this
+ * list are an on-disk format change, requiring feature bits. Valid values
+ * are as follows:
+ */
+#define XFS_DIR3_FT_UNKNOWN            0
+#define XFS_DIR3_FT_REG_FILE           1
+#define XFS_DIR3_FT_DIR                        2
+#define XFS_DIR3_FT_CHRDEV             3
+#define XFS_DIR3_FT_BLKDEV             4
+#define XFS_DIR3_FT_FIFO               5
+#define XFS_DIR3_FT_SOCK               6
+#define XFS_DIR3_FT_SYMLINK            7
+#define XFS_DIR3_FT_WHT                        8
+
+#define XFS_DIR3_FT_MAX                        9
+
 /*
  * Byte offset in data block and shortform entry.
  */
@@ -138,6 +155,9 @@ typedef struct xfs_dir2_sf_entry {
        xfs_dir2_sf_off_t       offset;         /* saved offset */
        __u8                    name[];         /* name, variable size */
        /*
+        * A single byte containing the file type field follows the inode
+        * number for version 3 directory entries.
+        *
         * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
         * variable offset after the name.
         */
@@ -162,16 +182,6 @@ xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
        put_unaligned_be16(off, &sfep->offset.i);
 }
 
-static inline int
-xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len)
-{
-       return sizeof(struct xfs_dir2_sf_entry) +       /* namelen + offset */
-               len +                                   /* name */
-               (hdr->i8count ?                         /* ino */
-                sizeof(xfs_dir2_ino8_t) :
-                sizeof(xfs_dir2_ino4_t));
-}
-
 static inline struct xfs_dir2_sf_entry *
 xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
 {
@@ -179,14 +189,78 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
                ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
 }
 
+static inline int
+xfs_dir3_sf_entsize(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       int                     len)
+{
+       int count = sizeof(struct xfs_dir2_sf_entry);   /* namelen + offset */
+
+       count += len;                                   /* name */
+       count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+                               sizeof(xfs_dir2_ino4_t); /* ino # */
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               count += sizeof(__uint8_t);             /* file type */
+       return count;
+}
+
 static inline struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr,
-               struct xfs_dir2_sf_entry *sfep)
+xfs_dir3_sf_nextentry(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
 {
        return (struct xfs_dir2_sf_entry *)
-               ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+               ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
 }
 
+/*
+ * in dir3 shortform directories, the file type field is stored at a variable
+ * offset after the inode number. Because it's only a single byte, endian
+ * conversion is not necessary.
+ */
+static inline __uint8_t *
+xfs_dir3_sfe_ftypep(
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       return (__uint8_t *)&sfep->name[sfep->namelen];
+}
+
+static inline __uint8_t
+xfs_dir3_sfe_get_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep)
+{
+       __uint8_t       *ftp;
+
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return XFS_DIR3_FT_UNKNOWN;
+
+       ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+       if (*ftp >= XFS_DIR3_FT_MAX)
+               return XFS_DIR3_FT_UNKNOWN;
+       return *ftp;
+}
+
+static inline void
+xfs_dir3_sfe_put_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_sf_hdr  *hdr,
+       struct xfs_dir2_sf_entry *sfep,
+       __uint8_t               ftype)
+{
+       __uint8_t       *ftp;
+
+       ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return;
+       ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
+       *ftp = ftype;
+}
 
 /*
  * Data block structures.
@@ -286,12 +360,18 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
  * Active entry in a data block.
  *
  * Aligned to 8 bytes.  After the variable length name field there is a
- * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p.
+ * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
+ *
+ * For dir3 structures, there is file type field between the name and the tag.
+ * This can only be manipulated by helper functions. It is packed hard against
+ * the end of the name so any padding for rounding is between the file type and
+ * the tag.
  */
 typedef struct xfs_dir2_data_entry {
        __be64                  inumber;        /* inode number */
        __u8                    namelen;        /* name length */
        __u8                    name[];         /* name bytes, no null */
+     /* __u8                   filetype; */    /* type of inode we point to */
      /*        __be16                  tag; */         /* starting offset of us */
 } xfs_dir2_data_entry_t;
 
@@ -311,20 +391,67 @@ typedef struct xfs_dir2_data_unused {
 /*
  * Size of a data entry.
  */
-static inline int xfs_dir2_data_entsize(int n)
+static inline int
+__xfs_dir3_data_entsize(
+       bool    ftype,
+       int     n)
 {
-       return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n +
-                (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN);
+       int     size = offsetof(struct xfs_dir2_data_entry, name[0]);
+
+       size += n;
+       size += sizeof(xfs_dir2_data_off_t);
+       if (ftype)
+               size += sizeof(__uint8_t);
+       return roundup(size, XFS_DIR2_DATA_ALIGN);
+}
+static inline int
+xfs_dir3_data_entsize(
+       struct xfs_mount        *mp,
+       int                     n)
+{
+       bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
+       return __xfs_dir3_data_entsize(ftype, n);
+}
+
+static inline __uint8_t
+xfs_dir3_dirent_get_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep)
+{
+       if (xfs_sb_version_hasftype(&mp->m_sb)) {
+               __uint8_t       type = dep->name[dep->namelen];
+
+               ASSERT(type < XFS_DIR3_FT_MAX);
+               if (type < XFS_DIR3_FT_MAX)
+                       return type;
+
+       }
+       return XFS_DIR3_FT_UNKNOWN;
+}
+
+static inline void
+xfs_dir3_dirent_put_ftype(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep,
+       __uint8_t               type)
+{
+       ASSERT(type < XFS_DIR3_FT_MAX);
+       ASSERT(dep->namelen != 0);
+
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               dep->name[dep->namelen] = type;
 }
 
 /*
  * Pointer to an entry's tag word.
  */
 static inline __be16 *
-xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep)
+xfs_dir3_data_entry_tag_p(
+       struct xfs_mount        *mp,
+       struct xfs_dir2_data_entry *dep)
 {
        return (__be16 *)((char *)dep +
-               xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+               xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
 }
 
 /*
@@ -375,13 +502,17 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
  * data block header because the sfe embeds the block offset of the entry into
  * it so that it doesn't change when format conversion occurs. Bad Things Happen
  * if we don't follow this rule.
+ *
+ * XXX: there is scope for significant optimisation of the logic here. Right
+ * now we are checking for "dir3 format" over and over again. Ideally we should
+ * only do it once for each operation.
  */
 #define        XFS_DIR3_DATA_DOT_OFFSET(mp)    \
        xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
 #define        XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
-       (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1))
+       (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
 #define        XFS_DIR3_DATA_FIRST_OFFSET(mp)          \
-       (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2))
+       (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
 
 static inline xfs_dir2_data_aoff_t
 xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
@@ -392,13 +523,19 @@ xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
 static inline xfs_dir2_data_aoff_t
 xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
 {
-       return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1);
+       bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+       return xfs_dir3_data_dot_offset(hdr) +
+               __xfs_dir3_data_entsize(dir3, 1);
 }
 
 static inline xfs_dir2_data_aoff_t
 xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
 {
-       return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2);
+       bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+                   hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+       return xfs_dir3_data_dotdot_offset(hdr) +
+               __xfs_dir3_data_entsize(dir3, 2);
 }
 
 /*
@@ -519,6 +656,9 @@ struct xfs_dir3_leaf {
 
 #define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
 
+extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
+                                       struct xfs_dir2_leaf *from);
+
 static inline int
 xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
 {
index 2aed25cae04d9f265df00aba8e5d63a624350a0f..08984eeee159c5d790bdce648caf4fe8cb4ea676 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -695,7 +696,7 @@ xfs_dir2_leaf_addname(
        ents = xfs_dir3_leaf_ents_p(leaf);
        xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
        bestsp = xfs_dir2_leaf_bests_p(ltp);
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
 
        /*
         * See if there are any entries with the same hash value
@@ -896,7 +897,8 @@ xfs_dir2_leaf_addname(
        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, dep->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        /*
         * Need to scan fix up the bestfree table.
@@ -1083,396 +1085,6 @@ xfs_dir3_leaf_compact_x1(
        *highstalep = highstale;
 }
 
-struct xfs_dir2_leaf_map_info {
-       xfs_extlen_t    map_blocks;     /* number of fsbs in map */
-       xfs_dablk_t     map_off;        /* last mapped file offset */
-       int             map_size;       /* total entries in *map */
-       int             map_valid;      /* valid entries in *map */
-       int             nmap;           /* mappings to ask xfs_bmapi */
-       xfs_dir2_db_t   curdb;          /* db for current block */
-       int             ra_current;     /* number of read-ahead blks */
-       int             ra_index;       /* *map index for read-ahead */
-       int             ra_offset;      /* map entry offset for ra */
-       int             ra_want;        /* readahead count wanted */
-       struct xfs_bmbt_irec map[];     /* map vector for blocks */
-};
-
-STATIC int
-xfs_dir2_leaf_readbuf(
-       struct xfs_inode        *dp,
-       size_t                  bufsize,
-       struct xfs_dir2_leaf_map_info *mip,
-       xfs_dir2_off_t          *curoff,
-       struct xfs_buf          **bpp)
-{
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_buf          *bp = *bpp;
-       struct xfs_bmbt_irec    *map = mip->map;
-       struct blk_plug         plug;
-       int                     error = 0;
-       int                     length;
-       int                     i;
-       int                     j;
-
-       /*
-        * If we have a buffer, we need to release it and
-        * take it out of the mapping.
-        */
-
-       if (bp) {
-               xfs_trans_brelse(NULL, bp);
-               bp = NULL;
-               mip->map_blocks -= mp->m_dirblkfsbs;
-               /*
-                * Loop to get rid of the extents for the
-                * directory block.
-                */
-               for (i = mp->m_dirblkfsbs; i > 0; ) {
-                       j = min_t(int, map->br_blockcount, i);
-                       map->br_blockcount -= j;
-                       map->br_startblock += j;
-                       map->br_startoff += j;
-                       /*
-                        * If mapping is done, pitch it from
-                        * the table.
-                        */
-                       if (!map->br_blockcount && --mip->map_valid)
-                               memmove(&map[0], &map[1],
-                                       sizeof(map[0]) * mip->map_valid);
-                       i -= j;
-               }
-       }
-
-       /*
-        * Recalculate the readahead blocks wanted.
-        */
-       mip->ra_want = howmany(bufsize + mp->m_dirblksize,
-                              mp->m_sb.sb_blocksize) - 1;
-       ASSERT(mip->ra_want >= 0);
-
-       /*
-        * If we don't have as many as we want, and we haven't
-        * run out of data blocks, get some more mappings.
-        */
-       if (1 + mip->ra_want > mip->map_blocks &&
-           mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
-               /*
-                * Get more bmaps, fill in after the ones
-                * we already have in the table.
-                */
-               mip->nmap = mip->map_size - mip->map_valid;
-               error = xfs_bmapi_read(dp, mip->map_off,
-                               xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
-                                                               mip->map_off,
-                               &map[mip->map_valid], &mip->nmap, 0);
-
-               /*
-                * Don't know if we should ignore this or try to return an
-                * error.  The trouble with returning errors is that readdir
-                * will just stop without actually passing the error through.
-                */
-               if (error)
-                       goto out;       /* XXX */
-
-               /*
-                * If we got all the mappings we asked for, set the final map
-                * offset based on the last bmap value received.  Otherwise,
-                * we've reached the end.
-                */
-               if (mip->nmap == mip->map_size - mip->map_valid) {
-                       i = mip->map_valid + mip->nmap - 1;
-                       mip->map_off = map[i].br_startoff + map[i].br_blockcount;
-               } else
-                       mip->map_off = xfs_dir2_byte_to_da(mp,
-                                                       XFS_DIR2_LEAF_OFFSET);
-
-               /*
-                * Look for holes in the mapping, and eliminate them.  Count up
-                * the valid blocks.
-                */
-               for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
-                       if (map[i].br_startblock == HOLESTARTBLOCK) {
-                               mip->nmap--;
-                               length = mip->map_valid + mip->nmap - i;
-                               if (length)
-                                       memmove(&map[i], &map[i + 1],
-                                               sizeof(map[i]) * length);
-                       } else {
-                               mip->map_blocks += map[i].br_blockcount;
-                               i++;
-                       }
-               }
-               mip->map_valid += mip->nmap;
-       }
-
-       /*
-        * No valid mappings, so no more data blocks.
-        */
-       if (!mip->map_valid) {
-               *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
-               goto out;
-       }
-
-       /*
-        * Read the directory block starting at the first mapping.
-        */
-       mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-       error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
-                       map->br_blockcount >= mp->m_dirblkfsbs ?
-                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-
-       /*
-        * Should just skip over the data block instead of giving up.
-        */
-       if (error)
-               goto out;       /* XXX */
-
-       /*
-        * Adjust the current amount of read-ahead: we just read a block that
-        * was previously ra.
-        */
-       if (mip->ra_current)
-               mip->ra_current -= mp->m_dirblkfsbs;
-
-       /*
-        * Do we need more readahead?
-        */
-       blk_start_plug(&plug);
-       for (mip->ra_index = mip->ra_offset = i = 0;
-            mip->ra_want > mip->ra_current && i < mip->map_blocks;
-            i += mp->m_dirblkfsbs) {
-               ASSERT(mip->ra_index < mip->map_valid);
-               /*
-                * Read-ahead a contiguous directory block.
-                */
-               if (i > mip->ra_current &&
-                   map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                       xfs_dir3_data_readahead(NULL, dp,
-                               map[mip->ra_index].br_startoff + mip->ra_offset,
-                               XFS_FSB_TO_DADDR(mp,
-                                       map[mip->ra_index].br_startblock +
-                                                       mip->ra_offset));
-                       mip->ra_current = i;
-               }
-
-               /*
-                * Read-ahead a non-contiguous directory block.  This doesn't
-                * use our mapping, but this is a very rare case.
-                */
-               else if (i > mip->ra_current) {
-                       xfs_dir3_data_readahead(NULL, dp,
-                                       map[mip->ra_index].br_startoff +
-                                                       mip->ra_offset, -1);
-                       mip->ra_current = i;
-               }
-
-               /*
-                * Advance offset through the mapping table.
-                */
-               for (j = 0; j < mp->m_dirblkfsbs; j++) {
-                       /*
-                        * The rest of this extent but not more than a dir
-                        * block.
-                        */
-                       length = min_t(int, mp->m_dirblkfsbs,
-                                       map[mip->ra_index].br_blockcount -
-                                                       mip->ra_offset);
-                       j += length;
-                       mip->ra_offset += length;
-
-                       /*
-                        * Advance to the next mapping if this one is used up.
-                        */
-                       if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
-                               mip->ra_offset = 0;
-                               mip->ra_index++;
-                       }
-               }
-       }
-       blk_finish_plug(&plug);
-
-out:
-       *bpp = bp;
-       return error;
-}
-
-/*
- * Getdents (readdir) for leaf and node directories.
- * This reads the data blocks only, so is the same for both forms.
- */
-int                                            /* error */
-xfs_dir2_leaf_getdents(
-       xfs_inode_t             *dp,            /* incore directory inode */
-       struct dir_context      *ctx,
-       size_t                  bufsize)
-{
-       struct xfs_buf          *bp = NULL;     /* data block buffer */
-       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
-       xfs_dir2_data_entry_t   *dep;           /* data entry */
-       xfs_dir2_data_unused_t  *dup;           /* unused entry */
-       int                     error = 0;      /* error return value */
-       int                     length;         /* temporary length value */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       int                     byteoff;        /* offset in current block */
-       xfs_dir2_off_t          curoff;         /* current overall offset */
-       xfs_dir2_off_t          newoff;         /* new curoff after new blk */
-       char                    *ptr = NULL;    /* pointer to current data */
-       struct xfs_dir2_leaf_map_info *map_info;
-
-       /*
-        * If the offset is at or past the largest allowed value,
-        * give up right away.
-        */
-       if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
-               return 0;
-
-       mp = dp->i_mount;
-
-       /*
-        * Set up to bmap a number of blocks based on the caller's
-        * buffer size, the directory block size, and the filesystem
-        * block size.
-        */
-       length = howmany(bufsize + mp->m_dirblksize,
-                                    mp->m_sb.sb_blocksize);
-       map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
-                               (length * sizeof(struct xfs_bmbt_irec)),
-                              KM_SLEEP | KM_NOFS);
-       map_info->map_size = length;
-
-       /*
-        * Inside the loop we keep the main offset value as a byte offset
-        * in the directory file.
-        */
-       curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
-
-       /*
-        * Force this conversion through db so we truncate the offset
-        * down to get the start of the data block.
-        */
-       map_info->map_off = xfs_dir2_db_to_da(mp,
-                                             xfs_dir2_byte_to_db(mp, curoff));
-
-       /*
-        * Loop over directory entries until we reach the end offset.
-        * Get more blocks and readahead as necessary.
-        */
-       while (curoff < XFS_DIR2_LEAF_OFFSET) {
-               /*
-                * If we have no buffer, or we're off the end of the
-                * current buffer, need to get another one.
-                */
-               if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
-
-                       error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
-                                                     &curoff, &bp);
-                       if (error || !map_info->map_valid)
-                               break;
-
-                       /*
-                        * Having done a read, we need to set a new offset.
-                        */
-                       newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
-                       /*
-                        * Start of the current block.
-                        */
-                       if (curoff < newoff)
-                               curoff = newoff;
-                       /*
-                        * Make sure we're in the right block.
-                        */
-                       else if (curoff > newoff)
-                               ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
-                                      map_info->curdb);
-                       hdr = bp->b_addr;
-                       xfs_dir3_data_check(dp, bp);
-                       /*
-                        * Find our position in the block.
-                        */
-                       ptr = (char *)xfs_dir3_data_entry_p(hdr);
-                       byteoff = xfs_dir2_byte_to_off(mp, curoff);
-                       /*
-                        * Skip past the header.
-                        */
-                       if (byteoff == 0)
-                               curoff += xfs_dir3_data_entry_offset(hdr);
-                       /*
-                        * Skip past entries until we reach our offset.
-                        */
-                       else {
-                               while ((char *)ptr - (char *)hdr < byteoff) {
-                                       dup = (xfs_dir2_data_unused_t *)ptr;
-
-                                       if (be16_to_cpu(dup->freetag)
-                                                 == XFS_DIR2_DATA_FREE_TAG) {
-
-                                               length = be16_to_cpu(dup->length);
-                                               ptr += length;
-                                               continue;
-                                       }
-                                       dep = (xfs_dir2_data_entry_t *)ptr;
-                                       length =
-                                          xfs_dir2_data_entsize(dep->namelen);
-                                       ptr += length;
-                               }
-                               /*
-                                * Now set our real offset.
-                                */
-                               curoff =
-                                       xfs_dir2_db_off_to_byte(mp,
-                                           xfs_dir2_byte_to_db(mp, curoff),
-                                           (char *)ptr - (char *)hdr);
-                               if (ptr >= (char *)hdr + mp->m_dirblksize) {
-                                       continue;
-                               }
-                       }
-               }
-               /*
-                * We have a pointer to an entry.
-                * Is it a live one?
-                */
-               dup = (xfs_dir2_data_unused_t *)ptr;
-               /*
-                * No, it's unused, skip over it.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       length = be16_to_cpu(dup->length);
-                       ptr += length;
-                       curoff += length;
-                       continue;
-               }
-
-               dep = (xfs_dir2_data_entry_t *)ptr;
-               length = xfs_dir2_data_entsize(dep->namelen);
-
-               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
-                           be64_to_cpu(dep->inumber), DT_UNKNOWN))
-                       break;
-
-               /*
-                * Advance to next entry in the block.
-                */
-               ptr += length;
-               curoff += length;
-               /* bufsize may have just been a guess; don't go negative */
-               bufsize = bufsize > length ? bufsize - length : 0;
-       }
-
-       /*
-        * All done.  Set output offset value to current offset.
-        */
-       if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-               ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
-       else
-               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
-       kmem_free(map_info);
-       if (bp)
-               xfs_trans_brelse(NULL, bp);
-       return error;
-}
-
-
 /*
  * Log the bests entries indicated from a leaf1 block.
  */
@@ -1614,6 +1226,7 @@ xfs_dir2_leaf_lookup(
         * Return the found inode number & CI name if appropriate
         */
        args->inumber = be64_to_cpu(dep->inumber);
+       args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_trans_brelse(tp, dbp);
        xfs_trans_brelse(tp, lbp);
@@ -1816,7 +1429,7 @@ xfs_dir2_leaf_removename(
         */
        xfs_dir2_data_make_free(tp, dbp,
                (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
        /*
         * We just mark the leaf entry stale by putting a null in it.
         */
@@ -1944,6 +1557,7 @@ xfs_dir2_leaf_replace(
         * Put the new inode number in, log it.
         */
        dep->inumber = cpu_to_be64(args->inumber);
+       xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
        tp = args->trans;
        xfs_dir2_data_log_entry(tp, dbp, dep);
        xfs_dir3_leaf_check(dp->i_mount, lbp);
@@ -1975,10 +1589,6 @@ xfs_dir2_leaf_search_hash(
        ents = xfs_dir3_leaf_ents_p(leaf);
        xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 
-#ifndef __KERNEL__
-       if (!leafhdr.count)
-               return 0;
-#endif
        /*
         * Note, the table cannot be empty, so we have to go through the loop.
         * Binary search the leaf entries looking for our hash value.
index 2226a00acd156118a2998ce37c2c95ae628503d9..4c3dba7ffb7439d250b0af44e4de237a9359a795 100644 (file)
@@ -30,6 +30,7 @@
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
@@ -312,11 +313,13 @@ xfs_dir2_free_log_header(
        struct xfs_trans        *tp,
        struct xfs_buf          *bp)
 {
+#ifdef DEBUG
        xfs_dir2_free_t         *free;          /* freespace structure */
 
        free = bp->b_addr;
        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
               free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+#endif
        xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
 }
 
@@ -602,7 +605,7 @@ xfs_dir2_leafn_lookup_for_addname(
                ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
                       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
        }
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
        /*
         * Loop over leaf entries with the right hash value.
         */
@@ -813,6 +816,7 @@ xfs_dir2_leafn_lookup_for_entry(
                                xfs_trans_brelse(tp, state->extrablk.bp);
                        args->cmpresult = cmp;
                        args->inumber = be64_to_cpu(dep->inumber);
+                       args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
                        *indexp = index;
                        state->extravalid = 1;
                        state->extrablk.bp = curbp;
@@ -1256,7 +1260,7 @@ xfs_dir2_leafn_remove(
        longest = be16_to_cpu(bf[0].length);
        needlog = needscan = 0;
        xfs_dir2_data_make_free(tp, dbp, off,
-               xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
+               xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
        /*
         * Rescan the data block freespaces for bestfree.
         * Log the data block header if needed.
@@ -1708,7 +1712,7 @@ xfs_dir2_node_addname_int(
        dp = args->dp;
        mp = dp->i_mount;
        tp = args->trans;
-       length = xfs_dir2_data_entsize(args->namelen);
+       length = xfs_dir3_data_entsize(mp, args->namelen);
        /*
         * If we came in with a freespace block that means that lookup
         * found an entry with our hash value.  This is the freespace
@@ -2004,7 +2008,8 @@ xfs_dir2_node_addname_int(
        dep->inumber = cpu_to_be64(args->inumber);
        dep->namelen = args->namelen;
        memcpy(dep->name, args->name, dep->namelen);
-       tagp = xfs_dir2_data_entry_tag_p(dep);
+       xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
+       tagp = xfs_dir3_data_entry_tag_p(mp, dep);
        *tagp = cpu_to_be16((char *)dep - (char *)hdr);
        xfs_dir2_data_log_entry(tp, dbp, dep);
        /*
@@ -2224,6 +2229,7 @@ xfs_dir2_node_replace(
                 * Fill in the new inode number and log the entry.
                 */
                dep->inumber = cpu_to_be64(inum);
+               xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
                xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
                rval = 0;
        }
index 0511cda4a712a480682b946829c5587cf37c4070..1bad84c408295ae4db2de71d96c74b18fff5b2b2 100644 (file)
 #ifndef __XFS_DIR2_PRIV_H__
 #define __XFS_DIR2_PRIV_H__
 
+struct dir_context;
+
 /* xfs_dir2.c */
 extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
                                xfs_dir2_db_t *dbp);
-extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
-                               struct xfs_buf *bp);
 extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                const unsigned char *name, int len);
 
-/* xfs_dir2_block.c */
-extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
+#define S_SHIFT 12
+extern const unsigned char xfs_mode_to_ftype[];
+
+extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
+                                       __uint8_t filetype);
 
+
+/* xfs_dir2_block.c */
+extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                              struct xfs_buf **bpp);
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
-               struct dir_context *ctx);
 extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_block_removename(struct xfs_da_args *args);
 extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -48,9 +51,6 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 #define        xfs_dir3_data_check(dp,bp)
 #endif
 
-extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
-
 extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
 extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
                xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -60,27 +60,10 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
                struct xfs_dir2_data_unused *dup, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
-               struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
                struct xfs_buf **bpp);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
-               struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
-               xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
-               int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
-               struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
-               xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
-extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
-extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
-
 extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
                xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
@@ -91,8 +74,6 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
 extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
                struct xfs_dir2_leaf_entry *ents, int *indexp,
                int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
-               size_t bufsize);
 extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
                struct xfs_buf **bpp, __uint16_t magic);
 extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -144,18 +125,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
                xfs_dablk_t fbno, struct xfs_buf **bpp);
 
 /* xfs_dir2_sf.c */
-extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
-               struct xfs_dir2_sf_entry *sfep);
 extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
                struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
 extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
                int size, xfs_dir2_sf_hdr_t *sfhp);
 extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
 extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
 
+/* xfs_dir2_readdir.c */
+extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
+                      size_t bufsize);
+
 #endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644 (file)
index 0000000..8993ec1
--- /dev/null
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+
+/*
+ * Directory file type support functions
+ */
+static unsigned char xfs_dir3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
+       DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
+};
+
+unsigned char
+xfs_dir3_get_dtype(
+       struct xfs_mount        *mp,
+       __uint8_t               filetype)
+{
+       if (!xfs_sb_version_hasftype(&mp->m_sb))
+               return DT_UNKNOWN;
+
+       if (filetype >= XFS_DIR3_FT_MAX)
+               return DT_UNKNOWN;
+
+       return xfs_dir3_filetype_table[filetype];
+}
+/*
+ * @mode, if set, indicates that the type field needs to be set up.
+ * This uses the transformation from file mode to DT_* as defined in linux/fs.h
+ * for file type specification. This will be propagated into the directory
+ * structure if appropriate for the given operation and filesystem config.
+ */
+const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
+       [0]                     = XFS_DIR3_FT_UNKNOWN,
+       [S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
+       [S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
+       [S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
+       [S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
+       [S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
+       [S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
+       [S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
+};
+
+STATIC int
+xfs_dir2_sf_getdents(
+       xfs_inode_t             *dp,            /* incore directory inode */
+       struct dir_context      *ctx)
+{
+       int                     i;              /* shortform entry number */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       xfs_dir2_dataptr_t      off;            /* current entry's offset */
+       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       xfs_dir2_dataptr_t      dot_offset;
+       xfs_dir2_dataptr_t      dotdot_offset;
+       xfs_ino_t               ino;
+
+       mp = dp->i_mount;
+
+       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+       /*
+        * Give up if the directory is way too short.
+        */
+       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               return XFS_ERROR(EIO);
+       }
+
+       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+       ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+
+       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+
+       /*
+        * If the block number in the offset is out of range, we're done.
+        */
+       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+               return 0;
+
+       /*
+        * Precalculate offsets for . and .. as we will always need them.
+        *
+        * XXX(hch): the second argument is sometimes 0 and sometimes
+        * mp->m_dirdatablk.
+        */
+       dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                            XFS_DIR3_DATA_DOT_OFFSET(mp));
+       dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                               XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
+
+       /*
+        * Put . entry unless we're starting past it.
+        */
+       if (ctx->pos <= dot_offset) {
+               ctx->pos = dot_offset & 0x7fffffff;
+               if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
+                       return 0;
+       }
+
+       /*
+        * Put .. entry unless we're starting past it.
+        */
+       if (ctx->pos <= dotdot_offset) {
+               ino = xfs_dir2_sf_get_parent_ino(sfp);
+               ctx->pos = dotdot_offset & 0x7fffffff;
+               if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
+                       return 0;
+       }
+
+       /*
+        * Loop while there are more entries and put'ing works.
+        */
+       sfep = xfs_dir2_sf_firstentry(sfp);
+       for (i = 0; i < sfp->count; i++) {
+               __uint8_t filetype;
+
+               off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                               xfs_dir2_sf_get_offset(sfep));
+
+               if (ctx->pos > off) {
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+                       continue;
+               }
+
+               ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
+               filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
+               ctx->pos = off & 0x7fffffff;
+               if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
+                           xfs_dir3_get_dtype(mp, filetype)))
+                       return 0;
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+       }
+
+       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                       0x7fffffff;
+       return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+STATIC int
+xfs_dir2_block_getdents(
+       xfs_inode_t             *dp,            /* incore inode */
+       struct dir_context      *ctx)
+{
+       xfs_dir2_data_hdr_t     *hdr;           /* block header */
+       struct xfs_buf          *bp;            /* buffer for block */
+       xfs_dir2_block_tail_t   *btp;           /* block tail */
+       xfs_dir2_data_entry_t   *dep;           /* block data entry */
+       xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+       char                    *endptr;        /* end of the data entries */
+       int                     error;          /* error return value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       char                    *ptr;           /* current data entry */
+       int                     wantoff;        /* starting block offset */
+       xfs_off_t               cook;
+
+       mp = dp->i_mount;
+       /*
+        * If the block number in the offset is out of range, we're done.
+        */
+       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+               return 0;
+
+       error = xfs_dir3_block_read(NULL, dp, &bp);
+       if (error)
+               return error;
+
+       /*
+        * Extract the byte offset we start at from the seek pointer.
+        * We'll skip entries before this.
+        */
+       wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
+       hdr = bp->b_addr;
+       xfs_dir3_data_check(dp, bp);
+       /*
+        * Set up values for the loop.
+        */
+       btp = xfs_dir2_block_tail_p(mp, hdr);
+       ptr = (char *)xfs_dir3_data_entry_p(hdr);
+       endptr = (char *)xfs_dir2_block_leaf_p(btp);
+
+       /*
+        * Loop over the data portion of the block.
+        * Each object is a real entry (dep) or an unused one (dup).
+        */
+       while (ptr < endptr) {
+               __uint8_t filetype;
+
+               dup = (xfs_dir2_data_unused_t *)ptr;
+               /*
+                * Unused, skip it.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       ptr += be16_to_cpu(dup->length);
+                       continue;
+               }
+
+               dep = (xfs_dir2_data_entry_t *)ptr;
+
+               /*
+                * Bump pointer for the next iteration.
+                */
+               ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+               /*
+                * The entry is before the desired starting point, skip it.
+                */
+               if ((char *)dep - (char *)hdr < wantoff)
+                       continue;
+
+               cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+                                           (char *)dep - (char *)hdr);
+
+               ctx->pos = cook & 0x7fffffff;
+               filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+               /*
+                * If it didn't fit, set the final offset to here & return.
+                */
+               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+                           be64_to_cpu(dep->inumber),
+                           xfs_dir3_get_dtype(mp, filetype))) {
+                       xfs_trans_brelse(NULL, bp);
+                       return 0;
+               }
+       }
+
+       /*
+        * Reached the end of the block.
+        * Set the offset to a non-existent block 1 and return.
+        */
+       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+                       0x7fffffff;
+       xfs_trans_brelse(NULL, bp);
+       return 0;
+}
+
+struct xfs_dir2_leaf_map_info {
+       xfs_extlen_t    map_blocks;     /* number of fsbs in map */
+       xfs_dablk_t     map_off;        /* last mapped file offset */
+       int             map_size;       /* total entries in *map */
+       int             map_valid;      /* valid entries in *map */
+       int             nmap;           /* mappings to ask xfs_bmapi */
+       xfs_dir2_db_t   curdb;          /* db for current block */
+       int             ra_current;     /* number of read-ahead blks */
+       int             ra_index;       /* *map index for read-ahead */
+       int             ra_offset;      /* map entry offset for ra */
+       int             ra_want;        /* readahead count wanted */
+       struct xfs_bmbt_irec map[];     /* map vector for blocks */
+};
+
+STATIC int
+xfs_dir2_leaf_readbuf(
+       struct xfs_inode        *dp,
+       size_t                  bufsize,
+       struct xfs_dir2_leaf_map_info *mip,
+       xfs_dir2_off_t          *curoff,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_buf          *bp = *bpp;
+       struct xfs_bmbt_irec    *map = mip->map;
+       struct blk_plug         plug;
+       int                     error = 0;
+       int                     length;
+       int                     i;
+       int                     j;
+
+       /*
+        * If we have a buffer, we need to release it and
+        * take it out of the mapping.
+        */
+
+       if (bp) {
+               xfs_trans_brelse(NULL, bp);
+               bp = NULL;
+               mip->map_blocks -= mp->m_dirblkfsbs;
+               /*
+                * Loop to get rid of the extents for the
+                * directory block.
+                */
+               for (i = mp->m_dirblkfsbs; i > 0; ) {
+                       j = min_t(int, map->br_blockcount, i);
+                       map->br_blockcount -= j;
+                       map->br_startblock += j;
+                       map->br_startoff += j;
+                       /*
+                        * If mapping is done, pitch it from
+                        * the table.
+                        */
+                       if (!map->br_blockcount && --mip->map_valid)
+                               memmove(&map[0], &map[1],
+                                       sizeof(map[0]) * mip->map_valid);
+                       i -= j;
+               }
+       }
+
+       /*
+        * Recalculate the readahead blocks wanted.
+        */
+       mip->ra_want = howmany(bufsize + mp->m_dirblksize,
+                              mp->m_sb.sb_blocksize) - 1;
+       ASSERT(mip->ra_want >= 0);
+
+       /*
+        * If we don't have as many as we want, and we haven't
+        * run out of data blocks, get some more mappings.
+        */
+       if (1 + mip->ra_want > mip->map_blocks &&
+           mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+               /*
+                * Get more bmaps, fill in after the ones
+                * we already have in the table.
+                */
+               mip->nmap = mip->map_size - mip->map_valid;
+               error = xfs_bmapi_read(dp, mip->map_off,
+                               xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
+                                                               mip->map_off,
+                               &map[mip->map_valid], &mip->nmap, 0);
+
+               /*
+                * Don't know if we should ignore this or try to return an
+                * error.  The trouble with returning errors is that readdir
+                * will just stop without actually passing the error through.
+                */
+               if (error)
+                       goto out;       /* XXX */
+
+               /*
+                * If we got all the mappings we asked for, set the final map
+                * offset based on the last bmap value received.  Otherwise,
+                * we've reached the end.
+                */
+               if (mip->nmap == mip->map_size - mip->map_valid) {
+                       i = mip->map_valid + mip->nmap - 1;
+                       mip->map_off = map[i].br_startoff + map[i].br_blockcount;
+               } else
+                       mip->map_off = xfs_dir2_byte_to_da(mp,
+                                                       XFS_DIR2_LEAF_OFFSET);
+
+               /*
+                * Look for holes in the mapping, and eliminate them.  Count up
+                * the valid blocks.
+                */
+               for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
+                       if (map[i].br_startblock == HOLESTARTBLOCK) {
+                               mip->nmap--;
+                               length = mip->map_valid + mip->nmap - i;
+                               if (length)
+                                       memmove(&map[i], &map[i + 1],
+                                               sizeof(map[i]) * length);
+                       } else {
+                               mip->map_blocks += map[i].br_blockcount;
+                               i++;
+                       }
+               }
+               mip->map_valid += mip->nmap;
+       }
+
+       /*
+        * No valid mappings, so no more data blocks.
+        */
+       if (!mip->map_valid) {
+               *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
+               goto out;
+       }
+
+       /*
+        * Read the directory block starting at the first mapping.
+        */
+       mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+       error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
+                       map->br_blockcount >= mp->m_dirblkfsbs ?
+                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
+
+       /*
+        * Should just skip over the data block instead of giving up.
+        */
+       if (error)
+               goto out;       /* XXX */
+
+       /*
+        * Adjust the current amount of read-ahead: we just read a block that
+        * was previously ra.
+        */
+       if (mip->ra_current)
+               mip->ra_current -= mp->m_dirblkfsbs;
+
+       /*
+        * Do we need more readahead?
+        */
+       blk_start_plug(&plug);
+       for (mip->ra_index = mip->ra_offset = i = 0;
+            mip->ra_want > mip->ra_current && i < mip->map_blocks;
+            i += mp->m_dirblkfsbs) {
+               ASSERT(mip->ra_index < mip->map_valid);
+               /*
+                * Read-ahead a contiguous directory block.
+                */
+               if (i > mip->ra_current &&
+                   map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
+                       xfs_dir3_data_readahead(NULL, dp,
+                               map[mip->ra_index].br_startoff + mip->ra_offset,
+                               XFS_FSB_TO_DADDR(mp,
+                                       map[mip->ra_index].br_startblock +
+                                                       mip->ra_offset));
+                       mip->ra_current = i;
+               }
+
+               /*
+                * Read-ahead a non-contiguous directory block.  This doesn't
+                * use our mapping, but this is a very rare case.
+                */
+               else if (i > mip->ra_current) {
+                       xfs_dir3_data_readahead(NULL, dp,
+                                       map[mip->ra_index].br_startoff +
+                                                       mip->ra_offset, -1);
+                       mip->ra_current = i;
+               }
+
+               /*
+                * Advance offset through the mapping table.
+                */
+               for (j = 0; j < mp->m_dirblkfsbs; j++) {
+                       /*
+                        * The rest of this extent but not more than a dir
+                        * block.
+                        */
+                       length = min_t(int, mp->m_dirblkfsbs,
+                                       map[mip->ra_index].br_blockcount -
+                                                       mip->ra_offset);
+                       j += length;
+                       mip->ra_offset += length;
+
+                       /*
+                        * Advance to the next mapping if this one is used up.
+                        */
+                       if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
+                               mip->ra_offset = 0;
+                               mip->ra_index++;
+                       }
+               }
+       }
+       blk_finish_plug(&plug);
+
+out:
+       *bpp = bp;
+       return error;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+STATIC int
+xfs_dir2_leaf_getdents(
+       xfs_inode_t             *dp,            /* incore directory inode */
+       struct dir_context      *ctx,
+       size_t                  bufsize)
+{
+       struct xfs_buf          *bp = NULL;     /* data block buffer */
+       xfs_dir2_data_hdr_t     *hdr;           /* data block header */
+       xfs_dir2_data_entry_t   *dep;           /* data entry */
+       xfs_dir2_data_unused_t  *dup;           /* unused entry */
+       int                     error = 0;      /* error return value */
+       int                     length;         /* temporary length value */
+       xfs_mount_t             *mp;            /* filesystem mount point */
+       int                     byteoff;        /* offset in current block */
+       xfs_dir2_off_t          curoff;         /* current overall offset */
+       xfs_dir2_off_t          newoff;         /* new curoff after new blk */
+       char                    *ptr = NULL;    /* pointer to current data */
+       struct xfs_dir2_leaf_map_info *map_info;
+
+       /*
+        * If the offset is at or past the largest allowed value,
+        * give up right away.
+        */
+       if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
+               return 0;
+
+       mp = dp->i_mount;
+
+       /*
+        * Set up to bmap a number of blocks based on the caller's
+        * buffer size, the directory block size, and the filesystem
+        * block size.
+        */
+       length = howmany(bufsize + mp->m_dirblksize,
+                                    mp->m_sb.sb_blocksize);
+       map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
+                               (length * sizeof(struct xfs_bmbt_irec)),
+                              KM_SLEEP | KM_NOFS);
+       map_info->map_size = length;
+
+       /*
+        * Inside the loop we keep the main offset value as a byte offset
+        * in the directory file.
+        */
+       curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
+
+       /*
+        * Force this conversion through db so we truncate the offset
+        * down to get the start of the data block.
+        */
+       map_info->map_off = xfs_dir2_db_to_da(mp,
+                                             xfs_dir2_byte_to_db(mp, curoff));
+
+       /*
+        * Loop over directory entries until we reach the end offset.
+        * Get more blocks and readahead as necessary.
+        */
+       while (curoff < XFS_DIR2_LEAF_OFFSET) {
+               __uint8_t filetype;
+
+               /*
+                * If we have no buffer, or we're off the end of the
+                * current buffer, need to get another one.
+                */
+               if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
+
+                       error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
+                                                     &curoff, &bp);
+                       if (error || !map_info->map_valid)
+                               break;
+
+                       /*
+                        * Having done a read, we need to set a new offset.
+                        */
+                       newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
+                       /*
+                        * Start of the current block.
+                        */
+                       if (curoff < newoff)
+                               curoff = newoff;
+                       /*
+                        * Make sure we're in the right block.
+                        */
+                       else if (curoff > newoff)
+                               ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+                                      map_info->curdb);
+                       hdr = bp->b_addr;
+                       xfs_dir3_data_check(dp, bp);
+                       /*
+                        * Find our position in the block.
+                        */
+                       ptr = (char *)xfs_dir3_data_entry_p(hdr);
+                       byteoff = xfs_dir2_byte_to_off(mp, curoff);
+                       /*
+                        * Skip past the header.
+                        */
+                       if (byteoff == 0)
+                               curoff += xfs_dir3_data_entry_offset(hdr);
+                       /*
+                        * Skip past entries until we reach our offset.
+                        */
+                       else {
+                               while ((char *)ptr - (char *)hdr < byteoff) {
+                                       dup = (xfs_dir2_data_unused_t *)ptr;
+
+                                       if (be16_to_cpu(dup->freetag)
+                                                 == XFS_DIR2_DATA_FREE_TAG) {
+
+                                               length = be16_to_cpu(dup->length);
+                                               ptr += length;
+                                               continue;
+                                       }
+                                       dep = (xfs_dir2_data_entry_t *)ptr;
+                                       length =
+                                          xfs_dir3_data_entsize(mp, dep->namelen);
+                                       ptr += length;
+                               }
+                               /*
+                                * Now set our real offset.
+                                */
+                               curoff =
+                                       xfs_dir2_db_off_to_byte(mp,
+                                           xfs_dir2_byte_to_db(mp, curoff),
+                                           (char *)ptr - (char *)hdr);
+                               if (ptr >= (char *)hdr + mp->m_dirblksize) {
+                                       continue;
+                               }
+                       }
+               }
+               /*
+                * We have a pointer to an entry.
+                * Is it a live one?
+                */
+               dup = (xfs_dir2_data_unused_t *)ptr;
+               /*
+                * No, it's unused, skip over it.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       length = be16_to_cpu(dup->length);
+                       ptr += length;
+                       curoff += length;
+                       continue;
+               }
+
+               dep = (xfs_dir2_data_entry_t *)ptr;
+               length = xfs_dir3_data_entsize(mp, dep->namelen);
+               filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+
+               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+               if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+                           be64_to_cpu(dep->inumber),
+                           xfs_dir3_get_dtype(mp, filetype)))
+                       break;
+
+               /*
+                * Advance to next entry in the block.
+                */
+               ptr += length;
+               curoff += length;
+               /* bufsize may have just been a guess; don't go negative */
+               bufsize = bufsize > length ? bufsize - length : 0;
+       }
+
+       /*
+        * All done.  Set output offset value to current offset.
+        */
+       if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+               ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+       else
+               ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+       kmem_free(map_info);
+       if (bp)
+               xfs_trans_brelse(NULL, bp);
+       return error;
+}
+
+/*
+ * Read a directory.
+ */
+int
+xfs_readdir(
+       xfs_inode_t     *dp,
+       struct dir_context *ctx,
+       size_t          bufsize)
+{
+       int             rval;           /* return value */
+       int             v;              /* type-checking value */
+
+       trace_xfs_readdir(dp);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return XFS_ERROR(EIO);
+
+       ASSERT(S_ISDIR(dp->i_d.di_mode));
+       XFS_STATS_INC(xs_dir_getdents);
+
+       if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+               rval = xfs_dir2_sf_getdents(dp, ctx);
+       else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+               ;
+       else if (v)
+               rval = xfs_dir2_block_getdents(dp, ctx);
+       else
+               rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+       return rval;
+}
index 97676a347da166e5d843db8ab2052666390e018d..bb6e2848f473d024d5ff1dd70a1232443901ea9f 100644 (file)
@@ -29,8 +29,8 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_error.h"
-#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_trace.h"
 
@@ -95,7 +95,7 @@ xfs_dir2_sf_get_parent_ino(
        return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
 }
 
-static void
+void
 xfs_dir2_sf_put_parent_ino(
        struct xfs_dir2_sf_hdr  *hdr,
        xfs_ino_t               ino)
@@ -105,31 +105,38 @@ xfs_dir2_sf_put_parent_ino(
 
 /*
  * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name.  The inode numbers may only be accessed
- * through the helpers below.
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
  */
 static xfs_dir2_inou_t *
-xfs_dir2_sfe_inop(
+xfs_dir3_sfe_inop(
+       struct xfs_mount        *mp,
        struct xfs_dir2_sf_entry *sfep)
 {
-       return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen];
+       __uint8_t       *ptr = &sfep->name[sfep->namelen];
+       if (xfs_sb_version_hasftype(&mp->m_sb))
+               ptr++;
+       return (xfs_dir2_inou_t *)ptr;
 }
 
 xfs_ino_t
-xfs_dir2_sfe_get_ino(
+xfs_dir3_sfe_get_ino(
+       struct xfs_mount        *mp,
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep)
 {
-       return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep));
+       return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
 }
 
-static void
-xfs_dir2_sfe_put_ino(
+void
+xfs_dir3_sfe_put_ino(
+       struct xfs_mount        *mp,
        struct xfs_dir2_sf_hdr  *hdr,
        struct xfs_dir2_sf_entry *sfep,
        xfs_ino_t               ino)
 {
-       xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino);
+       xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
 }
 
 /*
@@ -157,9 +164,16 @@ xfs_dir2_block_sfsize(
        int                     namelen;        /* total name bytes */
        xfs_ino_t               parent = 0;     /* parent inode number */
        int                     size=0;         /* total computed size */
+       int                     has_ftype;
 
        mp = dp->i_mount;
 
+       /*
+        * if there is a filetype field, add the extra byte to the namelen
+        * for each entry that we see.
+        */
+       has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
+
        count = i8count = namelen = 0;
        btp = xfs_dir2_block_tail_p(mp, hdr);
        blp = xfs_dir2_block_leaf_p(btp);
@@ -188,9 +202,10 @@ xfs_dir2_block_sfsize(
                if (!isdot)
                        i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
 #endif
+               /* take into account the file type field */
                if (!isdot && !isdotdot) {
                        count++;
-                       namelen += dep->namelen;
+                       namelen += dep->namelen + has_ftype;
                } else if (isdotdot)
                        parent = be64_to_cpu(dep->inumber);
                /*
@@ -316,12 +331,14 @@ xfs_dir2_block_to_sf(
                                (xfs_dir2_data_aoff_t)
                                ((char *)dep - (char *)hdr));
                        memcpy(sfep->name, dep->name, dep->namelen);
-                       xfs_dir2_sfe_put_ino(sfp, sfep,
+                       xfs_dir3_sfe_put_ino(mp, sfp, sfep,
                                             be64_to_cpu(dep->inumber));
+                       xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                                       xfs_dir3_dirent_get_ftype(mp, dep));
 
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                       sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
                }
-               ptr += xfs_dir2_data_entsize(dep->namelen);
+               ptr += xfs_dir3_data_entsize(mp, dep->namelen);
        }
        ASSERT((char *)sfep - (char *)sfp == size);
        xfs_dir2_sf_check(args);
@@ -372,7 +389,7 @@ xfs_dir2_sf_addname(
        /*
         * Compute entry (and change in) size.
         */
-       add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
+       add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
        incr_isize = add_entsize;
        objchange = 0;
 #if XFS_BIG_INUMS
@@ -466,8 +483,9 @@ xfs_dir2_sf_addname_easy(
        /*
         * Grow the in-inode space.
         */
-       xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen),
-               XFS_DATA_FORK);
+       xfs_idata_realloc(dp,
+                         xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
+                         XFS_DATA_FORK);
        /*
         * Need to set up again due to realloc of the inode data.
         */
@@ -479,7 +497,9 @@ xfs_dir2_sf_addname_easy(
        sfep->namelen = args->namelen;
        xfs_dir2_sf_put_offset(sfep, offset);
        memcpy(sfep->name, args->name, sfep->namelen);
-       xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
+
        /*
         * Update the header and inode.
         */
@@ -519,11 +539,13 @@ xfs_dir2_sf_addname_hard(
        xfs_dir2_sf_hdr_t       *oldsfp;        /* original shortform dir */
        xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
        xfs_dir2_sf_hdr_t       *sfp;           /* new shortform dir */
+       struct xfs_mount        *mp;
 
        /*
         * Copy the old directory to the stack buffer.
         */
        dp = args->dp;
+       mp = dp->i_mount;
 
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        old_isize = (int)dp->i_d.di_size;
@@ -535,13 +557,13 @@ xfs_dir2_sf_addname_hard(
         * to insert the new entry.
         * If it's going to end up at the end then oldsfep will point there.
         */
-       for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount),
+       for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
              oldsfep = xfs_dir2_sf_firstentry(oldsfp),
-             add_datasize = xfs_dir2_data_entsize(args->namelen),
+             add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
              eof = (char *)oldsfep == &buf[old_isize];
             !eof;
-            offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen),
-             oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep),
+            offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
+             oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
              eof = (char *)oldsfep == &buf[old_isize]) {
                new_offset = xfs_dir2_sf_get_offset(oldsfep);
                if (offset + add_datasize <= new_offset)
@@ -570,7 +592,8 @@ xfs_dir2_sf_addname_hard(
        sfep->namelen = args->namelen;
        xfs_dir2_sf_put_offset(sfep, offset);
        memcpy(sfep->name, args->name, sfep->namelen);
-       xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
+       xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
        sfp->count++;
 #if XFS_BIG_INUMS
        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -580,7 +603,7 @@ xfs_dir2_sf_addname_hard(
         * If there's more left to copy, do that.
         */
        if (!eof) {
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
                memcpy(sfep, oldsfep, old_isize - nbytes);
        }
        kmem_free(buf);
@@ -616,7 +639,7 @@ xfs_dir2_sf_addname_pick(
        mp = dp->i_mount;
 
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       size = xfs_dir2_data_entsize(args->namelen);
+       size = xfs_dir3_data_entsize(mp, args->namelen);
        offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
        sfep = xfs_dir2_sf_firstentry(sfp);
        holefit = 0;
@@ -629,8 +652,8 @@ xfs_dir2_sf_addname_pick(
                if (!holefit)
                        holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
                offset = xfs_dir2_sf_get_offset(sfep) +
-                        xfs_dir2_data_entsize(sfep->namelen);
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
+                        xfs_dir3_data_entsize(mp, sfep->namelen);
+               sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
        }
        /*
         * Calculate data bytes used excluding the new entry, if this
@@ -684,31 +707,34 @@ xfs_dir2_sf_check(
        int                     offset;         /* data offset */
        xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
+       struct xfs_mount        *mp;
 
        dp = args->dp;
+       mp = dp->i_mount;
 
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-       offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount);
+       offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
        ino = xfs_dir2_sf_get_parent_ino(sfp);
        i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
             i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
                ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
-               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+               ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
                i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
                offset =
                        xfs_dir2_sf_get_offset(sfep) +
-                       xfs_dir2_data_entsize(sfep->namelen);
+                       xfs_dir3_data_entsize(mp, sfep->namelen);
+               ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
+                                                       XFS_DIR3_FT_MAX);
        }
        ASSERT(i8count == sfp->i8count);
        ASSERT(XFS_BIG_INUMS || i8count == 0);
        ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
        ASSERT(offset +
               (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-              (uint)sizeof(xfs_dir2_block_tail_t) <=
-              dp->i_mount->m_dirblksize);
+              (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
 }
 #endif /* DEBUG */
 
@@ -765,100 +791,6 @@ xfs_dir2_sf_create(
        return 0;
 }
 
-int                                            /* error */
-xfs_dir2_sf_getdents(
-       xfs_inode_t             *dp,            /* incore directory inode */
-       struct dir_context      *ctx)
-{
-       int                     i;              /* shortform entry number */
-       xfs_mount_t             *mp;            /* filesystem mount point */
-       xfs_dir2_dataptr_t      off;            /* current entry's offset */
-       xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
-       xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
-       xfs_dir2_dataptr_t      dot_offset;
-       xfs_dir2_dataptr_t      dotdot_offset;
-       xfs_ino_t               ino;
-
-       mp = dp->i_mount;
-
-       ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-       /*
-        * Give up if the directory is way too short.
-        */
-       if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               return XFS_ERROR(EIO);
-       }
-
-       ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-       ASSERT(dp->i_df.if_u1.if_data != NULL);
-
-       sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-
-       ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-
-       /*
-        * If the block number in the offset is out of range, we're done.
-        */
-       if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
-               return 0;
-
-       /*
-        * Precalculate offsets for . and .. as we will always need them.
-        *
-        * XXX(hch): the second argument is sometimes 0 and sometimes
-        * mp->m_dirdatablk.
-        */
-       dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                            XFS_DIR3_DATA_DOT_OFFSET(mp));
-       dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                                               XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
-
-       /*
-        * Put . entry unless we're starting past it.
-        */
-       if (ctx->pos <= dot_offset) {
-               ctx->pos = dot_offset & 0x7fffffff;
-               if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
-                       return 0;
-       }
-
-       /*
-        * Put .. entry unless we're starting past it.
-        */
-       if (ctx->pos <= dotdot_offset) {
-               ino = xfs_dir2_sf_get_parent_ino(sfp);
-               ctx->pos = dotdot_offset & 0x7fffffff;
-               if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
-                       return 0;
-       }
-
-       /*
-        * Loop while there are more entries and put'ing works.
-        */
-       sfep = xfs_dir2_sf_firstentry(sfp);
-       for (i = 0; i < sfp->count; i++) {
-               off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-                               xfs_dir2_sf_get_offset(sfep));
-
-               if (ctx->pos > off) {
-                       sfep = xfs_dir2_sf_nextentry(sfp, sfep);
-                       continue;
-               }
-
-               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-               ctx->pos = off & 0x7fffffff;
-               if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
-                           ino, DT_UNKNOWN))
-                       return 0;
-               sfep = xfs_dir2_sf_nextentry(sfp, sfep);
-       }
-
-       ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
-                       0x7fffffff;
-       return 0;
-}
-
 /*
  * Lookup an entry in a shortform directory.
  * Returns EEXIST if found, ENOENT if not found.
@@ -898,6 +830,7 @@ xfs_dir2_sf_lookup(
        if (args->namelen == 1 && args->name[0] == '.') {
                args->inumber = dp->i_ino;
                args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
                return XFS_ERROR(EEXIST);
        }
        /*
@@ -907,6 +840,7 @@ xfs_dir2_sf_lookup(
            args->name[0] == '.' && args->name[1] == '.') {
                args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
                args->cmpresult = XFS_CMP_EXACT;
+               args->filetype = XFS_DIR3_FT_DIR;
                return XFS_ERROR(EEXIST);
        }
        /*
@@ -914,7 +848,7 @@ xfs_dir2_sf_lookup(
         */
        ci_sfep = NULL;
        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                /*
                 * Compare name and if it's an exact match, return the inode
                 * number. If it's the first case-insensitive match, store the
@@ -924,7 +858,10 @@ xfs_dir2_sf_lookup(
                                                                sfep->namelen);
                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
                        args->cmpresult = cmp;
-                       args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep);
+                       args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
+                                                            sfp, sfep);
+                       args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
+                                                               sfp, sfep);
                        if (cmp == XFS_CMP_EXACT)
                                return XFS_ERROR(EEXIST);
                        ci_sfep = sfep;
@@ -980,10 +917,10 @@ xfs_dir2_sf_removename(
         * Find the one we're deleting.
         */
        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
                                                                XFS_CMP_EXACT) {
-                       ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) ==
+                       ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
                               args->inumber);
                        break;
                }
@@ -997,7 +934,7 @@ xfs_dir2_sf_removename(
         * Calculate sizes.
         */
        byteoff = (int)((char *)sfep - (char *)sfp);
-       entsize = xfs_dir2_sf_entsize(sfp, args->namelen);
+       entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
        newsize = oldsize - entsize;
        /*
         * Copy the part if any after the removed entry, sliding it down.
@@ -1113,16 +1050,19 @@ xfs_dir2_sf_replace(
         * Normal entry, look for the name.
         */
        else {
-               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-                               i < sfp->count;
-                               i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+               for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
+                    i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
                        if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
                                                                XFS_CMP_EXACT) {
 #if XFS_BIG_INUMS || defined(DEBUG)
-                               ino = xfs_dir2_sfe_get_ino(sfp, sfep);
+                               ino = xfs_dir3_sfe_get_ino(dp->i_mount,
+                                                          sfp, sfep);
                                ASSERT(args->inumber != ino);
 #endif
-                               xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber);
+                               xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
+                                                    args->inumber);
+                               xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
+                                                      args->filetype);
                                break;
                        }
                }
@@ -1189,10 +1129,12 @@ xfs_dir2_sf_toino4(
        int                     oldsize;        /* old inode size */
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
 
        trace_xfs_dir2_sf_toino4(args);
 
        dp = args->dp;
+       mp = dp->i_mount;
 
        /*
         * Copy the old directory to the buffer.
@@ -1230,13 +1172,15 @@ xfs_dir2_sf_toino4(
        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
                    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
             i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
-                 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
+                 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
                sfep->offset = oldsfep->offset;
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               xfs_dir2_sfe_put_ino(sfp, sfep,
-                       xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ino(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
        }
        /*
         * Clean up the inode.
@@ -1264,10 +1208,12 @@ xfs_dir2_sf_toino8(
        int                     oldsize;        /* old inode size */
        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* new sf directory */
+       struct xfs_mount        *mp;
 
        trace_xfs_dir2_sf_toino8(args);
 
        dp = args->dp;
+       mp = dp->i_mount;
 
        /*
         * Copy the old directory to the buffer.
@@ -1305,13 +1251,15 @@ xfs_dir2_sf_toino8(
        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
                    oldsfep = xfs_dir2_sf_firstentry(oldsfp);
             i < sfp->count;
-            i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep),
-                 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) {
+            i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
+                 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
                sfep->namelen = oldsfep->namelen;
                sfep->offset = oldsfep->offset;
                memcpy(sfep->name, oldsfep->name, sfep->namelen);
-               xfs_dir2_sfe_put_ino(sfp, sfep,
-                       xfs_dir2_sfe_get_ino(oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ino(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
+               xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
+                       xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
        }
        /*
         * Clean up the inode.
index 69cf4fcde03e2d31266f70f6dfee1b73fe71b4a7..45560ee1a4ba8b1ccfdc36f9616cc5355e03558d 100644 (file)
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_sb.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_trans.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
index 0adf27ecf3f1cd4e98d8fcc06a732e4d476437db..251c66632e5e7d926e92942d764e49220237dd11 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
@@ -28,6 +29,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
@@ -710,10 +712,8 @@ xfs_qm_dqread(
 
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-               error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                         XFS_QM_DQALLOC_LOG_RES(mp), 0,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
+                                         XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                if (error)
                        goto error1;
                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
index 57aa4b03720cb7440acef44a920d18f81955ba72..60c6e1f126952acc43e1bbe2a1d065f304ed484d 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -43,14 +44,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
 /*
  * returns the number of iovecs needed to log the given dquot item.
  */
-STATIC uint
+STATIC void
 xfs_qm_dquot_logitem_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
-       /*
-        * we need only two iovecs, one for the format, one for the real thing
-        */
-       return 2;
+       *nvecs += 2;
+       *nbytes += sizeof(struct xfs_dq_logformat) +
+                  sizeof(struct xfs_disk_dquot);
 }
 
 /*
@@ -285,11 +287,14 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
  * We only need 1 iovec for an quotaoff item.  It just logs the
  * quotaoff_log_format structure.
  */
-STATIC uint
+STATIC void
 xfs_qm_qoff_logitem_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
-       return 1;
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_qoff_logitem);
 }
 
 /*
index 35d3f5b041ddc0977f47981cb88991edee857245..1123d93ff79546efe3a9d962460ab1e44e2e1bac 100644 (file)
@@ -26,7 +26,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_utils.h"
 #include "xfs_error.h"
 
 #ifdef DEBUG
index c585bc646395e04c5497621eeb4ec34bd1f262e7..066df425c14ffca5b4dacb20f7b3c6fcdd139acb 100644 (file)
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_export.h"
-#include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
index 85e9f87a1a7ce7945da58e305a21818262542cfa..86f559f6e5d3c3f825df5aeffbf6967a7654f055 100644 (file)
@@ -147,7 +147,7 @@ xfs_extent_busy_search(
  * extent.  If the overlap covers the beginning, the end, or all of the busy
  * extent, the overlapping portion can be made unbusy and used for the
  * allocation.  We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
+ * transaction/CIL context busy list, but we can update an entry's block
  * number or length.
  *
  * Returns true if the extent can safely be reused, or false if the search
index 452920a3f03fb2e4405ce52e34587e55acfb7abe..dc53e8febbbeaa54812b4e72dc25938718da69c1 100644 (file)
@@ -73,11 +73,22 @@ __xfs_efi_release(
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
  * structure.
  */
-STATIC uint
+static inline int
+xfs_efi_item_sizeof(
+       struct xfs_efi_log_item *efip)
+{
+       return sizeof(struct xfs_efi_log_format) +
+              (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
 xfs_efi_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
-       return 1;
+       *nvecs += 1;
+       *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
 }
 
 /*
@@ -93,21 +104,17 @@ xfs_efi_item_format(
        struct xfs_log_iovec    *log_vector)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-       uint                    size;
 
        ASSERT(atomic_read(&efip->efi_next_extent) ==
                                efip->efi_format.efi_nextents);
 
        efip->efi_format.efi_type = XFS_LI_EFI;
-
-       size = sizeof(xfs_efi_log_format_t);
-       size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
        efip->efi_format.efi_size = 1;
 
        log_vector->i_addr = &efip->efi_format;
-       log_vector->i_len = size;
+       log_vector->i_len = xfs_efi_item_sizeof(efip);
        log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
-       ASSERT(size >= sizeof(xfs_efi_log_format_t));
+       ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
 }
 
 
@@ -333,11 +340,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
  * We only need 1 iovec for an efd item.  It just logs the efd_log_format
  * structure.
  */
-STATIC uint
+static inline int
+xfs_efd_item_sizeof(
+       struct xfs_efd_log_item *efdp)
+{
+       return sizeof(xfs_efd_log_format_t) +
+              (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+}
+
+STATIC void
 xfs_efd_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
-       return 1;
+       *nvecs += 1;
+       *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
 }
 
 /*
@@ -353,20 +371,16 @@ xfs_efd_item_format(
        struct xfs_log_iovec    *log_vector)
 {
        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
-       uint                    size;
 
        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
 
        efdp->efd_format.efd_type = XFS_LI_EFD;
-
-       size = sizeof(xfs_efd_log_format_t);
-       size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
        efdp->efd_format.efd_size = 1;
 
        log_vector->i_addr = &efdp->efd_format;
-       log_vector->i_len = size;
+       log_vector->i_len = xfs_efd_item_sizeof(efdp);
        log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
-       ASSERT(size >= sizeof(xfs_efd_log_format_t));
+       ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
 }
 
 /*
index 432222418c566f6d7f30bed9951289ea9c5f1d10..0ffbce32d5693e05e042de8983c8274942616196 100644 (file)
 #ifndef        __XFS_EXTFREE_ITEM_H__
 #define        __XFS_EXTFREE_ITEM_H__
 
+/* kernel only EFI/EFD definitions */
+
 struct xfs_mount;
 struct kmem_zone;
 
-typedef struct xfs_extent {
-       xfs_dfsbno_t    ext_start;
-       xfs_extlen_t    ext_len;
-} xfs_extent_t;
-
-/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
- */
-
-typedef struct xfs_extent_32 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
-
-typedef struct xfs_extent_64 {
-       __uint64_t      ext_start;
-       __uint32_t      ext_len;
-       __uint32_t      ext_pad;
-} xfs_extent_64_t;
-
-/*
- * This is the structure used to lay out an efi log item in the
- * log.  The efi_extents field is a variable size array whose
- * size is given by efi_nextents.
- */
-typedef struct xfs_efi_log_format {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_t            efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_t;
-
-typedef struct xfs_efi_log_format_32 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
-
-typedef struct xfs_efi_log_format_64 {
-       __uint16_t              efi_type;       /* efi log item type */
-       __uint16_t              efi_size;       /* size of this item */
-       __uint32_t              efi_nextents;   /* # extents to free */
-       __uint64_t              efi_id;         /* efi identifier */
-       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
-
-/*
- * This is the structure used to lay out an efd log item in the
- * log.  The efd_extents array is a variable size array whose
- * size is given by efd_nextents;
- */
-typedef struct xfs_efd_log_format {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_t            efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_t;
-
-typedef struct xfs_efd_log_format_32 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
-
-typedef struct xfs_efd_log_format_64 {
-       __uint16_t              efd_type;       /* efd log item type */
-       __uint16_t              efd_size;       /* size of this item */
-       __uint32_t              efd_nextents;   /* # of extents freed */
-       __uint64_t              efd_efi_id;     /* id of corresponding efi */
-       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
-
-
-#ifdef __KERNEL__
-
 /*
  * Max number of extents in fast allocation path.
  */
@@ -160,6 +78,4 @@ int                  xfs_efi_copy_format(xfs_log_iovec_t *buf,
                                            xfs_efi_log_format_t *dst_efi_fmt);
 void                   xfs_efi_item_free(xfs_efi_log_item_t *);
 
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_EXTFREE_ITEM_H__ */
index de3dc98f4e8f76067c1e7d0ee4631a8638d87988..4c749ab543d0de17646993a282f925ebd0314ccf 100644 (file)
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_error.h"
-#include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
index 5170306a1009e22e287c425b1968d97b7a885a82..ce78e654d37b73693aa4c637e021dda9154ad5d6 100644 (file)
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
+#include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inum.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_ag.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_alloc.h"
-#include "xfs_utils.h"
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_trace.h"
@@ -668,8 +668,8 @@ exit:
  */
 int
 xfs_filestream_new_ag(
-       xfs_bmalloca_t  *ap,
-       xfs_agnumber_t  *agp)
+       struct xfs_bmalloca     *ap,
+       xfs_agnumber_t          *agp)
 {
        int             flags, err;
        xfs_inode_t     *ip, *pip = NULL;
index 09dd9af454349b57fe2a8a7bec0f3c5c3c7e4e3e..6d61dbee8564b12cca254721bf78685975799a74 100644 (file)
@@ -18,8 +18,6 @@
 #ifndef __XFS_FILESTREAM_H__
 #define __XFS_FILESTREAM_H__
 
-#ifdef __KERNEL__
-
 struct xfs_mount;
 struct xfs_inode;
 struct xfs_perag;
@@ -69,6 +67,4 @@ xfs_inode_is_filestream(
                (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
 }
 
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
new file mode 100644 (file)
index 0000000..35c08ff
--- /dev/null
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_FORMAT_H__
+#define __XFS_FORMAT_H__
+
+/*
+ * XFS On Disk Format Definitions
+ *
+ * This header file defines all the on-disk format definitions for 
+ * general XFS objects. Directory and attribute related objects are defined in
+ * xfs_da_format.h, which log and log item formats are defined in
+ * xfs_log_format.h. Everything else goes here.
+ */
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+
+/*
+ * RealTime Device format definitions
+ */
+
+/* Min and max rt extent sizes, specified in bytes */
+#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
+#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
+#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
+
+#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
+#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
+#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
+#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
+
+/*
+ * RT Summary and bit manipulation macros.
+ */
+#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
+       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define        XFS_SUMPTR(mp,bp,so)    \
+       ((xfs_suminfo_t *)((bp)->b_addr + \
+               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
+#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
+#define        XFS_BITTOWORD(mp,bi)    \
+       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
+#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
+
+#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
+#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
+
+#if XFS_BIG_BLKNOS
+#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
+#else
+#define        XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
+#endif
+
+/*
+ * Dquot and dquot block format definitions
+ */
+#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
+#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct xfs_disk_dquot {
+       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
+       __u8            d_version;      /* dquot version */
+       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+       __be32          d_id;           /* user,project,group id */
+       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
+       __be64          d_blk_softlimit;/* preferred limit on disk blks */
+       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
+       __be64          d_ino_softlimit;/* preferred inode limit */
+       __be64          d_bcount;       /* disk blocks owned by the user */
+       __be64          d_icount;       /* inodes owned by the user */
+       __be32          d_itimer;       /* zero if within inode limits if not,
+                                          this is when we refuse service */
+       __be32          d_btimer;       /* similar to above; for disk blocks */
+       __be16          d_iwarns;       /* warnings issued wrt num inodes */
+       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
+       __be32          d_pad0;         /* 64 bit align */
+       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
+       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
+       __be64          d_rtbcount;     /* realtime blocks owned */
+       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
+       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
+       __be16          d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
+       char              dd_fill[4];   /* filling for posterity */
+
+       /*
+        * These two are only present on filesystems with the CRC bits set.
+        */
+       __be32            dd_crc;       /* checksum */
+       __be64            dd_lsn;       /* last modification in log */
+       uuid_t            dd_uuid;      /* location information */
+} xfs_dqblk_t;
+
+#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
+
+/*
+ * Remote symlink format and access functions.
+ */
+#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
+
+struct xfs_dsymlink_hdr {
+       __be32  sl_magic;
+       __be32  sl_offset;
+       __be32  sl_bytes;
+       __be32  sl_crc;
+       uuid_t  sl_uuid;
+       __be64  sl_owner;
+       __be64  sl_blkno;
+       __be64  sl_lsn;
+};
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
+       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+                       sizeof(struct xfs_dsymlink_hdr) : 0))
+
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+                       uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+                                struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+#endif /* __XFS_FORMAT_H__ */
index d04695545397308a6596f5109a72f8923fd79419..1edb5cc3e5f495fdca3059054d1f2e7bd5d9fd58 100644 (file)
@@ -240,7 +240,9 @@ typedef struct xfs_fsop_resblks {
 
 
 /*
- * Minimum and maximum sizes need for growth checks
+ * Minimum and maximum sizes need for growth checks.
+ *
+ * Block counts are in units of filesystem blocks, not basic blocks.
  */
 #define XFS_MIN_AG_BLOCKS      64
 #define XFS_MIN_LOG_BLOCKS     512ULL
@@ -310,6 +312,17 @@ typedef struct xfs_bstat {
        __u16           bs_aextents;    /* attribute number of extents  */
 } xfs_bstat_t;
 
+/*
+ * Project quota id helpers (previously projid was 16bit only
+ * and using two 16bit values to hold new 32bit projid was choosen
+ * to retain compatibility with "old" filesystems).
+ */
+static inline __uint32_t
+bstat_get_projid(struct xfs_bstat *bs)
+{
+       return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
+}
+
 /*
  * The user-level BulkStat Request interface structure.
  */
@@ -344,7 +357,7 @@ typedef struct xfs_error_injection {
  * Speculative preallocation trimming.
  */
 #define XFS_EOFBLOCKS_VERSION          1
-struct xfs_eofblocks {
+struct xfs_fs_eofblocks {
        __u32           eof_version;
        __u32           eof_flags;
        uid_t           eof_uid;
@@ -449,6 +462,21 @@ typedef struct xfs_handle {
                                 - (char *) &(handle))                    \
                                 + (handle).ha_fid.fid_len)
 
+/*
+ * Structure passed to XFS_IOC_SWAPEXT
+ */
+typedef struct xfs_swapext
+{
+       __int64_t       sx_version;     /* version */
+#define XFS_SX_VERSION         0
+       __int64_t       sx_fdtarget;    /* fd of target file */
+       __int64_t       sx_fdtmp;       /* fd of tmp file */
+       xfs_off_t       sx_offset;      /* offset into file */
+       xfs_off_t       sx_length;      /* leng from offset */
+       char            sx_pad[16];     /* pad space, unused */
+       xfs_bstat_t     sx_stat;        /* stat of target b4 copy */
+} xfs_swapext_t;
+
 /*
  * Flags for going down operation
  */
@@ -511,8 +539,14 @@ typedef struct xfs_handle {
 #define XFS_IOC_ERROR_INJECTION             _IOW ('X', 116, struct xfs_error_injection)
 #define XFS_IOC_ERROR_CLEARALL      _IOW ('X', 117, struct xfs_error_injection)
 /*     XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
+
 /*     XFS_IOC_FREEZE            -- FIFREEZE   119      */
 /*     XFS_IOC_THAW              -- FITHAW     120      */
+#ifndef FIFREEZE
+#define XFS_IOC_FREEZE              _IOWR('X', 119, int)
+#define XFS_IOC_THAW                _IOWR('X', 120, int)
+#endif
+
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
index 614eb0cc360860214ce08443ff84993afa531143..e64ee5288b86be2d0c0267b383d3f6f9297e60a1 100644 (file)
@@ -203,8 +203,9 @@ xfs_growfs_data_private(
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
        tp->t_flags |= XFS_TRANS_RESERVE;
-       if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
-                       XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+                                 XFS_GROWFS_SPACE_RES(mp), 0);
+       if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
@@ -739,8 +740,7 @@ xfs_fs_log_dummy(
        int             error;
 
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
index 7a0c17d7ec0974354cfd645695e9f4f7778704fa..ccf2fb1439629fae273a239625f97f6879192878 100644 (file)
@@ -39,6 +39,7 @@
 #include "xfs_cksum.h"
 #include "xfs_buf_item.h"
 #include "xfs_icreate_item.h"
+#include "xfs_icache.h"
 
 
 /*
@@ -506,7 +507,7 @@ xfs_ialloc_next_ag(
 
 /*
  * Select an allocation group to look for a free inode in, based on the parent
- * inode and then mode.  Return the allocation group buffer.
+ * inode and the mode.  Return the allocation group buffer.
  */
 STATIC xfs_agnumber_t
 xfs_ialloc_ag_select(
@@ -728,7 +729,7 @@ xfs_dialloc_ag(
                error = xfs_inobt_get_rec(cur, &rec, &j);
                if (error)
                        goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
 
                if (rec.ir_freecount > 0) {
                        /*
@@ -1341,7 +1342,7 @@ xfs_imap(
        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
        int             error;  /* error code */
        int             offset; /* index of inode in its buffer */
-       int             offset_agbno;   /* blks from chunk start to inode */
+       xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
 
        ASSERT(ino != NULLFSINO);
 
index 3f90e1ceb8d68c4655bb033da592f495b91e9772..16219b9c67909a6a483678fb761db701a8cb00e6 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_dinode.h"
 #include "xfs_error.h"
 #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
 #include "xfs_icache.h"
+#include "xfs_bmap_util.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -619,7 +620,7 @@ restart:
 
 /*
  * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
  */
 STATIC void
 xfs_queue_eofblocks(
@@ -1203,15 +1204,15 @@ xfs_inode_match_id(
        struct xfs_inode        *ip,
        struct xfs_eofblocks    *eofb)
 {
-       if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
-           ip->i_d.di_uid != eofb->eof_uid)
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+           !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
                return 0;
 
-       if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
-           ip->i_d.di_gid != eofb->eof_gid)
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+           !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
                return 0;
 
-       if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+       if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
            xfs_get_projid(ip) != eofb->eof_prid)
                return 0;
 
index a01afbb3909a465a6e94f23bdc819530a3f22140..8a89f7d791bd9df3184fd9f66a15467f0f30f8dc 100644 (file)
 struct xfs_mount;
 struct xfs_perag;
 
+struct xfs_eofblocks {
+       __u32           eof_flags;
+       kuid_t          eof_uid;
+       kgid_t          eof_gid;
+       prid_t          eof_prid;
+       __u64           eof_min_file_size;
+};
+
 #define SYNC_WAIT              0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
 
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE                0x1
+#define XFS_IGET_UNTRUSTED     0x2
+#define XFS_IGET_DONTCACHE     0x4
+
 int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
             uint flags, uint lock_flags, xfs_inode_t **ipp);
 
@@ -49,4 +64,39 @@ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
                int flags, void *args),
        int flags, void *args, int tag);
 
+static inline int
+xfs_fs_eofblocks_from_user(
+       struct xfs_fs_eofblocks         *src,
+       struct xfs_eofblocks            *dst)
+{
+       if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+               return EINVAL;
+
+       if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+               return EINVAL;
+
+       if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+           memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+               return EINVAL;
+
+       dst->eof_flags = src->eof_flags;
+       dst->eof_prid = src->eof_prid;
+       dst->eof_min_file_size = src->eof_min_file_size;
+
+       dst->eof_uid = INVALID_UID;
+       if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+               dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+               if (!uid_valid(dst->eof_uid))
+                       return EINVAL;
+       }
+
+       dst->eof_gid = INVALID_GID;
+       if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+               dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+               if (!gid_valid(dst->eof_gid))
+                       return EINVAL;
+       }
+       return 0;
+}
+
 #endif
index 7716a4e7375e296e926ef402f8687169dcc295c3..5a5a593994d4196d3b18c2e959df90dc28c7588f 100644 (file)
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_inum.h"
 #include "xfs_trans.h"
-#include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_error.h"
 #include "xfs_icreate_item.h"
 
@@ -52,11 +40,14 @@ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
  *
  * We only need one iovec for the icreate log structure.
  */
-STATIC uint
+STATIC void
 xfs_icreate_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
-       return 1;
+       *nvecs += 1;
+       *nbytes += sizeof(struct xfs_icreate_log);
 }
 
 /*
index 88ba8aa0bc41c0f3aa291da6b621dfea7445f56b..59e89f87c09b3fb8483326a30a9fcb879a4c4665 100644 (file)
 #ifndef XFS_ICREATE_ITEM_H
 #define XFS_ICREATE_ITEM_H     1
 
-/*
- * on disk log item structure
- *
- * Log recovery assumes the first two entries are the type and size and they fit
- * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
- * decoding can be done correctly.
- */
-struct xfs_icreate_log {
-       __uint16_t      icl_type;       /* type of log format structure */
-       __uint16_t      icl_size;       /* size of log format structure */
-       __be32          icl_ag;         /* ag being allocated in */
-       __be32          icl_agbno;      /* start block of inode range */
-       __be32          icl_count;      /* number of inodes to initialise */
-       __be32          icl_isize;      /* size of inodes */
-       __be32          icl_length;     /* length of extent to initialise */
-       __be32          icl_gen;        /* inode generation number to use */
-};
-
 /* in memory log item structure */
 struct xfs_icreate_item {
        struct xfs_log_item     ic_item;
index bb262c25c8de463276e9282d64b299c975eceb7b..e3d75385aa76a6e45b7711a65bb39c268c9f689b 100644 (file)
 
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
+#include "xfs_attr.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_buf_item.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_error.h"
-#include "xfs_utils.h"
 #include "xfs_quota.h"
 #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_symlink.h"
 
-kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
 
 /*
@@ -58,9 +62,6 @@ kmem_zone_t *xfs_inode_zone;
 #define        XFS_ITRUNC_MAX_EXTENTS  2
 
 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
 
 /*
  * helper function to extract extent size hint from inode
@@ -310,623 +311,202 @@ xfs_isilocked(
 }
 #endif
 
-void
-__xfs_iflock(
-       struct xfs_inode        *ip)
-{
-       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-       do {
-               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-               if (xfs_isiflocked(ip))
-                       io_schedule();
-       } while (!xfs_iflock_nowait(ip));
-
-       finish_wait(wq, &wait.wait);
-}
-
 #ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
 /*
- * Make sure that the extents in the given memory buffer
- * are valid.
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
  */
-STATIC void
-xfs_validate_extents(
-       xfs_ifork_t             *ifp,
-       int                     nrecs,
-       xfs_exntfmt_t           fmt)
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
 {
-       xfs_bmbt_irec_t         irec;
-       xfs_bmbt_rec_host_t     rec;
-       int                     i;
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
 
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               rec.l0 = get_unaligned(&ep->l0);
-               rec.l1 = get_unaligned(&ep->l1);
-               xfs_bmbt_get_all(&rec, &irec);
-               if (fmt == XFS_EXTFMT_NOSTATE)
-                       ASSERT(irec.br_state == XFS_EXT_NORM);
-       }
+       return lock_mode;
 }
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
 
 /*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
  */
-#if defined(DEBUG)
 void
-xfs_inobp_check(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp)
-{
-       int             i;
-       int             j;
-       xfs_dinode_t    *dip;
-
-       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
-       for (i = 0; i < j; i++) {
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       i * mp->m_sb.sb_inodesize);
-               if (!dip->di_next_unlinked)  {
-                       xfs_alert(mp,
-       "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
-                               bp);
-                       ASSERT(dip->di_next_unlinked);
-               }
-       }
-}
-#endif
-
-static void
-xfs_inode_buf_verify(
-       struct xfs_buf  *bp)
+xfs_lock_inodes(
+       xfs_inode_t     **ips,
+       int             inodes,
+       uint            lock_mode)
 {
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       int             i;
-       int             ni;
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        */
-       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
-
-               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       xfs_buf_ioerror(bp, EFSCORRUPTED);
-                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                            mp, dip);
-#ifdef DEBUG
-                       xfs_emerg(mp,
-                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                               (unsigned long long)bp->b_bn, i,
-                               be16_to_cpu(dip->di_magic));
-                       ASSERT(0);
-#endif
-               }
-       }
-       xfs_inobp_check(mp, bp);
-}
+       int             attempts = 0, i, j, try_lock;
+       xfs_log_item_t  *lp;
 
+       ASSERT(ips && (inodes >= 2)); /* we need at least two */
 
-static void
-xfs_inode_buf_read_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp);
-}
-
-static void
-xfs_inode_buf_write_verify(
-       struct xfs_buf  *bp)
-{
-       xfs_inode_buf_verify(bp);
-}
+       try_lock = 0;
+       i = 0;
 
-const struct xfs_buf_ops xfs_inode_buf_ops = {
-       .verify_read = xfs_inode_buf_read_verify,
-       .verify_write = xfs_inode_buf_write_verify,
-};
+again:
+       for (; i < inodes; i++) {
+               ASSERT(ips[i]);
 
+               if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                       continue;
 
-/*
- * This routine is called to map an inode to the buffer containing the on-disk
- * version of the inode.  It returns a pointer to the buffer containing the
- * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
- * pointer to the on-disk inode within that buffer.
- *
- * If a non-zero error is returned, then the contents of bpp and dipp are
- * undefined.
- */
-int
-xfs_imap_to_bp(
-       struct xfs_mount        *mp,
-       struct xfs_trans        *tp,
-       struct xfs_imap         *imap,
-       struct xfs_dinode       **dipp,
-       struct xfs_buf          **bpp,
-       uint                    buf_flags,
-       uint                    iget_flags)
-{
-       struct xfs_buf          *bp;
-       int                     error;
+               /*
+                * If try_lock is not set yet, make sure all locked inodes
+                * are not in the AIL.
+                * If any are, set try_lock to be used later.
+                */
 
-       buf_flags |= XBF_UNMAPPED;
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, buf_flags, &bp,
-                                  &xfs_inode_buf_ops);
-       if (error) {
-               if (error == EAGAIN) {
-                       ASSERT(buf_flags & XBF_TRYLOCK);
-                       return error;
+               if (!try_lock) {
+                       for (j = (i - 1); j >= 0 && !try_lock; j--) {
+                               lp = (xfs_log_item_t *)ips[j]->i_itemp;
+                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                       try_lock++;
+                               }
+                       }
                }
 
-               if (error == EFSCORRUPTED &&
-                   (iget_flags & XFS_IGET_UNTRUSTED))
-                       return XFS_ERROR(EINVAL);
-
-               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-                       __func__, error);
-               return error;
-       }
-
-       *bpp = bp;
-       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
-       return 0;
-}
-
-/*
- * Move inode type and inode format specific information from the
- * on-disk inode to the in-core inode.  For fifos, devs, and sockets
- * this means set if_rdev to the proper value.  For files, directories,
- * and symlinks this means to bring in the in-line data or extent
- * pointers.  For a file in B-tree format, only the root is immediately
- * brought in-core.  The rest will be in-lined in if_extents when it
- * is first referenced (see xfs_iread_extents()).
- */
-STATIC int
-xfs_iformat(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip)
-{
-       xfs_attr_shortform_t    *atp;
-       int                     size;
-       int                     error = 0;
-       xfs_fsize_t             di_size;
-
-       if (unlikely(be32_to_cpu(dip->di_nextents) +
-                    be16_to_cpu(dip->di_anextents) >
-                    be64_to_cpu(dip->di_nblocks))) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
-                       (unsigned long long)ip->i_ino,
-                       (int)(be32_to_cpu(dip->di_nextents) +
-                             be16_to_cpu(dip->di_anextents)),
-                       (unsigned long long)
-                               be64_to_cpu(dip->di_nblocks));
-               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                       (unsigned long long)ip->i_ino,
-                       dip->di_forkoff);
-               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
-                    !ip->i_mount->m_rtdev_targp)) {
-               xfs_warn(ip->i_mount,
-                       "corrupt dinode %Lu, has realtime flag set.",
-                       ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
-                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFIFO:
-       case S_IFCHR:
-       case S_IFBLK:
-       case S_IFSOCK:
-               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
-                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
-                                             ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-               ip->i_d.di_size = 0;
-               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
-               break;
+               /*
+                * If any of the previous locks we have locked is in the AIL,
+                * we must TRY to get the second and subsequent locks. If
+                * we can't get any, we must release all we have
+                * and try again.
+                */
 
-       case S_IFREG:
-       case S_IFLNK:
-       case S_IFDIR:
-               switch (dip->di_format) {
-               case XFS_DINODE_FMT_LOCAL:
+               if (try_lock) {
+                       /* try_lock must be 0 if i is 0. */
                        /*
-                        * no local regular files yet
+                        * try_lock means we have an inode locked
+                        * that is in the AIL.
                         */
-                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (local format for regular file).",
-                                       (unsigned long long) ip->i_ino);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
-                       }
+                       ASSERT(i != 0);
+                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+                               attempts++;
+
+                               /*
+                                * Unlock all previous guys and try again.
+                                * xfs_iunlock will try to push the tail
+                                * if the inode is in the AIL.
+                                */
+
+                               for(j = i - 1; j >= 0; j--) {
+
+                                       /*
+                                        * Check to see if we've already
+                                        * unlocked this one.
+                                        * Not the first one going back,
+                                        * and the inode ptr is the same.
+                                        */
+                                       if ((j != (i - 1)) && ips[j] ==
+                                                               ips[j+1])
+                                               continue;
+
+                                       xfs_iunlock(ips[j], lock_mode);
+                               }
 
-                       di_size = be64_to_cpu(dip->di_size);
-                       if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                               xfs_warn(ip->i_mount,
-                       "corrupt inode %Lu (bad size %Ld for local inode).",
-                                       (unsigned long long) ip->i_ino,
-                                       (long long) di_size);
-                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
-                                                    XFS_ERRLEVEL_LOW,
-                                                    ip->i_mount, dip);
-                               return XFS_ERROR(EFSCORRUPTED);
+                               if ((attempts % 5) == 0) {
+                                       delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+                                       xfs_lock_delays++;
+#endif
+                               }
+                               i = 0;
+                               try_lock = 0;
+                               goto again;
                        }
-
-                       size = (int)di_size;
-                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
-                       break;
-               case XFS_DINODE_FMT_EXTENTS:
-                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
-                       break;
-               case XFS_DINODE_FMT_BTREE:
-                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
-                       break;
-               default:
-                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
-                                        ip->i_mount);
-                       return XFS_ERROR(EFSCORRUPTED);
+               } else {
+                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
                }
-               break;
-
-       default:
-               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       if (error) {
-               return error;
        }
-       if (!XFS_DFORK_Q(dip))
-               return 0;
-
-       ASSERT(ip->i_afp == NULL);
-       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-
-       switch (dip->di_aformat) {
-       case XFS_DINODE_FMT_LOCAL:
-               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
-               size = be16_to_cpu(atp->hdr.totsize);
-
-               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                       xfs_warn(ip->i_mount,
-                               "corrupt inode %Lu (bad attr fork size %Ld).",
-                               (unsigned long long) ip->i_ino,
-                               (long long) size);
-                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
-                                            XFS_ERRLEVEL_LOW,
-                                            ip->i_mount, dip);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
 
-               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
-               break;
-       case XFS_DINODE_FMT_EXTENTS:
-               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
-               break;
-       case XFS_DINODE_FMT_BTREE:
-               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
-               break;
-       default:
-               error = XFS_ERROR(EFSCORRUPTED);
-               break;
-       }
-       if (error) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+#ifdef DEBUG
+       if (attempts) {
+               if (attempts < 5) xfs_small_retries++;
+               else if (attempts < 100) xfs_middle_retries++;
+               else xfs_lots_retries++;
+       } else {
+               xfs_locked_n++;
        }
-       return error;
+#endif
 }
 
 /*
- * The file is in-lined in the on-disk inode.
- * If it fits into if_inline_data, then copy
- * it there, otherwise allocate a buffer for it
- * and copy the data there.  Either way, set
- * if_data to point at the data.
- * If we allocate a buffer for the data, make
- * sure that its size is a multiple of 4 and
- * record the real size in i_real_bytes.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
  */
-STATIC int
-xfs_iformat_local(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork,
-       int             size)
+void
+xfs_lock_two_inodes(
+       xfs_inode_t             *ip0,
+       xfs_inode_t             *ip1,
+       uint                    lock_mode)
 {
-       xfs_ifork_t     *ifp;
-       int             real_size;
-
-       /*
-        * If the size is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount,
-       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                       (unsigned long long) ip->i_ino, size,
-                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
-               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       real_size = 0;
-       if (size == 0)
-               ifp->if_u1.if_data = NULL;
-       else if (size <= sizeof(ifp->if_u2.if_inline_data))
-               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-       else {
-               real_size = roundup(size, 4);
-               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
-       }
-       ifp->if_bytes = size;
-       ifp->if_real_bytes = real_size;
-       if (size)
-               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFINLINE;
-       return 0;
-}
+       xfs_inode_t             *temp;
+       int                     attempts = 0;
+       xfs_log_item_t          *lp;
 
-/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it.  Either way, set if_extents
- * to point at the extents.
- */
-STATIC int
-xfs_iformat_extents(
-       xfs_inode_t     *ip,
-       xfs_dinode_t    *dip,
-       int             whichfork)
-{
-       xfs_bmbt_rec_t  *dp;
-       xfs_ifork_t     *ifp;
-       int             nex;
-       int             size;
-       int             i;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
-       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
-
-       /*
-        * If the number of extents is unreasonable, then something
-        * is wrong and we just bail out rather than crash in
-        * kmem_alloc() or memcpy() below.
-        */
-       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                       (unsigned long long) ip->i_ino, nex);
-               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
-                                    ip->i_mount, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_real_bytes = 0;
-       if (nex == 0)
-               ifp->if_u1.if_extents = NULL;
-       else if (nex <= XFS_INLINE_EXTS)
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       else
-               xfs_iext_add(ifp, 0, nex);
-
-       ifp->if_bytes = size;
-       if (size) {
-               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
-               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
-               for (i = 0; i < nex; i++, dp++) {
-                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                       ep->l0 = get_unaligned_be64(&dp->l0);
-                       ep->l1 = get_unaligned_be64(&dp->l1);
-               }
-               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
-               if (whichfork != XFS_DATA_FORK ||
-                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
-                               if (unlikely(xfs_check_nostate_extents(
-                                   ifp, 0, nex))) {
-                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
-                                                        XFS_ERRLEVEL_LOW,
-                                                        ip->i_mount);
-                                       return XFS_ERROR(EFSCORRUPTED);
-                               }
-       }
-       ifp->if_flags |= XFS_IFEXTENTS;
-       return 0;
-}
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+       ASSERT(ip0->i_ino != ip1->i_ino);
 
-/*
- * The file has too many extents to fit into
- * the inode, so they are in B-tree format.
- * Allocate a buffer for the root of the B-tree
- * and copy the root into it.  The i_extents
- * field will remain NULL until all of the
- * extents are read in (when they are needed).
- */
-STATIC int
-xfs_iformat_btree(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_bmdr_block_t        *dfp;
-       xfs_ifork_t             *ifp;
-       /* REFERENCED */
-       int                     nrecs;
-       int                     size;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
-       nrecs = be16_to_cpu(dfp->bb_numrecs);
-
-       /*
-        * blow out if -- fork has less extents than can fit in
-        * fork (fork shouldn't be a btree format), root btree
-        * block has more records than can fit into the fork,
-        * or the number of extents is greater than the number of
-        * blocks.
-        */
-       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
-                    XFS_BMDR_SPACE_CALC(nrecs) >
-                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
-                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-               xfs_warn(mp, "corrupt inode %Lu (btree).",
-                                       (unsigned long long) ip->i_ino);
-               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                        mp, dip);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       ifp->if_broot_bytes = size;
-       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
-       ASSERT(ifp->if_broot != NULL);
-       /*
-        * Copy and convert from the on-disk structure
-        * to the in-memory structure.
-        */
-       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-                        ifp->if_broot, size);
-       ifp->if_flags &= ~XFS_IFEXTENTS;
-       ifp->if_flags |= XFS_IFBROOT;
+       if (ip0->i_ino > ip1->i_ino) {
+               temp = ip0;
+               ip0 = ip1;
+               ip1 = temp;
+       }
 
-       return 0;
-}
+ again:
+       xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
 
-STATIC void
-xfs_dinode_from_disk(
-       xfs_icdinode_t          *to,
-       xfs_dinode_t            *from)
-{
-       to->di_magic = be16_to_cpu(from->di_magic);
-       to->di_mode = be16_to_cpu(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = be16_to_cpu(from->di_onlink);
-       to->di_uid = be32_to_cpu(from->di_uid);
-       to->di_gid = be32_to_cpu(from->di_gid);
-       to->di_nlink = be32_to_cpu(from->di_nlink);
-       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
-       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_flushiter = be16_to_cpu(from->di_flushiter);
-       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
-       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
-       to->di_size = be64_to_cpu(from->di_size);
-       to->di_nblocks = be64_to_cpu(from->di_nblocks);
-       to->di_extsize = be32_to_cpu(from->di_extsize);
-       to->di_nextents = be32_to_cpu(from->di_nextents);
-       to->di_anextents = be16_to_cpu(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat  = from->di_aformat;
-       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
-       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
-       to->di_flags    = be16_to_cpu(from->di_flags);
-       to->di_gen      = be32_to_cpu(from->di_gen);
-
-       if (to->di_version == 3) {
-               to->di_changecount = be64_to_cpu(from->di_changecount);
-               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
-               to->di_flags2 = be64_to_cpu(from->di_flags2);
-               to->di_ino = be64_to_cpu(from->di_ino);
-               to->di_lsn = be64_to_cpu(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
+       /*
+        * If the first lock we have locked is in the AIL, we must TRY to get
+        * the second lock. If we can't get it, we must release the first one
+        * and try again.
+        */
+       lp = (xfs_log_item_t *)ip0->i_itemp;
+       if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+               if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                       xfs_iunlock(ip0, lock_mode);
+                       if ((++attempts % 5) == 0)
+                               delay(1); /* Don't just spin the CPU */
+                       goto again;
+               }
+       } else {
+               xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
        }
 }
 
+
 void
-xfs_dinode_to_disk(
-       xfs_dinode_t            *to,
-       xfs_icdinode_t          *from)
+__xfs_iflock(
+       struct xfs_inode        *ip)
 {
-       to->di_magic = cpu_to_be16(from->di_magic);
-       to->di_mode = cpu_to_be16(from->di_mode);
-       to->di_version = from ->di_version;
-       to->di_format = from->di_format;
-       to->di_onlink = cpu_to_be16(from->di_onlink);
-       to->di_uid = cpu_to_be32(from->di_uid);
-       to->di_gid = cpu_to_be32(from->di_gid);
-       to->di_nlink = cpu_to_be32(from->di_nlink);
-       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
-       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
-       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
-       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
-       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
-       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
-       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
-       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-       to->di_size = cpu_to_be64(from->di_size);
-       to->di_nblocks = cpu_to_be64(from->di_nblocks);
-       to->di_extsize = cpu_to_be32(from->di_extsize);
-       to->di_nextents = cpu_to_be32(from->di_nextents);
-       to->di_anextents = cpu_to_be16(from->di_anextents);
-       to->di_forkoff = from->di_forkoff;
-       to->di_aformat = from->di_aformat;
-       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
-       to->di_dmstate = cpu_to_be16(from->di_dmstate);
-       to->di_flags = cpu_to_be16(from->di_flags);
-       to->di_gen = cpu_to_be32(from->di_gen);
-
-       if (from->di_version == 3) {
-               to->di_changecount = cpu_to_be64(from->di_changecount);
-               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
-               to->di_flags2 = cpu_to_be64(from->di_flags2);
-               to->di_ino = cpu_to_be64(from->di_ino);
-               to->di_lsn = cpu_to_be64(from->di_lsn);
-               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
-               uuid_copy(&to->di_uuid, &from->di_uuid);
-               to->di_flushiter = 0;
-       } else {
-               to->di_flushiter = cpu_to_be16(from->di_flushiter);
-       }
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
 }
 
 STATIC uint
@@ -987,234 +567,49 @@ xfs_dic2xflags(
                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 
-static bool
-xfs_dinode_verify(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *ip,
-       struct xfs_dinode       *dip)
-{
-       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
-               return false;
-
-       /* only version 3 or greater inodes are extensively verified here */
-       if (dip->di_version < 3)
-               return true;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             offsetof(struct xfs_dinode, di_crc)))
-               return false;
-       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
-               return false;
-       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       return true;
-}
-
-void
-xfs_dinode_calc_crc(
-       struct xfs_mount        *mp,
-       struct xfs_dinode       *dip)
-{
-       __uint32_t              crc;
-
-       if (dip->di_version < 3)
-               return;
-
-       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                             offsetof(struct xfs_dinode, di_crc));
-       dip->di_crc = xfs_end_cksum(crc);
-}
-
 /*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
  */
 int
-xfs_iread(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       uint            iget_flags)
+xfs_lookup(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             **ipp,
+       struct xfs_name         *ci_name)
 {
-       xfs_buf_t       *bp;
-       xfs_dinode_t    *dip;
-       int             error;
-
-       /*
-        * Fill in the location information in the in-core inode.
-        */
-       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-       if (error)
-               return error;
-
-       /* shortcut IO on inode allocation if possible */
-       if ((iget_flags & XFS_IGET_CREATE) &&
-           xfs_sb_version_hascrc(&mp->m_sb) &&
-           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
-               /* initialise the on-disk inode core */
-               memset(&ip->i_d, 0, sizeof(ip->i_d));
-               ip->i_d.di_magic = XFS_DINODE_MAGIC;
-               ip->i_d.di_gen = prandom_u32();
-               if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                       ip->i_d.di_version = 3;
-                       ip->i_d.di_ino = ip->i_ino;
-                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
-               } else
-                       ip->i_d.di_version = 2;
-               return 0;
-       }
-
-       /*
-        * Get pointers to the on-disk inode and the buffer containing it.
-        */
-       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
-       if (error)
-               return error;
-
-       /* even unallocated inodes are verified */
-       if (!xfs_dinode_verify(mp, ip, dip)) {
-               xfs_alert(mp, "%s: validation failed for inode %lld failed",
-                               __func__, ip->i_ino);
+       xfs_ino_t               inum;
+       int                     error;
+       uint                    lock_mode;
 
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-               error = XFS_ERROR(EFSCORRUPTED);
-               goto out_brelse;
-       }
+       trace_xfs_lookup(dp, name);
 
-       /*
-        * If the on-disk inode is already linked to a directory
-        * entry, copy all of the inode into the in-core inode.
-        * xfs_iformat() handles copying in the inode format
-        * specific information.
-        * Otherwise, just get the truly permanent information.
-        */
-       if (dip->di_mode) {
-               xfs_dinode_from_disk(&ip->i_d, dip);
-               error = xfs_iformat(ip, dip);
-               if (error)  {
-#ifdef DEBUG
-                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                               __func__, error);
-#endif /* DEBUG */
-                       goto out_brelse;
-               }
-       } else {
-               /*
-                * Partial initialisation of the in-core inode. Just the bits
-                * that xfs_ialloc won't overwrite or relies on being correct.
-                */
-               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-               ip->i_d.di_version = dip->di_version;
-               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
-               if (dip->di_version == 3) {
-                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
-                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
-               }
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return XFS_ERROR(EIO);
 
-               /*
-                * Make sure to pull in the mode here as well in
-                * case the inode is released without being used.
-                * This ensures that xfs_inactive() will see that
-                * the inode is already free and not try to mess
-                * with the uninitialized part of it.
-                */
-               ip->i_d.di_mode = 0;
-       }
+       lock_mode = xfs_ilock_map_shared(dp);
+       error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+       xfs_iunlock_map_shared(dp, lock_mode);
 
-       /*
-        * The inode format changed when we moved the link count and
-        * made it 32 bits long.  If this is an old format inode,
-        * convert it in memory to look like a new one.  If it gets
-        * flushed to disk we will convert back before flushing or
-        * logging it.  We zero out the new projid field and the old link
-        * count field.  We'll handle clearing the pad field (the remains
-        * of the old uuid field) when we actually convert the inode to
-        * the new format. We don't change the version number so that we
-        * can distinguish this from a real new format inode.
-        */
-       if (ip->i_d.di_version == 1) {
-               ip->i_d.di_nlink = ip->i_d.di_onlink;
-               ip->i_d.di_onlink = 0;
-               xfs_set_projid(ip, 0);
-       }
+       if (error)
+               goto out;
 
-       ip->i_delayed_blks = 0;
+       error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+       if (error)
+               goto out_free_name;
 
-       /*
-        * Mark the buffer containing the inode as something to keep
-        * around for a while.  This helps to keep recently accessed
-        * meta-data in-core longer.
-        */
-       xfs_buf_set_ref(bp, XFS_INO_REF);
+       return 0;
 
-       /*
-        * Use xfs_trans_brelse() to release the buffer containing the on-disk
-        * inode, because it was acquired with xfs_trans_read_buf() in
-        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
-        * brelse().  If we're within a transaction, then xfs_trans_brelse()
-        * will only release the buffer if it is not dirty within the
-        * transaction.  It will be OK to release the buffer in this case,
-        * because inodes on disk are never destroyed and we will be locking the
-        * new in-core inode before putting it in the cache where other
-        * processes can find it.  Thus we don't have to worry about the inode
-        * being changed just because we released the buffer.
-        */
- out_brelse:
-       xfs_trans_brelse(tp, bp);
+out_free_name:
+       if (ci_name)
+               kmem_free(ci_name->name);
+out:
+       *ipp = NULL;
        return error;
 }
 
-/*
- * Read in extents from a btree-format inode.
- * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
- */
-int
-xfs_iread_extents(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       int             error;
-       xfs_ifork_t     *ifp;
-       xfs_extnum_t    nextents;
-
-       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
-               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
-                                ip->i_mount);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-
-       /*
-        * We know that the size is valid (it's checked in iformat_btree)
-        */
-       ifp->if_bytes = ifp->if_real_bytes = 0;
-       ifp->if_flags |= XFS_IFEXTENTS;
-       xfs_iext_add(ifp, 0, nextents);
-       error = xfs_bmap_read_extents(tp, ip, whichfork);
-       if (error) {
-               xfs_iext_destroy(ifp);
-               ifp->if_flags &= ~XFS_IFEXTENTS;
-               return error;
-       }
-       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
-       return 0;
-}
-
 /*
  * Allocate an inode on disk and return a copy of its in-core version.
  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
@@ -1295,8 +690,8 @@ xfs_ialloc(
        ip->i_d.di_onlink = 0;
        ip->i_d.di_nlink = nlink;
        ASSERT(ip->i_d.di_nlink == nlink);
-       ip->i_d.di_uid = current_fsuid();
-       ip->i_d.di_gid = current_fsgid();
+       ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+       ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
        xfs_set_projid(ip, prid);
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
@@ -1335,7 +730,7 @@ xfs_ialloc(
         */
        if ((irix_sgid_inherit) &&
            (ip->i_d.di_mode & S_ISGID) &&
-           (!in_group_p((gid_t)ip->i_d.di_gid))) {
+           (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
                ip->i_d.di_mode &= ~S_ISGID;
        }
 
@@ -1467,31 +862,608 @@ xfs_ialloc(
 }
 
 /*
- * Free up the underlying blocks past new_size.  The new size must be smaller
- * than the current size.  This routine can be used both for the attribute and
- * data fork, and does not modify the inode size, which is left to the caller.
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
  *
- * The transaction passed to this routine must have made a permanent log
- * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
- * given transaction and start new ones, so make sure everything involved in
- * the transaction is tidy before calling here.  Some transaction will be
- * returned to the caller to be committed.  The incoming transaction must
- * already include the inode, and both inode locks must be held exclusively.
- * The inode must also be "held" within the transaction.  On return the inode
- * will be "held" within the returned transaction.  This routine does NOT
- * require any disk space to be reserved for it within the transaction.
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
  *
- * If we get an error, we must return with the inode locked and linked into the
- * current transaction. This keeps things simple for the higher level code,
- * because it always knows that the inode is locked and held in the transaction
- * that returns to it whether errors occur or not.  We don't mark the inode
- * dirty on error so that transactions can be easily aborted if possible.
  */
 int
-xfs_itruncate_extents(
-       struct xfs_trans        **tpp,
-       struct xfs_inode        *ip,
-       int                     whichfork,
+xfs_dir_ialloc(
+       xfs_trans_t     **tpp,          /* input: current transaction;
+                                          output: may be a new transaction. */
+       xfs_inode_t     *dp,            /* directory within whose allocate
+                                          the inode. */
+       umode_t         mode,
+       xfs_nlink_t     nlink,
+       xfs_dev_t       rdev,
+       prid_t          prid,           /* project id */
+       int             okalloc,        /* ok to allocate new space */
+       xfs_inode_t     **ipp,          /* pointer to inode; it will be
+                                          locked. */
+       int             *committed)
+
+{
+       xfs_trans_t     *tp;
+       xfs_trans_t     *ntp;
+       xfs_inode_t     *ip;
+       xfs_buf_t       *ialloc_context = NULL;
+       int             code;
+       void            *dqinfo;
+       uint            tflags;
+
+       tp = *tpp;
+       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+       /*
+        * xfs_ialloc will return a pointer to an incore inode if
+        * the Space Manager has an available inode on the free
+        * list. Otherwise, it will do an allocation and replenish
+        * the freelist.  Since we can only do one allocation per
+        * transaction without deadlocks, we will need to commit the
+        * current transaction and start a new one.  We will then
+        * need to call xfs_ialloc again to get the inode.
+        *
+        * If xfs_ialloc did an allocation to replenish the freelist,
+        * it returns the bp containing the head of the freelist as
+        * ialloc_context. We will hold a lock on it across the
+        * transaction commit so that no other process can steal
+        * the inode(s) that we've just allocated.
+        */
+       code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
+                         &ialloc_context, &ip);
+
+       /*
+        * Return an error if we were unable to allocate a new inode.
+        * This should only happen if we run out of space on disk or
+        * encounter a disk error.
+        */
+       if (code) {
+               *ipp = NULL;
+               return code;
+       }
+       if (!ialloc_context && !ip) {
+               *ipp = NULL;
+               return XFS_ERROR(ENOSPC);
+       }
+
+       /*
+        * If the AGI buffer is non-NULL, then we were unable to get an
+        * inode in one operation.  We need to commit the current
+        * transaction and call xfs_ialloc() again.  It is guaranteed
+        * to succeed the second time.
+        */
+       if (ialloc_context) {
+               struct xfs_trans_res tres;
+
+               /*
+                * Normally, xfs_trans_commit releases all the locks.
+                * We call bhold to hang on to the ialloc_context across
+                * the commit.  Holding this buffer prevents any other
+                * processes from doing any allocations in this
+                * allocation group.
+                */
+               xfs_trans_bhold(tp, ialloc_context);
+               /*
+                * Save the log reservation so we can use
+                * them in the next transaction.
+                */
+               tres.tr_logres = xfs_trans_get_log_res(tp);
+               tres.tr_logcount = xfs_trans_get_log_count(tp);
+
+               /*
+                * We want the quota changes to be associated with the next
+                * transaction, NOT this one. So, detach the dqinfo from this
+                * and attach it to the next transaction.
+                */
+               dqinfo = NULL;
+               tflags = 0;
+               if (tp->t_dqinfo) {
+                       dqinfo = (void *)tp->t_dqinfo;
+                       tp->t_dqinfo = NULL;
+                       tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+                       tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+               }
+
+               ntp = xfs_trans_dup(tp);
+               code = xfs_trans_commit(tp, 0);
+               tp = ntp;
+               if (committed != NULL) {
+                       *committed = 1;
+               }
+               /*
+                * If we get an error during the commit processing,
+                * release the buffer that is still held and return
+                * to the caller.
+                */
+               if (code) {
+                       xfs_buf_relse(ialloc_context);
+                       if (dqinfo) {
+                               tp->t_dqinfo = dqinfo;
+                               xfs_trans_free_dqinfo(tp);
+                       }
+                       *tpp = ntp;
+                       *ipp = NULL;
+                       return code;
+               }
+
+               /*
+                * transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(tp->t_ticket);
+               tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+               code = xfs_trans_reserve(tp, &tres, 0, 0);
+
+               /*
+                * Re-attach the quota info that we detached from prev trx.
+                */
+               if (dqinfo) {
+                       tp->t_dqinfo = dqinfo;
+                       tp->t_flags |= tflags;
+               }
+
+               if (code) {
+                       xfs_buf_relse(ialloc_context);
+                       *tpp = ntp;
+                       *ipp = NULL;
+                       return code;
+               }
+               xfs_trans_bjoin(tp, ialloc_context);
+
+               /*
+                * Call ialloc again. Since we've locked out all
+                * other allocations in this allocation group,
+                * this call should always succeed.
+                */
+               code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
+                                 okalloc, &ialloc_context, &ip);
+
+               /*
+                * If we get an error at this point, return to the caller
+                * so that the current transaction can be aborted.
+                */
+               if (code) {
+                       *tpp = tp;
+                       *ipp = NULL;
+                       return code;
+               }
+               ASSERT(!ialloc_context && ip);
+
+       } else {
+               if (committed != NULL)
+                       *committed = 0;
+       }
+
+       *ipp = ip;
+       *tpp = tp;
+
+       return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int                            /* error */
+xfs_droplink(
+       xfs_trans_t *tp,
+       xfs_inode_t *ip)
+{
+       int     error;
+
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+       ASSERT (ip->i_d.di_nlink > 0);
+       ip->i_d.di_nlink--;
+       drop_nlink(VFS_I(ip));
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       error = 0;
+       if (ip->i_d.di_nlink == 0) {
+               /*
+                * We're dropping the last link to this file.
+                * Move the on-disk inode to the AGI unlinked list.
+                * From xfs_inactive() we will pull the inode from
+                * the list and free it.
+                */
+               error = xfs_iunlink(tp, ip);
+       }
+       return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(ip->i_d.di_version == 1);
+
+       ip->i_d.di_version = 2;
+       ip->i_d.di_onlink = 0;
+       memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+       mp = tp->t_mountp;
+       if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+               spin_lock(&mp->m_sb_lock);
+               if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
+                       xfs_sb_version_addnlink(&mp->m_sb);
+                       spin_unlock(&mp->m_sb_lock);
+                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+               } else {
+                       spin_unlock(&mp->m_sb_lock);
+               }
+       }
+       /* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+       xfs_trans_t *tp,
+       xfs_inode_t *ip)
+{
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+       ASSERT(ip->i_d.di_nlink > 0);
+       ip->i_d.di_nlink++;
+       inc_nlink(VFS_I(ip));
+       if ((ip->i_d.di_version == 1) &&
+           (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+               /*
+                * The inode has increased its number of links beyond
+                * what can fit in an old format inode.  It now needs
+                * to be converted to a version 2 inode with a 32 bit
+                * link count.  If this is the first inode in the file
+                * system to do this, then we need to bump the superblock
+                * version number as well.
+                */
+               xfs_bump_ino_vers2(tp, ip);
+       }
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       return 0;
+}
+
+int
+xfs_create(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       umode_t                 mode,
+       xfs_dev_t               rdev,
+       xfs_inode_t             **ipp)
+{
+       int                     is_dir = S_ISDIR(mode);
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_inode        *ip = NULL;
+       struct xfs_trans        *tp = NULL;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       bool                    unlock_dp_on_error = false;
+       uint                    cancel_flags;
+       int                     committed;
+       prid_t                  prid;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
+       struct xfs_dquot        *pdqp = NULL;
+       struct xfs_trans_res    tres;
+       uint                    resblks;
+
+       trace_xfs_create(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+               prid = xfs_get_projid(dp);
+       else
+               prid = XFS_PROJID_DEFAULT;
+
+       /*
+        * Make sure that we have allocated dquot(s) on disk.
+        */
+       error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+                                       xfs_kgid_to_gid(current_fsgid()), prid,
+                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                                       &udqp, &gdqp, &pdqp);
+       if (error)
+               return error;
+
+       if (is_dir) {
+               rdev = 0;
+               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+               tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
+               tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+       } else {
+               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+               tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
+               tres.tr_logcount = XFS_CREATE_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+       }
+
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+       /*
+        * Initially assume that the file does not exist and
+        * reserve the resources for that case.  If that is not
+        * the case we'll drop the one we have and get a more
+        * appropriate transaction later.
+        */
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(tp, &tres, resblks, 0);
+       if (error == ENOSPC) {
+               /* flush outstanding delalloc blocks and retry */
+               xfs_flush_inodes(mp);
+               error = xfs_trans_reserve(tp, &tres, resblks, 0);
+       }
+       if (error == ENOSPC) {
+               /* No space at all so try a "no-allocation" reservation */
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &tres, 0, 0);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       unlock_dp_on_error = true;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       /*
+        * Reserve disk quota and the inode.
+        */
+       error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                               pdqp, resblks, 1, 0);
+       if (error)
+               goto out_trans_cancel;
+
+       error = xfs_dir_canenter(tp, dp, name, resblks);
+       if (error)
+               goto out_trans_cancel;
+
+       /*
+        * A newly created regular or special file just has one directory
+        * entry pointing to them, but a directory also the "." entry
+        * pointing to itself.
+        */
+       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+                              prid, resblks > 0, &ip, &committed);
+       if (error) {
+               if (error == ENOSPC)
+                       goto out_trans_cancel;
+               goto out_trans_abort;
+       }
+
+       /*
+        * Now we join the directory inode to the transaction.  We do not do it
+        * earlier because xfs_dir_ialloc might commit the previous transaction
+        * (and release all the locks).  An error from here on will result in
+        * the transaction cancel unlocking dp so don't do it explicitly in the
+        * error path.
+        */
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       unlock_dp_on_error = false;
+
+       error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks ?
+                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+       if (error) {
+               ASSERT(error != ENOSPC);
+               goto out_trans_abort;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+       if (is_dir) {
+               error = xfs_dir_init(tp, ip, dp);
+               if (error)
+                       goto out_bmap_cancel;
+
+               error = xfs_bumplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
+       }
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * create transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       /*
+        * Attach the dquot(s) to the inodes and modify them incore.
+        * These ids of the inode couldn't have changed since the new
+        * inode has been locked ever since it was created.
+        */
+       xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto out_release_inode;
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       *ipp = ip;
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+       /*
+        * Wait until after the current transaction is aborted to
+        * release the inode.  This prevents recursive transactions
+        * and deadlocks from xfs_inactive.
+        */
+       if (ip)
+               IRELE(ip);
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       if (unlock_dp_on_error)
+               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return error;
+}
+
+int
+xfs_link(
+       xfs_inode_t             *tdp,
+       xfs_inode_t             *sip,
+       struct xfs_name         *target_name)
+{
+       xfs_mount_t             *mp = tdp->i_mount;
+       xfs_trans_t             *tp;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     resblks;
+
+       trace_xfs_link(tdp, target_name);
+
+       ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(sip, 0);
+       if (error)
+               goto std_return;
+
+       error = xfs_qm_dqattach(tdp, 0);
+       if (error)
+               goto std_return;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto error_return;
+       }
+
+       xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+       /*
+        * If we are using project inheritance, we only allow hard link
+        * creation in our tree when the project IDs are the same; else
+        * the tree quota mechanism could be circumvented.
+        */
+       if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
+       error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+       if (error)
+               goto error_return;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error)
+               goto abort_return;
+       xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+       error = xfs_bumplink(tp, sip);
+       if (error)
+               goto abort_return;
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * link transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+               xfs_trans_set_sync(tp);
+       }
+
+       error = xfs_bmap_finish (&tp, &free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(&free_list);
+               goto abort_return;
+       }
+
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+       cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
+}
+
+/*
+ * Free up the underlying blocks past new_size.  The new size must be smaller
+ * than the current size.  This routine can be used both for the attribute and
+ * data fork, and does not modify the inode size, which is left to the caller.
+ *
+ * The transaction passed to this routine must have made a permanent log
+ * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
+ * given transaction and start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.  Some transaction will be
+ * returned to the caller to be committed.  The incoming transaction must
+ * already include the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On return the inode
+ * will be "held" within the returned transaction.  This routine does NOT
+ * require any disk space to be reserved for it within the transaction.
+ *
+ * If we get an error, we must return with the inode locked and linked into the
+ * current transaction. This keeps things simple for the higher level code,
+ * because it always knows that the inode is locked and held in the transaction
+ * that returns to it whether errors occur or not.  We don't mark the inode
+ * dirty on error so that transactions can be easily aborted if possible.
+ */
+int
+xfs_itruncate_extents(
+       struct xfs_trans        **tpp,
+       struct xfs_inode        *ip,
+       int                     whichfork,
        xfs_fsize_t             new_size)
 {
        struct xfs_mount        *mp = ip->i_mount;
@@ -1572,37 +1544,299 @@ xfs_itruncate_extents(
                        goto out;
 
                /*
-                * Transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
+                * Transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(tp->t_ticket);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+               if (error)
+                       goto out;
+       }
+
+       /*
+        * Always re-log the inode so that our permanent transaction can keep
+        * on rolling it forward in the log.
+        */
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       trace_xfs_itruncate_extents_end(ip, new_size);
+
+out:
+       *tpp = tp;
+       return error;
+out_bmap_cancel:
+       /*
+        * If the bunmapi call encounters an error, return to the caller where
+        * the transaction can be properly aborted.  We just need to make sure
+        * we're not holding any resources that we were not when we came in.
+        */
+       xfs_bmap_cancel(&free_list);
+       goto out;
+}
+
+int
+xfs_release(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       int             error;
+
+       if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+               return 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return 0;
+
+       if (!XFS_FORCED_SHUTDOWN(mp)) {
+               int truncated;
+
+               /*
+                * If we are using filestreams, and we have an unlinked
+                * file that we are processing the last close on, then nothing
+                * will be able to reopen and write to this file. Purge this
+                * inode from the filestreams cache so that it doesn't delay
+                * teardown of the inode.
+                */
+               if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
+                       xfs_filestream_deassociate(ip);
+
+               /*
+                * If we previously truncated this file and removed old data
+                * in the process, we want to initiate "early" writeout on
+                * the last close.  This is an attempt to combat the notorious
+                * NULL files problem which is particularly noticeable from a
+                * truncate down, buffered (re-)write (delalloc), followed by
+                * a crash.  What we are effectively doing here is
+                * significantly reducing the time window where we'd otherwise
+                * be exposed to that problem.
+                */
+               truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+               if (truncated) {
+                       xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+                               error = -filemap_flush(VFS_I(ip)->i_mapping);
+                               if (error)
+                                       return error;
+                       }
+               }
+       }
+
+       if (ip->i_d.di_nlink == 0)
+               return 0;
+
+       if (xfs_can_free_eofblocks(ip, false)) {
+
+               /*
+                * If we can't get the iolock just skip truncating the blocks
+                * past EOF because we could deadlock with the mmap_sem
+                * otherwise.  We'll get another chance to drop them once the
+                * last reference to the inode is dropped, so we'll never leak
+                * blocks permanently.
+                *
+                * Further, check if the inode is being opened, written and
+                * closed frequently and we have delayed allocation blocks
+                * outstanding (e.g. streaming writes from the NFS server),
+                * truncating the blocks past EOF will cause fragmentation to
+                * occur.
+                *
+                * In this case don't do the truncation, either, but we have to
+                * be careful how we detect this case. Blocks beyond EOF show
+                * up as i_delayed_blks even when the inode is clean, so we
+                * need to truncate them away first before checking for a dirty
+                * release. Hence on the first dirty close we will still remove
+                * the speculative allocation, but after that we will leave it
+                * in place.
+                */
+               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                       return 0;
+
+               error = xfs_free_eofblocks(mp, ip, true);
+               if (error && error != EAGAIN)
+                       return error;
+
+               /* delalloc blocks after truncation means it really is dirty */
+               if (ip->i_delayed_blks)
+                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+       }
+       return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+int
+xfs_inactive(
+       xfs_inode_t     *ip)
+{
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     committed;
+       struct xfs_trans        *tp;
+       struct xfs_mount        *mp;
+       struct xfs_trans_res    *resp;
+       int                     error;
+       int                     truncate = 0;
+
+       /*
+        * If the inode is already free, then there can be nothing
+        * to clean up here.
+        */
+       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+               ASSERT(ip->i_df.if_real_bytes == 0);
+               ASSERT(ip->i_df.if_broot_bytes == 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       mp = ip->i_mount;
+
+       error = 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               goto out;
+
+       if (ip->i_d.di_nlink != 0) {
+               /*
+                * force is true because we are evicting an inode from the
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
+                */
+               if (xfs_can_free_eofblocks(ip, true)) {
+                       error = xfs_free_eofblocks(mp, ip, false);
+                       if (error)
+                               return VN_INACTIVE_CACHE;
+               }
+               goto out;
+       }
+
+       if (S_ISREG(ip->i_d.di_mode) &&
+           (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+            ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+               truncate = 1;
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return VN_INACTIVE_CACHE;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+       resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
+               &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
+
+       error = xfs_trans_reserve(tp, resp, 0, 0);
+       if (error) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               xfs_trans_cancel(tp, 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       if (S_ISLNK(ip->i_d.di_mode)) {
+               error = xfs_inactive_symlink(ip, &tp);
+               if (error)
+                       goto out_cancel;
+       } else if (truncate) {
+               ip->i_d.di_size = 0;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+               if (error)
+                       goto out_cancel;
+
+               ASSERT(ip->i_d.di_nextents == 0);
+       }
+
+       /*
+        * If there are attributes associated with the file then blow them away
+        * now.  The code calls a routine that recursively deconstructs the
+        * attribute fork.  We need to just commit the current transaction
+        * because we can't use it for xfs_attr_inactive().
+        */
+       if (ip->i_d.di_anextents > 0) {
+               ASSERT(ip->i_d.di_forkoff != 0);
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       goto out_unlock;
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               error = xfs_attr_inactive(ip);
+               if (error)
+                       goto out;
+
+               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       goto out;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+       }
+
+       if (ip->i_afp)
+               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+       ASSERT(ip->i_d.di_anextents == 0);
+
+       /*
+        * Free the inode.
+        */
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_ifree(tp, ip, &free_list);
+       if (error) {
+               /*
+                * If we fail to free the inode, shut down.  The cancel
+                * might do that, we need to make sure.  Otherwise the
+                * inode might be lost for a long time or forever.
+                */
+               if (!XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_notice(mp, "%s: xfs_ifree returned error %d",
+                               __func__, error);
+                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               }
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       } else {
+               /*
+                * Credit the quota account(s). The inode is gone.
+                */
+               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+               /*
+                * Just ignore errors at this point.  There is nothing we can
+                * do except to try to keep going. Make sure it's not a silent
+                * error.
                 */
-               xfs_log_ticket_put(tp->t_ticket);
-               error = xfs_trans_reserve(tp, 0,
-                                       XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                       XFS_TRANS_PERM_LOG_RES,
-                                       XFS_ITRUNCATE_LOG_COUNT);
+               error = xfs_bmap_finish(&tp,  &free_list, &committed);
                if (error)
-                       goto out;
+                       xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+                               __func__, error);
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+                               __func__, error);
        }
 
        /*
-        * Always re-log the inode so that our permanent transaction can keep
-        * on rolling it forward in the log.
+        * Release the dquots held by inode, if any.
         */
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       trace_xfs_itruncate_extents_end(ip, new_size);
-
+       xfs_qm_dqdetach(ip);
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out:
-       *tpp = tp;
-       return error;
-out_bmap_cancel:
-       /*
-        * If the bunmapi call encounters an error, return to the caller where
-        * the transaction can be properly aborted.  We just need to make sure
-        * we're not holding any resources that we were not when we came in.
-        */
-       xfs_bmap_cancel(&free_list);
-       goto out;
+       return VN_INACTIVE_CACHE;
+out_cancel:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       goto out_unlock;
 }
 
 /*
@@ -1861,7 +2095,7 @@ xfs_iunlink_remove(
 }
 
 /*
- * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * A big issue when freeing the inode cluster is that we _cannot_ skip any
  * inodes that are in memory - they all must be marked stale and attached to
  * the cluster buffer.
  */
@@ -2093,272 +2327,6 @@ xfs_ifree(
        return error;
 }
 
-/*
- * Reallocate the space for if_broot based on the number of records
- * being added or deleted as indicated in rec_diff.  Move the records
- * and pointers in if_broot to fit the new size.  When shrinking this
- * will eliminate holes between the records and pointers created by
- * the caller.  When growing this will create holes to be filled in
- * by the caller.
- *
- * The caller must not request to add more records than would fit in
- * the on-disk inode root.  If the if_broot is currently NULL, then
- * if we adding records one will be allocated.  The caller must also
- * not request that the number of records go below zero, although
- * it can go to zero.
- *
- * ip -- the inode whose if_broot area is changing
- * ext_diff -- the change in the number of records, positive or negative,
- *      requested for the if_broot array.
- */
-void
-xfs_iroot_realloc(
-       xfs_inode_t             *ip,
-       int                     rec_diff,
-       int                     whichfork)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     cur_max;
-       xfs_ifork_t             *ifp;
-       struct xfs_btree_block  *new_broot;
-       int                     new_max;
-       size_t                  new_size;
-       char                    *np;
-       char                    *op;
-
-       /*
-        * Handle the degenerate case quietly.
-        */
-       if (rec_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (rec_diff > 0) {
-               /*
-                * If there wasn't any memory allocated before, just
-                * allocate it now and get out.
-                */
-               if (ifp->if_broot_bytes == 0) {
-                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-                       ifp->if_broot_bytes = (int)new_size;
-                       return;
-               }
-
-               /*
-                * If there is already an existing if_broot, then we need
-                * to realloc() it and shift the pointers to their new
-                * location.  The records don't change location because
-                * they are kept butted up against the btree block header.
-                */
-               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-               new_max = cur_max + rec_diff;
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
-                               KM_SLEEP | KM_NOFS);
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    (int)new_size);
-               ifp->if_broot_bytes = (int)new_size;
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-               memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
-               return;
-       }
-
-       /*
-        * rec_diff is less than 0.  In this case, we are shrinking the
-        * if_broot buffer.  It must already exist.  If we go to zero
-        * records, just get rid of the root and clear the status bit.
-        */
-       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
-       new_max = cur_max + rec_diff;
-       ASSERT(new_max >= 0);
-       if (new_max > 0)
-               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
-       else
-               new_size = 0;
-       if (new_size > 0) {
-               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
-               /*
-                * First copy over the btree block header.
-                */
-               memcpy(new_broot, ifp->if_broot,
-                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
-       } else {
-               new_broot = NULL;
-               ifp->if_flags &= ~XFS_IFBROOT;
-       }
-
-       /*
-        * Only copy the records and pointers if there are any.
-        */
-       if (new_max > 0) {
-               /*
-                * First copy the records.
-                */
-               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
-
-               /*
-                * Then copy the pointers.
-                */
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
-                                                    (int)new_size);
-               memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
-       }
-       kmem_free(ifp->if_broot);
-       ifp->if_broot = new_broot;
-       ifp->if_broot_bytes = (int)new_size;
-       if (ifp->if_broot)
-               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                       XFS_IFORK_SIZE(ip, whichfork));
-       return;
-}
-
-
-/*
- * This is called when the amount of space needed for if_data
- * is increased or decreased.  The change in size is indicated by
- * the number of bytes that need to be added or deleted in the
- * byte_diff parameter.
- *
- * If the amount of space needed has decreased below the size of the
- * inline buffer, then switch to using the inline buffer.  Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
- * to what is needed.
- *
- * ip -- the inode whose if_data area is changing
- * byte_diff -- the change in the number of bytes, positive or negative,
- *      requested for the if_data array.
- */
-void
-xfs_idata_realloc(
-       xfs_inode_t     *ip,
-       int             byte_diff,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-       int             new_size;
-       int             real_size;
-
-       if (byte_diff == 0) {
-               return;
-       }
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       new_size = (int)ifp->if_bytes + byte_diff;
-       ASSERT(new_size >= 0);
-
-       if (new_size == 0) {
-               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       kmem_free(ifp->if_u1.if_data);
-               }
-               ifp->if_u1.if_data = NULL;
-               real_size = 0;
-       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
-               /*
-                * If the valid extents/data can fit in if_inline_ext/data,
-                * copy them from the malloc'd vector and free it.
-                */
-               if (ifp->if_u1.if_data == NULL) {
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
-                             new_size);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
-               }
-               real_size = 0;
-       } else {
-               /*
-                * Stuck with malloc/realloc.
-                * For inline data, the underlying buffer must be
-                * a multiple of 4 bytes in size so that it can be
-                * logged and stay on word boundaries.  We enforce
-                * that here.
-                */
-               real_size = roundup(new_size, 4);
-               if (ifp->if_u1.if_data == NULL) {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                       /*
-                        * Only do the realloc if the underlying size
-                        * is really changing.
-                        */
-                       if (ifp->if_real_bytes != real_size) {
-                               ifp->if_u1.if_data =
-                                       kmem_realloc(ifp->if_u1.if_data,
-                                                       real_size,
-                                                       ifp->if_real_bytes,
-                                                       KM_SLEEP | KM_NOFS);
-                       }
-               } else {
-                       ASSERT(ifp->if_real_bytes == 0);
-                       ifp->if_u1.if_data = kmem_alloc(real_size,
-                                                       KM_SLEEP | KM_NOFS);
-                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
-                               ifp->if_bytes);
-               }
-       }
-       ifp->if_real_bytes = real_size;
-       ifp->if_bytes = new_size;
-       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-}
-
-void
-xfs_idestroy_fork(
-       xfs_inode_t     *ip,
-       int             whichfork)
-{
-       xfs_ifork_t     *ifp;
-
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       if (ifp->if_broot != NULL) {
-               kmem_free(ifp->if_broot);
-               ifp->if_broot = NULL;
-       }
-
-       /*
-        * If the format is local, then we can't have an extents
-        * array so just look for an inline data array.  If we're
-        * not local then we may or may not have an extents list,
-        * so check and free it up if we do.
-        */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
-                   (ifp->if_u1.if_data != NULL)) {
-                       ASSERT(ifp->if_real_bytes != 0);
-                       kmem_free(ifp->if_u1.if_data);
-                       ifp->if_u1.if_data = NULL;
-                       ifp->if_real_bytes = 0;
-               }
-       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
-                  ((ifp->if_flags & XFS_IFEXTIREC) ||
-                   ((ifp->if_u1.if_extents != NULL) &&
-                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
-               ASSERT(ifp->if_real_bytes != 0);
-               xfs_iext_destroy(ifp);
-       }
-       ASSERT(ifp->if_u1.if_extents == NULL ||
-              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
-       ASSERT(ifp->if_real_bytes == 0);
-       if (whichfork == XFS_ATTR_FORK) {
-               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
-               ip->i_afp = NULL;
-       }
-}
-
 /*
  * This is called to unpin an inode.  The caller must have the inode locked
  * in at least shared mode so that the buffer cannot be subsequently pinned
@@ -2402,162 +2370,471 @@ xfs_iunpin_wait(
                __xfs_iunpin_wait(ip);
 }
 
-/*
- * xfs_iextents_copy()
- *
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer.  It
- * returns the number of bytes copied into the buffer.
- *
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer.  Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
- */
 int
-xfs_iextents_copy(
-       xfs_inode_t             *ip,
-       xfs_bmbt_rec_t          *dp,
-       int                     whichfork)
+xfs_remove(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             *ip)
 {
-       int                     copied;
-       int                     i;
-       xfs_ifork_t             *ifp;
-       int                     nrecs;
-       xfs_fsblock_t           start_block;
+       xfs_mount_t             *mp = dp->i_mount;
+       xfs_trans_t             *tp = NULL;
+       int                     is_dir = S_ISDIR(ip->i_d.di_mode);
+       int                     error = 0;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     link_zero;
+       uint                    resblks;
+       uint                    log_count;
 
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(ifp->if_bytes > 0);
+       trace_xfs_remove(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(dp, 0);
+       if (error)
+               goto std_return;
 
-       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
-       ASSERT(nrecs > 0);
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               goto std_return;
+
+       if (is_dir) {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+               log_count = XFS_DEFAULT_LOG_COUNT;
+       } else {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+               log_count = XFS_REMOVE_LOG_COUNT;
+       }
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 
        /*
-        * There are some delayed allocation extents in the
-        * inode, so copy the extents one at a time and skip
-        * the delayed ones.  There must be at least one
-        * non-delayed extent.
+        * We try to get the real space reservation first,
+        * allowing for directory btree deletion(s) implying
+        * possible bmap insert(s).  If we can't get the space
+        * reservation then we use 0 instead, and avoid the bmap
+        * btree insert(s) in the directory code by, if the bmap
+        * insert tries to happen, instead trimming the LAST
+        * block from the directory.
         */
-       copied = 0;
-       for (i = 0; i < nrecs; i++) {
-               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-               start_block = xfs_bmbt_get_startblock(ep);
-               if (isnullstartblock(start_block)) {
-                       /*
-                        * It's a delayed allocation extent, so skip it.
-                        */
-                       continue;
+       resblks = XFS_REMOVE_SPACE_RES(mp);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
+       }
+       if (error) {
+               ASSERT(error != ENOSPC);
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we're removing a directory perform some additional validation.
+        */
+       if (is_dir) {
+               ASSERT(ip->i_d.di_nlink >= 2);
+               if (ip->i_d.di_nlink != 2) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
                }
+               if (!xfs_dir_isempty(ip)) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
+               }
+       }
+
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error) {
+               ASSERT(error != ENOENT);
+               goto out_bmap_cancel;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+       if (is_dir) {
+               /*
+                * Drop the link from ip's "..".
+                */
+               error = xfs_droplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
 
-               /* Translate to on disk format */
-               put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
-               put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
-               dp++;
-               copied++;
+               /*
+                * Drop the "." link from ip to self.
+                */
+               error = xfs_droplink(tp, ip);
+               if (error)
+                       goto out_bmap_cancel;
+       } else {
+               /*
+                * When removing a non-directory we need to log the parent
+                * inode here.  For a directory this is done implicitly
+                * by the xfs_droplink call for the ".." entry.
+                */
+               xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
-       ASSERT(copied != 0);
-       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
 
-       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+       /*
+        * Drop the link from dp to ip.
+        */
+       error = xfs_droplink(tp, ip);
+       if (error)
+               goto out_bmap_cancel;
+
+       /*
+        * Determine if this is the last link while
+        * we are in the transaction.
+        */
+       link_zero = (ip->i_d.di_nlink == 0);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * remove transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto std_return;
+
+       /*
+        * If we are using filestreams, kill the stream association.
+        * If the file is still open it may get a new one but that
+        * will get killed on last close in xfs_close() so we don't
+        * have to worry about that.
+        */
+       if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+               xfs_filestream_deassociate(ip);
+
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
 }
 
 /*
- * Each of the following cases stores data into the same region
- * of the on-disk inode, so only one of them can be valid at
- * any given time. While it is possible to have conflicting formats
- * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
- * in EXTENTS format, this can only happen when the fork has
- * changed formats after being modified but before being flushed.
- * In these cases, the format always takes precedence, because the
- * format indicates the current state of the fork.
+ * Enter all inodes for a rename transaction into a sorted array.
  */
-/*ARGSUSED*/
 STATIC void
-xfs_iflush_fork(
-       xfs_inode_t             *ip,
-       xfs_dinode_t            *dip,
-       xfs_inode_log_item_t    *iip,
-       int                     whichfork,
-       xfs_buf_t               *bp)
-{
-       char                    *cp;
-       xfs_ifork_t             *ifp;
-       xfs_mount_t             *mp;
-       static const short      brootflag[2] =
-               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
-       static const short      dataflag[2] =
-               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
-       static const short      extflag[2] =
-               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
-
-       if (!iip)
-               return;
-       ifp = XFS_IFORK_PTR(ip, whichfork);
-       /*
-        * This can happen if we gave up in iformat in an error path,
-        * for the attribute fork.
-        */
-       if (!ifp) {
-               ASSERT(whichfork == XFS_ATTR_FORK);
-               return;
-       }
-       cp = XFS_DFORK_PTR(dip, whichfork);
-       mp = ip->i_mount;
-       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
-       case XFS_DINODE_FMT_LOCAL:
-               if ((iip->ili_fields & dataflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(ifp->if_u1.if_data != NULL);
-                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
-                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+xfs_sort_for_rename(
+       xfs_inode_t     *dp1,   /* in: old (source) directory inode */
+       xfs_inode_t     *dp2,   /* in: new (target) directory inode */
+       xfs_inode_t     *ip1,   /* in: inode of old entry */
+       xfs_inode_t     *ip2,   /* in: inode of new entry, if it
+                                  already exists, NULL otherwise. */
+       xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
+       int             *num_inodes)  /* out: number of inodes in array */
+{
+       xfs_inode_t             *temp;
+       int                     i, j;
+
+       /*
+        * i_tab contains a list of pointers to inodes.  We initialize
+        * the table here & we'll sort it.  We will then use it to
+        * order the acquisition of the inode locks.
+        *
+        * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+        */
+       i_tab[0] = dp1;
+       i_tab[1] = dp2;
+       i_tab[2] = ip1;
+       if (ip2) {
+               *num_inodes = 4;
+               i_tab[3] = ip2;
+       } else {
+               *num_inodes = 3;
+               i_tab[3] = NULL;
+       }
+
+       /*
+        * Sort the elements via bubble sort.  (Remember, there are at
+        * most 4 elements to sort, so this is adequate.)
+        */
+       for (i = 0; i < *num_inodes; i++) {
+               for (j = 1; j < *num_inodes; j++) {
+                       if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+                               temp = i_tab[j];
+                               i_tab[j] = i_tab[j-1];
+                               i_tab[j-1] = temp;
+                       }
                }
-               break;
+       }
+}
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+       xfs_inode_t     *src_dp,
+       struct xfs_name *src_name,
+       xfs_inode_t     *src_ip,
+       xfs_inode_t     *target_dp,
+       struct xfs_name *target_name,
+       xfs_inode_t     *target_ip)
+{
+       xfs_trans_t     *tp = NULL;
+       xfs_mount_t     *mp = src_dp->i_mount;
+       int             new_parent;             /* moving to a new dir */
+       int             src_is_directory;       /* src_name is a directory */
+       int             error;
+       xfs_bmap_free_t free_list;
+       xfs_fsblock_t   first_block;
+       int             cancel_flags;
+       int             committed;
+       xfs_inode_t     *inodes[4];
+       int             spaceres;
+       int             num_inodes;
+
+       trace_xfs_rename(src_dp, target_dp, src_name, target_name);
+
+       new_parent = (src_dp != target_dp);
+       src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+
+       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+                               inodes, &num_inodes);
+
+       xfs_bmap_init(&free_list, &first_block);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
+       if (error == ENOSPC) {
+               spaceres = 0;
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
+       }
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               goto std_return;
+       }
+
+       /*
+        * Attach the dquots to the inodes
+        */
+       error = xfs_qm_vop_rename_dqattach(inodes);
+       if (error) {
+               xfs_trans_cancel(tp, cancel_flags);
+               goto std_return;
+       }
+
+       /*
+        * Lock all the participating inodes. Depending upon whether
+        * the target_name exists in the target directory, and
+        * whether the target directory is the same as the source
+        * directory, we can lock from 2 to 4 inodes.
+        */
+       xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
+
+       /*
+        * Join all the inodes to the transaction. From this point on,
+        * we can rely on either trans_commit or trans_cancel to unlock
+        * them.
+        */
+       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+       if (new_parent)
+               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+       if (target_ip)
+               xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we are using project inheritance, we only allow renames
+        * into our tree when the project IDs are the same; else the
+        * tree quota mechanism would be circumvented.
+        */
+       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
+       /*
+        * Set up the target.
+        */
+       if (target_ip == NULL) {
+               /*
+                * If there's no space reservation, check the entry will
+                * fit before actually inserting it.
+                */
+               error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
+               if (error)
+                       goto error_return;
+               /*
+                * If target does not exist and the rename crosses
+                * directories, adjust the target directory link count
+                * to account for the ".." reference from the new entry.
+                */
+               error = xfs_dir_createname(tp, target_dp, target_name,
+                                               src_ip->i_ino, &first_block,
+                                               &free_list, spaceres);
+               if (error == ENOSPC)
+                       goto error_return;
+               if (error)
+                       goto abort_return;
+
+               xfs_trans_ichgtime(tp, target_dp,
+                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
-       case XFS_DINODE_FMT_EXTENTS:
-               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-                      !(iip->ili_fields & extflag[whichfork]));
-               if ((iip->ili_fields & extflag[whichfork]) &&
-                   (ifp->if_bytes > 0)) {
-                       ASSERT(xfs_iext_get_ext(ifp, 0));
-                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
-                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
-                               whichfork);
+               if (new_parent && src_is_directory) {
+                       error = xfs_bumplink(tp, target_dp);
+                       if (error)
+                               goto abort_return;
                }
-               break;
+       } else { /* target_ip != NULL */
+               /*
+                * If target exists and it's a directory, check that both
+                * target and source are directories and that target can be
+                * destroyed, or that neither is a directory.
+                */
+               if (S_ISDIR(target_ip->i_d.di_mode)) {
+                       /*
+                        * Make sure target dir is empty.
+                        */
+                       if (!(xfs_dir_isempty(target_ip)) ||
+                           (target_ip->i_d.di_nlink > 2)) {
+                               error = XFS_ERROR(EEXIST);
+                               goto error_return;
+                       }
+               }
+
+               /*
+                * Link the source inode under the target name.
+                * If the source inode is a directory and we are moving
+                * it across directories, its ".." entry will be
+                * inconsistent until we replace that down below.
+                *
+                * In case there is already an entry with the same
+                * name at the destination directory, remove it first.
+                */
+               error = xfs_dir_replace(tp, target_dp, target_name,
+                                       src_ip->i_ino,
+                                       &first_block, &free_list, spaceres);
+               if (error)
+                       goto abort_return;
+
+               xfs_trans_ichgtime(tp, target_dp,
+                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+               /*
+                * Decrement the link count on the target since the target
+                * dir no longer points to it.
+                */
+               error = xfs_droplink(tp, target_ip);
+               if (error)
+                       goto abort_return;
+
+               if (src_is_directory) {
+                       /*
+                        * Drop the link from the old "." entry.
+                        */
+                       error = xfs_droplink(tp, target_ip);
+                       if (error)
+                               goto abort_return;
+               }
+       } /* target_ip != NULL */
+
+       /*
+        * Remove the source.
+        */
+       if (new_parent && src_is_directory) {
+               /*
+                * Rewrite the ".." entry to point to the new
+                * directory.
+                */
+               error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+                                       target_dp->i_ino,
+                                       &first_block, &free_list, spaceres);
+               ASSERT(error != EEXIST);
+               if (error)
+                       goto abort_return;
+       }
+
+       /*
+        * We always want to hit the ctime on the source inode.
+        *
+        * This isn't strictly required by the standards since the source
+        * inode isn't really being changed, but old unix file systems did
+        * it and some incremental backup programs won't work without it.
+        */
+       xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+       /*
+        * Adjust the link count on src_dp.  This is necessary when
+        * renaming a directory, either within one parent when
+        * the target existed, or across two parent directories.
+        */
+       if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+               /*
+                * Decrement link count on src_directory since the
+                * entry that's moved no longer points to it.
+                */
+               error = xfs_droplink(tp, src_dp);
+               if (error)
+                       goto abort_return;
+       }
 
-       case XFS_DINODE_FMT_BTREE:
-               if ((iip->ili_fields & brootflag[whichfork]) &&
-                   (ifp->if_broot_bytes > 0)) {
-                       ASSERT(ifp->if_broot != NULL);
-                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
-                               XFS_IFORK_SIZE(ip, whichfork));
-                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
-                               (xfs_bmdr_block_t *)cp,
-                               XFS_DFORK_SIZE(dip, mp, whichfork));
-               }
-               break;
+       error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+                                       &first_block, &free_list, spaceres);
+       if (error)
+               goto abort_return;
 
-       case XFS_DINODE_FMT_DEV:
-               if (iip->ili_fields & XFS_ILOG_DEV) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
-               }
-               break;
+       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+       if (new_parent)
+               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
 
-       case XFS_DINODE_FMT_UUID:
-               if (iip->ili_fields & XFS_ILOG_UUID) {
-                       ASSERT(whichfork == XFS_DATA_FORK);
-                       memcpy(XFS_DFORK_DPTR(dip),
-                              &ip->i_df.if_u2.if_uuid,
-                              sizeof(uuid_t));
-               }
-               break;
+       /*
+        * If this is a synchronous mount, make sure that the
+        * rename transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+               xfs_trans_set_sync(tp);
+       }
 
-       default:
-               ASSERT(0);
-               break;
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(&free_list);
+               xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                                XFS_TRANS_ABORT));
+               goto std_return;
        }
+
+       /*
+        * trans_commit will unlock src_ip, target_ip & decrement
+        * the vnode references.
+        */
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+       cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+       xfs_bmap_cancel(&free_list);
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
 }
 
 STATIC int
@@ -2816,7 +3093,6 @@ abort_out:
        return error;
 }
 
-
 STATIC int
 xfs_iflush_int(
        struct xfs_inode        *ip,
@@ -3004,1072 +3280,3 @@ xfs_iflush_int(
 corrupt_out:
        return XFS_ERROR(EFSCORRUPTED);
 }
-
-/*
- * Return a pointer to the extent record at file index idx.
- */
-xfs_bmbt_rec_host_t *
-xfs_iext_get_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx)            /* index of target extent */
-{
-       ASSERT(idx >= 0);
-       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-
-       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
-               return ifp->if_u1.if_ext_irec->er_extbuf;
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_ext_irec_t  *erp;           /* irec pointer */
-               int             erp_idx = 0;    /* irec index */
-               xfs_extnum_t    page_idx = idx; /* ext index in target list */
-
-               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
-               return &erp->er_extbuf[page_idx];
-       } else if (ifp->if_bytes) {
-               return &ifp->if_u1.if_extents[idx];
-       } else {
-               return NULL;
-       }
-}
-
-/*
- * Insert new item(s) into the extent records for incore inode
- * fork 'ifp'.  'count' new items are inserted at index 'idx'.
- */
-void
-xfs_iext_insert(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* starting index of new items */
-       xfs_extnum_t    count,          /* number of inserted items */
-       xfs_bmbt_irec_t *new,           /* items to insert */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    i;              /* extent record index */
-
-       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
-
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       xfs_iext_add(ifp, idx, count);
-       for (i = idx; i < idx + count; i++, new++)
-               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be increased. The ext_diff parameter stores the
- * number of new extents being added and the idx parameter contains
- * the extent index where the new extents will be added. If the new
- * extents are being appended, then we just need to (re)allocate and
- * initialize the space. Otherwise, if the new extents are being
- * inserted into the middle of the existing entries, a bit more work
- * is required to make room for the new extents to be inserted. The
- * caller is responsible for filling in the new extent entries upon
- * return.
- */
-void
-xfs_iext_add(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin adding exts */
-       int             ext_diff)       /* number of extents to add */
-{
-       int             byte_diff;      /* new bytes being added */
-       int             new_size;       /* size of extents after adding */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT((idx >= 0) && (idx <= nextents));
-       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
-       new_size = ifp->if_bytes + byte_diff;
-       /*
-        * If the new number of extents (nextents + ext_diff)
-        * fits inside the inode, then continue to use the inline
-        * extent buffer.
-        */
-       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
-               if (idx < nextents) {
-                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
-                               &ifp->if_u2.if_inline_ext[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
-               }
-               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-               ifp->if_real_bytes = 0;
-       }
-       /*
-        * Otherwise use a linear (direct) extent list.
-        * If the extents are currently inside the inode,
-        * xfs_iext_realloc_direct will switch us from
-        * inline to direct extent allocation mode.
-        */
-       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, new_size);
-               if (idx < nextents) {
-                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
-                               &ifp->if_u1.if_extents[idx],
-                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
-                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
-               }
-       }
-       /* Indirection array */
-       else {
-               xfs_ext_irec_t  *erp;
-               int             erp_idx = 0;
-               int             page_idx = idx;
-
-               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
-               if (ifp->if_flags & XFS_IFEXTIREC) {
-                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
-               } else {
-                       xfs_iext_irec_init(ifp);
-                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-                       erp = ifp->if_u1.if_ext_irec;
-               }
-               /* Extents fit in target extent page */
-               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
-                       if (page_idx < erp->er_extcount) {
-                               memmove(&erp->er_extbuf[page_idx + ext_diff],
-                                       &erp->er_extbuf[page_idx],
-                                       (erp->er_extcount - page_idx) *
-                                       sizeof(xfs_bmbt_rec_t));
-                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
-                       }
-                       erp->er_extcount += ext_diff;
-                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               }
-               /* Insert a new extent page */
-               else if (erp) {
-                       xfs_iext_add_indirect_multi(ifp,
-                               erp_idx, page_idx, ext_diff);
-               }
-               /*
-                * If extent(s) are being appended to the last page in
-                * the indirection array and the new extent(s) don't fit
-                * in the page, then erp is NULL and erp_idx is set to
-                * the next index needed in the indirection array.
-                */
-               else {
-                       int     count = ext_diff;
-
-                       while (count) {
-                               erp = xfs_iext_irec_new(ifp, erp_idx);
-                               erp->er_extcount = count;
-                               count -= MIN(count, (int)XFS_LINEAR_EXTS);
-                               if (count) {
-                                       erp_idx++;
-                               }
-                       }
-               }
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being added to the indirection
- * array and the new extents do not fit in the target extent list. The
- * erp_idx parameter contains the irec index for the target extent list
- * in the indirection array, and the idx parameter contains the extent
- * index within the list. The number of extents being added is stored
- * in the count parameter.
- *
- *    |-------|   |-------|
- *    |       |   |       |    idx - number of extents before idx
- *    |  idx  |   | count |
- *    |       |   |       |    count - number of extents being inserted at idx
- *    |-------|   |-------|
- *    | count |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_add_indirect_multi(
-       xfs_ifork_t     *ifp,                   /* inode fork pointer */
-       int             erp_idx,                /* target extent irec index */
-       xfs_extnum_t    idx,                    /* index within target list */
-       int             count)                  /* new extents being added */
-{
-       int             byte_diff;              /* new bytes being added */
-       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
-       xfs_extnum_t    ext_diff;               /* number of extents to add */
-       xfs_extnum_t    ext_cnt;                /* new extents still needed */
-       xfs_extnum_t    nex2;                   /* extents after idx + count */
-       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
-       int             nlists;                 /* number of irec's (lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       nex2 = erp->er_extcount - idx;
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /*
-        * Save second part of target extent list
-        * (all extents past */
-       if (nex2) {
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
-               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
-               erp->er_extcount -= nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
-               memset(&erp->er_extbuf[idx], 0, byte_diff);
-       }
-
-       /*
-        * Add the new extents to the end of the target
-        * list, then allocate new irec record(s) and
-        * extent buffer(s) as needed to store the rest
-        * of the new extents.
-        */
-       ext_cnt = count;
-       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
-       if (ext_diff) {
-               erp->er_extcount += ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-       while (ext_cnt) {
-               erp_idx++;
-               erp = xfs_iext_irec_new(ifp, erp_idx);
-               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
-               erp->er_extcount = ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
-               ext_cnt -= ext_diff;
-       }
-
-       /* Add nex2 extents back to indirection array */
-       if (nex2) {
-               xfs_extnum_t    ext_avail;
-               int             i;
-
-               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-               i = 0;
-               /*
-                * If nex2 extents fit in the current page, append
-                * nex2_ep after the new extents.
-                */
-               if (nex2 <= ext_avail) {
-                       i = erp->er_extcount;
-               }
-               /*
-                * Otherwise, check if space is available in the
-                * next page.
-                */
-               else if ((erp_idx < nlists - 1) &&
-                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
-                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
-                       erp_idx++;
-                       erp++;
-                       /* Create a hole for nex2 extents */
-                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
-                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
-               }
-               /*
-                * Final choice, create a new extent page for
-                * nex2 extents.
-                */
-               else {
-                       erp_idx++;
-                       erp = xfs_iext_irec_new(ifp, erp_idx);
-               }
-               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-               kmem_free(nex2_ep);
-               erp->er_extcount += nex2;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
-       }
-}
-
-/*
- * This is called when the amount of space required for incore file
- * extents needs to be decreased. The ext_diff parameter stores the
- * number of extents to be removed and the idx parameter contains
- * the extent index where the extents will be removed from.
- *
- * If the amount of space needed has decreased below the linear
- * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
- * extent array.  Otherwise, use kmem_realloc() to adjust the
- * size to what is needed.
- */
-void
-xfs_iext_remove(
-       xfs_inode_t     *ip,            /* incore inode pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff,       /* number of extents to remove */
-       int             state)          /* type of extent conversion */
-{
-       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
-
-       ASSERT(ext_diff > 0);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (ifp->if_flags & XFS_IFEXTIREC) {
-               xfs_iext_remove_indirect(ifp, idx, ext_diff);
-       } else if (ifp->if_real_bytes) {
-               xfs_iext_remove_direct(ifp, idx, ext_diff);
-       } else {
-               xfs_iext_remove_inline(ifp, idx, ext_diff);
-       }
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This removes ext_diff extents from the inline buffer, beginning
- * at extent index idx.
- */
-void
-xfs_iext_remove_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       int             nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       ASSERT(idx < XFS_INLINE_EXTS);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(((nextents - ext_diff) > 0) &&
-               (nextents - ext_diff) < XFS_INLINE_EXTS);
-
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u2.if_inline_ext[idx],
-                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
-                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       } else {
-               memset(&ifp->if_u2.if_inline_ext[idx], 0,
-                       ext_diff * sizeof(xfs_bmbt_rec_t));
-       }
-}
-
-/*
- * This removes ext_diff extents from a linear (direct) extent list,
- * beginning at extent index idx. If the extents are being removed
- * from the end of the list (ie. truncate) then we just need to re-
- * allocate the list to remove the extra space. Otherwise, if the
- * extents are being removed from the middle of the existing extent
- * entries, then we first need to move the extent records beginning
- * at idx + ext_diff up in the list to overwrite the records being
- * removed, then remove the extra space via kmem_realloc.
- */
-void
-xfs_iext_remove_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing exts */
-       int             ext_diff)       /* number of extents to remove */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             new_size;       /* size of extents after removal */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       new_size = ifp->if_bytes -
-               (ext_diff * sizeof(xfs_bmbt_rec_t));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-               return;
-       }
-       /* Move extents up in the list (if needed) */
-       if (idx + ext_diff < nextents) {
-               memmove(&ifp->if_u1.if_extents[idx],
-                       &ifp->if_u1.if_extents[idx + ext_diff],
-                       (nextents - (idx + ext_diff)) *
-                        sizeof(xfs_bmbt_rec_t));
-       }
-       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
-               0, ext_diff * sizeof(xfs_bmbt_rec_t));
-       /*
-        * Reallocate the direct extent list. If the extents
-        * will fit inside the inode then xfs_iext_realloc_direct
-        * will switch from direct to inline extent allocation
-        * mode for us.
-        */
-       xfs_iext_realloc_direct(ifp, new_size);
-       ifp->if_bytes = new_size;
-}
-
-/*
- * This is called when incore extents are being removed from the
- * indirection array and the extents being removed span multiple extent
- * buffers. The idx parameter contains the file extent index where we
- * want to begin removing extents, and the count parameter contains
- * how many extents need to be removed.
- *
- *    |-------|   |-------|
- *    | nex1  |   |       |    nex1 - number of extents before idx
- *    |-------|   | count |
- *    |       |   |       |    count - number of extents being removed at idx
- *    | count |   |-------|
- *    |       |   | nex2  |    nex2 - number of extents after idx + count
- *    |-------|   |-------|
- */
-void
-xfs_iext_remove_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    idx,            /* index to begin removing extents */
-       int             count)          /* number of extents to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             erp_idx = 0;    /* indirection array index */
-       xfs_extnum_t    ext_cnt;        /* extents left to remove */
-       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
-       xfs_extnum_t    nex1;           /* number of extents before idx */
-       xfs_extnum_t    nex2;           /* extents after idx + count */
-       int             page_idx = idx; /* index in target extent list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
-       ASSERT(erp != NULL);
-       nex1 = page_idx;
-       ext_cnt = count;
-       while (ext_cnt) {
-               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
-               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
-               /*
-                * Check for deletion of entire list;
-                * xfs_iext_irec_remove() updates extent offsets.
-                */
-               if (ext_diff == erp->er_extcount) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-                       ext_cnt -= ext_diff;
-                       nex1 = 0;
-                       if (ext_cnt) {
-                               ASSERT(erp_idx < ifp->if_real_bytes /
-                                       XFS_IEXT_BUFSZ);
-                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                               nex1 = 0;
-                               continue;
-                       } else {
-                               break;
-                       }
-               }
-               /* Move extents up (if needed) */
-               if (nex2) {
-                       memmove(&erp->er_extbuf[nex1],
-                               &erp->er_extbuf[nex1 + ext_diff],
-                               nex2 * sizeof(xfs_bmbt_rec_t));
-               }
-               /* Zero out rest of page */
-               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
-                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
-               /* Update remaining counters */
-               erp->er_extcount -= ext_diff;
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
-               ext_cnt -= ext_diff;
-               nex1 = 0;
-               erp_idx++;
-               erp++;
-       }
-       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
-       xfs_iext_irec_compact(ifp);
-}
-
-/*
- * Create, destroy, or resize a linear (direct) block of extents.
- */
-void
-xfs_iext_realloc_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new size of extents */
-{
-       int             rnew_size;      /* real new size of extents */
-
-       rnew_size = new_size;
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
-               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
-                (new_size != ifp->if_real_bytes)));
-
-       /* Free extent records */
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       }
-       /* Resize direct extent list and zero any new bytes */
-       else if (ifp->if_real_bytes) {
-               /* Check if extents will fit inside the inode */
-               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
-                       xfs_iext_direct_to_inline(ifp, new_size /
-                               (uint)sizeof(xfs_bmbt_rec_t));
-                       ifp->if_bytes = new_size;
-                       return;
-               }
-               if (!is_power_of_2(new_size)){
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               if (rnew_size != ifp->if_real_bytes) {
-                       ifp->if_u1.if_extents =
-                               kmem_realloc(ifp->if_u1.if_extents,
-                                               rnew_size,
-                                               ifp->if_real_bytes, KM_NOFS);
-               }
-               if (rnew_size > ifp->if_real_bytes) {
-                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
-                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
-                               rnew_size - ifp->if_real_bytes);
-               }
-       }
-       /*
-        * Switch from the inline extent buffer to a direct
-        * extent list. Be sure to include the inline extent
-        * bytes in new_size.
-        */
-       else {
-               new_size += ifp->if_bytes;
-               if (!is_power_of_2(new_size)) {
-                       rnew_size = roundup_pow_of_two(new_size);
-               }
-               xfs_iext_inline_to_direct(ifp, rnew_size);
-       }
-       ifp->if_real_bytes = rnew_size;
-       ifp->if_bytes = new_size;
-}
-
-/*
- * Switch from linear (direct) extent records to inline buffer.
- */
-void
-xfs_iext_direct_to_inline(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    nextents)       /* number of extents in file */
-{
-       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-       ASSERT(nextents <= XFS_INLINE_EXTS);
-       /*
-        * The inline buffer was zeroed when we switched
-        * from inline to direct extent allocation mode,
-        * so we don't need to clear it here.
-        */
-       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
-               nextents * sizeof(xfs_bmbt_rec_t));
-       kmem_free(ifp->if_u1.if_extents);
-       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
-       ifp->if_real_bytes = 0;
-}
-
-/*
- * Switch from inline buffer to linear (direct) extent records.
- * new_size should already be rounded up to the next power of 2
- * by the caller (when appropriate), so use new_size as it is.
- * However, since new_size may be rounded up, we can't update
- * if_bytes here. It is the caller's responsibility to update
- * if_bytes upon return.
- */
-void
-xfs_iext_inline_to_direct(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* number of extents in file */
-{
-       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
-       memset(ifp->if_u1.if_extents, 0, new_size);
-       if (ifp->if_bytes) {
-               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
-                       ifp->if_bytes);
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_real_bytes = new_size;
-}
-
-/*
- * Resize an extent indirection array to new_size bytes.
- */
-STATIC void
-xfs_iext_realloc_indirect(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             new_size)       /* new indirection array size */
-{
-       int             nlists;         /* number of irec's (ex lists) */
-       int             size;           /* current indirection array size */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       size = nlists * sizeof(xfs_ext_irec_t);
-       ASSERT(ifp->if_real_bytes);
-       ASSERT((new_size >= 0) && (new_size != size));
-       if (new_size == 0) {
-               xfs_iext_destroy(ifp);
-       } else {
-               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
-                       kmem_realloc(ifp->if_u1.if_ext_irec,
-                               new_size, size, KM_NOFS);
-       }
-}
-
-/*
- * Switch from indirection array to linear (direct) extent allocations.
- */
-STATIC void
-xfs_iext_indirect_to_direct(
-        xfs_ifork_t    *ifp)           /* inode fork pointer */
-{
-       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             size;           /* size of file extents */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-       size = nextents * sizeof(xfs_bmbt_rec_t);
-
-       xfs_iext_irec_compact_pages(ifp);
-       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
-
-       ep = ifp->if_u1.if_ext_irec->er_extbuf;
-       kmem_free(ifp->if_u1.if_ext_irec);
-       ifp->if_flags &= ~XFS_IFEXTIREC;
-       ifp->if_u1.if_extents = ep;
-       ifp->if_bytes = size;
-       if (nextents < XFS_LINEAR_EXTS) {
-               xfs_iext_realloc_direct(ifp, size);
-       }
-}
-
-/*
- * Free incore file extents.
- */
-void
-xfs_iext_destroy(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               int     erp_idx;
-               int     nlists;
-
-               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
-                       xfs_iext_irec_remove(ifp, erp_idx);
-               }
-               ifp->if_flags &= ~XFS_IFEXTIREC;
-       } else if (ifp->if_real_bytes) {
-               kmem_free(ifp->if_u1.if_extents);
-       } else if (ifp->if_bytes) {
-               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
-                       sizeof(xfs_bmbt_rec_t));
-       }
-       ifp->if_u1.if_extents = NULL;
-       ifp->if_real_bytes = 0;
-       ifp->if_bytes = 0;
-}
-
-/*
- * Return a pointer to the extent record for file system block bno.
- */
-xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
-xfs_iext_bno_to_ext(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       xfs_extnum_t    *idxp)          /* index of target extent */
-{
-       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
-       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
-       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       int             high;           /* upper boundary in search */
-       xfs_extnum_t    idx = 0;        /* index of target extent */
-       int             low;            /* lower boundary in search */
-       xfs_extnum_t    nextents;       /* number of file extents */
-       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
-
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       if (nextents == 0) {
-               *idxp = 0;
-               return NULL;
-       }
-       low = 0;
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               /* Find target extent list */
-               int     erp_idx = 0;
-               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
-               base = erp->er_extbuf;
-               high = erp->er_extcount - 1;
-       } else {
-               base = ifp->if_u1.if_extents;
-               high = nextents - 1;
-       }
-       /* Binary search extent records */
-       while (low <= high) {
-               idx = (low + high) >> 1;
-               ep = base + idx;
-               startoff = xfs_bmbt_get_startoff(ep);
-               blockcount = xfs_bmbt_get_blockcount(ep);
-               if (bno < startoff) {
-                       high = idx - 1;
-               } else if (bno >= startoff + blockcount) {
-                       low = idx + 1;
-               } else {
-                       /* Convert back to file-based extent index */
-                       if (ifp->if_flags & XFS_IFEXTIREC) {
-                               idx += erp->er_extoff;
-                       }
-                       *idxp = idx;
-                       return ep;
-               }
-       }
-       /* Convert back to file-based extent index */
-       if (ifp->if_flags & XFS_IFEXTIREC) {
-               idx += erp->er_extoff;
-       }
-       if (bno >= startoff + blockcount) {
-               if (++idx == nextents) {
-                       ep = NULL;
-               } else {
-                       ep = xfs_iext_get_ext(ifp, idx);
-               }
-       }
-       *idxp = idx;
-       return ep;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record for filesystem block bno. Store the index of the
- * target irec in *erp_idxp.
- */
-xfs_ext_irec_t *                       /* pointer to found extent record */
-xfs_iext_bno_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_fileoff_t   bno,            /* block number to search for */
-       int             *erp_idxp)      /* irec index of target ext list */
-{
-       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
-       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of extent irec's (lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
-               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
-                       high = erp_idx - 1;
-               } else if (erp_next && bno >=
-                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
-                       low = erp_idx + 1;
-               } else {
-                       break;
-               }
-       }
-       *erp_idxp = erp_idx;
-       return erp;
-}
-
-/*
- * Return a pointer to the indirection array entry containing the
- * extent record at file extent index *idxp. Store the index of the
- * target irec in *erp_idxp and store the page index of the target
- * extent record in *idxp.
- */
-xfs_ext_irec_t *
-xfs_iext_idx_to_irec(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
-       int             *erp_idxp,      /* pointer to target irec */
-       int             realloc)        /* new bytes were just added */
-{
-       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
-       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
-       int             erp_idx;        /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-       int             high;           /* binary search upper limit */
-       int             low;            /* binary search lower limit */
-       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       ASSERT(page_idx >= 0);
-       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
-
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp_idx = 0;
-       low = 0;
-       high = nlists - 1;
-
-       /* Binary search extent irec's */
-       while (low <= high) {
-               erp_idx = (low + high) >> 1;
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               prev = erp_idx > 0 ? erp - 1 : NULL;
-               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
-                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
-                       high = erp_idx - 1;
-               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
-                          (page_idx == erp->er_extoff + erp->er_extcount &&
-                           !realloc)) {
-                       low = erp_idx + 1;
-               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
-                          erp->er_extcount == XFS_LINEAR_EXTS) {
-                       ASSERT(realloc);
-                       page_idx = 0;
-                       erp_idx++;
-                       erp = erp_idx < nlists ? erp + 1 : NULL;
-                       break;
-               } else {
-                       page_idx -= erp->er_extoff;
-                       break;
-               }
-       }
-       *idxp = page_idx;
-       *erp_idxp = erp_idx;
-       return(erp);
-}
-
-/*
- * Allocate and initialize an indirection array once the space needed
- * for incore extents increases above XFS_IEXT_BUFSZ.
- */
-void
-xfs_iext_irec_init(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       xfs_extnum_t    nextents;       /* number of extents in file */
-
-       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       ASSERT(nextents <= XFS_LINEAR_EXTS);
-
-       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-
-       if (nextents == 0) {
-               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       } else if (!ifp->if_real_bytes) {
-               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
-       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
-               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
-       }
-       erp->er_extbuf = ifp->if_u1.if_extents;
-       erp->er_extcount = nextents;
-       erp->er_extoff = 0;
-
-       ifp->if_flags |= XFS_IFEXTIREC;
-       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
-       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
-       ifp->if_u1.if_ext_irec = erp;
-
-       return;
-}
-
-/*
- * Allocate and initialize a new entry in the indirection array.
- */
-xfs_ext_irec_t *
-xfs_iext_irec_new(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* index for new irec */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-
-       /* Resize indirection array */
-       xfs_iext_realloc_indirect(ifp, ++nlists *
-                                 sizeof(xfs_ext_irec_t));
-       /*
-        * Move records down in the array so the
-        * new page can use erp_idx.
-        */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = nlists - 1; i > erp_idx; i--) {
-               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
-       }
-       ASSERT(i == erp_idx);
-
-       /* Initialize new extent record */
-       erp = ifp->if_u1.if_ext_irec;
-       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
-       erp[erp_idx].er_extcount = 0;
-       erp[erp_idx].er_extoff = erp_idx > 0 ?
-               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
-       return (&erp[erp_idx]);
-}
-
-/*
- * Remove a record from the indirection array.
- */
-void
-xfs_iext_irec_remove(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx)        /* irec index to remove */
-{
-       xfs_ext_irec_t  *erp;           /* indirection array pointer */
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       erp = &ifp->if_u1.if_ext_irec[erp_idx];
-       if (erp->er_extbuf) {
-               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
-                       -erp->er_extcount);
-               kmem_free(erp->er_extbuf);
-       }
-       /* Compact extent records */
-       erp = ifp->if_u1.if_ext_irec;
-       for (i = erp_idx; i < nlists - 1; i++) {
-               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
-       }
-       /*
-        * Manually free the last extent record from the indirection
-        * array.  A call to xfs_iext_realloc_indirect() with a size
-        * of zero would result in a call to xfs_iext_destroy() which
-        * would in turn call this function again, creating a nasty
-        * infinite loop.
-        */
-       if (--nlists) {
-               xfs_iext_realloc_indirect(ifp,
-                       nlists * sizeof(xfs_ext_irec_t));
-       } else {
-               kmem_free(ifp->if_u1.if_ext_irec);
-       }
-       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
-}
-
-/*
- * This is called to clean up large amounts of unused memory allocated
- * by the indirection array.  Before compacting anything though, verify
- * that the indirection array is still needed and switch back to the
- * linear extent list (or even the inline buffer) if possible.  The
- * compaction policy is as follows:
- *
- *    Full Compaction: Extents fit into a single page (or inline buffer)
- * Partial Compaction: Extents occupy less than 50% of allocated space
- *      No Compaction: Extents occupy at least 50% of allocated space
- */
-void
-xfs_iext_irec_compact(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_extnum_t    nextents;       /* number of extents in file */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-
-       if (nextents == 0) {
-               xfs_iext_destroy(ifp);
-       } else if (nextents <= XFS_INLINE_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-               xfs_iext_direct_to_inline(ifp, nextents);
-       } else if (nextents <= XFS_LINEAR_EXTS) {
-               xfs_iext_indirect_to_direct(ifp);
-       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
-               xfs_iext_irec_compact_pages(ifp);
-       }
-}
-
-/*
- * Combine extents from neighboring extent pages.
- */
-void
-xfs_iext_irec_compact_pages(
-       xfs_ifork_t     *ifp)           /* inode fork pointer */
-{
-       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
-       int             erp_idx = 0;    /* indirection array index */
-       int             nlists;         /* number of irec's (ex lists) */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       while (erp_idx < nlists - 1) {
-               erp = &ifp->if_u1.if_ext_irec[erp_idx];
-               erp_next = erp + 1;
-               if (erp_next->er_extcount <=
-                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
-                       memcpy(&erp->er_extbuf[erp->er_extcount],
-                               erp_next->er_extbuf, erp_next->er_extcount *
-                               sizeof(xfs_bmbt_rec_t));
-                       erp->er_extcount += erp_next->er_extcount;
-                       /*
-                        * Free page before removing extent record
-                        * so er_extoffs don't get modified in
-                        * xfs_iext_irec_remove.
-                        */
-                       kmem_free(erp_next->er_extbuf);
-                       erp_next->er_extbuf = NULL;
-                       xfs_iext_irec_remove(ifp, erp_idx + 1);
-                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-               } else {
-                       erp_idx++;
-               }
-       }
-}
-
-/*
- * This is called to update the er_extoff field in the indirection
- * array when extents have been added or removed from one of the
- * extent lists. erp_idx contains the irec index to begin updating
- * at and ext_diff contains the number of extents that were added
- * or removed.
- */
-void
-xfs_iext_irec_update_extoffs(
-       xfs_ifork_t     *ifp,           /* inode fork pointer */
-       int             erp_idx,        /* irec index to update */
-       int             ext_diff)       /* number of new extents */
-{
-       int             i;              /* loop counter */
-       int             nlists;         /* number of irec's (ex lists */
-
-       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-       for (i = erp_idx; i < nlists; i++) {
-               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
-       }
-}
-
-/*
- * Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
- */
-bool
-xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
-{
-       /* prealloc/delalloc exists only on regular files */
-       if (!S_ISREG(ip->i_d.di_mode))
-               return false;
-
-       /*
-        * Zero sized files with no cached pages and delalloc blocks will not
-        * have speculative prealloc/delalloc blocks to remove.
-        */
-       if (VFS_I(ip)->i_size == 0 &&
-           VN_CACHED(VFS_I(ip)) == 0 &&
-           ip->i_delayed_blks == 0)
-               return false;
-
-       /* If we haven't read in the extent list, then don't do it now. */
-       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
-               return false;
-
-       /*
-        * Do not free real preallocated or append-only files unless the file
-        * has delalloc blocks and we are forced to remove them.
-        */
-       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
-               if (!force || ip->i_delayed_blks == 0)
-                       return false;
-
-       return true;
-}
-
index b55fd347ab5b9b9ff51fd555115c854d7cfd8084..4a91358c1470b9ac029d451dd38c3fa77daf6fc0 100644 (file)
 #ifndef        __XFS_INODE_H__
 #define        __XFS_INODE_H__
 
-struct posix_acl;
-struct xfs_dinode;
-struct xfs_inode;
-
-/*
- * Fork identifiers.
- */
-#define        XFS_DATA_FORK   0
-#define        XFS_ATTR_FORK   1
-
-/*
- * The following xfs_ext_irec_t struct introduces a second (top) level
- * to the in-core extent allocation scheme. These structs are allocated
- * in a contiguous block, creating an indirection array where each entry
- * (irec) contains a pointer to a buffer of in-core extent records which
- * it manages. Each extent buffer is 4k in size, since 4k is the system
- * page size on Linux i386 and systems with larger page sizes don't seem
- * to gain much, if anything, by using their native page size as the
- * extent buffer size. Also, using 4k extent buffers everywhere provides
- * a consistent interface for CXFS across different platforms.
- *
- * There is currently no limit on the number of irec's (extent lists)
- * allowed, so heavily fragmented files may require an indirection array
- * which spans multiple system pages of memory. The number of extents
- * which would require this amount of contiguous memory is very large
- * and should not cause problems in the foreseeable future. However,
- * if the memory needed for the contiguous array ever becomes a problem,
- * it is possible that a third level of indirection may be required.
- */
-typedef struct xfs_ext_irec {
-       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
-       xfs_extnum_t    er_extoff;      /* extent offset in file */
-       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
-} xfs_ext_irec_t;
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
 
 /*
- * File incore extent information, present for each of data & attr forks.
+ * Kernel only inode definitions
  */
-#define        XFS_IEXT_BUFSZ          4096
-#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
-#define        XFS_INLINE_EXTS         2
-#define        XFS_INLINE_DATA         32
-typedef struct xfs_ifork {
-       int                     if_bytes;       /* bytes in if_u1 */
-       int                     if_real_bytes;  /* bytes allocated in if_u1 */
-       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
-       short                   if_broot_bytes; /* bytes allocated for root */
-       unsigned char           if_flags;       /* per-fork flags */
-       union {
-               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
-               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
-               char            *if_data;       /* inline file data */
-       } if_u1;
-       union {
-               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
-                                               /* very small file extents */
-               char            if_inline_data[XFS_INLINE_DATA];
-                                               /* very small file data */
-               xfs_dev_t       if_rdev;        /* dev number if special */
-               uuid_t          if_uuid;        /* mount point value */
-       } if_u2;
-} xfs_ifork_t;
-
-/*
- * Inode location information.  Stored in the inode and passed to
- * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
- */
-struct xfs_imap {
-       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-       ushort          im_len;         /* length in BBs of inode chunk */
-       ushort          im_boffset;     /* inode offset in block in bytes */
-};
-
-/*
- * This is the xfs in-core inode structure.
- * Most of the on-disk inode is embedded in the i_d field.
- *
- * The extent pointers/inline file space, however, are managed
- * separately.  The memory for this information is pointed to by
- * the if_u1 unions depending on the type of the data.
- * This is used to linearize the array of extents for fast in-core
- * access.  This is used until the file's number of extents
- * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
- * are accessed through the buffer cache.
- *
- * Other state kept in the in-core inode is used for identification,
- * locking, transactional updating, etc of the inode.
- *
- * Generally, we do not want to hold the i_rlock while holding the
- * i_ilock. Hierarchy is i_iolock followed by i_rlock.
- *
- * xfs_iptr_t contains all the inode fields up to and including the
- * i_mnext and i_mprev fields, it is used as a marker in the inode
- * chain off the mount structure by xfs_sync calls.
- */
-
-typedef struct xfs_ictimestamp {
-       __int32_t       t_sec;          /* timestamp seconds */
-       __int32_t       t_nsec;         /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
-
-/*
- * NOTE:  This structure must be kept identical to struct xfs_dinode
- *       in xfs_dinode.h except for the endianness annotations.
- */
-typedef struct xfs_icdinode {
-       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
-       __uint16_t      di_mode;        /* mode and type of file */
-       __int8_t        di_version;     /* inode version */
-       __int8_t        di_format;      /* format of di_c data */
-       __uint16_t      di_onlink;      /* old number of links to file */
-       __uint32_t      di_uid;         /* owner's user id */
-       __uint32_t      di_gid;         /* owner's group id */
-       __uint32_t      di_nlink;       /* number of links to file */
-       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
-       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
-       __uint8_t       di_pad[6];      /* unused, zeroed space */
-       __uint16_t      di_flushiter;   /* incremented on flush */
-       xfs_ictimestamp_t di_atime;     /* time last accessed */
-       xfs_ictimestamp_t di_mtime;     /* time last modified */
-       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
-       xfs_fsize_t     di_size;        /* number of bytes in file */
-       xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
-       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
-       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
-       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
-       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
-       __int8_t        di_aformat;     /* format of attr fork's data */
-       __uint32_t      di_dmevmask;    /* DMIG event mask */
-       __uint16_t      di_dmstate;     /* DMIG state info */
-       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
-       __uint32_t      di_gen;         /* generation number */
-
-       /* di_next_unlinked is the only non-core field in the old dinode */
-       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
-
-       /* start of the extended dinode, writable fields */
-       __uint32_t      di_crc;         /* CRC of the inode */
-       __uint64_t      di_changecount; /* number of attribute changes */
-       xfs_lsn_t       di_lsn;         /* flush sequence */
-       __uint64_t      di_flags2;      /* more random flags */
-       __uint8_t       di_pad2[16];    /* more padding for future expansion */
-
-       /* fields only written to during inode creation */
-       xfs_ictimestamp_t di_crtime;    /* time created */
-       xfs_ino_t       di_ino;         /* inode number */
-       uuid_t          di_uuid;        /* UUID of the filesystem */
-
-       /* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
-
-static inline uint xfs_icdinode_size(int version)
-{
-       if (version == 3)
-               return sizeof(struct xfs_icdinode);
-       return offsetof(struct xfs_icdinode, di_next_unlinked);
-}
-
-/*
- * Flags for xfs_ichgtime().
- */
-#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
-
-/*
- * Per-fork incore inode flags.
- */
-#define        XFS_IFINLINE    0x01    /* Inline data is read in */
-#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-
-/*
- * Fork handling.
- */
-
-#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)            \
-       ((w) == XFS_DATA_FORK ? \
-               &(ip)->i_df : \
-               (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_IFORK_BOFF(ip) : \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
-#define XFS_IFORK_ASIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
-                       XFS_IFORK_BOFF(ip) : \
-               0)
-#define XFS_IFORK_SIZE(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_IFORK_DSIZE(ip) : \
-               XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_format : \
-               (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_format = (n)) : \
-               ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_nextents : \
-               (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_nextents = (n)) : \
-               ((ip)->i_d.di_anextents = (n)))
-#define XFS_IFORK_MAXEXT(ip, w) \
-       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
-
-
-#ifdef __KERNEL__
 
+struct xfs_dinode;
+struct xfs_inode;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -525,9 +315,21 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
         ((pip)->i_d.di_mode & S_ISGID))
 
 
-/*
- * xfs_inode.c prototypes.
- */
+int            xfs_release(struct xfs_inode *ip);
+int            xfs_inactive(struct xfs_inode *ip);
+int            xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+                          struct xfs_inode **ipp, struct xfs_name *ci_name);
+int            xfs_create(struct xfs_inode *dp, struct xfs_name *name,
+                          umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int            xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
+                          struct xfs_inode *ip);
+int            xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
+                        struct xfs_name *target_name);
+int            xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
+                          struct xfs_inode *src_ip, struct xfs_inode *target_dp,
+                          struct xfs_name *target_name,
+                          struct xfs_inode *target_ip);
+
 void           xfs_ilock(xfs_inode_t *, uint);
 int            xfs_ilock_nowait(xfs_inode_t *, uint);
 void           xfs_iunlock(xfs_inode_t *, uint);
@@ -548,13 +350,28 @@ int               xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
 int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
+
 void           xfs_iunpin_wait(xfs_inode_t *);
+#define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
+
 int            xfs_iflush(struct xfs_inode *, struct xfs_buf **);
 void           xfs_lock_inodes(xfs_inode_t **, int, uint);
 void           xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 xfs_extlen_t   xfs_get_extsz_hint(struct xfs_inode *ip);
 
+int            xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
+                              xfs_nlink_t, xfs_dev_t, prid_t, int,
+                              struct xfs_inode **, int *);
+int            xfs_droplink(struct xfs_trans *, struct xfs_inode *);
+int            xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
+void           xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
+
+/* from xfs_file.c */
+int            xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int            xfs_iozero(struct xfs_inode *, loff_t, size_t);
+
+
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -568,65 +385,6 @@ do { \
        iput(VFS_I(ip)); \
 } while (0)
 
-#endif /* __KERNEL__ */
-
-/*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE                0x1
-#define XFS_IGET_UNTRUSTED     0x2
-#define XFS_IGET_DONTCACHE     0x4
-
-int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-                              struct xfs_imap *, struct xfs_dinode **,
-                              struct xfs_buf **, uint, uint);
-int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
-                         struct xfs_inode *, uint);
-void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void           xfs_dinode_to_disk(struct xfs_dinode *,
-                                  struct xfs_icdinode *);
-void           xfs_idestroy_fork(struct xfs_inode *, int);
-void           xfs_idata_realloc(struct xfs_inode *, int, int);
-void           xfs_iroot_realloc(struct xfs_inode *, int, int);
-int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
-int            xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
-
-xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
-void           xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
-                               xfs_bmbt_irec_t *, int);
-void           xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
-void           xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
-void           xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
-void           xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void           xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
-void           xfs_iext_inline_to_direct(xfs_ifork_t *, int);
-void           xfs_iext_destroy(xfs_ifork_t *);
-xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
-xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
-void           xfs_iext_irec_init(xfs_ifork_t *);
-xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
-void           xfs_iext_irec_remove(xfs_ifork_t *, int);
-void           xfs_iext_irec_compact(xfs_ifork_t *);
-void           xfs_iext_irec_compact_pages(xfs_ifork_t *);
-void           xfs_iext_irec_compact_full(xfs_ifork_t *);
-void           xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
-bool           xfs_can_free_eofblocks(struct xfs_inode *, bool);
-
-#define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
-
-#if defined(DEBUG)
-void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define        xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
-
-extern struct kmem_zone        *xfs_ifork_zone;
 extern struct kmem_zone        *xfs_inode_zone;
-extern struct kmem_zone        *xfs_ili_zone;
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
 
 #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
new file mode 100644 (file)
index 0000000..e011d59
--- /dev/null
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_icache.h"
+#include "xfs_ialloc.h"
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+       xfs_mount_t     *mp,
+       xfs_buf_t       *bp)
+{
+       int             i;
+       int             j;
+       xfs_dinode_t    *dip;
+
+       j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+       for (i = 0; i < j; i++) {
+               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                       i * mp->m_sb.sb_inodesize);
+               if (!dip->di_next_unlinked)  {
+                       xfs_alert(mp,
+       "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
+                               bp);
+                       ASSERT(dip->di_next_unlinked);
+               }
+       }
+}
+#endif
+
+/*
+ * If we are doing readahead on an inode buffer, we might be in log recovery
+ * reading an inode allocation buffer that hasn't yet been replayed, and hence
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+ * If the readahead buffer is invalid, we don't want to mark it with an error,
+ * but we do want to clear the DONE status of the buffer so that a followup read
+ * will re-read it from disk. This will ensure that we don't get an unnecessary
+ * warnings during log recovery and we don't get unnecssary panics on debug
+ * kernels.
+ */
+static void
+xfs_inode_buf_verify(
+       struct xfs_buf  *bp,
+       bool            readahead)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       int             i;
+       int             ni;
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        */
+       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       if (readahead) {
+                               bp->b_flags &= ~XBF_DONE;
+                               return;
+                       }
+
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                            mp, dip);
+#ifdef DEBUG
+                       xfs_emerg(mp,
+                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                               (unsigned long long)bp->b_bn, i,
+                               be16_to_cpu(dip->di_magic));
+                       ASSERT(0);
+#endif
+               }
+       }
+       xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+static void
+xfs_inode_buf_readahead_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, true);
+}
+
+static void
+xfs_inode_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp, false);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+       .verify_read = xfs_inode_buf_read_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+       .verify_read = xfs_inode_buf_readahead_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+
+/*
+ * This routine is called to map an inode to the buffer containing the on-disk
+ * version of the inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
+ * pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and dipp are
+ * undefined.
+ */
+int
+xfs_imap_to_bp(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_imap         *imap,
+       struct xfs_dinode       **dipp,
+       struct xfs_buf          **bpp,
+       uint                    buf_flags,
+       uint                    iget_flags)
+{
+       struct xfs_buf          *bp;
+       int                     error;
+
+       buf_flags |= XBF_UNMAPPED;
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                  (int)imap->im_len, buf_flags, &bp,
+                                  &xfs_inode_buf_ops);
+       if (error) {
+               if (error == EAGAIN) {
+                       ASSERT(buf_flags & XBF_TRYLOCK);
+                       return error;
+               }
+
+               if (error == EFSCORRUPTED &&
+                   (iget_flags & XFS_IGET_UNTRUSTED))
+                       return XFS_ERROR(EINVAL);
+
+               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                       __func__, error);
+               return error;
+       }
+
+       *bpp = bp;
+       *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+       return 0;
+}
+
+STATIC void
+xfs_dinode_from_disk(
+       xfs_icdinode_t          *to,
+       xfs_dinode_t            *from)
+{
+       to->di_magic = be16_to_cpu(from->di_magic);
+       to->di_mode = be16_to_cpu(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = be16_to_cpu(from->di_onlink);
+       to->di_uid = be32_to_cpu(from->di_uid);
+       to->di_gid = be32_to_cpu(from->di_gid);
+       to->di_nlink = be32_to_cpu(from->di_nlink);
+       to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+       to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_flushiter = be16_to_cpu(from->di_flushiter);
+       to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
+       to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+       to->di_size = be64_to_cpu(from->di_size);
+       to->di_nblocks = be64_to_cpu(from->di_nblocks);
+       to->di_extsize = be32_to_cpu(from->di_extsize);
+       to->di_nextents = be32_to_cpu(from->di_nextents);
+       to->di_anextents = be16_to_cpu(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat  = from->di_aformat;
+       to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
+       to->di_dmstate  = be16_to_cpu(from->di_dmstate);
+       to->di_flags    = be16_to_cpu(from->di_flags);
+       to->di_gen      = be32_to_cpu(from->di_gen);
+
+       if (to->di_version == 3) {
+               to->di_changecount = be64_to_cpu(from->di_changecount);
+               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+               to->di_flags2 = be64_to_cpu(from->di_flags2);
+               to->di_ino = be64_to_cpu(from->di_ino);
+               to->di_lsn = be64_to_cpu(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+       }
+}
+
+void
+xfs_dinode_to_disk(
+       xfs_dinode_t            *to,
+       xfs_icdinode_t          *from)
+{
+       to->di_magic = cpu_to_be16(from->di_magic);
+       to->di_mode = cpu_to_be16(from->di_mode);
+       to->di_version = from ->di_version;
+       to->di_format = from->di_format;
+       to->di_onlink = cpu_to_be16(from->di_onlink);
+       to->di_uid = cpu_to_be32(from->di_uid);
+       to->di_gid = cpu_to_be32(from->di_gid);
+       to->di_nlink = cpu_to_be32(from->di_nlink);
+       to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
+       to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
+       to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
+       to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+       to->di_size = cpu_to_be64(from->di_size);
+       to->di_nblocks = cpu_to_be64(from->di_nblocks);
+       to->di_extsize = cpu_to_be32(from->di_extsize);
+       to->di_nextents = cpu_to_be32(from->di_nextents);
+       to->di_anextents = cpu_to_be16(from->di_anextents);
+       to->di_forkoff = from->di_forkoff;
+       to->di_aformat = from->di_aformat;
+       to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+       to->di_dmstate = cpu_to_be16(from->di_dmstate);
+       to->di_flags = cpu_to_be16(from->di_flags);
+       to->di_gen = cpu_to_be32(from->di_gen);
+
+       if (from->di_version == 3) {
+               to->di_changecount = cpu_to_be64(from->di_changecount);
+               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+               to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+               to->di_flags2 = cpu_to_be64(from->di_flags2);
+               to->di_ino = cpu_to_be64(from->di_ino);
+               to->di_lsn = cpu_to_be64(from->di_lsn);
+               memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+               uuid_copy(&to->di_uuid, &from->di_uuid);
+               to->di_flushiter = 0;
+       } else {
+               to->di_flushiter = cpu_to_be16(from->di_flushiter);
+       }
+}
+
+static bool
+xfs_dinode_verify(
+       struct xfs_mount        *mp,
+       struct xfs_inode        *ip,
+       struct xfs_dinode       *dip)
+{
+       if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+               return false;
+
+       /* only version 3 or greater inodes are extensively verified here */
+       if (dip->di_version < 3)
+               return true;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             offsetof(struct xfs_dinode, di_crc)))
+               return false;
+       if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+               return false;
+       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       return true;
+}
+
+void
+xfs_dinode_calc_crc(
+       struct xfs_mount        *mp,
+       struct xfs_dinode       *dip)
+{
+       __uint32_t              crc;
+
+       if (dip->di_version < 3)
+               return;
+
+       ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+       crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+                             offsetof(struct xfs_dinode, di_crc));
+       dip->di_crc = xfs_end_cksum(crc);
+}
+
+/*
+ * Read the disk inode attributes into the in-core inode structure.
+ *
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
+ */
+int
+xfs_iread(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       uint            iget_flags)
+{
+       xfs_buf_t       *bp;
+       xfs_dinode_t    *dip;
+       int             error;
+
+       /*
+        * Fill in the location information in the in-core inode.
+        */
+       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+       if (error)
+               return error;
+
+       /* shortcut IO on inode allocation if possible */
+       if ((iget_flags & XFS_IGET_CREATE) &&
+           xfs_sb_version_hascrc(&mp->m_sb) &&
+           !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+               /* initialise the on-disk inode core */
+               memset(&ip->i_d, 0, sizeof(ip->i_d));
+               ip->i_d.di_magic = XFS_DINODE_MAGIC;
+               ip->i_d.di_gen = prandom_u32();
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       ip->i_d.di_version = 3;
+                       ip->i_d.di_ino = ip->i_ino;
+                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+               } else
+                       ip->i_d.di_version = 2;
+               return 0;
+       }
+
+       /*
+        * Get pointers to the on-disk inode and the buffer containing it.
+        */
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
+       if (error)
+               return error;
+
+       /* even unallocated inodes are verified */
+       if (!xfs_dinode_verify(mp, ip, dip)) {
+               xfs_alert(mp, "%s: validation failed for inode %lld failed",
+                               __func__, ip->i_ino);
+
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+               error = XFS_ERROR(EFSCORRUPTED);
+               goto out_brelse;
+       }
+
+       /*
+        * If the on-disk inode is already linked to a directory
+        * entry, copy all of the inode into the in-core inode.
+        * xfs_iformat_fork() handles copying in the inode format
+        * specific information.
+        * Otherwise, just get the truly permanent information.
+        */
+       if (dip->di_mode) {
+               xfs_dinode_from_disk(&ip->i_d, dip);
+               error = xfs_iformat_fork(ip, dip);
+               if (error)  {
+#ifdef DEBUG
+                       xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+                               __func__, error);
+#endif /* DEBUG */
+                       goto out_brelse;
+               }
+       } else {
+               /*
+                * Partial initialisation of the in-core inode. Just the bits
+                * that xfs_ialloc won't overwrite or relies on being correct.
+                */
+               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+               ip->i_d.di_version = dip->di_version;
+               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+               if (dip->di_version == 3) {
+                       ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+                       uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+               }
+
+               /*
+                * Make sure to pull in the mode here as well in
+                * case the inode is released without being used.
+                * This ensures that xfs_inactive() will see that
+                * the inode is already free and not try to mess
+                * with the uninitialized part of it.
+                */
+               ip->i_d.di_mode = 0;
+       }
+
+       /*
+        * The inode format changed when we moved the link count and
+        * made it 32 bits long.  If this is an old format inode,
+        * convert it in memory to look like a new one.  If it gets
+        * flushed to disk we will convert back before flushing or
+        * logging it.  We zero out the new projid field and the old link
+        * count field.  We'll handle clearing the pad field (the remains
+        * of the old uuid field) when we actually convert the inode to
+        * the new format. We don't change the version number so that we
+        * can distinguish this from a real new format inode.
+        */
+       if (ip->i_d.di_version == 1) {
+               ip->i_d.di_nlink = ip->i_d.di_onlink;
+               ip->i_d.di_onlink = 0;
+               xfs_set_projid(ip, 0);
+       }
+
+       ip->i_delayed_blks = 0;
+
+       /*
+        * Mark the buffer containing the inode as something to keep
+        * around for a while.  This helps to keep recently accessed
+        * meta-data in-core longer.
+        */
+       xfs_buf_set_ref(bp, XFS_INO_REF);
+
+       /*
+        * Use xfs_trans_brelse() to release the buffer containing the on-disk
+        * inode, because it was acquired with xfs_trans_read_buf() in
+        * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+        * brelse().  If we're within a transaction, then xfs_trans_brelse()
+        * will only release the buffer if it is not dirty within the
+        * transaction.  It will be OK to release the buffer in this case,
+        * because inodes on disk are never destroyed and we will be locking the
+        * new in-core inode before putting it in the cache where other
+        * processes can find it.  Thus we don't have to worry about the inode
+        * being changed just because we released the buffer.
+        */
+ out_brelse:
+       xfs_trans_brelse(tp, bp);
+       return error;
+}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
new file mode 100644 (file)
index 0000000..599e6c0
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_BUF_H__
+#define        __XFS_INODE_BUF_H__
+
+struct xfs_inode;
+struct xfs_dinode;
+struct xfs_icdinode;
+
+/*
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
+ */
+struct xfs_imap {
+       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+       ushort          im_len;         /* length in BBs of inode chunk */
+       ushort          im_boffset;     /* inode offset in block in bytes */
+};
+
+int            xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+                              struct xfs_imap *, struct xfs_dinode **,
+                              struct xfs_buf **, uint, uint);
+int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                         struct xfs_inode *, uint);
+void           xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void           xfs_dinode_to_disk(struct xfs_dinode *,
+                                  struct xfs_icdinode *);
+
+#if defined(DEBUG)
+void           xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define        xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+
+#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
new file mode 100644 (file)
index 0000000..02f1083
--- /dev/null
@@ -0,0 +1,1920 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/log2.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_filestream.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+
+kmem_zone_t *xfs_ifork_zone;
+
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+void
+xfs_validate_extents(
+       xfs_ifork_t             *ifp,
+       int                     nrecs,
+       xfs_exntfmt_t           fmt)
+{
+       xfs_bmbt_irec_t         irec;
+       xfs_bmbt_rec_host_t     rec;
+       int                     i;
+
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               rec.l0 = get_unaligned(&ep->l0);
+               rec.l1 = get_unaligned(&ep->l1);
+               xfs_bmbt_get_all(&rec, &irec);
+               if (fmt == XFS_EXTFMT_NOSTATE)
+                       ASSERT(irec.br_state == XFS_EXT_NORM);
+       }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ifp, nrecs, fmt)
+#endif /* DEBUG */
+
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+int
+xfs_iformat_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip)
+{
+       xfs_attr_shortform_t    *atp;
+       int                     size;
+       int                     error = 0;
+       xfs_fsize_t             di_size;
+
+       if (unlikely(be32_to_cpu(dip->di_nextents) +
+                    be16_to_cpu(dip->di_anextents) >
+                    be64_to_cpu(dip->di_nblocks))) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
+                       (unsigned long long)ip->i_ino,
+                       (int)(be32_to_cpu(dip->di_nextents) +
+                             be16_to_cpu(dip->di_anextents)),
+                       (unsigned long long)
+                               be64_to_cpu(dip->di_nblocks));
+               XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+               xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
+                       (unsigned long long)ip->i_ino,
+                       dip->di_forkoff);
+               XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                    !ip->i_mount->m_rtdev_targp)) {
+               xfs_warn(ip->i_mount,
+                       "corrupt dinode %Lu, has realtime flag set.",
+                       ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                    XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       switch (ip->i_d.di_mode & S_IFMT) {
+       case S_IFIFO:
+       case S_IFCHR:
+       case S_IFBLK:
+       case S_IFSOCK:
+               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
+                       XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+                                             ip->i_mount, dip);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               ip->i_d.di_size = 0;
+               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
+               break;
+
+       case S_IFREG:
+       case S_IFLNK:
+       case S_IFDIR:
+               switch (dip->di_format) {
+               case XFS_DINODE_FMT_LOCAL:
+                       /*
+                        * no local regular files yet
+                        */
+                       if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (local format for regular file).",
+                                       (unsigned long long) ip->i_ino);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       di_size = be64_to_cpu(dip->di_size);
+                       if (unlikely(di_size < 0 ||
+                                    di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+                               xfs_warn(ip->i_mount,
+                       "corrupt inode %Lu (bad size %Ld for local inode).",
+                                       (unsigned long long) ip->i_ino,
+                                       (long long) di_size);
+                               XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+                                                    XFS_ERRLEVEL_LOW,
+                                                    ip->i_mount, dip);
+                               return XFS_ERROR(EFSCORRUPTED);
+                       }
+
+                       size = (int)di_size;
+                       error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+                       break;
+               case XFS_DINODE_FMT_EXTENTS:
+                       error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+                       break;
+               case XFS_DINODE_FMT_BTREE:
+                       error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+                       break;
+               default:
+                       XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+               break;
+
+       default:
+               XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       if (error) {
+               return error;
+       }
+       if (!XFS_DFORK_Q(dip))
+               return 0;
+
+       ASSERT(ip->i_afp == NULL);
+       ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+
+       switch (dip->di_aformat) {
+       case XFS_DINODE_FMT_LOCAL:
+               atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+               size = be16_to_cpu(atp->hdr.totsize);
+
+               if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+                       xfs_warn(ip->i_mount,
+                               "corrupt inode %Lu (bad attr fork size %Ld).",
+                               (unsigned long long) ip->i_ino,
+                               (long long) size);
+                       XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+                                            XFS_ERRLEVEL_LOW,
+                                            ip->i_mount, dip);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+
+               error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+               break;
+       case XFS_DINODE_FMT_EXTENTS:
+               error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+               break;
+       case XFS_DINODE_FMT_BTREE:
+               error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+               break;
+       default:
+               error = XFS_ERROR(EFSCORRUPTED);
+               break;
+       }
+       if (error) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+       }
+       return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork,
+       int             size)
+{
+       xfs_ifork_t     *ifp;
+       int             real_size;
+
+       /*
+        * If the size is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount,
+       "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+                       (unsigned long long) ip->i_ino, size,
+                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+               XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       real_size = 0;
+       if (size == 0)
+               ifp->if_u1.if_data = NULL;
+       else if (size <= sizeof(ifp->if_u2.if_inline_data))
+               ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+       else {
+               real_size = roundup(size, 4);
+               ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+       }
+       ifp->if_bytes = size;
+       ifp->if_real_bytes = real_size;
+       if (size)
+               memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFINLINE;
+       return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+       xfs_inode_t     *ip,
+       xfs_dinode_t    *dip,
+       int             whichfork)
+{
+       xfs_bmbt_rec_t  *dp;
+       xfs_ifork_t     *ifp;
+       int             nex;
+       int             size;
+       int             i;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+       size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+       /*
+        * If the number of extents is unreasonable, then something
+        * is wrong and we just bail out rather than crash in
+        * kmem_alloc() or memcpy() below.
+        */
+       if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+               xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
+                       (unsigned long long) ip->i_ino, nex);
+               XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+                                    ip->i_mount, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       ifp->if_real_bytes = 0;
+       if (nex == 0)
+               ifp->if_u1.if_extents = NULL;
+       else if (nex <= XFS_INLINE_EXTS)
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       else
+               xfs_iext_add(ifp, 0, nex);
+
+       ifp->if_bytes = size;
+       if (size) {
+               dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+               xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
+               for (i = 0; i < nex; i++, dp++) {
+                       xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+                       ep->l0 = get_unaligned_be64(&dp->l0);
+                       ep->l1 = get_unaligned_be64(&dp->l1);
+               }
+               XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
+               if (whichfork != XFS_DATA_FORK ||
+                       XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+                               if (unlikely(xfs_check_nostate_extents(
+                                   ifp, 0, nex))) {
+                                       XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+                                                        XFS_ERRLEVEL_LOW,
+                                                        ip->i_mount);
+                                       return XFS_ERROR(EFSCORRUPTED);
+                               }
+       }
+       ifp->if_flags |= XFS_IFEXTENTS;
+       return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_bmdr_block_t        *dfp;
+       xfs_ifork_t             *ifp;
+       /* REFERENCED */
+       int                     nrecs;
+       int                     size;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+       size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
+
+       /*
+        * blow out if -- fork has less extents than can fit in
+        * fork (fork shouldn't be a btree format), root btree
+        * block has more records than can fit into the fork,
+        * or the number of extents is greater than the number of
+        * blocks.
+        */
+       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                                       XFS_DFORK_SIZE(dip, mp, whichfork) ||
+                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+               xfs_warn(mp, "corrupt inode %Lu (btree).",
+                                       (unsigned long long) ip->i_ino);
+               XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                                        mp, dip);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       ifp->if_broot_bytes = size;
+       ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+       ASSERT(ifp->if_broot != NULL);
+       /*
+        * Copy and convert from the on-disk structure
+        * to the in-memory structure.
+        */
+       xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                        ifp->if_broot, size);
+       ifp->if_flags &= ~XFS_IFEXTENTS;
+       ifp->if_flags |= XFS_IFBROOT;
+
+       return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       int             error;
+       xfs_ifork_t     *ifp;
+       xfs_extnum_t    nextents;
+
+       if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+               XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+                                ip->i_mount);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+       nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+
+       /*
+        * We know that the size is valid (it's checked in iformat_btree)
+        */
+       ifp->if_bytes = ifp->if_real_bytes = 0;
+       ifp->if_flags |= XFS_IFEXTENTS;
+       xfs_iext_add(ifp, 0, nextents);
+       error = xfs_bmap_read_extents(tp, ip, whichfork);
+       if (error) {
+               xfs_iext_destroy(ifp);
+               ifp->if_flags &= ~XFS_IFEXTENTS;
+               return error;
+       }
+       xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+       return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we are adding records, one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *      requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+       xfs_inode_t             *ip,
+       int                     rec_diff,
+       int                     whichfork)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     cur_max;
+       xfs_ifork_t             *ifp;
+       struct xfs_btree_block  *new_broot;
+       int                     new_max;
+       size_t                  new_size;
+       char                    *np;
+       char                    *op;
+
+       /*
+        * Handle the degenerate case quietly.
+        */
+       if (rec_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (rec_diff > 0) {
+               /*
+                * If there wasn't any memory allocated before, just
+                * allocate it now and get out.
+                */
+               if (ifp->if_broot_bytes == 0) {
+                       new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
+                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+                       ifp->if_broot_bytes = (int)new_size;
+                       return;
+               }
+
+               /*
+                * If there is already an existing if_broot, then we need
+                * to realloc() it and shift the pointers to their new
+                * location.  The records don't change location because
+                * they are kept butted up against the btree block header.
+                */
+               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+               new_max = cur_max + rec_diff;
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
+                               XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
+                               KM_SLEEP | KM_NOFS);
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    (int)new_size);
+               ifp->if_broot_bytes = (int)new_size;
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+               memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+               return;
+       }
+
+       /*
+        * rec_diff is less than 0.  In this case, we are shrinking the
+        * if_broot buffer.  It must already exist.  If we go to zero
+        * records, just get rid of the root and clear the status bit.
+        */
+       ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
+       new_max = cur_max + rec_diff;
+       ASSERT(new_max >= 0);
+       if (new_max > 0)
+               new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
+       else
+               new_size = 0;
+       if (new_size > 0) {
+               new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+               /*
+                * First copy over the btree block header.
+                */
+               memcpy(new_broot, ifp->if_broot,
+                       XFS_BMBT_BLOCK_LEN(ip->i_mount));
+       } else {
+               new_broot = NULL;
+               ifp->if_flags &= ~XFS_IFBROOT;
+       }
+
+       /*
+        * Only copy the records and pointers if there are any.
+        */
+       if (new_max > 0) {
+               /*
+                * First copy the records.
+                */
+               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+               /*
+                * Then copy the pointers.
+                */
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
+                                                    (int)new_size);
+               memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+       }
+       kmem_free(ifp->if_broot);
+       ifp->if_broot = new_broot;
+       ifp->if_broot_bytes = (int)new_size;
+       if (ifp->if_broot)
+               ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                       XFS_IFORK_SIZE(ip, whichfork));
+       return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *      requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+       xfs_inode_t     *ip,
+       int             byte_diff,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+       int             new_size;
+       int             real_size;
+
+       if (byte_diff == 0) {
+               return;
+       }
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       new_size = (int)ifp->if_bytes + byte_diff;
+       ASSERT(new_size >= 0);
+
+       if (new_size == 0) {
+               if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       kmem_free(ifp->if_u1.if_data);
+               }
+               ifp->if_u1.if_data = NULL;
+               real_size = 0;
+       } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+               /*
+                * If the valid extents/data can fit in if_inline_ext/data,
+                * copy them from the malloc'd vector and free it.
+                */
+               if (ifp->if_u1.if_data == NULL) {
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+                             new_size);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+               }
+               real_size = 0;
+       } else {
+               /*
+                * Stuck with malloc/realloc.
+                * For inline data, the underlying buffer must be
+                * a multiple of 4 bytes in size so that it can be
+                * logged and stay on word boundaries.  We enforce
+                * that here.
+                */
+               real_size = roundup(new_size, 4);
+               if (ifp->if_u1.if_data == NULL) {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+               } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                       /*
+                        * Only do the realloc if the underlying size
+                        * is really changing.
+                        */
+                       if (ifp->if_real_bytes != real_size) {
+                               ifp->if_u1.if_data =
+                                       kmem_realloc(ifp->if_u1.if_data,
+                                                       real_size,
+                                                       ifp->if_real_bytes,
+                                                       KM_SLEEP | KM_NOFS);
+                       }
+               } else {
+                       ASSERT(ifp->if_real_bytes == 0);
+                       ifp->if_u1.if_data = kmem_alloc(real_size,
+                                                       KM_SLEEP | KM_NOFS);
+                       memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+                               ifp->if_bytes);
+               }
+       }
+       ifp->if_real_bytes = real_size;
+       ifp->if_bytes = new_size;
+       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+void
+xfs_idestroy_fork(
+       xfs_inode_t     *ip,
+       int             whichfork)
+{
+       xfs_ifork_t     *ifp;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (ifp->if_broot != NULL) {
+               kmem_free(ifp->if_broot);
+               ifp->if_broot = NULL;
+       }
+
+       /*
+        * If the format is local, then we can't have an extents
+        * array so just look for an inline data array.  If we're
+        * not local then we may or may not have an extents list,
+        * so check and free it up if we do.
+        */
+       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+               if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+                   (ifp->if_u1.if_data != NULL)) {
+                       ASSERT(ifp->if_real_bytes != 0);
+                       kmem_free(ifp->if_u1.if_data);
+                       ifp->if_u1.if_data = NULL;
+                       ifp->if_real_bytes = 0;
+               }
+       } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+                  ((ifp->if_flags & XFS_IFEXTIREC) ||
+                   ((ifp->if_u1.if_extents != NULL) &&
+                    (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
+               ASSERT(ifp->if_real_bytes != 0);
+               xfs_iext_destroy(ifp);
+       }
+       ASSERT(ifp->if_u1.if_extents == NULL ||
+              ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+       ASSERT(ifp->if_real_bytes == 0);
+       if (whichfork == XFS_ATTR_FORK) {
+               kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+               ip->i_afp = NULL;
+       }
+}
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * memcpy() the extents into the buffer.  Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+       xfs_inode_t             *ip,
+       xfs_bmbt_rec_t          *dp,
+       int                     whichfork)
+{
+       int                     copied;
+       int                     i;
+       xfs_ifork_t             *ifp;
+       int                     nrecs;
+       xfs_fsblock_t           start_block;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+       ASSERT(ifp->if_bytes > 0);
+
+       nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
+       ASSERT(nrecs > 0);
+
+       /*
+        * There are some delayed allocation extents in the
+        * inode, so copy the extents one at a time and skip
+        * the delayed ones.  There must be at least one
+        * non-delayed extent.
+        */
+       copied = 0;
+       for (i = 0; i < nrecs; i++) {
+               xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+               start_block = xfs_bmbt_get_startblock(ep);
+               if (isnullstartblock(start_block)) {
+                       /*
+                        * It's a delayed allocation extent, so skip it.
+                        */
+                       continue;
+               }
+
+               /* Translate to on disk format */
+               put_unaligned_be64(ep->l0, &dp->l0);
+               put_unaligned_be64(ep->l1, &dp->l1);
+               dp++;
+               copied++;
+       }
+       ASSERT(copied != 0);
+       xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
+
+       return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+void
+xfs_iflush_fork(
+       xfs_inode_t             *ip,
+       xfs_dinode_t            *dip,
+       xfs_inode_log_item_t    *iip,
+       int                     whichfork,
+       xfs_buf_t               *bp)
+{
+       char                    *cp;
+       xfs_ifork_t             *ifp;
+       xfs_mount_t             *mp;
+       static const short      brootflag[2] =
+               { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+       static const short      dataflag[2] =
+               { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+       static const short      extflag[2] =
+               { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+       if (!iip)
+               return;
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       /*
+        * This can happen if we gave up in iformat in an error path,
+        * for the attribute fork.
+        */
+       if (!ifp) {
+               ASSERT(whichfork == XFS_ATTR_FORK);
+               return;
+       }
+       cp = XFS_DFORK_PTR(dip, whichfork);
+       mp = ip->i_mount;
+       switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+       case XFS_DINODE_FMT_LOCAL:
+               if ((iip->ili_fields & dataflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(ifp->if_u1.if_data != NULL);
+                       ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+                       memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+               }
+               break;
+
+       case XFS_DINODE_FMT_EXTENTS:
+               ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+                      !(iip->ili_fields & extflag[whichfork]));
+               if ((iip->ili_fields & extflag[whichfork]) &&
+                   (ifp->if_bytes > 0)) {
+                       ASSERT(xfs_iext_get_ext(ifp, 0));
+                       ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+                       (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+                               whichfork);
+               }
+               break;
+
+       case XFS_DINODE_FMT_BTREE:
+               if ((iip->ili_fields & brootflag[whichfork]) &&
+                   (ifp->if_broot_bytes > 0)) {
+                       ASSERT(ifp->if_broot != NULL);
+                       ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
+                               XFS_IFORK_SIZE(ip, whichfork));
+                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
+                               (xfs_bmdr_block_t *)cp,
+                               XFS_DFORK_SIZE(dip, mp, whichfork));
+               }
+               break;
+
+       case XFS_DINODE_FMT_DEV:
+               if (iip->ili_fields & XFS_ILOG_DEV) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
+               }
+               break;
+
+       case XFS_DINODE_FMT_UUID:
+               if (iip->ili_fields & XFS_ILOG_UUID) {
+                       ASSERT(whichfork == XFS_DATA_FORK);
+                       memcpy(XFS_DFORK_DPTR(dip),
+                              &ip->i_df.if_u2.if_uuid,
+                              sizeof(uuid_t));
+               }
+               break;
+
+       default:
+               ASSERT(0);
+               break;
+       }
+}
+
+/*
+ * Return a pointer to the extent record at file index idx.
+ */
+xfs_bmbt_rec_host_t *
+xfs_iext_get_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx)            /* index of target extent */
+{
+       ASSERT(idx >= 0);
+       ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
+       if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
+               return ifp->if_u1.if_ext_irec->er_extbuf;
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_ext_irec_t  *erp;           /* irec pointer */
+               int             erp_idx = 0;    /* irec index */
+               xfs_extnum_t    page_idx = idx; /* ext index in target list */
+
+               erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
+               return &erp->er_extbuf[page_idx];
+       } else if (ifp->if_bytes) {
+               return &ifp->if_u1.if_extents[idx];
+       } else {
+               return NULL;
+       }
+}
+
+/*
+ * Insert new item(s) into the extent records for incore inode
+ * fork 'ifp'.  'count' new items are inserted at index 'idx'.
+ */
+void
+xfs_iext_insert(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* starting index of new items */
+       xfs_extnum_t    count,          /* number of inserted items */
+       xfs_bmbt_irec_t *new,           /* items to insert */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    i;              /* extent record index */
+
+       trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
+
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       xfs_iext_add(ifp, idx, count);
+       for (i = idx; i < idx + count; i++, new++)
+               xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be increased. The ext_diff parameter stores the
+ * number of new extents being added and the idx parameter contains
+ * the extent index where the new extents will be added. If the new
+ * extents are being appended, then we just need to (re)allocate and
+ * initialize the space. Otherwise, if the new extents are being
+ * inserted into the middle of the existing entries, a bit more work
+ * is required to make room for the new extents to be inserted. The
+ * caller is responsible for filling in the new extent entries upon
+ * return.
+ */
+void
+xfs_iext_add(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin adding exts */
+       int             ext_diff)       /* number of extents to add */
+{
+       int             byte_diff;      /* new bytes being added */
+       int             new_size;       /* size of extents after adding */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT((idx >= 0) && (idx <= nextents));
+       byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
+       new_size = ifp->if_bytes + byte_diff;
+       /*
+        * If the new number of extents (nextents + ext_diff)
+        * fits inside the inode, then continue to use the inline
+        * extent buffer.
+        */
+       if (nextents + ext_diff <= XFS_INLINE_EXTS) {
+               if (idx < nextents) {
+                       memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
+                               &ifp->if_u2.if_inline_ext[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
+               }
+               ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+               ifp->if_real_bytes = 0;
+       }
+       /*
+        * Otherwise use a linear (direct) extent list.
+        * If the extents are currently inside the inode,
+        * xfs_iext_realloc_direct will switch us from
+        * inline to direct extent allocation mode.
+        */
+       else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, new_size);
+               if (idx < nextents) {
+                       memmove(&ifp->if_u1.if_extents[idx + ext_diff],
+                               &ifp->if_u1.if_extents[idx],
+                               (nextents - idx) * sizeof(xfs_bmbt_rec_t));
+                       memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
+               }
+       }
+       /* Indirection array */
+       else {
+               xfs_ext_irec_t  *erp;
+               int             erp_idx = 0;
+               int             page_idx = idx;
+
+               ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
+               if (ifp->if_flags & XFS_IFEXTIREC) {
+                       erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
+               } else {
+                       xfs_iext_irec_init(ifp);
+                       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+                       erp = ifp->if_u1.if_ext_irec;
+               }
+               /* Extents fit in target extent page */
+               if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
+                       if (page_idx < erp->er_extcount) {
+                               memmove(&erp->er_extbuf[page_idx + ext_diff],
+                                       &erp->er_extbuf[page_idx],
+                                       (erp->er_extcount - page_idx) *
+                                       sizeof(xfs_bmbt_rec_t));
+                               memset(&erp->er_extbuf[page_idx], 0, byte_diff);
+                       }
+                       erp->er_extcount += ext_diff;
+                       xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               }
+               /* Insert a new extent page */
+               else if (erp) {
+                       xfs_iext_add_indirect_multi(ifp,
+                               erp_idx, page_idx, ext_diff);
+               }
+               /*
+                * If extent(s) are being appended to the last page in
+                * the indirection array and the new extent(s) don't fit
+                * in the page, then erp is NULL and erp_idx is set to
+                * the next index needed in the indirection array.
+                */
+               else {
+                       int     count = ext_diff;
+
+                       while (count) {
+                               erp = xfs_iext_irec_new(ifp, erp_idx);
+                               erp->er_extcount = count;
+                               count -= MIN(count, (int)XFS_LINEAR_EXTS);
+                               if (count) {
+                                       erp_idx++;
+                               }
+                       }
+               }
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being added to the indirection
+ * array and the new extents do not fit in the target extent list. The
+ * erp_idx parameter contains the irec index for the target extent list
+ * in the indirection array, and the idx parameter contains the extent
+ * index within the list. The number of extents being added is stored
+ * in the count parameter.
+ *
+ *    |-------|   |-------|
+ *    |       |   |       |    idx - number of extents before idx
+ *    |  idx  |   | count |
+ *    |       |   |       |    count - number of extents being inserted at idx
+ *    |-------|   |-------|
+ *    | count |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_add_indirect_multi(
+       xfs_ifork_t     *ifp,                   /* inode fork pointer */
+       int             erp_idx,                /* target extent irec index */
+       xfs_extnum_t    idx,                    /* index within target list */
+       int             count)                  /* new extents being added */
+{
+       int             byte_diff;              /* new bytes being added */
+       xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
+       xfs_extnum_t    ext_diff;               /* number of extents to add */
+       xfs_extnum_t    ext_cnt;                /* new extents still needed */
+       xfs_extnum_t    nex2;                   /* extents after idx + count */
+       xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
+       int             nlists;                 /* number of irec's (lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       nex2 = erp->er_extcount - idx;
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /*
+        * Save second part of target extent list
+        * (all extents past */
+       if (nex2) {
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
+               memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
+               erp->er_extcount -= nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
+               memset(&erp->er_extbuf[idx], 0, byte_diff);
+       }
+
+       /*
+        * Add the new extents to the end of the target
+        * list, then allocate new irec record(s) and
+        * extent buffer(s) as needed to store the rest
+        * of the new extents.
+        */
+       ext_cnt = count;
+       ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
+       if (ext_diff) {
+               erp->er_extcount += ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+       while (ext_cnt) {
+               erp_idx++;
+               erp = xfs_iext_irec_new(ifp, erp_idx);
+               ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
+               erp->er_extcount = ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
+               ext_cnt -= ext_diff;
+       }
+
+       /* Add nex2 extents back to indirection array */
+       if (nex2) {
+               xfs_extnum_t    ext_avail;
+               int             i;
+
+               byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
+               ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
+               i = 0;
+               /*
+                * If nex2 extents fit in the current page, append
+                * nex2_ep after the new extents.
+                */
+               if (nex2 <= ext_avail) {
+                       i = erp->er_extcount;
+               }
+               /*
+                * Otherwise, check if space is available in the
+                * next page.
+                */
+               else if ((erp_idx < nlists - 1) &&
+                        (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
+                         ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
+                       erp_idx++;
+                       erp++;
+                       /* Create a hole for nex2 extents */
+                       memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
+                               erp->er_extcount * sizeof(xfs_bmbt_rec_t));
+               }
+               /*
+                * Final choice, create a new extent page for
+                * nex2 extents.
+                */
+               else {
+                       erp_idx++;
+                       erp = xfs_iext_irec_new(ifp, erp_idx);
+               }
+               memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
+               kmem_free(nex2_ep);
+               erp->er_extcount += nex2;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
+       }
+}
+
+/*
+ * This is called when the amount of space required for incore file
+ * extents needs to be decreased. The ext_diff parameter stores the
+ * number of extents to be removed and the idx parameter contains
+ * the extent index where the extents will be removed from.
+ *
+ * If the amount of space needed has decreased below the linear
+ * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
+ * extent array.  Otherwise, use kmem_realloc() to adjust the
+ * size to what is needed.
+ */
+void
+xfs_iext_remove(
+       xfs_inode_t     *ip,            /* incore inode pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff,       /* number of extents to remove */
+       int             state)          /* type of extent conversion */
+{
+       xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
+
+       ASSERT(ext_diff > 0);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (ifp->if_flags & XFS_IFEXTIREC) {
+               xfs_iext_remove_indirect(ifp, idx, ext_diff);
+       } else if (ifp->if_real_bytes) {
+               xfs_iext_remove_direct(ifp, idx, ext_diff);
+       } else {
+               xfs_iext_remove_inline(ifp, idx, ext_diff);
+       }
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This removes ext_diff extents from the inline buffer, beginning
+ * at extent index idx.
+ */
+void
+xfs_iext_remove_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       int             nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       ASSERT(idx < XFS_INLINE_EXTS);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(((nextents - ext_diff) > 0) &&
+               (nextents - ext_diff) < XFS_INLINE_EXTS);
+
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u2.if_inline_ext[idx],
+                       &ifp->if_u2.if_inline_ext[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+               memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
+                       0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       } else {
+               memset(&ifp->if_u2.if_inline_ext[idx], 0,
+                       ext_diff * sizeof(xfs_bmbt_rec_t));
+       }
+}
+
+/*
+ * This removes ext_diff extents from a linear (direct) extent list,
+ * beginning at extent index idx. If the extents are being removed
+ * from the end of the list (ie. truncate) then we just need to re-
+ * allocate the list to remove the extra space. Otherwise, if the
+ * extents are being removed from the middle of the existing extent
+ * entries, then we first need to move the extent records beginning
+ * at idx + ext_diff up in the list to overwrite the records being
+ * removed, then remove the extra space via kmem_realloc.
+ */
+void
+xfs_iext_remove_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing exts */
+       int             ext_diff)       /* number of extents to remove */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             new_size;       /* size of extents after removal */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       new_size = ifp->if_bytes -
+               (ext_diff * sizeof(xfs_bmbt_rec_t));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+               return;
+       }
+       /* Move extents up in the list (if needed) */
+       if (idx + ext_diff < nextents) {
+               memmove(&ifp->if_u1.if_extents[idx],
+                       &ifp->if_u1.if_extents[idx + ext_diff],
+                       (nextents - (idx + ext_diff)) *
+                        sizeof(xfs_bmbt_rec_t));
+       }
+       memset(&ifp->if_u1.if_extents[nextents - ext_diff],
+               0, ext_diff * sizeof(xfs_bmbt_rec_t));
+       /*
+        * Reallocate the direct extent list. If the extents
+        * will fit inside the inode then xfs_iext_realloc_direct
+        * will switch from direct to inline extent allocation
+        * mode for us.
+        */
+       xfs_iext_realloc_direct(ifp, new_size);
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * This is called when incore extents are being removed from the
+ * indirection array and the extents being removed span multiple extent
+ * buffers. The idx parameter contains the file extent index where we
+ * want to begin removing extents, and the count parameter contains
+ * how many extents need to be removed.
+ *
+ *    |-------|   |-------|
+ *    | nex1  |   |       |    nex1 - number of extents before idx
+ *    |-------|   | count |
+ *    |       |   |       |    count - number of extents being removed at idx
+ *    | count |   |-------|
+ *    |       |   | nex2  |    nex2 - number of extents after idx + count
+ *    |-------|   |-------|
+ */
+void
+xfs_iext_remove_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    idx,            /* index to begin removing extents */
+       int             count)          /* number of extents to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             erp_idx = 0;    /* indirection array index */
+       xfs_extnum_t    ext_cnt;        /* extents left to remove */
+       xfs_extnum_t    ext_diff;       /* extents to remove in current list */
+       xfs_extnum_t    nex1;           /* number of extents before idx */
+       xfs_extnum_t    nex2;           /* extents after idx + count */
+       int             page_idx = idx; /* index in target extent list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
+       ASSERT(erp != NULL);
+       nex1 = page_idx;
+       ext_cnt = count;
+       while (ext_cnt) {
+               nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
+               ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
+               /*
+                * Check for deletion of entire list;
+                * xfs_iext_irec_remove() updates extent offsets.
+                */
+               if (ext_diff == erp->er_extcount) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+                       ext_cnt -= ext_diff;
+                       nex1 = 0;
+                       if (ext_cnt) {
+                               ASSERT(erp_idx < ifp->if_real_bytes /
+                                       XFS_IEXT_BUFSZ);
+                               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                               nex1 = 0;
+                               continue;
+                       } else {
+                               break;
+                       }
+               }
+               /* Move extents up (if needed) */
+               if (nex2) {
+                       memmove(&erp->er_extbuf[nex1],
+                               &erp->er_extbuf[nex1 + ext_diff],
+                               nex2 * sizeof(xfs_bmbt_rec_t));
+               }
+               /* Zero out rest of page */
+               memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
+                       ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
+               /* Update remaining counters */
+               erp->er_extcount -= ext_diff;
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
+               ext_cnt -= ext_diff;
+               nex1 = 0;
+               erp_idx++;
+               erp++;
+       }
+       ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
+       xfs_iext_irec_compact(ifp);
+}
+
+/*
+ * Create, destroy, or resize a linear (direct) block of extents.
+ */
+void
+xfs_iext_realloc_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new size of extents */
+{
+       int             rnew_size;      /* real new size of extents */
+
+       rnew_size = new_size;
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
+               ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
+                (new_size != ifp->if_real_bytes)));
+
+       /* Free extent records */
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       }
+       /* Resize direct extent list and zero any new bytes */
+       else if (ifp->if_real_bytes) {
+               /* Check if extents will fit inside the inode */
+               if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
+                       xfs_iext_direct_to_inline(ifp, new_size /
+                               (uint)sizeof(xfs_bmbt_rec_t));
+                       ifp->if_bytes = new_size;
+                       return;
+               }
+               if (!is_power_of_2(new_size)){
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               if (rnew_size != ifp->if_real_bytes) {
+                       ifp->if_u1.if_extents =
+                               kmem_realloc(ifp->if_u1.if_extents,
+                                               rnew_size,
+                                               ifp->if_real_bytes, KM_NOFS);
+               }
+               if (rnew_size > ifp->if_real_bytes) {
+                       memset(&ifp->if_u1.if_extents[ifp->if_bytes /
+                               (uint)sizeof(xfs_bmbt_rec_t)], 0,
+                               rnew_size - ifp->if_real_bytes);
+               }
+       }
+       /*
+        * Switch from the inline extent buffer to a direct
+        * extent list. Be sure to include the inline extent
+        * bytes in new_size.
+        */
+       else {
+               new_size += ifp->if_bytes;
+               if (!is_power_of_2(new_size)) {
+                       rnew_size = roundup_pow_of_two(new_size);
+               }
+               xfs_iext_inline_to_direct(ifp, rnew_size);
+       }
+       ifp->if_real_bytes = rnew_size;
+       ifp->if_bytes = new_size;
+}
+
+/*
+ * Switch from linear (direct) extent records to inline buffer.
+ */
+void
+xfs_iext_direct_to_inline(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    nextents)       /* number of extents in file */
+{
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+       ASSERT(nextents <= XFS_INLINE_EXTS);
+       /*
+        * The inline buffer was zeroed when we switched
+        * from inline to direct extent allocation mode,
+        * so we don't need to clear it here.
+        */
+       memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+               nextents * sizeof(xfs_bmbt_rec_t));
+       kmem_free(ifp->if_u1.if_extents);
+       ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+       ifp->if_real_bytes = 0;
+}
+
+/*
+ * Switch from inline buffer to linear (direct) extent records.
+ * new_size should already be rounded up to the next power of 2
+ * by the caller (when appropriate), so use new_size as it is.
+ * However, since new_size may be rounded up, we can't update
+ * if_bytes here. It is the caller's responsibility to update
+ * if_bytes upon return.
+ */
+void
+xfs_iext_inline_to_direct(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* number of extents in file */
+{
+       ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
+       memset(ifp->if_u1.if_extents, 0, new_size);
+       if (ifp->if_bytes) {
+               memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+                       ifp->if_bytes);
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_real_bytes = new_size;
+}
+
+/*
+ * Resize an extent indirection array to new_size bytes.
+ */
+STATIC void
+xfs_iext_realloc_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new indirection array size */
+{
+       int             nlists;         /* number of irec's (ex lists) */
+       int             size;           /* current indirection array size */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       size = nlists * sizeof(xfs_ext_irec_t);
+       ASSERT(ifp->if_real_bytes);
+       ASSERT((new_size >= 0) && (new_size != size));
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else {
+               ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
+                       kmem_realloc(ifp->if_u1.if_ext_irec,
+                               new_size, size, KM_NOFS);
+       }
+}
+
+/*
+ * Switch from indirection array to linear (direct) extent allocations.
+ */
+STATIC void
+xfs_iext_indirect_to_direct(
+        xfs_ifork_t    *ifp)           /* inode fork pointer */
+{
+       xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             size;           /* size of file extents */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+       size = nextents * sizeof(xfs_bmbt_rec_t);
+
+       xfs_iext_irec_compact_pages(ifp);
+       ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
+
+       ep = ifp->if_u1.if_ext_irec->er_extbuf;
+       kmem_free(ifp->if_u1.if_ext_irec);
+       ifp->if_flags &= ~XFS_IFEXTIREC;
+       ifp->if_u1.if_extents = ep;
+       ifp->if_bytes = size;
+       if (nextents < XFS_LINEAR_EXTS) {
+               xfs_iext_realloc_direct(ifp, size);
+       }
+}
+
+/*
+ * Free incore file extents.
+ */
+void
+xfs_iext_destroy(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               int     erp_idx;
+               int     nlists;
+
+               nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
+                       xfs_iext_irec_remove(ifp, erp_idx);
+               }
+               ifp->if_flags &= ~XFS_IFEXTIREC;
+       } else if (ifp->if_real_bytes) {
+               kmem_free(ifp->if_u1.if_extents);
+       } else if (ifp->if_bytes) {
+               memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
+                       sizeof(xfs_bmbt_rec_t));
+       }
+       ifp->if_u1.if_extents = NULL;
+       ifp->if_real_bytes = 0;
+       ifp->if_bytes = 0;
+}
+
+/*
+ * Return a pointer to the extent record for file system block bno.
+ */
+xfs_bmbt_rec_host_t *                  /* pointer to found extent record */
+xfs_iext_bno_to_ext(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       xfs_extnum_t    *idxp)          /* index of target extent */
+{
+       xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
+       xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
+       xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       int             high;           /* upper boundary in search */
+       xfs_extnum_t    idx = 0;        /* index of target extent */
+       int             low;            /* lower boundary in search */
+       xfs_extnum_t    nextents;       /* number of file extents */
+       xfs_fileoff_t   startoff = 0;   /* start offset of extent */
+
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       if (nextents == 0) {
+               *idxp = 0;
+               return NULL;
+       }
+       low = 0;
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               /* Find target extent list */
+               int     erp_idx = 0;
+               erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
+               base = erp->er_extbuf;
+               high = erp->er_extcount - 1;
+       } else {
+               base = ifp->if_u1.if_extents;
+               high = nextents - 1;
+       }
+       /* Binary search extent records */
+       while (low <= high) {
+               idx = (low + high) >> 1;
+               ep = base + idx;
+               startoff = xfs_bmbt_get_startoff(ep);
+               blockcount = xfs_bmbt_get_blockcount(ep);
+               if (bno < startoff) {
+                       high = idx - 1;
+               } else if (bno >= startoff + blockcount) {
+                       low = idx + 1;
+               } else {
+                       /* Convert back to file-based extent index */
+                       if (ifp->if_flags & XFS_IFEXTIREC) {
+                               idx += erp->er_extoff;
+                       }
+                       *idxp = idx;
+                       return ep;
+               }
+       }
+       /* Convert back to file-based extent index */
+       if (ifp->if_flags & XFS_IFEXTIREC) {
+               idx += erp->er_extoff;
+       }
+       if (bno >= startoff + blockcount) {
+               if (++idx == nextents) {
+                       ep = NULL;
+               } else {
+                       ep = xfs_iext_get_ext(ifp, idx);
+               }
+       }
+       *idxp = idx;
+       return ep;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record for filesystem block bno. Store the index of the
+ * target irec in *erp_idxp.
+ */
+xfs_ext_irec_t *                       /* pointer to found extent record */
+xfs_iext_bno_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_fileoff_t   bno,            /* block number to search for */
+       int             *erp_idxp)      /* irec index of target ext list */
+{
+       xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
+       xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of extent irec's (lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
+               if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
+                       high = erp_idx - 1;
+               } else if (erp_next && bno >=
+                          xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
+                       low = erp_idx + 1;
+               } else {
+                       break;
+               }
+       }
+       *erp_idxp = erp_idx;
+       return erp;
+}
+
+/*
+ * Return a pointer to the indirection array entry containing the
+ * extent record at file extent index *idxp. Store the index of the
+ * target irec in *erp_idxp and store the page index of the target
+ * extent record in *idxp.
+ */
+xfs_ext_irec_t *
+xfs_iext_idx_to_irec(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       xfs_extnum_t    *idxp,          /* extent index (file -> page) */
+       int             *erp_idxp,      /* pointer to target irec */
+       int             realloc)        /* new bytes were just added */
+{
+       xfs_ext_irec_t  *prev;          /* pointer to previous irec */
+       xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
+       int             erp_idx;        /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+       int             high;           /* binary search upper limit */
+       int             low;            /* binary search lower limit */
+       xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       ASSERT(page_idx >= 0);
+       ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+       ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp_idx = 0;
+       low = 0;
+       high = nlists - 1;
+
+       /* Binary search extent irec's */
+       while (low <= high) {
+               erp_idx = (low + high) >> 1;
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               prev = erp_idx > 0 ? erp - 1 : NULL;
+               if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
+                    realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
+                       high = erp_idx - 1;
+               } else if (page_idx > erp->er_extoff + erp->er_extcount ||
+                          (page_idx == erp->er_extoff + erp->er_extcount &&
+                           !realloc)) {
+                       low = erp_idx + 1;
+               } else if (page_idx == erp->er_extoff + erp->er_extcount &&
+                          erp->er_extcount == XFS_LINEAR_EXTS) {
+                       ASSERT(realloc);
+                       page_idx = 0;
+                       erp_idx++;
+                       erp = erp_idx < nlists ? erp + 1 : NULL;
+                       break;
+               } else {
+                       page_idx -= erp->er_extoff;
+                       break;
+               }
+       }
+       *idxp = page_idx;
+       *erp_idxp = erp_idx;
+       return(erp);
+}
+
+/*
+ * Allocate and initialize an indirection array once the space needed
+ * for incore extents increases above XFS_IEXT_BUFSZ.
+ */
+void
+xfs_iext_irec_init(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       xfs_extnum_t    nextents;       /* number of extents in file */
+
+       ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       ASSERT(nextents <= XFS_LINEAR_EXTS);
+
+       erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
+
+       if (nextents == 0) {
+               ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       } else if (!ifp->if_real_bytes) {
+               xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
+       } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
+               xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
+       }
+       erp->er_extbuf = ifp->if_u1.if_extents;
+       erp->er_extcount = nextents;
+       erp->er_extoff = 0;
+
+       ifp->if_flags |= XFS_IFEXTIREC;
+       ifp->if_real_bytes = XFS_IEXT_BUFSZ;
+       ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
+       ifp->if_u1.if_ext_irec = erp;
+
+       return;
+}
+
+/*
+ * Allocate and initialize a new entry in the indirection array.
+ */
+xfs_ext_irec_t *
+xfs_iext_irec_new(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* index for new irec */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+
+       /* Resize indirection array */
+       xfs_iext_realloc_indirect(ifp, ++nlists *
+                                 sizeof(xfs_ext_irec_t));
+       /*
+        * Move records down in the array so the
+        * new page can use erp_idx.
+        */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = nlists - 1; i > erp_idx; i--) {
+               memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
+       }
+       ASSERT(i == erp_idx);
+
+       /* Initialize new extent record */
+       erp = ifp->if_u1.if_ext_irec;
+       erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+       memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
+       erp[erp_idx].er_extcount = 0;
+       erp[erp_idx].er_extoff = erp_idx > 0 ?
+               erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
+       return (&erp[erp_idx]);
+}
+
+/*
+ * Remove a record from the indirection array.
+ */
+void
+xfs_iext_irec_remove(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx)        /* irec index to remove */
+{
+       xfs_ext_irec_t  *erp;           /* indirection array pointer */
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       erp = &ifp->if_u1.if_ext_irec[erp_idx];
+       if (erp->er_extbuf) {
+               xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
+                       -erp->er_extcount);
+               kmem_free(erp->er_extbuf);
+       }
+       /* Compact extent records */
+       erp = ifp->if_u1.if_ext_irec;
+       for (i = erp_idx; i < nlists - 1; i++) {
+               memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
+       }
+       /*
+        * Manually free the last extent record from the indirection
+        * array.  A call to xfs_iext_realloc_indirect() with a size
+        * of zero would result in a call to xfs_iext_destroy() which
+        * would in turn call this function again, creating a nasty
+        * infinite loop.
+        */
+       if (--nlists) {
+               xfs_iext_realloc_indirect(ifp,
+                       nlists * sizeof(xfs_ext_irec_t));
+       } else {
+               kmem_free(ifp->if_u1.if_ext_irec);
+       }
+       ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
+}
+
+/*
+ * This is called to clean up large amounts of unused memory allocated
+ * by the indirection array.  Before compacting anything though, verify
+ * that the indirection array is still needed and switch back to the
+ * linear extent list (or even the inline buffer) if possible.  The
+ * compaction policy is as follows:
+ *
+ *    Full Compaction: Extents fit into a single page (or inline buffer)
+ * Partial Compaction: Extents occupy less than 50% of allocated space
+ *      No Compaction: Extents occupy at least 50% of allocated space
+ */
+void
+xfs_iext_irec_compact(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_extnum_t    nextents;       /* number of extents in file */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+
+       if (nextents == 0) {
+               xfs_iext_destroy(ifp);
+       } else if (nextents <= XFS_INLINE_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+               xfs_iext_direct_to_inline(ifp, nextents);
+       } else if (nextents <= XFS_LINEAR_EXTS) {
+               xfs_iext_indirect_to_direct(ifp);
+       } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
+               xfs_iext_irec_compact_pages(ifp);
+       }
+}
+
+/*
+ * Combine extents from neighboring extent pages.
+ */
+void
+xfs_iext_irec_compact_pages(
+       xfs_ifork_t     *ifp)           /* inode fork pointer */
+{
+       xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
+       int             erp_idx = 0;    /* indirection array index */
+       int             nlists;         /* number of irec's (ex lists) */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       while (erp_idx < nlists - 1) {
+               erp = &ifp->if_u1.if_ext_irec[erp_idx];
+               erp_next = erp + 1;
+               if (erp_next->er_extcount <=
+                   (XFS_LINEAR_EXTS - erp->er_extcount)) {
+                       memcpy(&erp->er_extbuf[erp->er_extcount],
+                               erp_next->er_extbuf, erp_next->er_extcount *
+                               sizeof(xfs_bmbt_rec_t));
+                       erp->er_extcount += erp_next->er_extcount;
+                       /*
+                        * Free page before removing extent record
+                        * so er_extoffs don't get modified in
+                        * xfs_iext_irec_remove.
+                        */
+                       kmem_free(erp_next->er_extbuf);
+                       erp_next->er_extbuf = NULL;
+                       xfs_iext_irec_remove(ifp, erp_idx + 1);
+                       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+               } else {
+                       erp_idx++;
+               }
+       }
+}
+
+/*
+ * This is called to update the er_extoff field in the indirection
+ * array when extents have been added or removed from one of the
+ * extent lists. erp_idx contains the irec index to begin updating
+ * at and ext_diff contains the number of extents that were added
+ * or removed.
+ */
+void
+xfs_iext_irec_update_extoffs(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             erp_idx,        /* irec index to update */
+       int             ext_diff)       /* number of new extents */
+{
+       int             i;              /* loop counter */
+       int             nlists;         /* number of irec's (ex lists */
+
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+       nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+       for (i = erp_idx; i < nlists; i++) {
+               ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
+       }
+}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
new file mode 100644 (file)
index 0000000..28661a0
--- /dev/null
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_INODE_FORK_H__
+#define        __XFS_INODE_FORK_H__
+
+struct xfs_inode_log_item;
+
+/*
+ * The following xfs_ext_irec_t struct introduces a second (top) level
+ * to the in-core extent allocation scheme. These structs are allocated
+ * in a contiguous block, creating an indirection array where each entry
+ * (irec) contains a pointer to a buffer of in-core extent records which
+ * it manages. Each extent buffer is 4k in size, since 4k is the system
+ * page size on Linux i386 and systems with larger page sizes don't seem
+ * to gain much, if anything, by using their native page size as the
+ * extent buffer size. Also, using 4k extent buffers everywhere provides
+ * a consistent interface for CXFS across different platforms.
+ *
+ * There is currently no limit on the number of irec's (extent lists)
+ * allowed, so heavily fragmented files may require an indirection array
+ * which spans multiple system pages of memory. The number of extents
+ * which would require this amount of contiguous memory is very large
+ * and should not cause problems in the foreseeable future. However,
+ * if the memory needed for the contiguous array ever becomes a problem,
+ * it is possible that a third level of indirection may be required.
+ */
+typedef struct xfs_ext_irec {
+       xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
+       xfs_extnum_t    er_extoff;      /* extent offset in file */
+       xfs_extnum_t    er_extcount;    /* number of extents in page/block */
+} xfs_ext_irec_t;
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define        XFS_IEXT_BUFSZ          4096
+#define        XFS_LINEAR_EXTS         (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
+#define        XFS_INLINE_EXTS         2
+#define        XFS_INLINE_DATA         32
+typedef struct xfs_ifork {
+       int                     if_bytes;       /* bytes in if_u1 */
+       int                     if_real_bytes;  /* bytes allocated in if_u1 */
+       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
+       short                   if_broot_bytes; /* bytes allocated for root */
+       unsigned char           if_flags;       /* per-fork flags */
+       union {
+               xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
+               xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
+               char            *if_data;       /* inline file data */
+       } if_u1;
+       union {
+               xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
+                                               /* very small file extents */
+               char            if_inline_data[XFS_INLINE_DATA];
+                                               /* very small file data */
+               xfs_dev_t       if_rdev;        /* dev number if special */
+               uuid_t          if_uuid;        /* mount point value */
+       } if_u2;
+} xfs_ifork_t;
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define        XFS_IFINLINE    0x01    /* Inline data is read in */
+#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)            \
+       ((w) == XFS_DATA_FORK ? \
+               &(ip)->i_df : \
+               (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_IFORK_BOFF(ip) : \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+#define XFS_IFORK_ASIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+                       XFS_IFORK_BOFF(ip) : \
+               0)
+#define XFS_IFORK_SIZE(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_IFORK_DSIZE(ip) : \
+               XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_format : \
+               (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_format = (n)) : \
+               ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_nextents : \
+               (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_nextents = (n)) : \
+               ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+
+int            xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+void           xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+                               struct xfs_inode_log_item *, int,
+                               struct xfs_buf *);
+void           xfs_idestroy_fork(struct xfs_inode *, int);
+void           xfs_idata_realloc(struct xfs_inode *, int, int);
+void           xfs_iroot_realloc(struct xfs_inode *, int, int);
+int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int            xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
+                                 int);
+
+struct xfs_bmbt_rec_host *
+               xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
+                               struct xfs_bmbt_irec *, int);
+void           xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
+                                           xfs_extnum_t, int);
+void           xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
+void           xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
+void           xfs_iext_realloc_direct(struct xfs_ifork *, int);
+void           xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
+void           xfs_iext_inline_to_direct(struct xfs_ifork *, int);
+void           xfs_iext_destroy(struct xfs_ifork *);
+struct xfs_bmbt_rec_host *
+               xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
+struct xfs_ext_irec *
+               xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
+                                    int);
+void           xfs_iext_irec_init(struct xfs_ifork *);
+struct xfs_ext_irec *
+               xfs_iext_irec_new(struct xfs_ifork *, int);
+void           xfs_iext_irec_remove(struct xfs_ifork *, int);
+void           xfs_iext_irec_compact(struct xfs_ifork *);
+void           xfs_iext_irec_compact_pages(struct xfs_ifork *);
+void           xfs_iext_irec_compact_full(struct xfs_ifork *);
+void           xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+
+extern struct kmem_zone        *xfs_ifork_zone;
+
+#endif /* __XFS_INODE_FORK_H__ */
index f76ff52e43c0a4f5536163a61230f1ac89f77358..378081109844b09b2bbfd07dcb4214027fefe2c2 100644 (file)
@@ -47,32 +47,44 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
  * inode core, and possibly one for the inode data/extents/b-tree root
  * and one for the inode attribute data/extents/b-tree root.
  */
-STATIC uint
+STATIC void
 xfs_inode_item_size(
-       struct xfs_log_item     *lip)
+       struct xfs_log_item     *lip,
+       int                     *nvecs,
+       int                     *nbytes)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
-       uint                    nvecs = 2;
+
+       *nvecs += 2;
+       *nbytes += sizeof(struct xfs_inode_log_format) +
+                  xfs_icdinode_size(ip->i_d.di_version);
 
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_DEXT) &&
                    ip->i_d.di_nextents > 0 &&
-                   ip->i_df.if_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_bytes > 0) {
+                       /* worst case, doesn't subtract delalloc extents */
+                       *nbytes += XFS_IFORK_DSIZE(ip);
+                       *nvecs += 1;
+               }
                break;
 
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
-                   ip->i_df.if_broot_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_broot_bytes > 0) {
+                       *nbytes += ip->i_df.if_broot_bytes;
+                       *nvecs += 1;
+               }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_DDATA) &&
-                   ip->i_df.if_bytes > 0)
-                       nvecs++;
+                   ip->i_df.if_bytes > 0) {
+                       *nbytes += roundup(ip->i_df.if_bytes, 4);
+                       *nvecs += 1;
+               }
                break;
 
        case XFS_DINODE_FMT_DEV:
@@ -85,7 +97,7 @@ xfs_inode_item_size(
        }
 
        if (!XFS_IFORK_Q(ip))
-               return nvecs;
+               return;
 
 
        /*
@@ -95,28 +107,33 @@ xfs_inode_item_size(
        case XFS_DINODE_FMT_EXTENTS:
                if ((iip->ili_fields & XFS_ILOG_AEXT) &&
                    ip->i_d.di_anextents > 0 &&
-                   ip->i_afp->if_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_bytes > 0) {
+                       /* worst case, doesn't subtract unused space */
+                       *nbytes += XFS_IFORK_ASIZE(ip);
+                       *nvecs += 1;
+               }
                break;
 
        case XFS_DINODE_FMT_BTREE:
                if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
-                   ip->i_afp->if_broot_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_broot_bytes > 0) {
+                       *nbytes += ip->i_afp->if_broot_bytes;
+                       *nvecs += 1;
+               }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
                if ((iip->ili_fields & XFS_ILOG_ADATA) &&
-                   ip->i_afp->if_bytes > 0)
-                       nvecs++;
+                   ip->i_afp->if_bytes > 0) {
+                       *nbytes += roundup(ip->i_afp->if_bytes, 4);
+                       *nvecs += 1;
+               }
                break;
 
        default:
                ASSERT(0);
                break;
        }
-
-       return nvecs;
 }
 
 /*
index 779812fb3d80b27c94d97f288fea9195db8be4fb..dce4d656768c32888521dc0c9007c6c2961b7c4e 100644 (file)
 #ifndef        __XFS_INODE_ITEM_H__
 #define        __XFS_INODE_ITEM_H__
 
-/*
- * This is the structure used to lay out an inode log item in the
- * log.  The size of the inline data/extents/b-tree root to be logged
- * (if any) is indicated in the ilf_dsize field.  Changes to this structure
- * must be added on to the end.
- */
-typedef struct xfs_inode_log_format {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_t;
-
-typedef struct xfs_inode_log_format_32 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} __attribute__((packed)) xfs_inode_log_format_32_t;
-
-typedef struct xfs_inode_log_format_64 {
-       __uint16_t              ilf_type;       /* inode log item type */
-       __uint16_t              ilf_size;       /* size of this item */
-       __uint32_t              ilf_fields;     /* flags for fields logged */
-       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-       __uint16_t              ilf_dsize;      /* size of data/ext/root */
-       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
-       __uint64_t              ilf_ino;        /* inode number */
-       union {
-               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-               uuid_t          ilfu_uuid;      /* mount point value */
-       } ilf_u;
-       __int64_t               ilf_blkno;      /* blkno of inode buffer */
-       __int32_t               ilf_len;        /* len of inode buffer */
-       __int32_t               ilf_boffset;    /* off of inode in buffer */
-} xfs_inode_log_format_64_t;
-
-/*
- * Flags for xfs_trans_log_inode flags field.
- */
-#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
-#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
-#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
-#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
-#define        XFS_ILOG_DEV    0x010   /* log the dev field */
-#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
-#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
-#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
-#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
-
-
-/*
- * The timestamps are dirty, but not necessarily anything else in the inode
- * core.  Unlike the other fields above this one must never make it to disk
- * in the ilf_fields of the inode_log_format, but is purely store in-memory in
- * ili_fields in the inode_log_item.
- */
-#define XFS_ILOG_TIMESTAMP     0x4000
-
-#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
-                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
-
-#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
-                                XFS_ILOG_DBROOT)
-
-#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT)
-
-#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
-                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
-                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
-                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
-
-static inline int xfs_ilog_fbroot(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-static inline int xfs_ilog_fext(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
-static inline int xfs_ilog_fdata(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#ifdef __KERNEL__
+/* kernel only definitions */
 
 struct xfs_buf;
 struct xfs_bmbt_rec;
 struct xfs_inode;
 struct xfs_mount;
 
-
 typedef struct xfs_inode_log_item {
        xfs_log_item_t          ili_item;          /* common portion */
        struct xfs_inode        *ili_inode;        /* inode ptr */
@@ -151,7 +41,6 @@ typedef struct xfs_inode_log_item {
        xfs_inode_log_format_t  ili_format;        /* logged structure */
 } xfs_inode_log_item_t;
 
-
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
@@ -165,6 +54,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool);
 extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
                                         xfs_inode_log_format_t *);
 
-#endif /* __KERNEL__ */
+extern struct kmem_zone        *xfs_ili_zone;
 
 #endif /* __XFS_INODE_ITEM_H__ */
index 6e2bca5d44d67acb52a58115a6b9482fc04a5bc1..bdebc21078d7e83bac4347ad13a5f569ed4d6a0f 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_error.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
 #include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
 #include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_symlink.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -350,6 +350,40 @@ xfs_readlink_by_handle(
        return error;
 }
 
+int
+xfs_set_dmattrs(
+       xfs_inode_t     *ip,
+       u_int           evmask,
+       u_int16_t       state)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_trans_t     *tp;
+       int             error;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return XFS_ERROR(EPERM);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       ip->i_d.di_dmevmask = evmask;
+       ip->i_d.di_dmstate  = state;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       error = xfs_trans_commit(tp, 0);
+
+       return error;
+}
+
 STATIC int
 xfs_fssetdm_by_handle(
        struct file             *parfilp,
@@ -967,7 +1001,7 @@ xfs_ioctl_setattr(
         * first do an error checking pass.
         */
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (code)
                goto error_return;
 
@@ -981,15 +1015,22 @@ xfs_ioctl_setattr(
         * to the file owner ID, except in cases where the
         * CAP_FSETID capability is applicable.
         */
-       if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+       if (!inode_owner_or_capable(VFS_I(ip))) {
                code = XFS_ERROR(EPERM);
                goto error_return;
        }
 
        /*
         * Do a quota reservation only if projid is actually going to change.
+        * Only allow changing of projid from init_user_ns since it is a
+        * non user namespace aware identifier.
         */
        if (mask & FSX_PROJID) {
+               if (current_user_ns() != &init_user_ns) {
+                       code = XFS_ERROR(EINVAL);
+                       goto error_return;
+               }
+
                if (XFS_IS_QUOTA_RUNNING(mp) &&
                    XFS_IS_PQUOTA_ON(mp) &&
                    xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1103,7 +1144,7 @@ xfs_ioctl_setattr(
                 * cleared upon successful return from chown()
                 */
                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                   !capable(CAP_FSETID))
+                   !inode_capable(VFS_I(ip), CAP_FSETID))
                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 
                /*
@@ -1328,6 +1369,75 @@ xfs_ioc_getbmapx(
        return 0;
 }
 
+int
+xfs_ioc_swapext(
+       xfs_swapext_t   *sxp)
+{
+       xfs_inode_t     *ip, *tip;
+       struct fd       f, tmp;
+       int             error = 0;
+
+       /* Pull information for the target fd */
+       f = fdget((int)sxp->sx_fdtarget);
+       if (!f.file) {
+               error = XFS_ERROR(EINVAL);
+               goto out;
+       }
+
+       if (!(f.file->f_mode & FMODE_WRITE) ||
+           !(f.file->f_mode & FMODE_READ) ||
+           (f.file->f_flags & O_APPEND)) {
+               error = XFS_ERROR(EBADF);
+               goto out_put_file;
+       }
+
+       tmp = fdget((int)sxp->sx_fdtmp);
+       if (!tmp.file) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_file;
+       }
+
+       if (!(tmp.file->f_mode & FMODE_WRITE) ||
+           !(tmp.file->f_mode & FMODE_READ) ||
+           (tmp.file->f_flags & O_APPEND)) {
+               error = XFS_ERROR(EBADF);
+               goto out_put_tmp_file;
+       }
+
+       if (IS_SWAPFILE(file_inode(f.file)) ||
+           IS_SWAPFILE(file_inode(tmp.file))) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       ip = XFS_I(file_inode(f.file));
+       tip = XFS_I(file_inode(tmp.file));
+
+       if (ip->i_mount != tip->i_mount) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       if (ip->i_ino == tip->i_ino) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_tmp_file;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               error = XFS_ERROR(EIO);
+               goto out_put_tmp_file;
+       }
+
+       error = xfs_swap_extents(ip, tip, sxp);
+
+ out_put_tmp_file:
+       fdput(tmp);
+ out_put_file:
+       fdput(f);
+ out:
+       return error;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -1472,7 +1582,7 @@ xfs_file_ioctl(
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
-               error = xfs_swapext(&sxp);
+               error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
                return -error;
        }
@@ -1610,23 +1720,23 @@ xfs_file_ioctl(
                return -error;
 
        case XFS_IOC_FREE_EOFBLOCKS: {
-               struct xfs_eofblocks eofb;
+               struct xfs_fs_eofblocks eofb;
+               struct xfs_eofblocks keofb;
 
-               if (copy_from_user(&eofb, arg, sizeof(eofb)))
-                       return -XFS_ERROR(EFAULT);
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
 
-               if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
-                       return -XFS_ERROR(EINVAL);
+               if (mp->m_flags & XFS_MOUNT_RDONLY)
+                       return -XFS_ERROR(EROFS);
 
-               if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
-                       return -XFS_ERROR(EINVAL);
+               if (copy_from_user(&eofb, arg, sizeof(eofb)))
+                       return -XFS_ERROR(EFAULT);
 
-               if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
-                   memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
-                       return -XFS_ERROR(EINVAL);
+               error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
+               if (error)
+                       return -error;
 
-               error = xfs_icache_free_eofblocks(mp, &eofb);
-               return -error;
+               return -xfs_icache_free_eofblocks(mp, &keofb);
        }
 
        default:
index d56173b34a2a55662575f79a4ba5426662b647c9..77c02c7900b6eb8d78276abdd7d0b6ec03c12e29 100644 (file)
@@ -27,6 +27,10 @@ xfs_ioc_space(
        unsigned int            cmd,
        xfs_flock64_t           *bf);
 
+int
+xfs_ioc_swapext(
+       xfs_swapext_t   *sxp);
+
 extern int
 xfs_find_handle(
        unsigned int            cmd,
@@ -82,4 +86,10 @@ xfs_file_compat_ioctl(
        unsigned int            cmd,
        unsigned long           arg);
 
+extern int
+xfs_set_dmattrs(
+       struct xfs_inode        *ip,
+       u_int                   evmask,
+       u_int16_t               state);
+
 #endif
index c0c66259cc913d3a19df61efeab3bb86cb028aaf..d3ab9534307fcaa7e863be8965f80311090ce499 100644 (file)
@@ -33,8 +33,6 @@
 #include "xfs_inode.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
-#include "xfs_dfrag.h"
-#include "xfs_vnodeops.h"
 #include "xfs_fsops.h"
 #include "xfs_alloc.h"
 #include "xfs_rtalloc.h"
@@ -644,7 +642,7 @@ xfs_file_compat_ioctl(
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
-               error = xfs_swapext(&sxp);
+               error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
                return -error;
        }
index 6a7096422295d1d821f1e0bab397041c42f53de1..8d4d49b6fbf347b3add01ed4675b489a653a6dfb 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
-#include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -187,10 +188,8 @@ xfs_iomap_write_direct(
         * Allocate and setup the transaction
         */
        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-       error = xfs_trans_reserve(tp, resblks,
-                       XFS_WRITE_LOG_RES(mp), resrtextents,
-                       XFS_TRANS_PERM_LOG_RES,
-                       XFS_WRITE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                 resblks, resrtextents);
        /*
         * Check for running out of space, note: need lock to return
         */
@@ -698,10 +697,8 @@ xfs_iomap_write_allocate(
                        tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
                        tp->t_flags |= XFS_TRANS_RESERVE;
                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-                       error = xfs_trans_reserve(tp, nres,
-                                       XFS_WRITE_LOG_RES(mp),
-                                       0, XFS_TRANS_PERM_LOG_RES,
-                                       XFS_WRITE_LOG_COUNT);
+                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                                 nres, 0);
                        if (error) {
                                xfs_trans_cancel(tp, 0);
                                return XFS_ERROR(error);
@@ -864,10 +861,8 @@ xfs_iomap_write_unwritten(
                sb_start_intwrite(mp->m_super);
                tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
                tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
-               error = xfs_trans_reserve(tp, resblks,
-                               XFS_WRITE_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_WRITE_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                         resblks, 0);
                if (error) {
                        xfs_trans_cancel(tp, 0);
                        return XFS_ERROR(error);
index 96dda62d497b7e04a68a1a6ddfc579aececf8374..2b8952d9199bbd145473a48b326120b8d43ed9b6 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_acl.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -87,10 +91,12 @@ xfs_init_security(
 static void
 xfs_dentry_to_name(
        struct xfs_name *namep,
-       struct dentry   *dentry)
+       struct dentry   *dentry,
+       int             mode)
 {
        namep->name = dentry->d_name.name;
        namep->len = dentry->d_name.len;
+       namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
 }
 
 STATIC void
@@ -106,7 +112,7 @@ xfs_cleanup_inode(
         * xfs_init_security we must back out.
         * ENOSPC can hit here, among other things.
         */
-       xfs_dentry_to_name(&teardown, dentry);
+       xfs_dentry_to_name(&teardown, dentry, 0);
 
        xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
        iput(inode);
@@ -146,7 +152,7 @@ xfs_vn_mknod(
                        mode &= ~current_umask();
        }
 
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, mode);
        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
        if (unlikely(error))
                goto out_free_acl;
@@ -207,7 +213,7 @@ xfs_vn_lookup(
        if (dentry->d_name.len >= MAXNAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
 
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
@@ -234,7 +240,7 @@ xfs_vn_ci_lookup(
        if (dentry->d_name.len >= MAXNAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
 
-       xfs_dentry_to_name(&xname, dentry);
+       xfs_dentry_to_name(&xname, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
@@ -269,7 +275,7 @@ xfs_vn_link(
        struct xfs_name name;
        int             error;
 
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, inode->i_mode);
 
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
        if (unlikely(error))
@@ -288,7 +294,7 @@ xfs_vn_unlink(
        struct xfs_name name;
        int             error;
 
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, 0);
 
        error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
        if (error)
@@ -318,7 +324,7 @@ xfs_vn_symlink(
 
        mode = S_IFLNK |
                (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
-       xfs_dentry_to_name(&name, dentry);
+       xfs_dentry_to_name(&name, dentry, mode);
 
        error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
        if (unlikely(error))
@@ -350,12 +356,12 @@ xfs_vn_rename(
        struct xfs_name oname;
        struct xfs_name nname;
 
-       xfs_dentry_to_name(&oname, odentry);
-       xfs_dentry_to_name(&nname, ndentry);
+       xfs_dentry_to_name(&oname, odentry, 0);
+       xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
 
        return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
                           XFS_I(ndir), &nname, new_inode ?
-                                               XFS_I(new_inode) : NULL);
+                                               XFS_I(new_inode) : NULL);
 }
 
 /*
@@ -420,8 +426,8 @@ xfs_vn_getattr(
        stat->dev = inode->i_sb->s_dev;
        stat->mode = ip->i_d.di_mode;
        stat->nlink = ip->i_d.di_nlink;
-       stat->uid = ip->i_d.di_uid;
-       stat->gid = ip->i_d.di_gid;
+       stat->uid = inode->i_uid;
+       stat->gid = inode->i_gid;
        stat->ino = ip->i_ino;
        stat->atime = inode->i_atime;
        stat->mtime = inode->i_mtime;
@@ -485,8 +491,8 @@ xfs_setattr_nonsize(
        int                     mask = iattr->ia_valid;
        xfs_trans_t             *tp;
        int                     error;
-       uid_t                   uid = 0, iuid = 0;
-       gid_t                   gid = 0, igid = 0;
+       kuid_t                  uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+       kgid_t                  gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
        struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
        struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
 
@@ -522,13 +528,13 @@ xfs_setattr_nonsize(
                        uid = iattr->ia_uid;
                        qflags |= XFS_QMOPT_UQUOTA;
                } else {
-                       uid = ip->i_d.di_uid;
+                       uid = inode->i_uid;
                }
                if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
                        gid = iattr->ia_gid;
                        qflags |= XFS_QMOPT_GQUOTA;
                }  else {
-                       gid = ip->i_d.di_gid;
+                       gid = inode->i_gid;
                }
 
                /*
@@ -538,14 +544,16 @@ xfs_setattr_nonsize(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-               error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-                                        qflags, &udqp, &gdqp, NULL);
+               error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
+                                          xfs_kgid_to_gid(gid),
+                                          xfs_get_projid(ip),
+                                          qflags, &udqp, &gdqp, NULL);
                if (error)
                        return error;
        }
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error)
                goto out_dqrele;
 
@@ -561,8 +569,8 @@ xfs_setattr_nonsize(
                 * while we didn't have the inode locked, inode's dquot(s)
                 * would have changed also.
                 */
-               iuid = ip->i_d.di_uid;
-               igid = ip->i_d.di_gid;
+               iuid = inode->i_uid;
+               igid = inode->i_gid;
                gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
@@ -571,8 +579,8 @@ xfs_setattr_nonsize(
                 * going to change.
                 */
                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                   ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-                    (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+                   ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+                    (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
                        ASSERT(tp);
                        error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                NULL, capable(CAP_FOWNER) ?
@@ -602,17 +610,17 @@ xfs_setattr_nonsize(
                 * Change the ownerships and register quota modifications
                 * in the transaction.
                 */
-               if (iuid != uid) {
+               if (!uid_eq(iuid, uid)) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
                                ASSERT(mask & ATTR_UID);
                                ASSERT(udqp);
                                olddquot1 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_udquot, udqp);
                        }
-                       ip->i_d.di_uid = uid;
+                       ip->i_d.di_uid = xfs_kuid_to_uid(uid);
                        inode->i_uid = uid;
                }
-               if (igid != gid) {
+               if (!gid_eq(igid, gid)) {
                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
                                ASSERT(mask & ATTR_GID);
@@ -620,7 +628,7 @@ xfs_setattr_nonsize(
                                olddquot2 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
-                       ip->i_d.di_gid = gid;
+                       ip->i_d.di_gid = xfs_kgid_to_gid(gid);
                        inode->i_gid = gid;
                }
        }
@@ -807,9 +815,7 @@ xfs_setattr_size(
                goto out_unlock;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                XFS_TRANS_PERM_LOG_RES,
-                                XFS_ITRUNCATE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
                goto out_trans_cancel;
 
@@ -932,7 +938,7 @@ xfs_vn_update_time(
        trace_xfs_update_time(ip);
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return -error;
@@ -1173,8 +1179,8 @@ xfs_setup_inode(
 
        inode->i_mode   = ip->i_d.di_mode;
        set_nlink(inode, ip->i_d.di_nlink);
-       inode->i_uid    = ip->i_d.di_uid;
-       inode->i_gid    = ip->i_d.di_gid;
+       inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
+       inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
 
        switch (inode->i_mode & S_IFMT) {
        case S_IFBLK:
index ef41c92ce66e9e8159a2c3b1319774e589877ce6..d81fb41205ec97b9a00ecc0789e99763adc5af71 100644 (file)
@@ -27,4 +27,17 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
 
 extern void xfs_setup_inode(struct xfs_inode *);
 
+/*
+ * Internal setattr interfaces.
+ */
+#define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
+#define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if op would block */
+#define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL         0x08    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC          0x10    /* synchronous operation required */
+
+extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
+                              int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
+
 #endif /* __XFS_IOPS_H__ */
index 800f896a6cc48cdffc19f85d6a9dd4c30ef897d5..f9bb590acc0ebfd38a4aaaee3baaa5b0bf6ec505 100644 (file)
 # define XFS_BIG_INUMS 0
 #endif
 
+/*
+ * Kernel specific type declarations for XFS
+ */
+typedef signed char            __int8_t;
+typedef unsigned char          __uint8_t;
+typedef signed short int       __int16_t;
+typedef unsigned short int     __uint16_t;
+typedef signed int             __int32_t;
+typedef unsigned int           __uint32_t;
+typedef signed long long int   __int64_t;
+typedef unsigned long long int __uint64_t;
+
+typedef __uint32_t             inst_t;         /* an instruction */
+
+typedef __s64                  xfs_off_t;      /* <file offset> type */
+typedef unsigned long long     xfs_ino_t;      /* <inode> type */
+typedef __s64                  xfs_daddr_t;    /* <disk address> type */
+typedef char *                 xfs_caddr_t;    /* <core address> type */
+typedef __u32                  xfs_dev_t;
+typedef __u32                  xfs_nlink_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
 #include "xfs_types.h"
 
 #include "kmem.h"
 #define xfs_inherit_sync       xfs_params.inherit_sync.val
 #define xfs_inherit_nodump     xfs_params.inherit_nodump.val
 #define xfs_inherit_noatime    xfs_params.inherit_noatim.val
-#define xfs_buf_timer_centisecs        xfs_params.xfs_buf_timer.val
-#define xfs_buf_age_centisecs  xfs_params.xfs_buf_age.val
 #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
 #define xfs_rotorstep          xfs_params.rotorstep.val
 #define xfs_inherit_nodefrag   xfs_params.inherit_nodfrg.val
 #define MAX(a,b)       (max(a,b))
 #define howmany(x, y)  (((x)+((y)-1))/(y))
 
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+{
+       return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+{
+       return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+{
+       return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+{
+       return make_kgid(&init_user_ns, gid);
+}
+
 /*
  * Various platform dependent calls that don't fit anywhere else
  */
index d852a2b3e1fdfae0c4fb5bf18452ec79fd03ab01..5372d58ef93a26220f0d916763e7f37de63d0050 100644 (file)
@@ -614,7 +614,8 @@ xfs_log_mount(
        xfs_daddr_t     blk_offset,
        int             num_bblks)
 {
-       int             error;
+       int             error = 0;
+       int             min_logfsbs;
 
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
                xfs_notice(mp, "Mounting Filesystem");
@@ -630,6 +631,50 @@ xfs_log_mount(
                goto out;
        }
 
+       /*
+        * Validate the given log space and drop a critical message via syslog
+        * if the log size is too small that would lead to some unexpected
+        * situations in transaction log space reservation stage.
+        *
+        * Note: we can't just reject the mount if the validation fails.  This
+        * would mean that people would have to downgrade their kernel just to
+        * remedy the situation as there is no way to grow the log (short of
+        * black magic surgery with xfs_db).
+        *
+        * We can, however, reject mounts for CRC format filesystems, as the
+        * mkfs binary being used to make the filesystem should never create a
+        * filesystem with a log that is too small.
+        */
+       min_logfsbs = xfs_log_calc_minimum_size(mp);
+
+       if (mp->m_sb.sb_logblocks < min_logfsbs) {
+               xfs_warn(mp,
+               "Log size %d blocks too small, minimum size is %d blocks",
+                        mp->m_sb.sb_logblocks, min_logfsbs);
+               error = EINVAL;
+       } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+               xfs_warn(mp,
+               "Log size %d blocks too large, maximum size is %lld blocks",
+                        mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+               error = EINVAL;
+       } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+               xfs_warn(mp,
+               "log size %lld bytes too large, maximum size is %lld bytes",
+                        XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+                        XFS_MAX_LOG_BYTES);
+               error = EINVAL;
+       }
+       if (error) {
+               if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                       xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+                       ASSERT(0);
+                       goto out_free_log;
+               }
+               xfs_crit(mp,
+"Log size out of supported range. Continuing onwards, but if log hangs are\n"
+"experienced then please report this message in the bug report.");
+       }
+
        /*
         * Initialize the AIL now we have a log.
         */
@@ -720,7 +765,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
  * Unmount record used to have a string "Unmount filesystem--" in the
  * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
  * We just write the magic number now since that particular field isn't
- * currently architecture converted and "nUmount" is a bit foo.
+ * currently architecture converted and "Unmount" is a bit foo.
  * As far as I know, there weren't any dependencies on the old behaviour.
  */
 
@@ -1941,7 +1986,7 @@ xlog_print_tic_res(
 
        xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                "xlog_write: reservation ran out. Need to up reservation");
-       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+       xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 }
 
 /*
@@ -2044,7 +2089,7 @@ xlog_write_setup_ophdr(
  * Set up the parameters of the region copy into the log. This has
  * to handle region write split across multiple log buffers - this
  * state is kept external to this function so that this code can
- * can be written in an obvious, self documenting manner.
+ * be written in an obvious, self documenting manner.
  */
 static int
 xlog_write_setup_copy(
@@ -3391,24 +3436,17 @@ xfs_log_ticket_get(
 }
 
 /*
- * Allocate and initialise a new log ticket.
+ * Figure out the total log space unit (in bytes) that would be
+ * required for a log ticket.
  */
-struct xlog_ticket *
-xlog_ticket_alloc(
-       struct xlog     *log,
-       int             unit_bytes,
-       int             cnt,
-       char            client,
-       bool            permanent,
-       xfs_km_flags_t  alloc_flags)
+int
+xfs_log_calc_unit_res(
+       struct xfs_mount        *mp,
+       int                     unit_bytes)
 {
-       struct xlog_ticket *tic;
-       uint            num_headers;
-       int             iclog_space;
-
-       tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
-       if (!tic)
-               return NULL;
+       struct xlog             *log = mp->m_log;
+       int                     iclog_space;
+       uint                    num_headers;
 
        /*
         * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3483,20 +3521,43 @@ xlog_ticket_alloc(
        unit_bytes += log->l_iclog_hsize;
 
        /* for roundoff padding for transaction data and one for commit record */
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
-           log->l_mp->m_sb.sb_logsunit > 1) {
+       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
                /* log su roundoff */
-               unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
+               unit_bytes += 2 * mp->m_sb.sb_logsunit;
        } else {
                /* BB roundoff */
-               unit_bytes += 2*BBSIZE;
+               unit_bytes += 2 * BBSIZE;
         }
 
+       return unit_bytes;
+}
+
+/*
+ * Allocate and initialise a new log ticket.
+ */
+struct xlog_ticket *
+xlog_ticket_alloc(
+       struct xlog             *log,
+       int                     unit_bytes,
+       int                     cnt,
+       char                    client,
+       bool                    permanent,
+       xfs_km_flags_t          alloc_flags)
+{
+       struct xlog_ticket      *tic;
+       int                     unit_res;
+
+       tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
+       if (!tic)
+               return NULL;
+
+       unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
+
        atomic_set(&tic->t_ref, 1);
        tic->t_task             = current;
        INIT_LIST_HEAD(&tic->t_queue);
-       tic->t_unit_res         = unit_bytes;
-       tic->t_curr_res         = unit_bytes;
+       tic->t_unit_res         = unit_res;
+       tic->t_curr_res         = unit_res;
        tic->t_cnt              = cnt;
        tic->t_ocnt             = cnt;
        tic->t_tid              = prandom_u32();
index fb630e496c12406c558b7cc53854bf0cd123ccaf..1c458487f000a42306cb44f14509d80fea0c2f02 100644 (file)
 #ifndef        __XFS_LOG_H__
 #define __XFS_LOG_H__
 
-/* get lsn fields */
-#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
-#define BLOCK_LSN(lsn) ((uint)(lsn))
+#include "xfs_log_format.h"
 
-/* this is used in a spot where we might otherwise double-endian-flip */
-#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+struct xfs_log_vec {
+       struct xfs_log_vec      *lv_next;       /* next lv in build list */
+       int                     lv_niovecs;     /* number of iovecs in lv */
+       struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+       struct xfs_log_item     *lv_item;       /* owner */
+       char                    *lv_buf;        /* formatted buffer */
+       int                     lv_buf_len;     /* size of formatted buffer */
+       int                     lv_size;        /* size of allocated lv */
+};
+
+#define XFS_LOG_VEC_ORDERED    (-1)
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+       struct xfs_log_callback *cb_next;
+       void                    (*cb_func)(void *, int);
+       void                    *cb_arg;
+} xfs_log_callback_t;
 
-#ifdef __KERNEL__
 /*
  * By comparing each component, we don't have to worry about extra
  * endian issues in treating two 32 bit numbers as one 64 bit number
@@ -59,67 +75,6 @@ static inline xfs_lsn_t      _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  */
 #define XFS_LOG_SYNC           0x1
 
-#endif /* __KERNEL__ */
-
-
-/* Log Clients */
-#define XFS_TRANSACTION                0x69
-#define XFS_VOLUME             0x2
-#define XFS_LOG                        0xaa
-
-
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT          1
-#define XLOG_REG_TYPE_BCHUNK           2
-#define XLOG_REG_TYPE_EFI_FORMAT       3
-#define XLOG_REG_TYPE_EFD_FORMAT       4
-#define XLOG_REG_TYPE_IFORMAT          5
-#define XLOG_REG_TYPE_ICORE            6
-#define XLOG_REG_TYPE_IEXT             7
-#define XLOG_REG_TYPE_IBROOT           8
-#define XLOG_REG_TYPE_ILOCAL           9
-#define XLOG_REG_TYPE_IATTR_EXT                10
-#define XLOG_REG_TYPE_IATTR_BROOT      11
-#define XLOG_REG_TYPE_IATTR_LOCAL      12
-#define XLOG_REG_TYPE_QFORMAT          13
-#define XLOG_REG_TYPE_DQUOT            14
-#define XLOG_REG_TYPE_QUOTAOFF         15
-#define XLOG_REG_TYPE_LRHEADER         16
-#define XLOG_REG_TYPE_UNMOUNT          17
-#define XLOG_REG_TYPE_COMMIT           18
-#define XLOG_REG_TYPE_TRANSHDR         19
-#define XLOG_REG_TYPE_ICREATE          20
-#define XLOG_REG_TYPE_MAX              20
-
-typedef struct xfs_log_iovec {
-       void            *i_addr;        /* beginning address of region */
-       int             i_len;          /* length in bytes of region */
-       uint            i_type;         /* type of region */
-} xfs_log_iovec_t;
-
-struct xfs_log_vec {
-       struct xfs_log_vec      *lv_next;       /* next lv in build list */
-       int                     lv_niovecs;     /* number of iovecs in lv */
-       struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
-       struct xfs_log_item     *lv_item;       /* owner */
-       char                    *lv_buf;        /* formatted buffer */
-       int                     lv_buf_len;     /* size of formatted buffer */
-};
-
-#define XFS_LOG_VEC_ORDERED    (-1)
-
-/*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
-       struct xfs_log_callback *cb_next;
-       void                    (*cb_func)(void *, int);
-       void                    *cb_arg;
-} xfs_log_callback_t;
-
-
-#ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
 struct xlog_in_core;
@@ -188,5 +143,4 @@ void        xfs_log_work_queue(struct xfs_mount *mp);
 void   xfs_log_worker(struct work_struct *work);
 void   xfs_log_quiesce(struct xfs_mount *mp);
 
-#endif
 #endif /* __XFS_LOG_H__ */
index 02b9cf3f8252baeade5d4e99b3e88853a7b50b98..cfe97973ba36d1d586c3704b536aebce2e391af1 100644 (file)
@@ -80,6 +80,83 @@ xlog_cil_init_post_recovery(
                                                                log->l_curr_block);
 }
 
+STATIC int
+xlog_cil_lv_item_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_vec      *lv)
+{
+       int     index;
+       char    *ptr;
+
+       /* format new vectors into array */
+       lip->li_ops->iop_format(lip, lv->lv_iovecp);
+
+       /* copy data into existing array */
+       ptr = lv->lv_buf;
+       for (index = 0; index < lv->lv_niovecs; index++) {
+               struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+
+               memcpy(ptr, vec->i_addr, vec->i_len);
+               vec->i_addr = ptr;
+               ptr += vec->i_len;
+       }
+
+       /*
+        * some size calculations for log vectors over-estimate, so the caller
+        * doesn't know the amount of space actually used by the item. Return
+        * the byte count to the caller so they can check and store it
+        * appropriately.
+        */
+       return ptr - lv->lv_buf;
+}
+
+/*
+ * Prepare the log item for insertion into the CIL. Calculate the difference in
+ * log space and vectors it will consume, and if it is a new item pin it as
+ * well.
+ */
+STATIC void
+xfs_cil_prepare_item(
+       struct xlog             *log,
+       struct xfs_log_vec      *lv,
+       struct xfs_log_vec      *old_lv,
+       int                     *diff_len,
+       int                     *diff_iovecs)
+{
+       /* Account for the new LV being passed in */
+       if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+               *diff_len += lv->lv_buf_len;
+               *diff_iovecs += lv->lv_niovecs;
+       }
+
+       /*
+        * If there is no old LV, this is the first time we've seen the item in
+        * this CIL context and so we need to pin it. If we are replacing the
+        * old_lv, then remove the space it accounts for and free it.
+        */
+       if (!old_lv)
+               lv->lv_item->li_ops->iop_pin(lv->lv_item);
+       else if (old_lv != lv) {
+               ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+
+               *diff_len -= old_lv->lv_buf_len;
+               *diff_iovecs -= old_lv->lv_niovecs;
+               kmem_free(old_lv);
+       }
+
+       /* attach new log vector to log item */
+       lv->lv_item->li_lv = lv;
+
+       /*
+        * If this is the first time the item is being committed to the
+        * CIL, store the sequence number on the log item so we can
+        * tell in future commits whether this is the first checkpoint
+        * the item is being committed into.
+        */
+       if (!lv->lv_item->li_seq)
+               lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+}
+
 /*
  * Format log item into a flat buffers
  *
@@ -106,35 +183,39 @@ xlog_cil_init_post_recovery(
  * format the regions into the iclog as though they are being formatted
  * directly out of the objects themselves.
  */
-static struct xfs_log_vec *
-xlog_cil_prepare_log_vecs(
-       struct xfs_trans        *tp)
+static void
+xlog_cil_insert_format_items(
+       struct xlog             *log,
+       struct xfs_trans        *tp,
+       int                     *diff_len,
+       int                     *diff_iovecs)
 {
        struct xfs_log_item_desc *lidp;
-       struct xfs_log_vec      *lv = NULL;
-       struct xfs_log_vec      *ret_lv = NULL;
 
 
        /* Bail out if we didn't find a log item.  */
        if (list_empty(&tp->t_items)) {
                ASSERT(0);
-               return NULL;
+               return;
        }
 
        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-               struct xfs_log_vec *new_lv;
-               void    *ptr;
-               int     index;
-               int     len = 0;
-               uint    niovecs;
+               struct xfs_log_item *lip = lidp->lid_item;
+               struct xfs_log_vec *lv;
+               struct xfs_log_vec *old_lv;
+               int     niovecs = 0;
+               int     nbytes = 0;
+               int     buf_size;
                bool    ordered = false;
 
                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
                        continue;
 
+               /* get number of vecs and size of data to be stored */
+               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
                /* Skip items that do not have any vectors for writing */
-               niovecs = IOP_SIZE(lidp->lid_item);
                if (!niovecs)
                        continue;
 
@@ -146,109 +227,63 @@ xlog_cil_prepare_log_vecs(
                if (niovecs == XFS_LOG_VEC_ORDERED) {
                        ordered = true;
                        niovecs = 0;
+                       nbytes = 0;
                }
 
-               new_lv = kmem_zalloc(sizeof(*new_lv) +
-                               niovecs * sizeof(struct xfs_log_iovec),
-                               KM_SLEEP|KM_NOFS);
-
-               new_lv->lv_item = lidp->lid_item;
-               new_lv->lv_niovecs = niovecs;
-               if (ordered) {
-                       /* track as an ordered logvec */
-                       new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
-                       goto next;
-               }
-
-               /* The allocated iovec region lies beyond the log vector. */
-               new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+               /* grab the old item if it exists for reservation accounting */
+               old_lv = lip->li_lv;
 
-               /* build the vector array and calculate it's length */
-               IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
-               for (index = 0; index < new_lv->lv_niovecs; index++)
-                       len += new_lv->lv_iovecp[index].i_len;
+               /* calc buffer size */
+               buf_size = sizeof(struct xfs_log_vec) + nbytes +
+                               niovecs * sizeof(struct xfs_log_iovec);
 
-               new_lv->lv_buf_len = len;
-               new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
-                               KM_SLEEP|KM_NOFS);
-               ptr = new_lv->lv_buf;
+               /* compare to existing item size */
+               if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+                       /* same or smaller, optimise common overwrite case */
+                       lv = lip->li_lv;
+                       lv->lv_next = NULL;
 
-               for (index = 0; index < new_lv->lv_niovecs; index++) {
-                       struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
+                       if (ordered)
+                               goto insert;
 
-                       memcpy(ptr, vec->i_addr, vec->i_len);
-                       vec->i_addr = ptr;
-                       ptr += vec->i_len;
-               }
-               ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
-
-next:
-               if (!ret_lv)
-                       ret_lv = new_lv;
-               else
-                       lv->lv_next = new_lv;
-               lv = new_lv;
-       }
-
-       return ret_lv;
-}
-
-/*
- * Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
- */
-STATIC void
-xfs_cil_prepare_item(
-       struct xlog             *log,
-       struct xfs_log_vec      *lv,
-       int                     *len,
-       int                     *diff_iovecs)
-{
-       struct xfs_log_vec      *old = lv->lv_item->li_lv;
+                       /*
+                        * set the item up as though it is a new insertion so
+                        * that the space reservation accounting is correct.
+                        */
+                       *diff_iovecs -= lv->lv_niovecs;
+                       *diff_len -= lv->lv_buf_len;
 
-       if (old) {
-               /* existing lv on log item, space used is a delta */
-               ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
-                       old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+                       /* Ensure the lv is set up according to ->iop_size */
+                       lv->lv_niovecs = niovecs;
+                       lv->lv_buf = (char *)lv + buf_size - nbytes;
 
-               /*
-                * If the new item is ordered, keep the old one that is already
-                * tracking dirty or ordered regions
-                */
-               if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
-                       ASSERT(!lv->lv_buf);
-                       kmem_free(lv);
-                       return;
+                       lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+                       goto insert;
                }
 
-               *len += lv->lv_buf_len - old->lv_buf_len;
-               *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
-               kmem_free(old->lv_buf);
-               kmem_free(old);
-       } else {
-               /* new lv, must pin the log item */
-               ASSERT(!lv->lv_item->li_lv);
-
-               if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
-                       *len += lv->lv_buf_len;
-                       *diff_iovecs += lv->lv_niovecs;
+               /* allocate new data chunk */
+               lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+               lv->lv_item = lip;
+               lv->lv_size = buf_size;
+               lv->lv_niovecs = niovecs;
+               if (ordered) {
+                       /* track as an ordered logvec */
+                       ASSERT(lip->li_lv == NULL);
+                       lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       goto insert;
                }
-               IOP_PIN(lv->lv_item);
 
-       }
+               /* The allocated iovec region lies beyond the log vector. */
+               lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 
-       /* attach new log vector to log item */
-       lv->lv_item->li_lv = lv;
+               /* The allocated data region lies beyond the iovec region */
+               lv->lv_buf = (char *)lv + buf_size - nbytes;
 
-       /*
-        * If this is the first time the item is being committed to the
-        * CIL, store the sequence number on the log item so we can
-        * tell in future commits whether this is the first checkpoint
-        * the item is being committed into.
-        */
-       if (!lv->lv_item->li_seq)
-               lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
+               lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+insert:
+               ASSERT(lv->lv_buf_len <= nbytes);
+               xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+       }
 }
 
 /*
@@ -261,53 +296,47 @@ xfs_cil_prepare_item(
 static void
 xlog_cil_insert_items(
        struct xlog             *log,
-       struct xfs_log_vec      *log_vector,
-       struct xlog_ticket      *ticket)
+       struct xfs_trans        *tp)
 {
        struct xfs_cil          *cil = log->l_cilp;
        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
-       struct xfs_log_vec      *lv;
+       struct xfs_log_item_desc *lidp;
        int                     len = 0;
        int                     diff_iovecs = 0;
        int                     iclog_space;
 
-       ASSERT(log_vector);
+       ASSERT(tp);
 
        /*
-        * Do all the accounting aggregation and switching of log vectors
-        * around in a separate loop to the insertion of items into the CIL.
-        * Then we can do a separate loop to update the CIL within a single
-        * lock/unlock pair. This reduces the number of round trips on the CIL
-        * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
-        * hold time for the transaction commit.
-        *
-        * If this is the first time the item is being placed into the CIL in
-        * this context, pin it so it can't be written to disk until the CIL is
-        * flushed to the iclog and the iclog written to disk.
-        *
         * We can do this safely because the context can't checkpoint until we
         * are done so it doesn't matter exactly how we update the CIL.
         */
+       xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+
+       /*
+        * Now (re-)position everything modified at the tail of the CIL.
+        * We do this here so we only need to take the CIL lock once during
+        * the transaction commit.
+        */
        spin_lock(&cil->xc_cil_lock);
-       for (lv = log_vector; lv; ) {
-               struct xfs_log_vec *next = lv->lv_next;
+       list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+               struct xfs_log_item     *lip = lidp->lid_item;
 
-               ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
-               lv->lv_next = NULL;
+               /* Skip items which aren't dirty in this transaction. */
+               if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                       continue;
 
-               /*
-                * xfs_cil_prepare_item() may free the lv, so move the item on
-                * the CIL first.
-                */
-               list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
-               xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-               lv = next;
+               list_move_tail(&lip->li_cil, &cil->xc_cil);
        }
 
        /* account for space used by new iovec headers  */
        len += diff_iovecs * sizeof(xlog_op_header_t);
        ctx->nvecs += diff_iovecs;
 
+       /* attach the transaction to the CIL if it has any busy extents */
+       if (!list_empty(&tp->t_busy))
+               list_splice_init(&tp->t_busy, &ctx->busy_extents);
+
        /*
         * Now transfer enough transaction reservation to the context ticket
         * for the checkpoint. The context ticket is special - the unit
@@ -316,10 +345,8 @@ xlog_cil_insert_items(
         * during the transaction commit.
         */
        if (ctx->ticket->t_curr_res == 0) {
-               /* first commit in checkpoint, steal the header reservation */
-               ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
-               ticket->t_curr_res -= ctx->ticket->t_unit_res;
+               tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
        }
 
        /* do we need space for more log record headers? */
@@ -333,10 +360,10 @@ xlog_cil_insert_items(
                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
                ctx->ticket->t_unit_res += hdrs;
                ctx->ticket->t_curr_res += hdrs;
-               ticket->t_curr_res -= hdrs;
-               ASSERT(ticket->t_curr_res >= len);
+               tp->t_ticket->t_curr_res -= hdrs;
+               ASSERT(tp->t_ticket->t_curr_res >= len);
        }
-       ticket->t_curr_res -= len;
+       tp->t_ticket->t_curr_res -= len;
        ctx->space_used += len;
 
        spin_unlock(&cil->xc_cil_lock);
@@ -350,7 +377,6 @@ xlog_cil_free_logvec(
 
        for (lv = log_vector; lv; ) {
                struct xfs_log_vec *next = lv->lv_next;
-               kmem_free(lv->lv_buf);
                kmem_free(lv);
                lv = next;
        }
@@ -376,9 +402,9 @@ xlog_cil_committed(
        xfs_extent_busy_clear(mp, &ctx->busy_extents,
                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 
-       spin_lock(&ctx->cil->xc_cil_lock);
+       spin_lock(&ctx->cil->xc_push_lock);
        list_del(&ctx->committing);
-       spin_unlock(&ctx->cil->xc_cil_lock);
+       spin_unlock(&ctx->cil->xc_push_lock);
 
        xlog_cil_free_logvec(ctx->lv_chain);
 
@@ -433,7 +459,7 @@ xlog_cil_push(
        down_write(&cil->xc_ctx_lock);
        ctx = cil->xc_ctx;
 
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        push_seq = cil->xc_push_seq;
        ASSERT(push_seq <= ctx->sequence);
 
@@ -444,10 +470,10 @@ xlog_cil_push(
         */
        if (list_empty(&cil->xc_cil)) {
                cil->xc_push_seq = 0;
-               spin_unlock(&cil->xc_cil_lock);
+               spin_unlock(&cil->xc_push_lock);
                goto out_skip;
        }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
 
 
        /* check for a previously pushed seqeunce */
@@ -515,9 +541,9 @@ xlog_cil_push(
         * that higher sequences will wait for us to write out a commit record
         * before they do.
         */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        list_add(&ctx->committing, &cil->xc_committing);
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
        up_write(&cil->xc_ctx_lock);
 
        /*
@@ -552,7 +578,7 @@ xlog_cil_push(
         * order the commit records so replay will get them in the right order.
         */
 restart:
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
                /*
                 * Higher sequences will wait for this one so skip them.
@@ -565,11 +591,11 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
                        goto restart;
                }
        }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
 
        /* xfs_log_done always frees the ticket on error. */
        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
@@ -588,10 +614,10 @@ restart:
         * callbacks to the iclog we can assign the commit LSN to the context
         * and wake up anyone who is waiting for the commit to complete.
         */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        ctx->commit_lsn = commit_lsn;
        wake_up_all(&cil->xc_commit_wait);
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
 
        /* release the hounds! */
        return xfs_log_release_iclog(log->l_mp, commit_iclog);
@@ -644,12 +670,12 @@ xlog_cil_push_background(
        if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
                return;
 
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        if (cil->xc_push_seq < cil->xc_current_sequence) {
                cil->xc_push_seq = cil->xc_current_sequence;
                queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
        }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
 
 }
 
@@ -672,14 +698,14 @@ xlog_cil_push_foreground(
         * If the CIL is empty or we've already pushed the sequence then
         * there's no work we need to do.
         */
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
-               spin_unlock(&cil->xc_cil_lock);
+               spin_unlock(&cil->xc_push_lock);
                return;
        }
 
        cil->xc_push_seq = push_seq;
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
 
        /* do the push now */
        xlog_cil_push(log);
@@ -706,43 +732,25 @@ xfs_log_commit_cil(
        int                     flags)
 {
        struct xlog             *log = mp->m_log;
+       struct xfs_cil          *cil = log->l_cilp;
        int                     log_flags = 0;
-       struct xfs_log_vec      *log_vector;
 
        if (flags & XFS_TRANS_RELEASE_LOG_RES)
                log_flags = XFS_LOG_REL_PERM_RESERV;
 
-       /*
-        * Do all the hard work of formatting items (including memory
-        * allocation) outside the CIL context lock. This prevents stalling CIL
-        * pushes when we are low on memory and a transaction commit spends a
-        * lot of time in memory reclaim.
-        */
-       log_vector = xlog_cil_prepare_log_vecs(tp);
-       if (!log_vector)
-               return ENOMEM;
-
        /* lock out background commit */
-       down_read(&log->l_cilp->xc_ctx_lock);
-       if (commit_lsn)
-               *commit_lsn = log->l_cilp->xc_ctx->sequence;
+       down_read(&cil->xc_ctx_lock);
 
-       /* xlog_cil_insert_items() destroys log_vector list */
-       xlog_cil_insert_items(log, log_vector, tp->t_ticket);
+       xlog_cil_insert_items(log, tp);
 
        /* check we didn't blow the reservation */
        if (tp->t_ticket->t_curr_res < 0)
-               xlog_print_tic_res(log->l_mp, tp->t_ticket);
+               xlog_print_tic_res(mp, tp->t_ticket);
 
-       /* attach the transaction to the CIL if it has any busy extents */
-       if (!list_empty(&tp->t_busy)) {
-               spin_lock(&log->l_cilp->xc_cil_lock);
-               list_splice_init(&tp->t_busy,
-                                       &log->l_cilp->xc_ctx->busy_extents);
-               spin_unlock(&log->l_cilp->xc_cil_lock);
-       }
+       tp->t_commit_lsn = cil->xc_ctx->sequence;
+       if (commit_lsn)
+               *commit_lsn = tp->t_commit_lsn;
 
-       tp->t_commit_lsn = *commit_lsn;
        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
        xfs_trans_unreserve_and_mod_sb(tp);
 
@@ -757,11 +765,11 @@ xfs_log_commit_cil(
         * the log items. This affects (at least) processing of stale buffers,
         * inodes and EFIs.
         */
-       xfs_trans_free_items(tp, *commit_lsn, 0);
+       xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
 
        xlog_cil_push_background(log);
 
-       up_read(&log->l_cilp->xc_ctx_lock);
+       up_read(&cil->xc_ctx_lock);
        return 0;
 }
 
@@ -800,7 +808,7 @@ xlog_cil_force_lsn(
         * on commits for those as well.
         */
 restart:
-       spin_lock(&cil->xc_cil_lock);
+       spin_lock(&cil->xc_push_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
                if (ctx->sequence > sequence)
                        continue;
@@ -809,7 +817,7 @@ restart:
                         * It is still being pushed! Wait for the push to
                         * complete, then start again from the beginning.
                         */
-                       xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
+                       xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
                        goto restart;
                }
                if (ctx->sequence != sequence)
@@ -817,7 +825,7 @@ restart:
                /* found it! */
                commit_lsn = ctx->commit_lsn;
        }
-       spin_unlock(&cil->xc_cil_lock);
+       spin_unlock(&cil->xc_push_lock);
        return commit_lsn;
 }
 
@@ -875,6 +883,7 @@ xlog_cil_init(
        INIT_LIST_HEAD(&cil->xc_cil);
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
+       spin_lock_init(&cil->xc_push_lock);
        init_rwsem(&cil->xc_ctx_lock);
        init_waitqueue_head(&cil->xc_commit_wait);
 
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
new file mode 100644 (file)
index 0000000..31e3a06
--- /dev/null
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_LOG_FORMAT_H__
+#define __XFS_LOG_FORMAT_H__
+
+struct xfs_mount;
+struct xfs_trans_res;
+
+/*
+ * On-disk Log Format definitions.
+ *
+ * This file contains all the on-disk format definitions used within the log. It
+ * includes the physical log structure itself, as well as all the log item
+ * format structures that are written into the log and intepreted by log
+ * recovery. We start with the physical log format definitions, and then work
+ * through all the log items definitions and everything they encode into the
+ * log.
+ */
+typedef __uint32_t xlog_tid_t;
+
+#define XLOG_MIN_ICLOGS                2
+#define XLOG_MAX_ICLOGS                8
+#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
+#define XLOG_VERSION_1         1
+#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE  (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
+#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+
+#define XLOG_HEADER_SIZE       512
+
+/* Minimum number of transactions that must fit in the log (defined by mkfs) */
+#define XFS_MIN_LOG_FACTOR     3
+
+#define XLOG_REC_SHIFT(log) \
+       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
+        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
+
+static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
+{
+       return ((xfs_lsn_t)cycle << 32) | block;
+}
+
+static inline uint xlog_get_cycle(char *ptr)
+{
+       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
+               return be32_to_cpu(*((__be32 *)ptr + 1));
+       else
+               return be32_to_cpu(*(__be32 *)ptr);
+}
+
+/* Log Clients */
+#define XFS_TRANSACTION                0x69
+#define XFS_VOLUME             0x2
+#define XFS_LOG                        0xaa
+
+#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
+
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT          1
+#define XLOG_REG_TYPE_BCHUNK           2
+#define XLOG_REG_TYPE_EFI_FORMAT       3
+#define XLOG_REG_TYPE_EFD_FORMAT       4
+#define XLOG_REG_TYPE_IFORMAT          5
+#define XLOG_REG_TYPE_ICORE            6
+#define XLOG_REG_TYPE_IEXT             7
+#define XLOG_REG_TYPE_IBROOT           8
+#define XLOG_REG_TYPE_ILOCAL           9
+#define XLOG_REG_TYPE_IATTR_EXT                10
+#define XLOG_REG_TYPE_IATTR_BROOT      11
+#define XLOG_REG_TYPE_IATTR_LOCAL      12
+#define XLOG_REG_TYPE_QFORMAT          13
+#define XLOG_REG_TYPE_DQUOT            14
+#define XLOG_REG_TYPE_QUOTAOFF         15
+#define XLOG_REG_TYPE_LRHEADER         16
+#define XLOG_REG_TYPE_UNMOUNT          17
+#define XLOG_REG_TYPE_COMMIT           18
+#define XLOG_REG_TYPE_TRANSHDR         19
+#define XLOG_REG_TYPE_ICREATE          20
+#define XLOG_REG_TYPE_MAX              20
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS       0x01    /* Start a new transaction */
+#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
+#define XLOG_END_TRANS         0x10    /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
+
+
+typedef struct xlog_op_header {
+       __be32     oh_tid;      /* transaction id of operation  :  4 b */
+       __be32     oh_len;      /* bytes in data region         :  4 b */
+       __u8       oh_clientid; /* who sent me this             :  1 b */
+       __u8       oh_flags;    /*                              :  1 b */
+       __u16      oh_res2;     /* 32 bit align                 :  2 b */
+} xlog_op_header_t;
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#ifdef XFS_NATIVE_HOST
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#endif
+
+typedef struct xlog_rec_header {
+       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
+       __be32    h_cycle;      /* write cycle of log                   :  4 */
+       __be32    h_version;    /* LR version                           :  4 */
+       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
+       __be64    h_lsn;        /* lsn of this LR                       :  8 */
+       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
+       __le32    h_crc;        /* crc of log record                    :  4 */
+       __be32    h_prev_block; /* block number to previous LR          :  4 */
+       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
+       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+       /* new fields */
+       __be32    h_fmt;        /* format of log record                 :  4 */
+       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
+       __be32    h_size;       /* iclog size                           :  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+       __be32    xh_cycle;     /* write cycle of log                   : 4 */
+       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
+} xlog_rec_ext_header_t;
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+       xlog_rec_header_t       hic_header;
+       xlog_rec_ext_header_t   hic_xheader;
+       char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
+/* not an on-disk structure, but needed by log recovery in userspace */
+typedef struct xfs_log_iovec {
+       void            *i_addr;        /* beginning address of region */
+       int             i_len;          /* length in bytes of region */
+       uint            i_type;         /* type of region */
+} xfs_log_iovec_t;
+
+
+/*
+ * Transaction Header definitions.
+ *
+ * This is the structure written in the log at the head of every transaction. It
+ * identifies the type and id of the transaction, and contains the number of
+ * items logged by the transaction so we know how many to expect during
+ * recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+       uint            th_magic;               /* magic number */
+       uint            th_type;                /* transaction type */
+       __int32_t       th_tid;                 /* transaction id (unused) */
+       uint            th_num_items;           /* num items logged by trans */
+} xfs_trans_header_t;
+
+#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
+
+/*
+ * Log item types.
+ */
+#define        XFS_LI_EFI              0x1236
+#define        XFS_LI_EFD              0x1237
+#define        XFS_LI_IUNLINK          0x1238
+#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
+#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
+#define        XFS_LI_DQUOT            0x123d
+#define        XFS_LI_QUOTAOFF         0x123e
+#define        XFS_LI_ICREATE          0x123f
+
+#define XFS_LI_TYPE_DESC \
+       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }, \
+       { XFS_LI_ICREATE,       "XFS_LI_ICREATE" }
+
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE     1
+#define XFS_TRANS_SETATTR_SIZE         2
+#define XFS_TRANS_INACTIVE             3
+#define XFS_TRANS_CREATE               4
+#define XFS_TRANS_CREATE_TRUNC         5
+#define XFS_TRANS_TRUNCATE_FILE                6
+#define XFS_TRANS_REMOVE               7
+#define XFS_TRANS_LINK                 8
+#define XFS_TRANS_RENAME               9
+#define XFS_TRANS_MKDIR                        10
+#define XFS_TRANS_RMDIR                        11
+#define XFS_TRANS_SYMLINK              12
+#define XFS_TRANS_SET_DMATTRS          13
+#define XFS_TRANS_GROWFS               14
+#define XFS_TRANS_STRAT_WRITE          15
+#define XFS_TRANS_DIOSTRAT             16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define        XFS_TRANS_WRITEID               18
+#define        XFS_TRANS_ADDAFORK              19
+#define        XFS_TRANS_ATTRINVAL             20
+#define        XFS_TRANS_ATRUNCATE             21
+#define        XFS_TRANS_ATTR_SET              22
+#define        XFS_TRANS_ATTR_RM               23
+#define        XFS_TRANS_ATTR_FLAG             24
+#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
+#define XFS_TRANS_QM_SBCHANGE          26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1               27
+#define XFS_TRANS_DUMMY2               28
+#define XFS_TRANS_QM_QUOTAOFF          29
+#define XFS_TRANS_QM_DQALLOC           30
+#define XFS_TRANS_QM_SETQLIM           31
+#define XFS_TRANS_QM_DQCLUSTER         32
+#define XFS_TRANS_QM_QINOCREATE                33
+#define XFS_TRANS_QM_QUOTAOFF_END      34
+#define XFS_TRANS_SB_UNIT              35
+#define XFS_TRANS_FSYNC_TS             36
+#define        XFS_TRANS_GROWFSRT_ALLOC        37
+#define        XFS_TRANS_GROWFSRT_ZERO         38
+#define        XFS_TRANS_GROWFSRT_FREE         39
+#define        XFS_TRANS_SWAPEXT               40
+#define        XFS_TRANS_SB_COUNT              41
+#define        XFS_TRANS_CHECKPOINT            42
+#define        XFS_TRANS_ICREATE               43
+#define        XFS_TRANS_TYPE_MAX              43
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
+       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
+       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
+       { XFS_TRANS_CREATE,             "CREATE" }, \
+       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
+       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
+       { XFS_TRANS_REMOVE,             "REMOVE" }, \
+       { XFS_TRANS_LINK,               "LINK" }, \
+       { XFS_TRANS_RENAME,             "RENAME" }, \
+       { XFS_TRANS_MKDIR,              "MKDIR" }, \
+       { XFS_TRANS_RMDIR,              "RMDIR" }, \
+       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
+       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
+       { XFS_TRANS_GROWFS,             "GROWFS" }, \
+       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
+       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
+       { XFS_TRANS_WRITEID,            "WRITEID" }, \
+       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
+       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
+       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
+       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
+       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
+       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
+       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
+       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
+       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
+       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
+       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
+       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
+       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
+       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
+       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
+       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
+       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
+       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
+       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
+       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
+       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+       struct xfs_log_item     *lid_item;
+       struct list_head        lid_trans;
+       unsigned char           lid_flags;
+};
+
+#define XFS_LID_DIRTY          0x1
+
+/*
+ * Values for t_flags.
+ */
+#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
+#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
+#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
+#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
+                                          count in superblock */
+
+/*
+ * Values for call flags parameter.
+ */
+#define        XFS_TRANS_RELEASE_LOG_RES       0x4
+#define        XFS_TRANS_ABORT                 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define        XFS_TRANS_SB_ICOUNT             0x00000001
+#define        XFS_TRANS_SB_IFREE              0x00000002
+#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
+#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
+#define        XFS_TRANS_SB_FREXTENTS          0x00000010
+#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
+#define        XFS_TRANS_SB_DBLOCKS            0x00000040
+#define        XFS_TRANS_SB_AGCOUNT            0x00000080
+#define        XFS_TRANS_SB_IMAXPCT            0x00000100
+#define        XFS_TRANS_SB_REXTSIZE           0x00000200
+#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
+#define        XFS_TRANS_SB_RBLOCKS            0x00000800
+#define        XFS_TRANS_SB_REXTENTS           0x00001000
+#define        XFS_TRANS_SB_REXTSLOG           0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define        XFS_AGF_REF             4
+#define        XFS_AGI_REF             4
+#define        XFS_AGFL_REF            3
+#define        XFS_INO_BTREE_REF       3
+#define        XFS_ALLOC_BTREE_REF     2
+#define        XFS_BMAP_BTREE_REF      2
+#define        XFS_DIR_BTREE_REF       2
+#define        XFS_INO_REF             2
+#define        XFS_ATTR_BTREE_REF      1
+#define        XFS_DQUOT_REF           1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+#define        XFS_ICHGTIME_CREATE     0x4     /* inode create timestamp */
+
+
+/*
+ * Inode Log Item Format definitions.
+ *
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ */
+typedef struct xfs_inode_log_format {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+typedef struct xfs_inode_log_format_32 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+       __uint16_t              ilf_type;       /* inode log item type */
+       __uint16_t              ilf_size;       /* size of this item */
+       __uint32_t              ilf_fields;     /* flags for fields logged */
+       __uint16_t              ilf_asize;      /* size of attr d/ext/root */
+       __uint16_t              ilf_dsize;      /* size of data/ext/root */
+       __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
+       __uint64_t              ilf_ino;        /* inode number */
+       union {
+               __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
+               uuid_t          ilfu_uuid;      /* mount point value */
+       } ilf_u;
+       __int64_t               ilf_blkno;      /* blkno of inode buffer */
+       __int32_t               ilf_len;        /* len of inode buffer */
+       __int32_t               ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_64_t;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define        XFS_ILOG_CORE   0x001   /* log standard inode fields */
+#define        XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
+#define        XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
+#define        XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
+#define        XFS_ILOG_DEV    0x010   /* log the dev field */
+#define        XFS_ILOG_UUID   0x020   /* log the uuid field */
+#define        XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
+#define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
+#define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP     0x4000
+
+#define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+                                XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define        XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT)
+
+#define        XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT)
+
+#define        XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+                                XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+                                XFS_ILOG_DEV | XFS_ILOG_UUID | \
+                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+
+static inline int xfs_ilog_fbroot(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+static inline int xfs_ilog_fext(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+static inline int xfs_ilog_fdata(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
+/*
+ * Incore version of the on-disk inode core structures. We log this directly
+ * into the journal in host CPU format (for better or worse) and as such
+ * directly mirrors the xfs_dinode structure as it must contain all the same
+ * information.
+ */
+typedef struct xfs_ictimestamp {
+       __int32_t       t_sec;          /* timestamp seconds */
+       __int32_t       t_nsec;         /* timestamp nanoseconds */
+} xfs_ictimestamp_t;
+
+/*
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
+ *       in xfs_dinode.h except for the endianness annotations.
+ */
+typedef struct xfs_icdinode {
+       __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+       __uint16_t      di_mode;        /* mode and type of file */
+       __int8_t        di_version;     /* inode version */
+       __int8_t        di_format;      /* format of di_c data */
+       __uint16_t      di_onlink;      /* old number of links to file */
+       __uint32_t      di_uid;         /* owner's user id */
+       __uint32_t      di_gid;         /* owner's group id */
+       __uint32_t      di_nlink;       /* number of links to file */
+       __uint16_t      di_projid_lo;   /* lower part of owner's project id */
+       __uint16_t      di_projid_hi;   /* higher part of owner's project id */
+       __uint8_t       di_pad[6];      /* unused, zeroed space */
+       __uint16_t      di_flushiter;   /* incremented on flush */
+       xfs_ictimestamp_t di_atime;     /* time last accessed */
+       xfs_ictimestamp_t di_mtime;     /* time last modified */
+       xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
+       xfs_fsize_t     di_size;        /* number of bytes in file */
+       xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
+       xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+       xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+       xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+       __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+       __int8_t        di_aformat;     /* format of attr fork's data */
+       __uint32_t      di_dmevmask;    /* DMIG event mask */
+       __uint16_t      di_dmstate;     /* DMIG state info */
+       __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+       __uint32_t      di_gen;         /* generation number */
+
+       /* di_next_unlinked is the only non-core field in the old dinode */
+       xfs_agino_t     di_next_unlinked;/* agi unlinked list ptr */
+
+       /* start of the extended dinode, writable fields */
+       __uint32_t      di_crc;         /* CRC of the inode */
+       __uint64_t      di_changecount; /* number of attribute changes */
+       xfs_lsn_t       di_lsn;         /* flush sequence */
+       __uint64_t      di_flags2;      /* more random flags */
+       __uint8_t       di_pad2[16];    /* more padding for future expansion */
+
+       /* fields only written to during inode creation */
+       xfs_ictimestamp_t di_crtime;    /* time created */
+       xfs_ino_t       di_ino;         /* inode number */
+       uuid_t          di_uuid;        /* UUID of the filesystem */
+
+       /* structure must be padded to 64 bit alignment */
+} xfs_icdinode_t;
+
+static inline uint xfs_icdinode_size(int version)
+{
+       if (version == 3)
+               return sizeof(struct xfs_icdinode);
+       return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
+/*
+ * Buffer Log Format defintions
+ *
+ * These are the physical dirty bitmap defintions for the log format structure.
+ */
+#define        XFS_BLF_CHUNK           128
+#define        XFS_BLF_SHIFT           7
+#define        BIT_TO_WORD_SHIFT       5
+#define        NBWORD                  (NBBY * sizeof(unsigned int))
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define        XFS_BLF_INODE_BUF       (1<<0)
+
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define        XFS_BLF_CANCEL          (1<<1)
+
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define        XFS_BLF_UDQUOT_BUF      (1<<2)
+#define XFS_BLF_PDQUOT_BUF     (1<<3)
+#define        XFS_BLF_GDQUOT_BUF      (1<<4)
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.
+ */
+#define XFS_BLF_DATAMAP_SIZE   ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+
+typedef struct xfs_buf_log_format {
+       unsigned short  blf_type;       /* buf log item type indicator */
+       unsigned short  blf_size;       /* size of this item */
+       ushort          blf_flags;      /* misc state */
+       ushort          blf_len;        /* number of blocks in this buf */
+       __int64_t       blf_blkno;      /* starting blkno of this buf */
+       unsigned int    blf_map_size;   /* used size of data bitmap in words */
+       unsigned int    blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
+} xfs_buf_log_format_t;
+
+/*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS  5
+#define XFS_BLFT_SHIFT 11
+#define XFS_BLFT_MASK  (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+       XFS_BLFT_UNKNOWN_BUF = 0,
+       XFS_BLFT_UDQUOT_BUF,
+       XFS_BLFT_PDQUOT_BUF,
+       XFS_BLFT_GDQUOT_BUF,
+       XFS_BLFT_BTREE_BUF,
+       XFS_BLFT_AGF_BUF,
+       XFS_BLFT_AGFL_BUF,
+       XFS_BLFT_AGI_BUF,
+       XFS_BLFT_DINO_BUF,
+       XFS_BLFT_SYMLINK_BUF,
+       XFS_BLFT_DIR_BLOCK_BUF,
+       XFS_BLFT_DIR_DATA_BUF,
+       XFS_BLFT_DIR_FREE_BUF,
+       XFS_BLFT_DIR_LEAF1_BUF,
+       XFS_BLFT_DIR_LEAFN_BUF,
+       XFS_BLFT_DA_NODE_BUF,
+       XFS_BLFT_ATTR_LEAF_BUF,
+       XFS_BLFT_ATTR_RMT_BUF,
+       XFS_BLFT_SB_BUF,
+       XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+       ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+       blf->blf_flags &= ~XFS_BLFT_MASK;
+       blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+       return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
+ * EFI/EFD log format definitions
+ */
+typedef struct xfs_extent {
+       xfs_dfsbno_t    ext_start;
+       xfs_extlen_t    ext_len;
+} xfs_extent_t;
+
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+typedef struct xfs_extent_32 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+       __uint64_t      ext_start;
+       __uint32_t      ext_len;
+       __uint32_t      ext_pad;
+} xfs_extent_64_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_t            efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+typedef struct xfs_efi_log_format_32 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+       __uint16_t              efi_type;       /* efi log item type */
+       __uint16_t              efi_size;       /* size of this item */
+       __uint32_t              efi_nextents;   /* # extents to free */
+       __uint64_t              efi_id;         /* efi identifier */
+       xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_64_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_t            efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+typedef struct xfs_efd_log_format_32 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+       __uint16_t              efd_type;       /* efd log item type */
+       __uint16_t              efd_size;       /* size of this item */
+       __uint32_t              efd_nextents;   /* # of extents freed */
+       __uint64_t              efd_efi_id;     /* id of corresponding efi */
+       xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_64_t;
+
+/*
+ * Dquot Log format definitions.
+ *
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+       __uint16_t              qlf_type;      /* dquot log item type */
+       __uint16_t              qlf_size;      /* size of this item */
+       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
+       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
+       __int32_t               qlf_len;       /* len of dquot buffer */
+       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+       unsigned short          qf_type;        /* quotaoff log item type */
+       unsigned short          qf_size;        /* size of this item */
+       unsigned int            qf_flags;       /* USR and/or GRP */
+       char                    qf_pad[12];     /* padding for future */
+} xfs_qoff_logformat_t;
+
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
+#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
+#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
+#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
+
+/*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
+
+#define XFS_ALL_QUOTA_ACCT     \
+               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD     \
+               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD     \
+               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
+
+#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+                                XFS_PQUOTA_CHKD)
+
+/*
+ * Inode create log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+       __uint16_t      icl_type;       /* type of log format structure */
+       __uint16_t      icl_size;       /* size of log format structure */
+       __be32          icl_ag;         /* ag being allocated in */
+       __be32          icl_agbno;      /* start block of inode range */
+       __be32          icl_count;      /* number of inodes to initialise */
+       __be32          icl_isize;      /* size of inodes */
+       __be32          icl_length;     /* length of extent to initialise */
+       __be32          icl_gen;        /* inode generation number to use */
+};
+
+int    xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int    xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+#endif /* __XFS_LOG_FORMAT_H__ */
index b9ea262dd1c2c7575fee738fba6e71575180924a..136654b9400df9b28a40415b1fbca3c9b7d6a958 100644 (file)
@@ -24,51 +24,13 @@ struct xlog_ticket;
 struct xfs_mount;
 
 /*
- * Macros, structures, prototypes for internal log manager use.
+ * Flags for log structure
  */
-
-#define XLOG_MIN_ICLOGS                2
-#define XLOG_MAX_ICLOGS                8
-#define XLOG_HEADER_MAGIC_NUM  0xFEEDbabe      /* Invalid cycle number */
-#define XLOG_VERSION_1         1
-#define XLOG_VERSION_2         2               /* Large IClogs, Log sunit */
-#define XLOG_VERSION_OKBITS    (XLOG_VERSION_1 | XLOG_VERSION_2)
-#define XLOG_MIN_RECORD_BSIZE  (16*1024)       /* eventually 32k */
-#define XLOG_BIG_RECORD_BSIZE  (32*1024)       /* 32k buffers */
-#define XLOG_MAX_RECORD_BSIZE  (256*1024)
-#define XLOG_HEADER_CYCLE_SIZE (32*1024)       /* cycle data in header */
-#define XLOG_MIN_RECORD_BSHIFT 14              /* 16384 == 1 << 14 */
-#define XLOG_BIG_RECORD_BSHIFT 15              /* 32k == 1 << 15 */
-#define XLOG_MAX_RECORD_BSHIFT 18              /* 256k == 1 << 18 */
-#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
-                                 (log)->l_mp->m_sb.sb_logsunit)
-#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
-
-#define XLOG_HEADER_SIZE       512
-
-#define XLOG_REC_SHIFT(log) \
-       BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-#define XLOG_TOTAL_REC_SHIFT(log) \
-       BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
-        XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
-
-static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
-{
-       return ((xfs_lsn_t)cycle << 32) | block;
-}
-
-static inline uint xlog_get_cycle(char *ptr)
-{
-       if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-               return be32_to_cpu(*((__be32 *)ptr + 1));
-       else
-               return be32_to_cpu(*(__be32 *)ptr);
-}
-
-#define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
-
-#ifdef __KERNEL__
+#define XLOG_ACTIVE_RECOVERY   0x2     /* in the middle of recovery */
+#define        XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
+#define XLOG_IO_ERROR          0x8     /* log hit an I/O error, and being
+                                          shutdown */
+#define XLOG_TAIL_WARN         0x10    /* log tail verify warning issued */
 
 /*
  * get client id from packed copy.
@@ -101,27 +63,7 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
 #define XLOG_STATE_ALL      0x7FFF /* All possible valid flags */
 #define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
-#endif /* __KERNEL__ */
 
-/*
- * Flags to log operation header
- *
- * The first write of a new transaction will be preceded with a start
- * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
- * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
- * the remainder of the current active in-core log, it is split up into
- * multiple regions.  Each partial region will be marked with a
- * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
- *
- */
-#define XLOG_START_TRANS       0x01    /* Start a new transaction */
-#define XLOG_COMMIT_TRANS      0x02    /* Commit this transaction */
-#define XLOG_CONTINUE_TRANS    0x04    /* Cont this trans into new region */
-#define XLOG_WAS_CONT_TRANS    0x08    /* Cont this trans into new region */
-#define XLOG_END_TRANS         0x10    /* End a continued transaction */
-#define XLOG_UNMOUNT_TRANS     0x20    /* Unmount a filesystem transaction */
-
-#ifdef __KERNEL__
 /*
  * Flags to log ticket
  */
@@ -132,22 +74,6 @@ static inline uint xlog_get_client_id(__be32 i)
        { XLOG_TIC_INITED,      "XLOG_TIC_INITED" }, \
        { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
 
-#endif /* __KERNEL__ */
-
-#define XLOG_UNMOUNT_TYPE      0x556e  /* Un for Unmount */
-
-/*
- * Flags for log structure
- */
-#define XLOG_ACTIVE_RECOVERY   0x2     /* in the middle of recovery */
-#define        XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
-#define XLOG_IO_ERROR          0x8     /* log hit an I/O error, and being
-                                          shutdown */
-#define XLOG_TAIL_WARN         0x10    /* log tail verify warning issued */
-
-typedef __uint32_t xlog_tid_t;
-
-#ifdef __KERNEL__
 /*
  * Below are states for covering allocation transactions.
  * By covering, we mean changing the h_tail_lsn in the last on-disk
@@ -223,7 +149,6 @@ typedef __uint32_t xlog_tid_t;
 
 #define XLOG_COVER_OPS         5
 
-
 /* Ticket reservation region accounting */ 
 #define XLOG_TIC_LEN_MAX       15
 
@@ -258,64 +183,6 @@ typedef struct xlog_ticket {
        xlog_res_t         t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : 8 * 15 */ 
 } xlog_ticket_t;
 
-#endif
-
-
-typedef struct xlog_op_header {
-       __be32     oh_tid;      /* transaction id of operation  :  4 b */
-       __be32     oh_len;      /* bytes in data region         :  4 b */
-       __u8       oh_clientid; /* who sent me this             :  1 b */
-       __u8       oh_flags;    /*                              :  1 b */
-       __u16      oh_res2;     /* 32 bit align                 :  2 b */
-} xlog_op_header_t;
-
-
-/* valid values for h_fmt */
-#define XLOG_FMT_UNKNOWN  0
-#define XLOG_FMT_LINUX_LE 1
-#define XLOG_FMT_LINUX_BE 2
-#define XLOG_FMT_IRIX_BE  3
-
-/* our fmt */
-#ifdef XFS_NATIVE_HOST
-#define XLOG_FMT XLOG_FMT_LINUX_BE
-#else
-#define XLOG_FMT XLOG_FMT_LINUX_LE
-#endif
-
-typedef struct xlog_rec_header {
-       __be32    h_magicno;    /* log record (LR) identifier           :  4 */
-       __be32    h_cycle;      /* write cycle of log                   :  4 */
-       __be32    h_version;    /* LR version                           :  4 */
-       __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
-       __be64    h_lsn;        /* lsn of this LR                       :  8 */
-       __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-       __le32    h_crc;        /* crc of log record                    :  4 */
-       __be32    h_prev_block; /* block number to previous LR          :  4 */
-       __be32    h_num_logops; /* number of log operations in this LR  :  4 */
-       __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
-       /* new fields */
-       __be32    h_fmt;        /* format of log record                 :  4 */
-       uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
-       __be32    h_size;       /* iclog size                           :  4 */
-} xlog_rec_header_t;
-
-typedef struct xlog_rec_ext_header {
-       __be32    xh_cycle;     /* write cycle of log                   : 4 */
-       __be32    xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
-} xlog_rec_ext_header_t;
-
-#ifdef __KERNEL__
-
-/*
- * Quite misnamed, because this union lays out the actual on-disk log buffer.
- */
-typedef union xlog_in_core2 {
-       xlog_rec_header_t       hic_header;
-       xlog_rec_ext_header_t   hic_xheader;
-       char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
 /*
  * - A log record header is 512 bytes.  There is plenty of room to grow the
  *     xlog_rec_header_t into the reserved space.
@@ -411,14 +278,17 @@ struct xfs_cil {
        struct xlog             *xc_log;
        struct list_head        xc_cil;
        spinlock_t              xc_cil_lock;
+
+       struct rw_semaphore     xc_ctx_lock ____cacheline_aligned_in_smp;
        struct xfs_cil_ctx      *xc_ctx;
-       struct rw_semaphore     xc_ctx_lock;
+
+       spinlock_t              xc_push_lock ____cacheline_aligned_in_smp;
+       xfs_lsn_t               xc_push_seq;
        struct list_head        xc_committing;
        wait_queue_head_t       xc_commit_wait;
        xfs_lsn_t               xc_current_sequence;
        struct work_struct      xc_push_work;
-       xfs_lsn_t               xc_push_seq;
-};
+} ____cacheline_aligned_in_smp;
 
 /*
  * The amount of log space we allow the CIL to aggregate is difficult to size.
@@ -686,6 +556,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
        schedule();
        remove_wait_queue(wq, &wait);
 }
-#endif /* __KERNEL__ */
 
 #endif /* __XFS_LOG_PRIV_H__ */
index 7681b19aa5dc565a9807005ff3f1d6428a22f016..7c0c1fdc728b4ff6e18da1a4f0b46edde2337e4e 100644 (file)
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -41,7 +41,6 @@
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
-#include "xfs_utils.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_da_btree.h"
 #include "xfs_dir2_format.h"
-#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr_remote.h"
 
+#define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
+
 STATIC int
 xlog_find_zeroed(
        struct xlog     *,
@@ -607,7 +608,7 @@ out:
 
 /*
  * Head is defined to be the point of the log where the next log write
- * write could go.  This means that incomplete LR writes at the end are
+ * could go.  This means that incomplete LR writes at the end are
  * eliminated when calculating the head.  We aren't guaranteed that previous
  * LR have complete transactions.  We only know that a cycle number of
  * current cycle number -1 won't be present in the log if we start writing
@@ -963,6 +964,7 @@ xlog_find_tail(
        }
        if (!found) {
                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+               xlog_put_bp(bp);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -1144,7 +1146,8 @@ xlog_find_zeroed(
                 */
                xfs_warn(log->l_mp,
                        "Log inconsistent or not a log (last==0, first!=1)");
-               return XFS_ERROR(EINVAL);
+               error = XFS_ERROR(EINVAL);
+               goto bp_err;
        }
 
        /* we have a partially zeroed log */
@@ -1766,19 +1769,11 @@ xlog_recover_buffer_pass1(
 
 /*
  * Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table.  If it does then return 1
- * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
  */
-STATIC int
-xlog_check_buffer_cancelled(
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
        struct xlog             *log,
        xfs_daddr_t             blkno,
        uint                    len,
@@ -1787,22 +1782,16 @@ xlog_check_buffer_cancelled(
        struct list_head        *bucket;
        struct xfs_buf_cancel   *bcp;
 
-       if (log->l_buf_cancel_table == NULL) {
-               /*
-                * There is nothing in the table built in pass one,
-                * so this buffer must not be cancelled.
-                */
+       if (!log->l_buf_cancel_table) {
+               /* empty table means no cancelled buffers in the log */
                ASSERT(!(flags & XFS_BLF_CANCEL));
-               return 0;
+               return NULL;
        }
 
-       /*
-        * Search for an entry in the  cancel table that matches our buffer.
-        */
        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
        list_for_each_entry(bcp, bucket, bc_list) {
                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
-                       goto found;
+                       return bcp;
        }
 
        /*
@@ -1810,9 +1799,32 @@ xlog_check_buffer_cancelled(
         * that the buffer is NOT cancelled.
         */
        ASSERT(!(flags & XFS_BLF_CANCEL));
-       return 0;
+       return NULL;
+}
+
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0.  If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+       struct xlog             *log,
+       xfs_daddr_t             blkno,
+       uint                    len,
+       ushort                  flags)
+{
+       struct xfs_buf_cancel   *bcp;
+
+       bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+       if (!bcp)
+               return 0;
 
-found:
        /*
         * We've go a match, so return 1 so that the recovery of this buffer
         * is cancelled.  If this buffer is actually a buffer cancel log
@@ -1946,6 +1958,104 @@ xlog_recover_do_inode_buffer(
        return 0;
 }
 
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number.  If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       __uint32_t              magic32;
+       __uint16_t              magic16;
+       __uint16_t              magicda;
+       void                    *blk = bp->b_addr;
+
+       /* v4 filesystems always recover immediately */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               goto recover_immediately;
+
+       magic32 = be32_to_cpu(*(__be32 *)blk);
+       switch (magic32) {
+       case XFS_ABTB_CRC_MAGIC:
+       case XFS_ABTC_CRC_MAGIC:
+       case XFS_ABTB_MAGIC:
+       case XFS_ABTC_MAGIC:
+       case XFS_IBT_CRC_MAGIC:
+       case XFS_IBT_MAGIC:
+               return be64_to_cpu(
+                               ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
+       case XFS_BMAP_CRC_MAGIC:
+       case XFS_BMAP_MAGIC:
+               return be64_to_cpu(
+                               ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
+       case XFS_AGF_MAGIC:
+               return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+       case XFS_AGFL_MAGIC:
+               return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+       case XFS_AGI_MAGIC:
+               return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+       case XFS_SYMLINK_MAGIC:
+               return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+       case XFS_DIR3_BLOCK_MAGIC:
+       case XFS_DIR3_DATA_MAGIC:
+       case XFS_DIR3_FREE_MAGIC:
+               return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+       case XFS_ATTR3_RMT_MAGIC:
+               return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+       case XFS_SB_MAGIC:
+               return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
+       default:
+               break;
+       }
+
+       magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+       switch (magicda) {
+       case XFS_DIR3_LEAF1_MAGIC:
+       case XFS_DIR3_LEAFN_MAGIC:
+       case XFS_DA3_NODE_MAGIC:
+               return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+       default:
+               break;
+       }
+
+       /*
+        * We do individual object checks on dquot and inode buffers as they
+        * have their own individual LSN records. Also, we could have a stale
+        * buffer here, so we have to at least recognise these buffer types.
+        *
+        * A notd complexity here is inode unlinked list processing - it logs
+        * the inode directly in the buffer, but we don't know which inodes have
+        * been modified, and there is no global buffer LSN. Hence we need to
+        * recover all inode buffer types immediately. This problem will be
+        * fixed by logical logging of the unlinked list modifications.
+        */
+       magic16 = be16_to_cpu(*(__be16 *)blk);
+       switch (magic16) {
+       case XFS_DQUOT_MAGIC:
+       case XFS_DINODE_MAGIC:
+               goto recover_immediately;
+       default:
+               break;
+       }
+
+       /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+       return (xfs_lsn_t)-1;
+
+}
+
 /*
  * Validate the recovered buffer is of the correct type and attach the
  * appropriate buffer operations to them for writeback. Magic numbers are in a
@@ -1955,7 +2065,7 @@ xlog_recover_do_inode_buffer(
  *     inside a struct xfs_da_blkinfo at the start of the buffer.
  */
 static void
-xlog_recovery_validate_buf_type(
+xlog_recover_validate_buf_type(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -2234,7 +2344,7 @@ xlog_recover_do_reg_buffer(
         * just avoid the verification stage for non-crc filesystems
         */
        if (xfs_sb_version_hascrc(&mp->m_sb))
-               xlog_recovery_validate_buf_type(mp, bp, buf_f);
+               xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 
 /*
@@ -2366,7 +2476,7 @@ xfs_qm_dqcheck(
 
 /*
  * Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
  */
@@ -2425,20 +2535,22 @@ xlog_recover_do_dquot_buffer(
  * over the log during recovery.  During the first we build a table of
  * those buffers which have been cancelled, and during the second we
  * only replay those buffers which do not have corresponding cancel
- * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table.  See xlog_recover_buffer_pass[1,2] above
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
 xlog_recover_buffer_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
        int                     error;
        uint                    buf_flags;
+       xfs_lsn_t               lsn;
 
        /*
         * In this pass we only want to recover all the buffers which have
@@ -2463,10 +2575,17 @@ xlog_recover_buffer_pass2(
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
-               xfs_buf_relse(bp);
-               return error;
+               goto out_release;
        }
 
+       /*
+        * recover the buffer only if we get an LSN from it and it's less than
+        * the lsn of the transaction we are replaying.
+        */
+       lsn = xlog_recover_get_buf_lsn(mp, bp);
+       if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+               goto out_release;
+
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (buf_f->blf_flags &
@@ -2476,7 +2595,7 @@ xlog_recover_buffer_pass2(
                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
-               return XFS_ERROR(error);
+               goto out_release;
 
        /*
         * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2505,6 +2624,7 @@ xlog_recover_buffer_pass2(
                xfs_buf_delwri_queue(bp, buffer_list);
        }
 
+out_release:
        xfs_buf_relse(bp);
        return error;
 }
@@ -2513,7 +2633,8 @@ STATIC int
 xlog_recover_inode_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp = log->l_mp;
@@ -2592,6 +2713,20 @@ xlog_recover_inode_pass2(
                goto error;
        }
 
+       /*
+        * If the inode has an LSN in it, recover the inode only if it's less
+        * than the lsn of the transaction we are replaying.
+        */
+       if (dip->di_version >= 3) {
+               xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       trace_xfs_log_recover_inode_skip(log, in_f);
+                       error = 0;
+                       goto out_release;
+               }
+       }
+
        /*
         * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
         * are transactional and if ordering is necessary we can determine that
@@ -2781,6 +2916,8 @@ write_inode_buffer:
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
        xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
        xfs_buf_relse(bp);
 error:
        if (need_free)
@@ -2822,7 +2959,8 @@ STATIC int
 xlog_recover_dquot_pass2(
        struct xlog                     *log,
        struct list_head                *buffer_list,
-       struct xlog_recover_item        *item)
+       struct xlog_recover_item        *item,
+       xfs_lsn_t                       current_lsn)
 {
        xfs_mount_t             *mp = log->l_mp;
        xfs_buf_t               *bp;
@@ -2896,6 +3034,19 @@ xlog_recover_dquot_pass2(
                return XFS_ERROR(EIO);
        }
 
+       /*
+        * If the dquot has an LSN in it, recover the dquot only if it's less
+        * than the lsn of the transaction we are replaying.
+        */
+       if (xfs_sb_version_hascrc(&mp->m_sb)) {
+               struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+               xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
+
+               if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                       goto out_release;
+               }
+       }
+
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2906,9 +3057,10 @@ xlog_recover_dquot_pass2(
        ASSERT(bp->b_target->bt_mount == mp);
        bp->b_iodone = xlog_recover_iodone;
        xfs_buf_delwri_queue(bp, buffer_list);
-       xfs_buf_relse(bp);
 
-       return (0);
+out_release:
+       xfs_buf_relse(bp);
+       return 0;
 }
 
 /*
@@ -3116,6 +3268,106 @@ xlog_recover_free_trans(
        kmem_free(trans);
 }
 
+STATIC void
+xlog_recover_buffer_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
+       struct xfs_mount                *mp = log->l_mp;
+
+       if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+                       buf_f->blf_len, buf_f->blf_flags)) {
+               return;
+       }
+
+       xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+                               buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_inode_log_format     ilf_buf;
+       struct xfs_inode_log_format     *ilfp;
+       struct xfs_mount                *mp = log->l_mp;
+       int                     error;
+
+       if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+               ilfp = item->ri_buf[0].i_addr;
+       } else {
+               ilfp = &ilf_buf;
+               memset(ilfp, 0, sizeof(*ilfp));
+               error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+               if (error)
+                       return;
+       }
+
+       if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+               return;
+
+       xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+                               ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       struct xfs_mount        *mp = log->l_mp;
+       struct xfs_disk_dquot   *recddq;
+       struct xfs_dq_logformat *dq_f;
+       uint                    type;
+
+
+       if (mp->m_qflags == 0)
+               return;
+
+       recddq = item->ri_buf[1].i_addr;
+       if (recddq == NULL)
+               return;
+       if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+               return;
+
+       type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+       ASSERT(type);
+       if (log->l_quotaoffs_flag & type)
+               return;
+
+       dq_f = item->ri_buf[0].i_addr;
+       ASSERT(dq_f);
+       ASSERT(dq_f->qlf_len == 1);
+
+       xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+                         XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+       struct xlog                     *log,
+       struct xlog_recover_item        *item)
+{
+       switch (ITEM_TYPE(item)) {
+       case XFS_LI_BUF:
+               xlog_recover_buffer_ra_pass2(log, item);
+               break;
+       case XFS_LI_INODE:
+               xlog_recover_inode_ra_pass2(log, item);
+               break;
+       case XFS_LI_DQUOT:
+               xlog_recover_dquot_ra_pass2(log, item);
+               break;
+       case XFS_LI_EFI:
+       case XFS_LI_EFD:
+       case XFS_LI_QUOTAOFF:
+       default:
+               break;
+       }
+}
+
 STATIC int
 xlog_recover_commit_pass1(
        struct xlog                     *log,
@@ -3155,15 +3407,18 @@ xlog_recover_commit_pass2(
 
        switch (ITEM_TYPE(item)) {
        case XFS_LI_BUF:
-               return xlog_recover_buffer_pass2(log, buffer_list, item);
+               return xlog_recover_buffer_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
        case XFS_LI_INODE:
-               return xlog_recover_inode_pass2(log, buffer_list, item);
+               return xlog_recover_inode_pass2(log, buffer_list, item,
+                                                trans->r_lsn);
        case XFS_LI_EFI:
                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
        case XFS_LI_EFD:
                return xlog_recover_efd_pass2(log, item);
        case XFS_LI_DQUOT:
-               return xlog_recover_dquot_pass2(log, buffer_list, item);
+               return xlog_recover_dquot_pass2(log, buffer_list, item,
+                                               trans->r_lsn);
        case XFS_LI_ICREATE:
                return xlog_recover_do_icreate_pass2(log, buffer_list, item);
        case XFS_LI_QUOTAOFF:
@@ -3177,6 +3432,26 @@ xlog_recover_commit_pass2(
        }
 }
 
+STATIC int
+xlog_recover_items_pass2(
+       struct xlog                     *log,
+       struct xlog_recover             *trans,
+       struct list_head                *buffer_list,
+       struct list_head                *item_list)
+{
+       struct xlog_recover_item        *item;
+       int                             error = 0;
+
+       list_for_each_entry(item, item_list, ri_list) {
+               error = xlog_recover_commit_pass2(log, trans,
+                                         buffer_list, item);
+               if (error)
+                       return error;
+       }
+
+       return error;
+}
+
 /*
  * Perform the transaction.
  *
@@ -3189,9 +3464,16 @@ xlog_recover_commit_trans(
        struct xlog_recover     *trans,
        int                     pass)
 {
-       int                     error = 0, error2;
-       xlog_recover_item_t     *item;
-       LIST_HEAD               (buffer_list);
+       int                             error = 0;
+       int                             error2;
+       int                             items_queued = 0;
+       struct xlog_recover_item        *item;
+       struct xlog_recover_item        *next;
+       LIST_HEAD                       (buffer_list);
+       LIST_HEAD                       (ra_list);
+       LIST_HEAD                       (done_list);
+
+       #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
 
        hlist_del(&trans->r_list);
 
@@ -3199,14 +3481,22 @@ xlog_recover_commit_trans(
        if (error)
                return error;
 
-       list_for_each_entry(item, &trans->r_itemq, ri_list) {
+       list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
                switch (pass) {
                case XLOG_RECOVER_PASS1:
                        error = xlog_recover_commit_pass1(log, trans, item);
                        break;
                case XLOG_RECOVER_PASS2:
-                       error = xlog_recover_commit_pass2(log, trans,
-                                                         &buffer_list, item);
+                       xlog_recover_ra_pass2(log, item);
+                       list_move_tail(&item->ri_list, &ra_list);
+                       items_queued++;
+                       if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+                               error = xlog_recover_items_pass2(log, trans,
+                                               &buffer_list, &ra_list);
+                               list_splice_tail_init(&ra_list, &done_list);
+                               items_queued = 0;
+                       }
+
                        break;
                default:
                        ASSERT(0);
@@ -3216,9 +3506,19 @@ xlog_recover_commit_trans(
                        goto out;
        }
 
+out:
+       if (!list_empty(&ra_list)) {
+               if (!error)
+                       error = xlog_recover_items_pass2(log, trans,
+                                       &buffer_list, &ra_list);
+               list_splice_tail_init(&ra_list, &done_list);
+       }
+
+       if (!list_empty(&done_list))
+               list_splice_init(&done_list, &trans->r_itemq);
+
        xlog_recover_free_trans(trans);
 
-out:
        error2 = xfs_buf_delwri_submit(&buffer_list);
        return error ? error : error2;
 }
@@ -3376,7 +3676,7 @@ xlog_recover_process_efi(
        }
 
        tp = xfs_trans_alloc(mp, 0);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
                goto abort_error;
        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3482,8 +3782,7 @@ xlog_recover_clear_agi_bucket(
        int             error;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-                                 0, 0, 0);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
        if (error)
                goto out_abort;
 
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
new file mode 100644 (file)
index 0000000..bbcec0b
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2013 Jie Liu.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+
+/*
+ * Calculate the maximum length in bytes that would be required for a local
+ * attribute value as large attributes out of line are not logged.
+ */
+STATIC int
+xfs_log_calc_max_attrsetm_res(
+       struct xfs_mount        *mp)
+{
+       int                     size;
+       int                     nblks;
+
+       size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
+              MAXNAMELEN - 1;
+       nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+       nblks += XFS_B_TO_FSB(mp, size);
+       nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
+
+       return  M_RES(mp)->tr_attrsetm.tr_logres +
+               M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
+}
+
+/*
+ * Iterate over the log space reservation table to figure out and return
+ * the maximum one in terms of the pre-calculated values which were done
+ * at mount time.
+ */
+STATIC void
+xfs_log_get_max_trans_res(
+       struct xfs_mount        *mp,
+       struct xfs_trans_res    *max_resp)
+{
+       struct xfs_trans_res    *resp;
+       struct xfs_trans_res    *end_resp;
+       int                     log_space = 0;
+       int                     attr_space;
+
+       attr_space = xfs_log_calc_max_attrsetm_res(mp);
+
+       resp = (struct xfs_trans_res *)M_RES(mp);
+       end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
+       for (; resp < end_resp; resp++) {
+               int             tmp = resp->tr_logcount > 1 ?
+                                     resp->tr_logres * resp->tr_logcount :
+                                     resp->tr_logres;
+               if (log_space < tmp) {
+                       log_space = tmp;
+                       *max_resp = *resp;              /* struct copy */
+               }
+       }
+
+       if (attr_space > log_space) {
+               *max_resp = M_RES(mp)->tr_attrsetm;     /* struct copy */
+               max_resp->tr_logres = attr_space;
+       }
+}
+
+/*
+ * Calculate the minimum valid log size for the given superblock configuration.
+ * Used to calculate the minimum log size at mkfs time, and to determine if
+ * the log is large enough or not at mount time. Returns the minimum size in
+ * filesystem block size units.
+ */
+int
+xfs_log_calc_minimum_size(
+       struct xfs_mount        *mp)
+{
+       struct xfs_trans_res    tres = {0};
+       int                     max_logres;
+       int                     min_logblks = 0;
+       int                     lsunit = 0;
+
+       xfs_log_get_max_trans_res(mp, &tres);
+
+       max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
+       if (tres.tr_logcount > 1)
+               max_logres *= tres.tr_logcount;
+
+       if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
+               lsunit = BTOBB(mp->m_sb.sb_logsunit);
+
+       /*
+        * Two factors should be taken into account for calculating the minimum
+        * log space.
+        * 1) The fundamental limitation is that no single transaction can be
+        *    larger than half size of the log.
+        *
+        *    From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
+        *    define, which is set to 3. That means we can definitely fit
+        *    maximally sized 2 transactions in the log. We'll use this same
+        *    value here.
+        *
+        * 2) If the lsunit option is specified, a transaction requires 2 LSU
+        *    for the reservation because there are two log writes that can
+        *    require padding - the transaction data and the commit record which
+        *    are written separately and both can require padding to the LSU.
+        *    Consider that we can have an active CIL reservation holding 2*LSU,
+        *    but the CIL is not over a push threshold, in this case, if we
+        *    don't have enough log space for at one new transaction, which
+        *    includes another 2*LSU in the reservation, we will run into dead
+        *    loop situation in log space grant procedure. i.e.
+        *    xlog_grant_head_wait().
+        *
+        *    Hence the log size needs to be able to contain two maximally sized
+        *    and padded transactions, which is (2 * (2 * LSU + maxlres)).
+        *
+        * Also, the log size should be a multiple of the log stripe unit, round
+        * it up to lsunit boundary if lsunit is specified.
+        */
+       if (lsunit) {
+               min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
+                             2 * lsunit;
+       } else
+               min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
+       min_logblks *= XFS_MIN_LOG_FACTOR;
+
+       return XFS_BB_TO_FSB(mp, min_logblks);
+}
index 2b0ba358165619b87523315f1ca3940b4e2606c0..5dcc68019d1bc8c49a799695436045d69d49167f 100644 (file)
@@ -17,7 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
@@ -40,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_fsops.h"
-#include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_cksum.h"
@@ -59,69 +60,6 @@ STATIC void  xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #define xfs_icsb_balance_counter_locked(mp, a, b)      do { } while (0)
 #endif
 
-static const struct {
-       short offset;
-       short type;     /* 0 = integer
-                        * 1 = binary / string (no translation)
-                        */
-} xfs_sb_info[] = {
-    { offsetof(xfs_sb_t, sb_magicnum),   0 },
-    { offsetof(xfs_sb_t, sb_blocksize),  0 },
-    { offsetof(xfs_sb_t, sb_dblocks),    0 },
-    { offsetof(xfs_sb_t, sb_rblocks),    0 },
-    { offsetof(xfs_sb_t, sb_rextents),   0 },
-    { offsetof(xfs_sb_t, sb_uuid),       1 },
-    { offsetof(xfs_sb_t, sb_logstart),   0 },
-    { offsetof(xfs_sb_t, sb_rootino),    0 },
-    { offsetof(xfs_sb_t, sb_rbmino),     0 },
-    { offsetof(xfs_sb_t, sb_rsumino),    0 },
-    { offsetof(xfs_sb_t, sb_rextsize),   0 },
-    { offsetof(xfs_sb_t, sb_agblocks),   0 },
-    { offsetof(xfs_sb_t, sb_agcount),    0 },
-    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
-    { offsetof(xfs_sb_t, sb_logblocks),  0 },
-    { offsetof(xfs_sb_t, sb_versionnum), 0 },
-    { offsetof(xfs_sb_t, sb_sectsize),   0 },
-    { offsetof(xfs_sb_t, sb_inodesize),  0 },
-    { offsetof(xfs_sb_t, sb_inopblock),  0 },
-    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
-    { offsetof(xfs_sb_t, sb_blocklog),   0 },
-    { offsetof(xfs_sb_t, sb_sectlog),    0 },
-    { offsetof(xfs_sb_t, sb_inodelog),   0 },
-    { offsetof(xfs_sb_t, sb_inopblog),   0 },
-    { offsetof(xfs_sb_t, sb_agblklog),   0 },
-    { offsetof(xfs_sb_t, sb_rextslog),   0 },
-    { offsetof(xfs_sb_t, sb_inprogress), 0 },
-    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
-    { offsetof(xfs_sb_t, sb_icount),     0 },
-    { offsetof(xfs_sb_t, sb_ifree),      0 },
-    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
-    { offsetof(xfs_sb_t, sb_frextents),  0 },
-    { offsetof(xfs_sb_t, sb_uquotino),   0 },
-    { offsetof(xfs_sb_t, sb_gquotino),   0 },
-    { offsetof(xfs_sb_t, sb_qflags),     0 },
-    { offsetof(xfs_sb_t, sb_flags),      0 },
-    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
-    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
-    { offsetof(xfs_sb_t, sb_unit),      0 },
-    { offsetof(xfs_sb_t, sb_width),     0 },
-    { offsetof(xfs_sb_t, sb_dirblklog),         0 },
-    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
-    { offsetof(xfs_sb_t, sb_logsectsize),0 },
-    { offsetof(xfs_sb_t, sb_logsunit),  0 },
-    { offsetof(xfs_sb_t, sb_features2),         0 },
-    { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-    { offsetof(xfs_sb_t, sb_features_compat), 0 },
-    { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
-    { offsetof(xfs_sb_t, sb_features_incompat), 0 },
-    { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-    { offsetof(xfs_sb_t, sb_crc),       0 },
-    { offsetof(xfs_sb_t, sb_pad),       0 },
-    { offsetof(xfs_sb_t, sb_pquotino),  0 },
-    { offsetof(xfs_sb_t, sb_lsn),       0 },
-    { sizeof(xfs_sb_t),                         0 }
-};
-
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
@@ -197,64 +135,6 @@ xfs_uuid_unmount(
 }
 
 
-/*
- * Reference counting access wrappers to the perag structures.
- * Because we never free per-ag structures, the only thing we
- * have to protect against changes is the tree structure itself.
- */
-struct xfs_perag *
-xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
-{
-       struct xfs_perag        *pag;
-       int                     ref = 0;
-
-       rcu_read_lock();
-       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
-       if (pag) {
-               ASSERT(atomic_read(&pag->pag_ref) >= 0);
-               ref = atomic_inc_return(&pag->pag_ref);
-       }
-       rcu_read_unlock();
-       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
-       return pag;
-}
-
-/*
- * search from @first to find the next perag with the given tag set.
- */
-struct xfs_perag *
-xfs_perag_get_tag(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          first,
-       int                     tag)
-{
-       struct xfs_perag        *pag;
-       int                     found;
-       int                     ref;
-
-       rcu_read_lock();
-       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
-                                       (void **)&pag, first, 1, tag);
-       if (found <= 0) {
-               rcu_read_unlock();
-               return NULL;
-       }
-       ref = atomic_inc_return(&pag->pag_ref);
-       rcu_read_unlock();
-       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
-       return pag;
-}
-
-void
-xfs_perag_put(struct xfs_perag *pag)
-{
-       int     ref;
-
-       ASSERT(atomic_read(&pag->pag_ref) > 0);
-       ref = atomic_dec_return(&pag->pag_ref);
-       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
-}
-
 STATIC void
 __xfs_free_perag(
        struct rcu_head *head)
@@ -307,184 +187,6 @@ xfs_sb_validate_fsb_count(
        return 0;
 }
 
-/*
- * Check the validity of the SB found.
- */
-STATIC int
-xfs_mount_validate_sb(
-       xfs_mount_t     *mp,
-       xfs_sb_t        *sbp,
-       bool            check_inprogress,
-       bool            check_version)
-{
-
-       /*
-        * If the log device and data device have the
-        * same device number, the log is internal.
-        * Consequently, the sb_logstart should be non-zero.  If
-        * we have a zero sb_logstart in this case, we may be trying to mount
-        * a volume filesystem in a non-volume manner.
-        */
-       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-               xfs_warn(mp, "bad magic number");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-
-       if (!xfs_sb_good_version(sbp)) {
-               xfs_warn(mp, "bad version");
-               return XFS_ERROR(EWRONGFS);
-       }
-
-       if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
-                       (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
-                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
-               xfs_notice(mp,
-"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Version 5 superblock feature mask validation. Reject combinations the
-        * kernel cannot support up front before checking anything else. For
-        * write validation, we don't need to check feature masks.
-        */
-       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-               xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
-
-               if (xfs_sb_has_compat_feature(sbp,
-                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
-                               (sbp->sb_features_compat &
-                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
-               }
-
-               if (xfs_sb_has_ro_compat_feature(sbp,
-                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
-                       xfs_alert(mp,
-"Superblock has unknown read-only compatible features (0x%x) enabled.",
-                               (sbp->sb_features_ro_compat &
-                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
-                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                               xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
-"Filesystem can only be safely mounted read only.");
-                               return XFS_ERROR(EINVAL);
-                       }
-               }
-               if (xfs_sb_has_incompat_feature(sbp,
-                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
-                       xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
-                               (sbp->sb_features_incompat &
-                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
-                       return XFS_ERROR(EINVAL);
-               }
-       }
-
-       if (unlikely(
-           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an external log; "
-               "specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       if (unlikely(
-           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-               xfs_warn(mp,
-               "filesystem is marked as having an internal log; "
-               "do not specify logdev on the mount command line.");
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * More sanity checking.  Most of these were stolen directly from
-        * xfs_repair.
-        */
-       if (unlikely(
-           sbp->sb_agcount <= 0                                        ||
-           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
-           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
-           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
-           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
-           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
-           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
-           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
-           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
-           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
-           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
-           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
-           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
-           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
-           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
-           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
-           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
-           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
-           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
-           sbp->sb_dblocks == 0                                        ||
-           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
-           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-               XFS_CORRUPTION_ERROR("SB sanity check failed",
-                               XFS_ERRLEVEL_LOW, mp, sbp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Until this is fixed only page-sized or smaller data blocks work.
-        */
-       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-               xfs_warn(mp,
-               "File system with blocksize %d bytes. "
-               "Only pagesize (%ld) or less will currently work.",
-                               sbp->sb_blocksize, PAGE_SIZE);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       /*
-        * Currently only very few inode sizes are supported.
-        */
-       switch (sbp->sb_inodesize) {
-       case 256:
-       case 512:
-       case 1024:
-       case 2048:
-               break;
-       default:
-               xfs_warn(mp, "inode size of %d bytes not supported",
-                               sbp->sb_inodesize);
-               return XFS_ERROR(ENOSYS);
-       }
-
-       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
-           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-               xfs_warn(mp,
-               "file system too large to be mounted on this system.");
-               return XFS_ERROR(EFBIG);
-       }
-
-       if (check_inprogress && sbp->sb_inprogress) {
-               xfs_warn(mp, "Offline file system operation in progress!");
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       /*
-        * Version 1 directory format has never worked on Linux.
-        */
-       if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-               xfs_warn(mp, "file system using version 1 directory format");
-               return XFS_ERROR(ENOSYS);
-       }
-
-       return 0;
-}
-
 int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
@@ -569,283 +271,15 @@ out_unwind:
        return error;
 }
 
-static void
-xfs_sb_quota_from_disk(struct xfs_sb *sbp)
-{
-       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
-       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
-               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
-                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
-       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
-}
-
-void
-xfs_sb_from_disk(
-       struct xfs_sb   *to,
-       xfs_dsb_t       *from)
-{
-       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
-       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
-       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
-       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
-       to->sb_rextents = be64_to_cpu(from->sb_rextents);
-       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-       to->sb_logstart = be64_to_cpu(from->sb_logstart);
-       to->sb_rootino = be64_to_cpu(from->sb_rootino);
-       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
-       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
-       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
-       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
-       to->sb_agcount = be32_to_cpu(from->sb_agcount);
-       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
-       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
-       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
-       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
-       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
-       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
-       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-       to->sb_blocklog = from->sb_blocklog;
-       to->sb_sectlog = from->sb_sectlog;
-       to->sb_inodelog = from->sb_inodelog;
-       to->sb_inopblog = from->sb_inopblog;
-       to->sb_agblklog = from->sb_agblklog;
-       to->sb_rextslog = from->sb_rextslog;
-       to->sb_inprogress = from->sb_inprogress;
-       to->sb_imax_pct = from->sb_imax_pct;
-       to->sb_icount = be64_to_cpu(from->sb_icount);
-       to->sb_ifree = be64_to_cpu(from->sb_ifree);
-       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
-       to->sb_frextents = be64_to_cpu(from->sb_frextents);
-       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
-       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
-       to->sb_qflags = be16_to_cpu(from->sb_qflags);
-       to->sb_flags = from->sb_flags;
-       to->sb_shared_vn = from->sb_shared_vn;
-       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
-       to->sb_unit = be32_to_cpu(from->sb_unit);
-       to->sb_width = be32_to_cpu(from->sb_width);
-       to->sb_dirblklog = from->sb_dirblklog;
-       to->sb_logsectlog = from->sb_logsectlog;
-       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
-       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
-       to->sb_features2 = be32_to_cpu(from->sb_features2);
-       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
-       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
-       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
-       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
-       to->sb_features_log_incompat =
-                               be32_to_cpu(from->sb_features_log_incompat);
-       to->sb_pad = 0;
-       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
-       to->sb_lsn = be64_to_cpu(from->sb_lsn);
-}
-
-static inline void
-xfs_sb_quota_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       *fields)
-{
-       __uint16_t      qflags = from->sb_qflags;
-
-       if (*fields & XFS_SB_QFLAGS) {
-               /*
-                * The in-core version of sb_qflags do not have
-                * XFS_OQUOTA_* flags, whereas the on-disk version
-                * does.  So, convert incore XFS_{PG}QUOTA_* flags
-                * to on-disk XFS_OQUOTA_* flags.
-                */
-               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                       qflags |= XFS_OQUOTA_ENFD;
-               if (from->sb_qflags &
-                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                       qflags |= XFS_OQUOTA_CHKD;
-               to->sb_qflags = cpu_to_be16(qflags);
-               *fields &= ~XFS_SB_QFLAGS;
-       }
-}
-
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
-void
-xfs_sb_to_disk(
-       xfs_dsb_t       *to,
-       xfs_sb_t        *from,
-       __int64_t       fields)
-{
-       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
-       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-       xfs_sb_field_t  f;
-       int             first;
-       int             size;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-
-       xfs_sb_quota_to_disk(to, from, &fields);
-       while (fields) {
-               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-               first = xfs_sb_info[f].offset;
-               size = xfs_sb_info[f + 1].offset - first;
-
-               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
-
-               if (size == 1 || xfs_sb_info[f].type == 1) {
-                       memcpy(to_ptr + first, from_ptr + first, size);
-               } else {
-                       switch (size) {
-                       case 2:
-                               *(__be16 *)(to_ptr + first) =
-                                       cpu_to_be16(*(__u16 *)(from_ptr + first));
-                               break;
-                       case 4:
-                               *(__be32 *)(to_ptr + first) =
-                                       cpu_to_be32(*(__u32 *)(from_ptr + first));
-                               break;
-                       case 8:
-                               *(__be64 *)(to_ptr + first) =
-                                       cpu_to_be64(*(__u64 *)(from_ptr + first));
-                               break;
-                       default:
-                               ASSERT(0);
-                       }
-               }
-
-               fields &= ~(1LL << f);
-       }
-}
-
-static int
-xfs_sb_verify(
-       struct xfs_buf  *bp,
-       bool            check_version)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_sb   sb;
-
-       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
-
-       /*
-        * Only check the in progress field for the primary superblock as
-        * mkfs.xfs doesn't clear it from secondary superblocks.
-        */
-       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
-                                    check_version);
-}
-
-/*
- * If the superblock has the CRC feature bit set or the CRC field is non-null,
- * check that the CRC is valid.  We check the CRC field is non-null because a
- * single bit error could clear the feature bit and unused parts of the
- * superblock are supposed to be zero. Hence a non-null crc field indicates that
- * we've potentially lost a feature bit and we should check it anyway.
- */
-static void
-xfs_sb_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-       int             error;
-
-       /*
-        * open code the version check to avoid needing to convert the entire
-        * superblock from disk order just to check the version number
-        */
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
-           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
-                                               XFS_SB_VERSION_5) ||
-            dsb->sb_crc != 0)) {
-
-               if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
-                                     offsetof(struct xfs_sb, sb_crc))) {
-                       error = EFSCORRUPTED;
-                       goto out_error;
-               }
-       }
-       error = xfs_sb_verify(bp, true);
-
-out_error:
-       if (error) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, error);
-       }
-}
-
-/*
- * We may be probed for a filesystem match, so we may not want to emit
- * messages when the superblock buffer is not actually an XFS superblock.
- * If we find an XFS superblock, the run a normal, noisy mount because we are
- * really going to mount it and want to know about errors.
- */
-static void
-xfs_sb_quiet_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
-
-
-       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
-               /* XFS filesystem, verify noisily! */
-               xfs_sb_read_verify(bp);
-               return;
-       }
-       /* quietly fail */
-       xfs_buf_ioerror(bp, EWRONGFS);
-}
-
-static void
-xfs_sb_write_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-       int                     error;
-
-       error = xfs_sb_verify(bp, false);
-       if (error) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, error);
-               return;
-       }
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (bip)
-               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-
-       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
-                        offsetof(struct xfs_sb, sb_crc));
-}
-
-const struct xfs_buf_ops xfs_sb_buf_ops = {
-       .verify_read = xfs_sb_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
-static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
-       .verify_read = xfs_sb_quiet_read_verify,
-       .verify_write = xfs_sb_write_verify,
-};
-
 /*
  * xfs_readsb
  *
  * Does the initial read of the superblock.
  */
 int
-xfs_readsb(xfs_mount_t *mp, int flags)
+xfs_readsb(
+       struct xfs_mount *mp,
+       int             flags)
 {
        unsigned int    sector_size;
        struct xfs_buf  *bp;
@@ -884,8 +318,8 @@ reread:
         * Initialize the mount structure from the superblock.
         */
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
-
        xfs_sb_quota_from_disk(&mp->m_sb);
+
        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
@@ -922,107 +356,6 @@ release_buf:
        return error;
 }
 
-
-/*
- * xfs_mount_common
- *
- * Mount initialization code establishing various mount
- * fields from the superblock associated with the given
- * mount structure
- */
-STATIC void
-xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
-{
-       mp->m_agfrotor = mp->m_agirotor = 0;
-       spin_lock_init(&mp->m_agirotor_lock);
-       mp->m_maxagi = mp->m_sb.sb_agcount;
-       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
-       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
-       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
-       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
-       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-       mp->m_blockmask = sbp->sb_blocksize - 1;
-       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
-       mp->m_blockwmask = mp->m_blockwsize - 1;
-
-       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-
-       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
-       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-
-       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
-       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
-                                       sbp->sb_inopblock);
-       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-}
-
-/*
- * xfs_initialize_perag_data
- *
- * Read in each per-ag structure so we can count up the number of
- * allocated inodes, free inodes and used filesystem blocks as this
- * information is no longer persistent in the superblock. Once we have
- * this information, write it into the in-core superblock structure.
- */
-STATIC int
-xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
-{
-       xfs_agnumber_t  index;
-       xfs_perag_t     *pag;
-       xfs_sb_t        *sbp = &mp->m_sb;
-       uint64_t        ifree = 0;
-       uint64_t        ialloc = 0;
-       uint64_t        bfree = 0;
-       uint64_t        bfreelst = 0;
-       uint64_t        btree = 0;
-       int             error;
-
-       for (index = 0; index < agcount; index++) {
-               /*
-                * read the agf, then the agi. This gets us
-                * all the information we need and populates the
-                * per-ag structures for us.
-                */
-               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
-               if (error)
-                       return error;
-
-               error = xfs_ialloc_pagi_init(mp, NULL, index);
-               if (error)
-                       return error;
-               pag = xfs_perag_get(mp, index);
-               ifree += pag->pagi_freecount;
-               ialloc += pag->pagi_count;
-               bfree += pag->pagf_freeblks;
-               bfreelst += pag->pagf_flcount;
-               btree += pag->pagf_btreeblks;
-               xfs_perag_put(pag);
-       }
-       /*
-        * Overwrite incore superblock counters with just-read data
-        */
-       spin_lock(&mp->m_sb_lock);
-       sbp->sb_ifree = ifree;
-       sbp->sb_icount = ialloc;
-       sbp->sb_fdblocks = bfree + bfreelst + btree;
-       spin_unlock(&mp->m_sb_lock);
-
-       /* Fixup the per-cpu counters as well. */
-       xfs_icsb_reinit_counters(mp);
-
-       return 0;
-}
-
 /*
  * Update alignment values based on mount options and sb values
  */
@@ -1194,7 +527,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 }
 
 /*
- * Check that the data (and log if separate) are an ok size.
+ * Check that the data (and log if separate) is an ok size.
  */
 STATIC int
 xfs_check_sizes(xfs_mount_t *mp)
@@ -1264,8 +597,7 @@ xfs_mount_reset_sbqflags(
                return 0;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1315,7 +647,7 @@ xfs_mountfs(
        uint            quotaflags = 0;
        int             error = 0;
 
-       xfs_mount_common(mp, sbp);
+       xfs_sb_mount_common(mp, sbp);
 
        /*
         * Check for a mismatched features2 values.  Older kernels
@@ -1400,7 +732,7 @@ xfs_mountfs(
        xfs_set_inoalignment(mp);
 
        /*
-        * Check that the data (and log if separate) are an ok size.
+        * Check that the data (and log if separate) is an ok size.
         */
        error = xfs_check_sizes(mp);
        if (error)
@@ -1738,8 +1070,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
                return 0;
 
        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
@@ -1752,49 +1083,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
 }
 
 /*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
- * in-core superblock into the superblock buffer to be logged.
- * It does not provide the higher level of locking that is
- * needed to protect the in-core superblock from concurrent
- * access.
- */
-void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
-{
-       xfs_buf_t       *bp;
-       int             first;
-       int             last;
-       xfs_mount_t     *mp;
-       xfs_sb_field_t  f;
-
-       ASSERT(fields);
-       if (!fields)
-               return;
-       mp = tp->t_mountp;
-       bp = xfs_trans_getsb(tp, mp, 0);
-       first = sizeof(xfs_sb_t);
-       last = 0;
-
-       /* translate/copy */
-
-       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
-
-       /* find modified range */
-       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       last = xfs_sb_info[f + 1].offset - 1;
-
-       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
-       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-       first = xfs_sb_info[f].offset;
-
-       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
-       xfs_trans_log_buf(tp, bp, first, last);
-}
-
-
-/*
- * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
  * a delta to a specified field in the in-core superblock.  Simply
  * switch on the field indicated and apply the delta to that field.
  * Fields are not allowed to dip below zero, so if the delta would
@@ -2101,8 +1390,7 @@ xfs_mount_log_sb(
                         XFS_SB_VERSIONNUM));
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-       error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                 XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
@@ -2260,12 +1548,6 @@ xfs_icsb_init_counters(
        if (mp->m_sb_cnts == NULL)
                return -ENOMEM;
 
-#ifdef CONFIG_HOTPLUG_CPU
-       mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-       mp->m_icsb_notifier.priority = 0;
-       register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
        for_each_online_cpu(i) {
                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
@@ -2278,6 +1560,13 @@ xfs_icsb_init_counters(
         * initial balance kicks us off correctly
         */
        mp->m_icsb_counters = -1;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
+       mp->m_icsb_notifier.priority = 0;
+       register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
+
        return 0;
 }
 
index 4e374d4a9189622bccb2b56aa331c7218567d1f2..1fa0584b5627830c77e4bf0fa02db9c55c066a2d 100644 (file)
 #ifndef __XFS_MOUNT_H__
 #define        __XFS_MOUNT_H__
 
-typedef struct xfs_trans_reservations {
-       uint    tr_write;       /* extent alloc trans */
-       uint    tr_itruncate;   /* truncate trans */
-       uint    tr_rename;      /* rename trans */
-       uint    tr_link;        /* link trans */
-       uint    tr_remove;      /* unlink trans */
-       uint    tr_symlink;     /* symlink trans */
-       uint    tr_create;      /* create trans */
-       uint    tr_mkdir;       /* mkdir trans */
-       uint    tr_ifree;       /* inode free trans */
-       uint    tr_ichange;     /* inode update trans */
-       uint    tr_growdata;    /* fs data section grow trans */
-       uint    tr_swrite;      /* sync write inode trans */
-       uint    tr_addafork;    /* cvt inode to attributed trans */
-       uint    tr_writeid;     /* write setuid/setgid file */
-       uint    tr_attrinval;   /* attr fork buffer invalidation */
-       uint    tr_attrsetm;    /* set/create an attribute at mount time */
-       uint    tr_attrsetrt;   /* set/create an attribute at runtime */
-       uint    tr_attrrm;      /* remove an attribute */
-       uint    tr_clearagi;    /* clear bad agi unlinked ino bucket */
-       uint    tr_growrtalloc; /* grow realtime allocations */
-       uint    tr_growrtzero;  /* grow realtime zeroing */
-       uint    tr_growrtfree;  /* grow realtime freeing */
-       uint    tr_qm_sbchange; /* change quota flags */
-       uint    tr_qm_setqlim;  /* adjust quota limits */
-       uint    tr_qm_dqalloc;  /* allocate quota on disk */
-       uint    tr_qm_quotaoff; /* turn quota off */
-       uint    tr_qm_equotaoff;/* end of turn quota off */
-       uint    tr_sb;          /* modify superblock */
-} xfs_trans_reservations_t;
-
-#ifndef __KERNEL__
-
-#define xfs_daddr_to_agno(mp,d) \
-       ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define xfs_daddr_to_agbno(mp,d) \
-       ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-
-#else /* __KERNEL__ */
+#ifdef __KERNEL__
 
 struct xlog;
 struct xfs_inode;
@@ -174,7 +136,7 @@ typedef struct xfs_mount {
        int                     m_ialloc_blks;  /* blocks in inode allocation */
        int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
        uint                    m_qflags;       /* quota status flags */
-       xfs_trans_reservations_t m_reservations;/* precomputed res values */
+       struct xfs_trans_resv   m_resv;         /* precomputed res values */
        __uint64_t              m_maxicount;    /* maximum inode count */
        __uint64_t              m_resblks;      /* total reserved blocks */
        __uint64_t              m_resblks_avail;/* available reserved blocks */
@@ -329,14 +291,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
        return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
 
-/*
- * perag get/put wrappers for ref counting
- */
-struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
-                                       int tag);
-void   xfs_perag_put(struct xfs_perag *pag);
-
 /*
  * Per-cpu superblock locking functions
  */
@@ -366,9 +320,63 @@ typedef struct xfs_mod_sb {
        int64_t         msb_delta;      /* Change to make to specified field */
 } xfs_mod_sb_t;
 
+/*
+ * Per-ag incore structure, copies of information in agf and agi, to improve the
+ * performance of allocation group selection. This is defined for the kernel
+ * only, and hence is defined here instead of in xfs_ag.h. You need the struct
+ * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
+ * so this doesn't introduce any strange header file dependencies.
+ */
+typedef struct xfs_perag {
+       struct xfs_mount *pag_mount;    /* owner filesystem */
+       xfs_agnumber_t  pag_agno;       /* AG this structure belongs to */
+       atomic_t        pag_ref;        /* perag reference count */
+       char            pagf_init;      /* this agf's entry is initialized */
+       char            pagi_init;      /* this agi's entry is initialized */
+       char            pagf_metadata;  /* the agf is preferred to be metadata */
+       char            pagi_inodeok;   /* The agi is ok for inodes */
+       __uint8_t       pagf_levels[XFS_BTNUM_AGF];
+                                       /* # of levels in bno & cnt btree */
+       __uint32_t      pagf_flcount;   /* count of blocks in freelist */
+       xfs_extlen_t    pagf_freeblks;  /* total free blocks */
+       xfs_extlen_t    pagf_longest;   /* longest free space */
+       __uint32_t      pagf_btreeblks; /* # of blocks held in AGF btrees */
+       xfs_agino_t     pagi_freecount; /* number of free inodes */
+       xfs_agino_t     pagi_count;     /* number of allocated inodes */
+
+       /*
+        * Inode allocation search lookup optimisation.
+        * If the pagino matches, the search for new inodes
+        * doesn't need to search the near ones again straight away
+        */
+       xfs_agino_t     pagl_pagino;
+       xfs_agino_t     pagl_leftrec;
+       xfs_agino_t     pagl_rightrec;
+       spinlock_t      pagb_lock;      /* lock for pagb_tree */
+       struct rb_root  pagb_tree;      /* ordered tree of busy extents */
+
+       atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
+
+       spinlock_t      pag_ici_lock;   /* incore inode cache lock */
+       struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+       int             pag_ici_reclaimable;    /* reclaimable inodes */
+       struct mutex    pag_ici_reclaim_lock;   /* serialisation point */
+       unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
+
+       /* buffer cache index */
+       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
+       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+
+       /* for rcu-safe freeing */
+       struct rcu_head rcu_head;
+       int             pagb_count;     /* pagb slots in use */
+} xfs_perag_t;
+
 extern int     xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int     xfs_mountfs(xfs_mount_t *mp);
+extern int     xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
+                                    xfs_agnumber_t *maxagi);
 
 extern void    xfs_unmountfs(xfs_mount_t *);
 extern int     xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -387,13 +395,4 @@ extern void        xfs_set_low_space_thresholds(struct xfs_mount *);
 
 #endif /* __KERNEL__ */
 
-extern void    xfs_sb_calc_crc(struct xfs_buf  *);
-extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern int     xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
-                                       xfs_agnumber_t *);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
-
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-
 #endif /* __XFS_MOUNT_H__ */
index d320794d03ce233d93f7ccfcd0a6c2c3f186e9c2..6218a0aeeeea88449c4a1e29e54c0297ae8de6fb 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
@@ -37,7 +38,6 @@
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
-#include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -834,21 +834,52 @@ xfs_qm_qino_alloc(
        int             error;
        int             committed;
 
+       *ip = NULL;
+       /*
+        * With superblock that doesn't have separate pquotino, we
+        * share an inode between gquota and pquota. If the on-disk
+        * superblock has GQUOTA and the filesystem is now mounted
+        * with PQUOTA, just use sb_gquotino for sb_pquotino and
+        * vice-versa.
+        */
+       if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
+                       (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
+               xfs_ino_t ino = NULLFSINO;
+
+               if ((flags & XFS_QMOPT_PQUOTA) &&
+                            (mp->m_sb.sb_gquotino != NULLFSINO)) {
+                       ino = mp->m_sb.sb_gquotino;
+                       ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+               } else if ((flags & XFS_QMOPT_GQUOTA) &&
+                            (mp->m_sb.sb_pquotino != NULLFSINO)) {
+                       ino = mp->m_sb.sb_pquotino;
+                       ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+               }
+               if (ino != NULLFSINO) {
+                       error = xfs_iget(mp, NULL, ino, 0, 0, ip);
+                       if (error)
+                               return error;
+                       mp->m_sb.sb_gquotino = NULLFSINO;
+                       mp->m_sb.sb_pquotino = NULLFSINO;
+               }
+       }
+
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-       if ((error = xfs_trans_reserve(tp,
-                                     XFS_QM_QINOCREATE_SPACE_RES(mp),
-                                     XFS_CREATE_LOG_RES(mp), 0,
-                                     XFS_TRANS_PERM_LOG_RES,
-                                     XFS_CREATE_LOG_COUNT))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
+                                 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
+       if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
 
-       error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-       if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT);
-               return error;
+       if (!*ip) {
+               error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
+                                                               &committed);
+               if (error) {
+                       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                        XFS_TRANS_ABORT);
+                       return error;
+               }
        }
 
        /*
@@ -860,21 +891,25 @@ xfs_qm_qino_alloc(
        if (flags & XFS_QMOPT_SBVERSION) {
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                  XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
-                      (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                       XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+                       XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
+                               (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                                XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+                                XFS_SB_QFLAGS));
 
                xfs_sb_version_addquota(&mp->m_sb);
                mp->m_sb.sb_uquotino = NULLFSINO;
                mp->m_sb.sb_gquotino = NULLFSINO;
+               mp->m_sb.sb_pquotino = NULLFSINO;
 
-               /* qflags will get updated _after_ quotacheck */
-               mp->m_sb.sb_qflags = 0;
+               /* qflags will get updated fully _after_ quotacheck */
+               mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
        }
        if (flags & XFS_QMOPT_UQUOTA)
                mp->m_sb.sb_uquotino = (*ip)->i_ino;
-       else
+       else if (flags & XFS_QMOPT_GQUOTA)
                mp->m_sb.sb_gquotino = (*ip)->i_ino;
+       else
+               mp->m_sb.sb_pquotino = (*ip)->i_ino;
        spin_unlock(&mp->m_sb_lock);
        xfs_mod_sb(tp, sbfields);
 
@@ -1484,11 +1519,10 @@ xfs_qm_init_quotainos(
                        if (error)
                                goto error_rele;
                }
-               /* XXX: Use gquotino for now */
                if (XFS_IS_PQUOTA_ON(mp) &&
-                   mp->m_sb.sb_gquotino != NULLFSINO) {
-                       ASSERT(mp->m_sb.sb_gquotino > 0);
-                       error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                   mp->m_sb.sb_pquotino != NULLFSINO) {
+                       ASSERT(mp->m_sb.sb_pquotino > 0);
+                       error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
                                             0, 0, &pip);
                        if (error)
                                goto error_rele;
@@ -1496,7 +1530,8 @@ xfs_qm_init_quotainos(
        } else {
                flags |= XFS_QMOPT_SBVERSION;
                sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                           XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+                           XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
+                           XFS_SB_QFLAGS);
        }
 
        /*
@@ -1524,9 +1559,8 @@ xfs_qm_init_quotainos(
                flags &= ~XFS_QMOPT_SBVERSION;
        }
        if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
-               /* XXX: Use XFS_SB_GQUOTINO for now */
                error = xfs_qm_qino_alloc(mp, &pip,
-                                         sbflags | XFS_SB_GQUOTINO,
+                                         sbflags | XFS_SB_PQUOTINO,
                                          flags | XFS_QMOPT_PQUOTA);
                if (error)
                        goto error_rele;
@@ -1704,8 +1738,7 @@ xfs_qm_write_sb_changes(
        int             error;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
@@ -1734,8 +1767,8 @@ xfs_qm_write_sb_changes(
 int
 xfs_qm_vop_dqalloc(
        struct xfs_inode        *ip,
-       uid_t                   uid,
-       gid_t                   gid,
+       xfs_dqid_t              uid,
+       xfs_dqid_t              gid,
        prid_t                  prid,
        uint                    flags,
        struct xfs_dquot        **O_udqpp,
@@ -1782,7 +1815,7 @@ xfs_qm_vop_dqalloc(
                         * holding ilock.
                         */
                        xfs_iunlock(ip, lockflags);
-                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                       error = xfs_qm_dqget(mp, NULL, uid,
                                                 XFS_DQ_USER,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
@@ -1809,7 +1842,7 @@ xfs_qm_vop_dqalloc(
        if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
                if (ip->i_d.di_gid != gid) {
                        xfs_iunlock(ip, lockflags);
-                       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                       error = xfs_qm_dqget(mp, NULL, gid,
                                                 XFS_DQ_GROUP,
                                                 XFS_QMOPT_DQALLOC |
                                                 XFS_QMOPT_DOWARN,
@@ -1943,7 +1976,7 @@ xfs_qm_vop_chown_reserve(
                        XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
        if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-           ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+           ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
                udq_delblks = udqp;
                /*
                 * If there are delayed allocation blocks, then we have to
index 579d6a02a5b6ec5fd2e21c503f4ada8b6dfa54d6..670cd44640704eb4899f960045cb14f8f582acce 100644 (file)
@@ -160,6 +160,8 @@ extern int          xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
                                        struct fs_disk_quota *);
 extern int             xfs_qm_scall_getqstat(struct xfs_mount *,
                                        struct fs_quota_stat *);
+extern int             xfs_qm_scall_getqstatv(struct xfs_mount *,
+                                       struct fs_quota_statv *);
 extern int             xfs_qm_scall_quotaon(struct xfs_mount *, uint);
 extern int             xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
 
index 437a52d91f6d91ce9f48acbfc5a9928a9ee366ed..3af50ccdfac1a10da858ef26e80b040ad4a474f1 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
index e4f8b2d6f38ba1960beefe35cbd15b0066f530cf..8174aad0b38813ec836ea044d9a1b7b4dc06f300 100644 (file)
@@ -20,6 +20,7 @@
 
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
@@ -37,7 +38,6 @@
 #include "xfs_error.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
-#include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -247,9 +247,7 @@ xfs_qm_scall_trunc_qfile(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES,
-                                 XFS_ITRUNCATE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -296,8 +294,10 @@ xfs_qm_scall_trunc_qfiles(
 
        if (flags & XFS_DQ_USER)
                error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-       if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+       if (flags & XFS_DQ_GROUP)
                error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+       if (flags & XFS_DQ_PROJ)
+               error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
 
        return error ? error : error2;
 }
@@ -404,6 +404,7 @@ xfs_qm_scall_quotaon(
 
 /*
  * Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTAT command.
  */
 int
 xfs_qm_scall_getqstat(
@@ -413,8 +414,10 @@ xfs_qm_scall_getqstat(
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        struct xfs_inode        *uip = NULL;
        struct xfs_inode        *gip = NULL;
+       struct xfs_inode        *pip = NULL;
        bool                    tempuqip = false;
        bool                    tempgqip = false;
+       bool                    temppqip = false;
 
        memset(out, 0, sizeof(fs_quota_stat_t));
 
@@ -424,16 +427,106 @@ xfs_qm_scall_getqstat(
                out->qs_gquota.qfs_ino = NULLFSINO;
                return (0);
        }
+
+       out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+                                                       (XFS_ALL_QUOTA_ACCT|
+                                                        XFS_ALL_QUOTA_ENFD));
+       if (q) {
+               uip = q->qi_uquotaip;
+               gip = q->qi_gquotaip;
+               pip = q->qi_pquotaip;
+       }
+       if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                       0, 0, &uip) == 0)
+                       tempuqip = true;
+       }
+       if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                       0, 0, &gip) == 0)
+                       tempgqip = true;
+       }
+       /*
+        * Q_XGETQSTAT doesn't have room for both group and project quotas.
+        * So, allow the project quota values to be copied out only if
+        * there is no group quota information available.
+        */
+       if (!gip) {
+               if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+                       if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+                                               0, 0, &pip) == 0)
+                               temppqip = true;
+               }
+       } else
+               pip = NULL;
+       if (uip) {
+               out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+               out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+               out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+               if (tempuqip)
+                       IRELE(uip);
+       }
+
+       if (gip) {
+               out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+               out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+               out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+               if (tempgqip)
+                       IRELE(gip);
+       }
+       if (pip) {
+               out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+               out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
+               out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
+               if (temppqip)
+                       IRELE(pip);
+       }
+       if (q) {
+               out->qs_incoredqs = q->qi_dquots;
+               out->qs_btimelimit = q->qi_btimelimit;
+               out->qs_itimelimit = q->qi_itimelimit;
+               out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+               out->qs_bwarnlimit = q->qi_bwarnlimit;
+               out->qs_iwarnlimit = q->qi_iwarnlimit;
+       }
+       return 0;
+}
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ * for Q_XGETQSTATV command, to support separate project quota field.
+ */
+int
+xfs_qm_scall_getqstatv(
+       struct xfs_mount        *mp,
+       struct fs_quota_statv   *out)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       struct xfs_inode        *uip = NULL;
+       struct xfs_inode        *gip = NULL;
+       struct xfs_inode        *pip = NULL;
+       bool                    tempuqip = false;
+       bool                    tempgqip = false;
+       bool                    temppqip = false;
+
+       if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+               out->qs_uquota.qfs_ino = NULLFSINO;
+               out->qs_gquota.qfs_ino = NULLFSINO;
+               out->qs_pquota.qfs_ino = NULLFSINO;
+               return (0);
+       }
+
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
-       out->qs_pad = 0;
        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+       out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
 
        if (q) {
                uip = q->qi_uquotaip;
                gip = q->qi_gquotaip;
+               pip = q->qi_pquotaip;
        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -445,18 +538,30 @@ xfs_qm_scall_getqstat(
                                        0, 0, &gip) == 0)
                        tempgqip = true;
        }
+       if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
+                                       0, 0, &pip) == 0)
+                       temppqip = true;
+       }
        if (uip) {
                out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
                out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
                if (tempuqip)
                        IRELE(uip);
        }
+
        if (gip) {
                out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
                out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
                if (tempgqip)
                        IRELE(gip);
        }
+       if (pip) {
+               out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
+               out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
+               if (temppqip)
+                       IRELE(pip);
+       }
        if (q) {
                out->qs_incoredqs = q->qi_dquots;
                out->qs_btimelimit = q->qi_btimelimit;
@@ -515,8 +620,7 @@ xfs_qm_scall_setqlim(
        xfs_dqunlock(dqp);
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                goto out_rele;
@@ -650,8 +754,7 @@ xfs_qm_log_quotaoff_end(
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
 
-       error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return (error);
@@ -684,8 +787,7 @@ xfs_qm_log_quotaoff(
        uint                    oldsbqflag=0;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-       error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp),
-                                 0, 0, XFS_DEFAULT_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
        if (error)
                goto error0;
 
index b14f42c714b609b95eab2b993805a7e4e4e73c24..e7d84d2d86830a25a5cd9778521766d4a592c45b 100644 (file)
 #ifndef __XFS_QUOTA_H__
 #define __XFS_QUOTA_H__
 
-struct xfs_trans;
-
-/*
- * The ondisk form of a dquot structure.
- */
-#define XFS_DQUOT_MAGIC                0x4451          /* 'DQ' */
-#define XFS_DQUOT_VERSION      (u_int8_t)0x01  /* latest version number */
-
-/*
- * uid_t and gid_t are hard-coded to 32 bits in the inode.
- * Hence, an 'id' in a dquot is 32 bits..
- */
-typedef __uint32_t     xfs_dqid_t;
-
-/*
- * Even though users may not have quota limits occupying all 64-bits,
- * they may need 64-bit accounting. Hence, 64-bit quota-counters,
- * and quota-limits. This is a waste in the common case, but hey ...
- */
-typedef __uint64_t     xfs_qcnt_t;
-typedef __uint16_t     xfs_qwarncnt_t;
-
-/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
- */
-typedef struct xfs_disk_dquot {
-       __be16          d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
-       __u8            d_version;      /* dquot version */
-       __u8            d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
-       __be32          d_id;           /* user,project,group id */
-       __be64          d_blk_hardlimit;/* absolute limit on disk blks */
-       __be64          d_blk_softlimit;/* preferred limit on disk blks */
-       __be64          d_ino_hardlimit;/* maximum # allocated inodes */
-       __be64          d_ino_softlimit;/* preferred inode limit */
-       __be64          d_bcount;       /* disk blocks owned by the user */
-       __be64          d_icount;       /* inodes owned by the user */
-       __be32          d_itimer;       /* zero if within inode limits if not,
-                                          this is when we refuse service */
-       __be32          d_btimer;       /* similar to above; for disk blocks */
-       __be16          d_iwarns;       /* warnings issued wrt num inodes */
-       __be16          d_bwarns;       /* warnings issued wrt disk blocks */
-       __be32          d_pad0;         /* 64 bit align */
-       __be64          d_rtb_hardlimit;/* absolute limit on realtime blks */
-       __be64          d_rtb_softlimit;/* preferred limit on RT disk blks */
-       __be64          d_rtbcount;     /* realtime blocks owned */
-       __be32          d_rtbtimer;     /* similar to above; for RT disk blocks */
-       __be16          d_rtbwarns;     /* warnings issued wrt RT disk blocks */
-       __be16          d_pad;
-} xfs_disk_dquot_t;
-
-/*
- * This is what goes on disk. This is separated from the xfs_disk_dquot because
- * carrying the unnecessary padding would be a waste of memory.
- */
-typedef struct xfs_dqblk {
-       xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
-       char              dd_fill[4];   /* filling for posterity */
-
-       /*
-        * These two are only present on filesystems with the CRC bits set.
-        */
-       __be32            dd_crc;       /* checksum */
-       __be64            dd_lsn;       /* last modification in log */
-       uuid_t            dd_uuid;      /* location information */
-} xfs_dqblk_t;
-
-#define XFS_DQUOT_CRC_OFF      offsetof(struct xfs_dqblk, dd_crc)
-
-/*
- * flags for q_flags field in the dquot.
- */
-#define XFS_DQ_USER            0x0001          /* a user quota */
-#define XFS_DQ_PROJ            0x0002          /* project quota */
-#define XFS_DQ_GROUP           0x0004          /* a group quota */
-#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
-#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
-
-#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
-
-#define XFS_DQ_FLAGS \
-       { XFS_DQ_USER,          "USER" }, \
-       { XFS_DQ_PROJ,          "PROJ" }, \
-       { XFS_DQ_GROUP,         "GROUP" }, \
-       { XFS_DQ_DIRTY,         "DIRTY" }, \
-       { XFS_DQ_FREEING,       "FREEING" }
-
-/*
- * We have the possibility of all three quota types being active at once, and
- * hence free space modification requires modification of all three current
- * dquots in a single transaction. For this case we need to have a reservation
- * of at least 3 dquots.
- *
- * However, a chmod operation can change both UID and GID in a single
- * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
- * modified. Hence for this case we need to reserve space for at least 4 dquots.
- *
- * And in the worst case, there's a rename operation that can be modifying up to
- * 4 inodes with dquots attached to them. In reality, the only inodes that can
- * have their dquots modified are the source and destination directory inodes
- * due to directory name creation and removal. That can require space allocation
- * and/or freeing on both directory inodes, and hence all three dquots on each
- * inode can be modified. And if the directories are world writeable, all the
- * dquots can be unique and so 6 dquots can be modified....
- *
- * And, of course, we also need to take into account the dquot log format item
- * used to describe each dquot.
- */
-#define XFS_DQUOT_LOGRES(mp)   \
-       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
-
-/*
- * These are the structures used to lay out dquots and quotaoff
- * records on the log. Quite similar to those of inodes.
- */
-
-/*
- * log format struct for dquots.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- */
-typedef struct xfs_dq_logformat {
-       __uint16_t              qlf_type;      /* dquot log item type */
-       __uint16_t              qlf_size;      /* size of this item */
-       xfs_dqid_t              qlf_id;        /* usr/grp/proj id : 32 bits */
-       __int64_t               qlf_blkno;     /* blkno of dquot buffer */
-       __int32_t               qlf_len;       /* len of dquot buffer */
-       __uint32_t              qlf_boffset;   /* off of dquot in buffer */
-} xfs_dq_logformat_t;
-
-/*
- * log format struct for QUOTAOFF records.
- * The first two fields must be the type and size fitting into
- * 32 bits : log_recovery code assumes that.
- * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
- * to the first and ensures that the first logitem is taken out of the AIL
- * only when the last one is securely committed.
- */
-typedef struct xfs_qoff_logformat {
-       unsigned short          qf_type;        /* quotaoff log item type */
-       unsigned short          qf_size;        /* size of this item */
-       unsigned int            qf_flags;       /* USR and/or GRP */
-       char                    qf_pad[12];     /* padding for future */
-} xfs_qoff_logformat_t;
-
-
-/*
- * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
- */
-#define XFS_UQUOTA_ACCT        0x0001  /* user quota accounting ON */
-#define XFS_UQUOTA_ENFD        0x0002  /* user quota limits enforced */
-#define XFS_UQUOTA_CHKD        0x0004  /* quotacheck run on usr quotas */
-#define XFS_PQUOTA_ACCT        0x0008  /* project quota accounting ON */
-#define XFS_OQUOTA_ENFD        0x0010  /* other (grp/prj) quota limits enforced */
-#define XFS_OQUOTA_CHKD        0x0020  /* quotacheck run on other (grp/prj) quotas */
-#define XFS_GQUOTA_ACCT        0x0040  /* group quota accounting ON */
-
-/*
- * Conversion to and from the combined OQUOTA flag (if necessary)
- * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
- */
-#define XFS_GQUOTA_ENFD        0x0080  /* group quota limits enforced */
-#define XFS_GQUOTA_CHKD        0x0100  /* quotacheck run on group quotas */
-#define XFS_PQUOTA_ENFD        0x0200  /* project quota limits enforced */
-#define XFS_PQUOTA_CHKD        0x0400  /* quotacheck run on project quotas */
-
-/*
- * Quota Accounting/Enforcement flags
- */
-#define XFS_ALL_QUOTA_ACCT     \
-               (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD     \
-               (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD     \
-               (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
-
-#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
-#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
-#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
-
-/*
- * Incore only flags for quotaoff - these bits get cleared when quota(s)
- * are in the process of getting turned off. These flags are in m_qflags but
- * never in sb_qflags.
- */
-#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
-#define XFS_ALL_QUOTA_ACTIVE   \
-       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+#include "xfs_quota_defs.h"
 
 /*
- * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
- * quota will be not be switched off as long as that inode lock is held.
+ * Kernel only quota definitions and functions
  */
-#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
-                                                  XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp)   ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
-                                                  XFS_PQUOTA_ACTIVE))
-#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
-#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
-#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
 
-/*
- * Flags to tell various functions what to do. Not all of these are meaningful
- * to a single function. None of these XFS_QMOPT_* flags are meant to have
- * persistent values (ie. their values can and will change between versions)
- */
-#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
-#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
-#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
-#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
-
-/*
- * flags to xfs_trans_mod_dquot to indicate which field needs to be
- * modified.
- */
-#define XFS_QMOPT_RES_REGBLKS  0x0010000
-#define XFS_QMOPT_RES_RTBLKS   0x0020000
-#define XFS_QMOPT_BCOUNT       0x0040000
-#define XFS_QMOPT_ICOUNT       0x0080000
-#define XFS_QMOPT_RTBCOUNT     0x0100000
-#define XFS_QMOPT_DELBCOUNT    0x0200000
-#define XFS_QMOPT_DELRTBCOUNT  0x0400000
-#define XFS_QMOPT_RES_INOS     0x0800000
-
-/*
- * flags for dqalloc.
- */
-#define XFS_QMOPT_INHERIT      0x1000000
-
-/*
- * flags to xfs_trans_mod_dquot.
- */
-#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
-#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
-#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
-#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
-#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
-#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
-#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
-#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
-
-
-#define XFS_QMOPT_QUOTALL      \
-               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
-#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+struct xfs_trans;
 
-#ifdef __KERNEL__
 /*
  * This check is done typically without holding the inode lock;
  * that may seem racy, but it is harmless in the context that it is used.
@@ -301,13 +48,6 @@ typedef struct xfs_qoff_logformat {
         (XFS_IS_PQUOTA_ON(mp) && \
                (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
 
-#define XFS_MOUNT_QUOTA_ALL    (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-                                XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-                                XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
-                                XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
-                                XFS_PQUOTA_CHKD)
-
-
 /*
  * The structure kept inside the xfs_trans_t keep track of dquot changes
  * within a transaction and apply them later.
@@ -340,8 +80,9 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
                struct xfs_mount *, struct xfs_dquot *,
                struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
 
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-               struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+               prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
+               struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
                struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -362,9 +103,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
 #else
 static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-               uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
-               struct xfs_dquot **pdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+               prid_t prid, uint flags, struct xfs_dquot **udqp,
+               struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
 {
        *udqp = NULL;
        *gdqp = NULL;
@@ -415,5 +156,4 @@ extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 
-#endif /* __KERNEL__ */
 #endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
new file mode 100644 (file)
index 0000000..e6b0d6e
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_DEFS_H__
+#define __XFS_QUOTA_DEFS_H__
+
+/*
+ * Quota definitions shared between user and kernel source trees.
+ */
+
+/*
+ * Even though users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t     xfs_qcnt_t;
+typedef __uint16_t     xfs_qwarncnt_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER            0x0001          /* a user quota */
+#define XFS_DQ_PROJ            0x0002          /* project quota */
+#define XFS_DQ_GROUP           0x0004          /* a group quota */
+#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
+#define XFS_DQ_FREEING         0x0010          /* dquot is beeing torn down */
+
+#define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
+
+#define XFS_DQ_FLAGS \
+       { XFS_DQ_USER,          "USER" }, \
+       { XFS_DQ_PROJ,          "PROJ" }, \
+       { XFS_DQ_GROUP,         "GROUP" }, \
+       { XFS_DQ_DIRTY,         "DIRTY" }, \
+       { XFS_DQ_FREEING,       "FREEING" }
+
+/*
+ * We have the possibility of all three quota types being active at once, and
+ * hence free space modification requires modification of all three current
+ * dquots in a single transaction. For this case we need to have a reservation
+ * of at least 3 dquots.
+ *
+ * However, a chmod operation can change both UID and GID in a single
+ * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
+ * modified. Hence for this case we need to reserve space for at least 4 dquots.
+ *
+ * And in the worst case, there's a rename operation that can be modifying up to
+ * 4 inodes with dquots attached to them. In reality, the only inodes that can
+ * have their dquots modified are the source and destination directory inodes
+ * due to directory name creation and removal. That can require space allocation
+ * and/or freeing on both directory inodes, and hence all three dquots on each
+ * inode can be modified. And if the directories are world writeable, all the
+ * dquots can be unique and so 6 dquots can be modified....
+ *
+ * And, of course, we also need to take into account the dquot log format item
+ * used to describe each dquot.
+ */
+#define XFS_DQUOT_LOGRES(mp)   \
+       ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
+
+#define XFS_IS_QUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)      ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+#define XFS_IS_UQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)     ((mp)->m_qflags & XFS_PQUOTA_ENFD)
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE      0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE      0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE      0x4000  /* pquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE   \
+       (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)    ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+                                                  XFS_GQUOTA_ACTIVE | \
+                                                  XFS_PQUOTA_ACTIVE))
+#define XFS_IS_OQUOTA_ON(mp)   ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
+                                                  XFS_PQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)   ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)   ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+#define XFS_IS_PQUOTA_ON(mp)   ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
+#define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES    0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION    0x0000040 /* change superblock version num */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
+#define XFS_QMOPT_DQREPAIR     0x0001000 /* repair dquot if damaged */
+#define XFS_QMOPT_GQUOTA       0x0002000 /* group dquot requested */
+#define XFS_QMOPT_ENOSPC       0x0004000 /* enospc instead of edquot (prj) */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS  0x0010000
+#define XFS_QMOPT_RES_RTBLKS   0x0020000
+#define XFS_QMOPT_BCOUNT       0x0040000
+#define XFS_QMOPT_ICOUNT       0x0080000
+#define XFS_QMOPT_RTBCOUNT     0x0100000
+#define XFS_QMOPT_DELBCOUNT    0x0200000
+#define XFS_QMOPT_DELRTBCOUNT  0x0400000
+#define XFS_QMOPT_RES_INOS     0x0800000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT      0x1000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS  XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS        XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS  XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT    XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT    XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT  XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL      \
+               (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+#endif /* __XFS_QUOTA_H__ */
index 20e30f93b0c7dab8b548527c46634c49486e02cd..1326d81596c2920b27f45021b67b76979e5ef387 100644 (file)
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_sb.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_log.h"
+#include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
@@ -53,6 +55,18 @@ xfs_fs_get_xstate(
        return -xfs_qm_scall_getqstat(mp, fqs);
 }
 
+STATIC int
+xfs_fs_get_xstatev(
+       struct super_block      *sb,
+       struct fs_quota_statv   *fqs)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       return -xfs_qm_scall_getqstatv(mp, fqs);
+}
+
 STATIC int
 xfs_fs_set_xstate(
        struct super_block      *sb,
@@ -133,6 +147,7 @@ xfs_fs_set_dqblk(
 }
 
 const struct quotactl_ops xfs_quotactl_operations = {
+       .get_xstatev            = xfs_fs_get_xstatev,
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
        .get_dqblk              = xfs_fs_get_dqblk,
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
deleted file mode 100644 (file)
index 30ff5f4..0000000
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_space.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-
-
-/*
- * Enter all inodes for a rename transaction into a sorted array.
- */
-STATIC void
-xfs_sort_for_rename(
-       xfs_inode_t     *dp1,   /* in: old (source) directory inode */
-       xfs_inode_t     *dp2,   /* in: new (target) directory inode */
-       xfs_inode_t     *ip1,   /* in: inode of old entry */
-       xfs_inode_t     *ip2,   /* in: inode of new entry, if it
-                                  already exists, NULL otherwise. */
-       xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
-       int             *num_inodes)  /* out: number of inodes in array */
-{
-       xfs_inode_t             *temp;
-       int                     i, j;
-
-       /*
-        * i_tab contains a list of pointers to inodes.  We initialize
-        * the table here & we'll sort it.  We will then use it to
-        * order the acquisition of the inode locks.
-        *
-        * Note that the table may contain duplicates.  e.g., dp1 == dp2.
-        */
-       i_tab[0] = dp1;
-       i_tab[1] = dp2;
-       i_tab[2] = ip1;
-       if (ip2) {
-               *num_inodes = 4;
-               i_tab[3] = ip2;
-       } else {
-               *num_inodes = 3;
-               i_tab[3] = NULL;
-       }
-
-       /*
-        * Sort the elements via bubble sort.  (Remember, there are at
-        * most 4 elements to sort, so this is adequate.)
-        */
-       for (i = 0; i < *num_inodes; i++) {
-               for (j = 1; j < *num_inodes; j++) {
-                       if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
-                               temp = i_tab[j];
-                               i_tab[j] = i_tab[j-1];
-                               i_tab[j-1] = temp;
-                       }
-               }
-       }
-}
-
-/*
- * xfs_rename
- */
-int
-xfs_rename(
-       xfs_inode_t     *src_dp,
-       struct xfs_name *src_name,
-       xfs_inode_t     *src_ip,
-       xfs_inode_t     *target_dp,
-       struct xfs_name *target_name,
-       xfs_inode_t     *target_ip)
-{
-       xfs_trans_t     *tp = NULL;
-       xfs_mount_t     *mp = src_dp->i_mount;
-       int             new_parent;             /* moving to a new dir */
-       int             src_is_directory;       /* src_name is a directory */
-       int             error;
-       xfs_bmap_free_t free_list;
-       xfs_fsblock_t   first_block;
-       int             cancel_flags;
-       int             committed;
-       xfs_inode_t     *inodes[4];
-       int             spaceres;
-       int             num_inodes;
-
-       trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-
-       new_parent = (src_dp != target_dp);
-       src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-
-       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
-                               inodes, &num_inodes);
-
-       xfs_bmap_init(&free_list, &first_block);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
-       if (error == ENOSPC) {
-               spaceres = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
-       }
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               goto std_return;
-       }
-
-       /*
-        * Attach the dquots to the inodes
-        */
-       error = xfs_qm_vop_rename_dqattach(inodes);
-       if (error) {
-               xfs_trans_cancel(tp, cancel_flags);
-               goto std_return;
-       }
-
-       /*
-        * Lock all the participating inodes. Depending upon whether
-        * the target_name exists in the target directory, and
-        * whether the target directory is the same as the source
-        * directory, we can lock from 2 to 4 inodes.
-        */
-       xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
-
-       /*
-        * Join all the inodes to the transaction. From this point on,
-        * we can rely on either trans_commit or trans_cancel to unlock
-        * them.
-        */
-       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
-       if (new_parent)
-               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
-       if (target_ip)
-               xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
-
-       /*
-        * If we are using project inheritance, we only allow renames
-        * into our tree when the project IDs are the same; else the
-        * tree quota mechanism would be circumvented.
-        */
-       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                    (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
-               error = XFS_ERROR(EXDEV);
-               goto error_return;
-       }
-
-       /*
-        * Set up the target.
-        */
-       if (target_ip == NULL) {
-               /*
-                * If there's no space reservation, check the entry will
-                * fit before actually inserting it.
-                */
-               error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
-               if (error)
-                       goto error_return;
-               /*
-                * If target does not exist and the rename crosses
-                * directories, adjust the target directory link count
-                * to account for the ".." reference from the new entry.
-                */
-               error = xfs_dir_createname(tp, target_dp, target_name,
-                                               src_ip->i_ino, &first_block,
-                                               &free_list, spaceres);
-               if (error == ENOSPC)
-                       goto error_return;
-               if (error)
-                       goto abort_return;
-
-               xfs_trans_ichgtime(tp, target_dp,
-                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-               if (new_parent && src_is_directory) {
-                       error = xfs_bumplink(tp, target_dp);
-                       if (error)
-                               goto abort_return;
-               }
-       } else { /* target_ip != NULL */
-               /*
-                * If target exists and it's a directory, check that both
-                * target and source are directories and that target can be
-                * destroyed, or that neither is a directory.
-                */
-               if (S_ISDIR(target_ip->i_d.di_mode)) {
-                       /*
-                        * Make sure target dir is empty.
-                        */
-                       if (!(xfs_dir_isempty(target_ip)) ||
-                           (target_ip->i_d.di_nlink > 2)) {
-                               error = XFS_ERROR(EEXIST);
-                               goto error_return;
-                       }
-               }
-
-               /*
-                * Link the source inode under the target name.
-                * If the source inode is a directory and we are moving
-                * it across directories, its ".." entry will be
-                * inconsistent until we replace that down below.
-                *
-                * In case there is already an entry with the same
-                * name at the destination directory, remove it first.
-                */
-               error = xfs_dir_replace(tp, target_dp, target_name,
-                                       src_ip->i_ino,
-                                       &first_block, &free_list, spaceres);
-               if (error)
-                       goto abort_return;
-
-               xfs_trans_ichgtime(tp, target_dp,
-                                       XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-               /*
-                * Decrement the link count on the target since the target
-                * dir no longer points to it.
-                */
-               error = xfs_droplink(tp, target_ip);
-               if (error)
-                       goto abort_return;
-
-               if (src_is_directory) {
-                       /*
-                        * Drop the link from the old "." entry.
-                        */
-                       error = xfs_droplink(tp, target_ip);
-                       if (error)
-                               goto abort_return;
-               }
-       } /* target_ip != NULL */
-
-       /*
-        * Remove the source.
-        */
-       if (new_parent && src_is_directory) {
-               /*
-                * Rewrite the ".." entry to point to the new
-                * directory.
-                */
-               error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
-                                       target_dp->i_ino,
-                                       &first_block, &free_list, spaceres);
-               ASSERT(error != EEXIST);
-               if (error)
-                       goto abort_return;
-       }
-
-       /*
-        * We always want to hit the ctime on the source inode.
-        *
-        * This isn't strictly required by the standards since the source
-        * inode isn't really being changed, but old unix file systems did
-        * it and some incremental backup programs won't work without it.
-        */
-       xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
-
-       /*
-        * Adjust the link count on src_dp.  This is necessary when
-        * renaming a directory, either within one parent when
-        * the target existed, or across two parent directories.
-        */
-       if (src_is_directory && (new_parent || target_ip != NULL)) {
-
-               /*
-                * Decrement link count on src_directory since the
-                * entry that's moved no longer points to it.
-                */
-               error = xfs_droplink(tp, src_dp);
-               if (error)
-                       goto abort_return;
-       }
-
-       error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
-                                       &first_block, &free_list, spaceres);
-       if (error)
-               goto abort_return;
-
-       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-       if (new_parent)
-               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * rename transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT));
-               goto std_return;
-       }
-
-       /*
-        * trans_commit will unlock src_ip, target_ip & decrement
-        * the vnode references.
-        */
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
index 98dc670d3ee04182da47b27e7db1695b71807434..6f9e63c9fc2617ab89966447083527f1d0c94257 100644 (file)
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_rtalloc.h"
 #include "xfs_fsops.h"
 #include "xfs_error.h"
 #include "xfs_inode_item.h"
 #include "xfs_trans_space.h"
-#include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
 #include "xfs_icache.h"
@@ -101,10 +100,9 @@ xfs_growfs_rt_alloc(
                /*
                 * Reserve space & log for one extent added to the file.
                 */
-               if ((error = xfs_trans_reserve(tp, resblks,
-                               XFS_GROWRTALLOC_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_DEFAULT_PERM_LOG_COUNT)))
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+                                         resblks, 0);
+               if (error)
                        goto error_cancel;
                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
                /*
@@ -147,8 +145,9 @@ xfs_growfs_rt_alloc(
                        /*
                         * Reserve log for one block zeroing.
                         */
-                       if ((error = xfs_trans_reserve(tp, 0,
-                                       XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+                       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+                                                 0, 0);
+                       if (error)
                                goto error_cancel;
                        /*
                         * Lock the bitmap inode.
@@ -736,8 +735,8 @@ xfs_rtallocate_range(
 {
        xfs_rtblock_t   end;            /* end of the allocated extent */
        int             error;          /* error value */
-       xfs_rtblock_t   postblock;      /* first block allocated > end */
-       xfs_rtblock_t   preblock;       /* first block allocated < start */
+       xfs_rtblock_t   postblock = 0;  /* first block allocated > end */
+       xfs_rtblock_t   preblock = 0;   /* first block allocated < start */
 
        end = start + len - 1;
        /*
@@ -1958,8 +1957,9 @@ xfs_growfs_rt(
                 * Start a transaction, get the log reservation.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-               if ((error = xfs_trans_reserve(tp, 0,
-                               XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
+                                         0, 0);
+               if (error)
                        goto error_cancel;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
@@ -2148,7 +2148,7 @@ xfs_rtfree_extent(
        ASSERT(mp->m_rbmip->i_itemp != NULL);
        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 
-#if defined(__KERNEL__) && defined(DEBUG)
+#ifdef DEBUG
        /*
         * Check to see that this whole range is currently allocated.
         */
index f7f3a359c1c5a238afd4884b19f4a74363e36287..b2a1a24c0e2f3d8037cdd03f2b8deffc298d38c9 100644 (file)
 #ifndef __XFS_RTALLOC_H__
 #define        __XFS_RTALLOC_H__
 
+/* kernel only definitions and functions */
+
 struct xfs_mount;
 struct xfs_trans;
 
-/* Min and max rt extent sizes, specified in bytes */
-#define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
-#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
-#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
-
-/*
- * Constants for bit manipulations.
- */
-#define        XFS_NBBYLOG     3               /* log2(NBBY) */
-#define        XFS_WORDLOG     2               /* log2(sizeof(xfs_rtword_t)) */
-#define        XFS_NBWORDLOG   (XFS_NBBYLOG + XFS_WORDLOG)
-#define        XFS_NBWORD      (1 << XFS_NBWORDLOG)
-#define        XFS_WORDMASK    ((1 << XFS_WORDLOG) - 1)
-
-#define        XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
-#define        XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
-#define        XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
-#define        XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
-
-/*
- * Summary and bit manipulation macros.
- */
-#define        XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
-#define        XFS_SUMOFFSTOBLOCK(mp,s)        \
-       (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
-#define        XFS_SUMPTR(mp,bp,so)    \
-       ((xfs_suminfo_t *)((bp)->b_addr + \
-               (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
-
-#define        XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
-#define        XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
-#define        XFS_BITTOWORD(mp,bi)    \
-       ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
-
-#define        XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
-#define        XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
-
-#define        XFS_RTLOBIT(w)  xfs_lowbit32(w)
-#define        XFS_RTHIBIT(w)  xfs_highbit32(w)
-
-#if XFS_BIG_BLKNOS
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
-#else
-#define        XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
-#endif
-
-
-#ifdef __KERNEL__
-
 #ifdef CONFIG_XFS_RT
 /*
  * Function prototypes for exported functions.
@@ -161,6 +114,4 @@ xfs_rtmount_init(
 # define xfs_rtunmount_inodes(m)
 #endif /* CONFIG_XFS_RT */
 
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
new file mode 100644 (file)
index 0000000..a5b59d9
--- /dev/null
@@ -0,0 +1,834 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+/*
+ * Physical superblock buffer manipulations. Shared with libxfs in userspace.
+ */
+
+static const struct {
+       short offset;
+       short type;     /* 0 = integer
+                        * 1 = binary / string (no translation)
+                        */
+} xfs_sb_info[] = {
+       { offsetof(xfs_sb_t, sb_magicnum),      0 },
+       { offsetof(xfs_sb_t, sb_blocksize),     0 },
+       { offsetof(xfs_sb_t, sb_dblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rblocks),       0 },
+       { offsetof(xfs_sb_t, sb_rextents),      0 },
+       { offsetof(xfs_sb_t, sb_uuid),          1 },
+       { offsetof(xfs_sb_t, sb_logstart),      0 },
+       { offsetof(xfs_sb_t, sb_rootino),       0 },
+       { offsetof(xfs_sb_t, sb_rbmino),        0 },
+       { offsetof(xfs_sb_t, sb_rsumino),       0 },
+       { offsetof(xfs_sb_t, sb_rextsize),      0 },
+       { offsetof(xfs_sb_t, sb_agblocks),      0 },
+       { offsetof(xfs_sb_t, sb_agcount),       0 },
+       { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
+       { offsetof(xfs_sb_t, sb_logblocks),     0 },
+       { offsetof(xfs_sb_t, sb_versionnum),    0 },
+       { offsetof(xfs_sb_t, sb_sectsize),      0 },
+       { offsetof(xfs_sb_t, sb_inodesize),     0 },
+       { offsetof(xfs_sb_t, sb_inopblock),     0 },
+       { offsetof(xfs_sb_t, sb_fname[0]),      1 },
+       { offsetof(xfs_sb_t, sb_blocklog),      0 },
+       { offsetof(xfs_sb_t, sb_sectlog),       0 },
+       { offsetof(xfs_sb_t, sb_inodelog),      0 },
+       { offsetof(xfs_sb_t, sb_inopblog),      0 },
+       { offsetof(xfs_sb_t, sb_agblklog),      0 },
+       { offsetof(xfs_sb_t, sb_rextslog),      0 },
+       { offsetof(xfs_sb_t, sb_inprogress),    0 },
+       { offsetof(xfs_sb_t, sb_imax_pct),      0 },
+       { offsetof(xfs_sb_t, sb_icount),        0 },
+       { offsetof(xfs_sb_t, sb_ifree),         0 },
+       { offsetof(xfs_sb_t, sb_fdblocks),      0 },
+       { offsetof(xfs_sb_t, sb_frextents),     0 },
+       { offsetof(xfs_sb_t, sb_uquotino),      0 },
+       { offsetof(xfs_sb_t, sb_gquotino),      0 },
+       { offsetof(xfs_sb_t, sb_qflags),        0 },
+       { offsetof(xfs_sb_t, sb_flags),         0 },
+       { offsetof(xfs_sb_t, sb_shared_vn),     0 },
+       { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
+       { offsetof(xfs_sb_t, sb_unit),          0 },
+       { offsetof(xfs_sb_t, sb_width),         0 },
+       { offsetof(xfs_sb_t, sb_dirblklog),     0 },
+       { offsetof(xfs_sb_t, sb_logsectlog),    0 },
+       { offsetof(xfs_sb_t, sb_logsectsize),   0 },
+       { offsetof(xfs_sb_t, sb_logsunit),      0 },
+       { offsetof(xfs_sb_t, sb_features2),     0 },
+       { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+       { offsetof(xfs_sb_t, sb_features_compat),       0 },
+       { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
+       { offsetof(xfs_sb_t, sb_features_incompat),     0 },
+       { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+       { offsetof(xfs_sb_t, sb_crc),           0 },
+       { offsetof(xfs_sb_t, sb_pad),           0 },
+       { offsetof(xfs_sb_t, sb_pquotino),      0 },
+       { offsetof(xfs_sb_t, sb_lsn),           0 },
+       { sizeof(xfs_sb_t),                     0 }
+};
+
+/*
+ * Reference counting access wrappers to the perag structures.
+ * Because we never free per-ag structures, the only thing we
+ * have to protect against changes is the tree structure itself.
+ */
+struct xfs_perag *
+xfs_perag_get(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno)
+{
+       struct xfs_perag        *pag;
+       int                     ref = 0;
+
+       rcu_read_lock();
+       pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+       if (pag) {
+               ASSERT(atomic_read(&pag->pag_ref) >= 0);
+               ref = atomic_inc_return(&pag->pag_ref);
+       }
+       rcu_read_unlock();
+       trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+       return pag;
+}
+
+/*
+ * search from @first to find the next perag with the given tag set.
+ */
+struct xfs_perag *
+xfs_perag_get_tag(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          first,
+       int                     tag)
+{
+       struct xfs_perag        *pag;
+       int                     found;
+       int                     ref;
+
+       rcu_read_lock();
+       found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+                                       (void **)&pag, first, 1, tag);
+       if (found <= 0) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       ref = atomic_inc_return(&pag->pag_ref);
+       rcu_read_unlock();
+       trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
+       return pag;
+}
+
+void
+xfs_perag_put(
+       struct xfs_perag        *pag)
+{
+       int     ref;
+
+       ASSERT(atomic_read(&pag->pag_ref) > 0);
+       ref = atomic_dec_return(&pag->pag_ref);
+       trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+       xfs_mount_t     *mp,
+       xfs_sb_t        *sbp,
+       bool            check_inprogress,
+       bool            check_version)
+{
+
+       /*
+        * If the log device and data device have the
+        * same device number, the log is internal.
+        * Consequently, the sb_logstart should be non-zero.  If
+        * we have a zero sb_logstart in this case, we may be trying to mount
+        * a volume filesystem in a non-volume manner.
+        */
+       if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+               xfs_warn(mp, "bad magic number");
+               return XFS_ERROR(EWRONGFS);
+       }
+
+
+       if (!xfs_sb_good_version(sbp)) {
+               xfs_warn(mp, "bad version");
+               return XFS_ERROR(EWRONGFS);
+       }
+
+       /*
+        * Version 5 superblock feature mask validation. Reject combinations the
+        * kernel cannot support up front before checking anything else. For
+        * write validation, we don't need to check feature masks.
+        */
+       if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+               xfs_alert(mp,
+"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
+"Use of these features in this kernel is at your own risk!");
+
+               if (xfs_sb_has_compat_feature(sbp,
+                                       XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+                               (sbp->sb_features_compat &
+                                               XFS_SB_FEAT_COMPAT_UNKNOWN));
+               }
+
+               if (xfs_sb_has_ro_compat_feature(sbp,
+                                       XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+                       xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+                               (sbp->sb_features_ro_compat &
+                                               XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+                       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                               xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+                               return XFS_ERROR(EINVAL);
+                       }
+               }
+               if (xfs_sb_has_incompat_feature(sbp,
+                                       XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+                       xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+                               (sbp->sb_features_incompat &
+                                               XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+                       return XFS_ERROR(EINVAL);
+               }
+       }
+
+       if (xfs_sb_version_has_pquotino(sbp)) {
+               if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
+                       xfs_notice(mp,
+                          "Version 5 of Super block has XFS_OQUOTA bits.\n");
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+       } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+                               XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
+                       xfs_notice(mp,
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
+                       return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       if (unlikely(
+           sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an external log; "
+               "specify logdev on the mount command line.");
+               return XFS_ERROR(EINVAL);
+       }
+
+       if (unlikely(
+           sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+               xfs_warn(mp,
+               "filesystem is marked as having an internal log; "
+               "do not specify logdev on the mount command line.");
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * More sanity checking.  Most of these were stolen directly from
+        * xfs_repair.
+        */
+       if (unlikely(
+           sbp->sb_agcount <= 0                                        ||
+           sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
+           sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
+           sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
+           sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+           sbp->sb_sectsize != (1 << sbp->sb_sectlog)                  ||
+           sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
+           sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
+           sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+           sbp->sb_blocksize != (1 << sbp->sb_blocklog)                ||
+           sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
+           sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
+           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
+           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+           (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
+           (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
+           (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
+           (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */)    ||
+           sbp->sb_dblocks == 0                                        ||
+           sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
+           sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
+               XFS_CORRUPTION_ERROR("SB sanity check failed",
+                               XFS_ERRLEVEL_LOW, mp, sbp);
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       /*
+        * Until this is fixed only page-sized or smaller data blocks work.
+        */
+       if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+               xfs_warn(mp,
+               "File system with blocksize %d bytes. "
+               "Only pagesize (%ld) or less will currently work.",
+                               sbp->sb_blocksize, PAGE_SIZE);
+               return XFS_ERROR(ENOSYS);
+       }
+
+       /*
+        * Currently only very few inode sizes are supported.
+        */
+       switch (sbp->sb_inodesize) {
+       case 256:
+       case 512:
+       case 1024:
+       case 2048:
+               break;
+       default:
+               xfs_warn(mp, "inode size of %d bytes not supported",
+                               sbp->sb_inodesize);
+               return XFS_ERROR(ENOSYS);
+       }
+
+       if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
+           xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
+               xfs_warn(mp,
+               "file system too large to be mounted on this system.");
+               return XFS_ERROR(EFBIG);
+       }
+
+       if (check_inprogress && sbp->sb_inprogress) {
+               xfs_warn(mp, "Offline file system operation in progress!");
+               return XFS_ERROR(EFSCORRUPTED);
+       }
+
+       /*
+        * Version 1 directory format has never worked on Linux.
+        */
+       if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
+               xfs_warn(mp, "file system using version 1 directory format");
+               return XFS_ERROR(ENOSYS);
+       }
+
+       return 0;
+}
+
+void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+       /*
+        * older mkfs doesn't initialize quota inodes to NULLFSINO. This
+        * leads to in-core values having two different values for a quota
+        * inode to be invalid: 0 and NULLFSINO. Change it to a single value
+        * NULLFSINO.
+        *
+        * Note that this change affect only the in-core values. These
+        * values are not written back to disk unless any quota information
+        * is written to the disk. Even in that case, sb_pquotino field is
+        * not written to disk unless the superblock supports pquotino.
+        */
+       if (sbp->sb_uquotino == 0)
+               sbp->sb_uquotino = NULLFSINO;
+       if (sbp->sb_gquotino == 0)
+               sbp->sb_gquotino = NULLFSINO;
+       if (sbp->sb_pquotino == 0)
+               sbp->sb_pquotino = NULLFSINO;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(sbp))
+               return;
+
+       if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+       if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+               sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+                                       XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+       sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+
+       if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+               /*
+                * In older version of superblock, on-disk superblock only
+                * has sb_gquotino, and in-core superblock has both sb_gquotino
+                * and sb_pquotino. But, only one of them is supported at any
+                * point of time. So, if PQUOTA is set in disk superblock,
+                * copy over sb_gquotino to sb_pquotino.
+                */
+               sbp->sb_pquotino = sbp->sb_gquotino;
+               sbp->sb_gquotino = NULLFSINO;
+       }
+}
+
+void
+xfs_sb_from_disk(
+       struct xfs_sb   *to,
+       xfs_dsb_t       *from)
+{
+       to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
+       to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
+       to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
+       to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
+       to->sb_rextents = be64_to_cpu(from->sb_rextents);
+       memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
+       to->sb_logstart = be64_to_cpu(from->sb_logstart);
+       to->sb_rootino = be64_to_cpu(from->sb_rootino);
+       to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
+       to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
+       to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
+       to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
+       to->sb_agcount = be32_to_cpu(from->sb_agcount);
+       to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
+       to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
+       to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
+       to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
+       to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
+       to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
+       memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
+       to->sb_blocklog = from->sb_blocklog;
+       to->sb_sectlog = from->sb_sectlog;
+       to->sb_inodelog = from->sb_inodelog;
+       to->sb_inopblog = from->sb_inopblog;
+       to->sb_agblklog = from->sb_agblklog;
+       to->sb_rextslog = from->sb_rextslog;
+       to->sb_inprogress = from->sb_inprogress;
+       to->sb_imax_pct = from->sb_imax_pct;
+       to->sb_icount = be64_to_cpu(from->sb_icount);
+       to->sb_ifree = be64_to_cpu(from->sb_ifree);
+       to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
+       to->sb_frextents = be64_to_cpu(from->sb_frextents);
+       to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
+       to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
+       to->sb_qflags = be16_to_cpu(from->sb_qflags);
+       to->sb_flags = from->sb_flags;
+       to->sb_shared_vn = from->sb_shared_vn;
+       to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
+       to->sb_unit = be32_to_cpu(from->sb_unit);
+       to->sb_width = be32_to_cpu(from->sb_width);
+       to->sb_dirblklog = from->sb_dirblklog;
+       to->sb_logsectlog = from->sb_logsectlog;
+       to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
+       to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
+       to->sb_features2 = be32_to_cpu(from->sb_features2);
+       to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+       to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+       to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+       to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+       to->sb_features_log_incompat =
+                               be32_to_cpu(from->sb_features_log_incompat);
+       to->sb_pad = 0;
+       to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+       to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       *fields)
+{
+       __uint16_t      qflags = from->sb_qflags;
+
+       /*
+        * We need to do these manipilations only if we are working
+        * with an older version of on-disk superblock.
+        */
+       if (xfs_sb_version_has_pquotino(from))
+               return;
+
+       if (*fields & XFS_SB_QFLAGS) {
+               /*
+                * The in-core version of sb_qflags do not have
+                * XFS_OQUOTA_* flags, whereas the on-disk version
+                * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                * to on-disk XFS_OQUOTA_* flags.
+                */
+               qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+                               XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+                       qflags |= XFS_OQUOTA_ENFD;
+               if (from->sb_qflags &
+                               (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+                       qflags |= XFS_OQUOTA_CHKD;
+               to->sb_qflags = cpu_to_be16(qflags);
+               *fields &= ~XFS_SB_QFLAGS;
+       }
+
+       /*
+        * GQUOTINO and PQUOTINO cannot be used together in versions
+        * of superblock that do not have pquotino. from->sb_flags
+        * tells us which quota is active and should be copied to
+        * disk.
+        */
+       if ((*fields & XFS_SB_GQUOTINO) &&
+                               (from->sb_qflags & XFS_GQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+       else if ((*fields & XFS_SB_PQUOTINO) &&
+                               (from->sb_qflags & XFS_PQUOTA_ACCT))
+               to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+
+       *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+}
+
+/*
+ * Copy in core superblock to ondisk one.
+ *
+ * The fields argument is mask of superblock fields to copy.
+ */
+void
+xfs_sb_to_disk(
+       xfs_dsb_t       *to,
+       xfs_sb_t        *from,
+       __int64_t       fields)
+{
+       xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+       xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
+       xfs_sb_field_t  f;
+       int             first;
+       int             size;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+
+       xfs_sb_quota_to_disk(to, from, &fields);
+       while (fields) {
+               f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+               first = xfs_sb_info[f].offset;
+               size = xfs_sb_info[f + 1].offset - first;
+
+               ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+               if (size == 1 || xfs_sb_info[f].type == 1) {
+                       memcpy(to_ptr + first, from_ptr + first, size);
+               } else {
+                       switch (size) {
+                       case 2:
+                               *(__be16 *)(to_ptr + first) =
+                                     cpu_to_be16(*(__u16 *)(from_ptr + first));
+                               break;
+                       case 4:
+                               *(__be32 *)(to_ptr + first) =
+                                     cpu_to_be32(*(__u32 *)(from_ptr + first));
+                               break;
+                       case 8:
+                               *(__be64 *)(to_ptr + first) =
+                                     cpu_to_be64(*(__u64 *)(from_ptr + first));
+                               break;
+                       default:
+                               ASSERT(0);
+                       }
+               }
+
+               fields &= ~(1LL << f);
+       }
+}
+
+static int
+xfs_sb_verify(
+       struct xfs_buf  *bp,
+       bool            check_version)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_sb   sb;
+
+       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+       /*
+        * Only check the in progress field for the primary superblock as
+        * mkfs.xfs doesn't clear it from secondary superblocks.
+        */
+       return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+                                    check_version);
+}
+
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ */
+static void
+xfs_sb_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+       int             error;
+
+       /*
+        * open code the version check to avoid needing to convert the entire
+        * superblock from disk order just to check the version number
+        */
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+           (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+                                               XFS_SB_VERSION_5) ||
+            dsb->sb_crc != 0)) {
+
+               if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
+                                     offsetof(struct xfs_sb, sb_crc))) {
+                       error = EFSCORRUPTED;
+                       goto out_error;
+               }
+       }
+       error = xfs_sb_verify(bp, true);
+
+out_error:
+       if (error) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                    mp, bp->b_addr);
+               xfs_buf_ioerror(bp, error);
+       }
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, then run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_dsb  *dsb = XFS_BUF_TO_SBP(bp);
+
+
+       if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
+               /* XFS filesystem, verify noisily! */
+               xfs_sb_read_verify(bp);
+               return;
+       }
+       /* quietly fail */
+       xfs_buf_ioerror(bp, EWRONGFS);
+}
+
+static void
+xfs_sb_write_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+       int                     error;
+
+       error = xfs_sb_verify(bp, false);
+       if (error) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                    mp, bp->b_addr);
+               xfs_buf_ioerror(bp, error);
+               return;
+       }
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (bip)
+               XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+                        offsetof(struct xfs_sb, sb_crc));
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+       .verify_read = xfs_sb_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+       .verify_read = xfs_sb_quiet_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_sb_mount_common(
+       struct xfs_mount *mp,
+       struct xfs_sb   *sbp)
+{
+       mp->m_agfrotor = mp->m_agirotor = 0;
+       spin_lock_init(&mp->m_agirotor_lock);
+       mp->m_maxagi = mp->m_sb.sb_agcount;
+       mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+       mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+       mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+       mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+       mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+       mp->m_blockmask = sbp->sb_blocksize - 1;
+       mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+       mp->m_blockwmask = mp->m_blockwsize - 1;
+
+       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
+
+       mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+       mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+                                       sbp->sb_inopblock);
+       mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+/*
+ * xfs_initialize_perag_data
+ *
+ * Read in each per-ag structure so we can count up the number of
+ * allocated inodes, free inodes and used filesystem blocks as this
+ * information is no longer persistent in the superblock. Once we have
+ * this information, write it into the in-core superblock structure.
+ */
+int
+xfs_initialize_perag_data(
+       struct xfs_mount *mp,
+       xfs_agnumber_t  agcount)
+{
+       xfs_agnumber_t  index;
+       xfs_perag_t     *pag;
+       xfs_sb_t        *sbp = &mp->m_sb;
+       uint64_t        ifree = 0;
+       uint64_t        ialloc = 0;
+       uint64_t        bfree = 0;
+       uint64_t        bfreelst = 0;
+       uint64_t        btree = 0;
+       int             error;
+
+       for (index = 0; index < agcount; index++) {
+               /*
+                * read the agf, then the agi. This gets us
+                * all the information we need and populates the
+                * per-ag structures for us.
+                */
+               error = xfs_alloc_pagf_init(mp, NULL, index, 0);
+               if (error)
+                       return error;
+
+               error = xfs_ialloc_pagi_init(mp, NULL, index);
+               if (error)
+                       return error;
+               pag = xfs_perag_get(mp, index);
+               ifree += pag->pagi_freecount;
+               ialloc += pag->pagi_count;
+               bfree += pag->pagf_freeblks;
+               bfreelst += pag->pagf_flcount;
+               btree += pag->pagf_btreeblks;
+               xfs_perag_put(pag);
+       }
+       /*
+        * Overwrite incore superblock counters with just-read data
+        */
+       spin_lock(&mp->m_sb_lock);
+       sbp->sb_ifree = ifree;
+       sbp->sb_icount = ialloc;
+       sbp->sb_fdblocks = bfree + bfreelst + btree;
+       spin_unlock(&mp->m_sb_lock);
+
+       /* Fixup the per-cpu counters as well. */
+       xfs_icsb_reinit_counters(mp);
+
+       return 0;
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+       xfs_buf_t       *bp;
+       int             first;
+       int             last;
+       xfs_mount_t     *mp;
+       xfs_sb_field_t  f;
+
+       ASSERT(fields);
+       if (!fields)
+               return;
+       mp = tp->t_mountp;
+       bp = xfs_trans_getsb(tp, mp, 0);
+       first = sizeof(xfs_sb_t);
+       last = 0;
+
+       /* translate/copy */
+
+       xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+
+       /* find modified range */
+       f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       last = xfs_sb_info[f + 1].offset - 1;
+
+       f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+       ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+       first = xfs_sb_info[f].offset;
+
+       xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+       xfs_trans_log_buf(tp, bp, first, last);
+}
index 78f9e70b80c7da8a64b92d528a9f7aa2ba49401e..6835b44f850e58e780c2712e24530dbb3e57f5f4 100644 (file)
@@ -26,6 +26,7 @@
 
 struct xfs_buf;
 struct xfs_mount;
+struct xfs_trans;
 
 #define        XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
 #define        XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
@@ -83,11 +84,13 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_PROJID32BIT    0x00000080      /* 32 bit project id */
 #define XFS_SB_VERSION2_CRCBIT         0x00000100      /* metadata CRCs */
+#define XFS_SB_VERSION2_FTYPE          0x00000200      /* inode type in dir */
 
 #define        XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
         XFS_SB_VERSION2_ATTR2BIT       | \
-        XFS_SB_VERSION2_PROJID32BIT)
+        XFS_SB_VERSION2_PROJID32BIT    | \
+        XFS_SB_VERSION2_FTYPE)
 #define        XFS_SB_VERSION2_OKSASHFBITS     \
        (0)
 #define XFS_SB_VERSION2_OKREALBITS     \
@@ -354,15 +357,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
                     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
                        return 0;
 
-#ifdef __KERNEL__
                if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
                        return 0;
-#else
-               if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
-                   sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
-                       return 0;
-#endif
-
                return 1;
        }
        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
@@ -554,12 +550,13 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
 }
 
-static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
 {
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+       sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
+       sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
 }
 
-
 /*
  * Extended v5 superblock feature masks. These are to be used for new v5
  * superblock features only.
@@ -598,7 +595,10 @@ xfs_sb_has_ro_compat_feature(
        return (sbp->sb_features_ro_compat & feature) != 0;
 }
 
-#define XFS_SB_FEAT_INCOMPAT_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_ALL \
+               (XFS_SB_FEAT_INCOMPAT_FTYPE)
+
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool
 xfs_sb_has_incompat_feature(
@@ -618,16 +618,39 @@ xfs_sb_has_incompat_log_feature(
        return (sbp->sb_features_log_incompat & feature) != 0;
 }
 
-static inline bool
-xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+/*
+ * V5 superblock specific feature checks
+ */
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
 {
-       return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
+{
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+               xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
+              (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
 }
 
 /*
  * end of superblock version macros
  */
 
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+       return (ino == sbp->sb_uquotino ||
+               ino == sbp->sb_gquotino ||
+               ino == sbp->sb_pquotino);
+}
+
 #define XFS_SB_DADDR           ((xfs_daddr_t)0) /* daddr in filesystem/ag */
 #define        XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
 #define XFS_BUF_TO_SBP(bp)     ((xfs_dsb_t *)((bp)->b_addr))
@@ -660,4 +683,23 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
 #define XFS_B_TO_FSBT(mp,b)    (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
 #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
 
+/*
+ * perag get/put wrappers for ref counting
+ */
+extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
+extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
+                                          int tag);
+extern void    xfs_perag_put(struct xfs_perag *pag);
+extern int     xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
+
+extern void    xfs_sb_calc_crc(struct xfs_buf  *);
+extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void    xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void    xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+
 #endif /* __XFS_SB_H__ */
index 1d68ffcdeaa7f555ab77ee05f5c13571c6ee3b4d..979a77d4b87d1142c12b4b1cabcf41f75777041a 100644 (file)
  */
 
 #include "xfs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_fsops.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
 #include "xfs_filestream.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
@@ -421,12 +421,6 @@ xfs_parseargs(
        }
 #endif
 
-       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
-           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-               xfs_warn(mp, "cannot mount with both project and group quota");
-               return EINVAL;
-       }
-
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                xfs_warn(mp, "sunit and swidth must be specified together");
                return EINVAL;
@@ -556,14 +550,13 @@ xfs_showargs(
        else if (mp->m_qflags & XFS_UQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_UQUOTANOENF);
 
-       /* Either project or group quotas can be active, not both */
-
        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
                if (mp->m_qflags & XFS_PQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_PRJQUOTA);
                else
                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
-       } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+       }
+       if (mp->m_qflags & XFS_GQUOTA_ACCT) {
                if (mp->m_qflags & XFS_GQUOTA_ENFD)
                        seq_puts(m, "," MNTOPT_GRPQUOTA);
                else
@@ -870,17 +863,17 @@ xfs_init_mount_workqueues(
                goto out_destroy_unwritten;
 
        mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
        if (!mp->m_reclaim_workqueue)
                goto out_destroy_cil;
 
        mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
        if (!mp->m_log_workqueue)
                goto out_destroy_reclaim;
 
        mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
-                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+                       0, 0, mp->m_fsname);
        if (!mp->m_eofblocks_workqueue)
                goto out_destroy_log;
 
@@ -1396,6 +1389,14 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
 
+       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
+           !xfs_sb_version_has_pquotino(&mp->m_sb)) {
+               xfs_warn(mp,
+                 "Super block does not support project and group quota together");
+               return XFS_ERROR(EINVAL);
+       }
+
        return 0;
 }
 
index f4895b662fcb549706881a4dd65a8b048d23d03e..2f2a7c005be2d32219fd9c580bb2050f2f4e0050 100644 (file)
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir2.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_utils.h"
 #include "xfs_trans_space.h"
-#include "xfs_log_priv.h"
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
-
-
-/*
- * Each contiguous block has a header, so it is not just a simple pathlen
- * to FSB conversion.
- */
-int
-xfs_symlink_blocks(
-       struct xfs_mount *mp,
-       int             pathlen)
-{
-       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
-
-       return (pathlen + buflen - 1) / buflen;
-}
-
-static int
-xfs_symlink_hdr_set(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return 0;
-
-       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
-       dsl->sl_offset = cpu_to_be32(offset);
-       dsl->sl_bytes = cpu_to_be32(size);
-       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
-       dsl->sl_owner = cpu_to_be64(ino);
-       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       return sizeof(struct xfs_dsymlink_hdr);
-}
-
-/*
- * Checking of the symlink header is split into two parts. the verifier does
- * CRC, location and bounds checking, the unpacking function checks the path
- * parameters and owner.
- */
-bool
-xfs_symlink_hdr_ok(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       uint32_t                offset,
-       uint32_t                size,
-       struct xfs_buf          *bp)
-{
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (offset != be32_to_cpu(dsl->sl_offset))
-               return false;
-       if (size != be32_to_cpu(dsl->sl_bytes))
-               return false;
-       if (ino != be64_to_cpu(dsl->sl_owner))
-               return false;
-
-       /* ok */
-       return true;
-}
-
-static bool
-xfs_symlink_verify(
-       struct xfs_buf          *bp)
-{
-       struct xfs_mount        *mp = bp->b_target->bt_mount;
-       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return false;
-       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
-               return false;
-       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
-               return false;
-       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
-               return false;
-       if (be32_to_cpu(dsl->sl_offset) +
-                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
-               return false;
-       if (dsl->sl_owner == 0)
-               return false;
-
-       return true;
-}
-
-static void
-xfs_symlink_read_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
-                                 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
-           !xfs_symlink_verify(bp)) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-       }
-}
-
-static void
-xfs_symlink_write_verify(
-       struct xfs_buf  *bp)
-{
-       struct xfs_mount *mp = bp->b_target->bt_mount;
-       struct xfs_buf_log_item *bip = bp->b_fspriv;
-
-       /* no verification of non-crc buffers */
-       if (!xfs_sb_version_hascrc(&mp->m_sb))
-               return;
-
-       if (!xfs_symlink_verify(bp)) {
-               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
-               xfs_buf_ioerror(bp, EFSCORRUPTED);
-               return;
-       }
-
-       if (bip) {
-               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
-               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-       }
-       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
-                        offsetof(struct xfs_dsymlink_hdr, sl_crc));
-}
-
-const struct xfs_buf_ops xfs_symlink_buf_ops = {
-       .verify_read = xfs_symlink_read_verify,
-       .verify_write = xfs_symlink_write_verify,
-};
-
-void
-xfs_symlink_local_to_remote(
-       struct xfs_trans        *tp,
-       struct xfs_buf          *bp,
-       struct xfs_inode        *ip,
-       struct xfs_ifork        *ifp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       char                    *buf;
-
-       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
-               bp->b_ops = NULL;
-               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-               return;
-       }
-
-       /*
-        * As this symlink fits in an inode literal area, it must also fit in
-        * the smallest buffer the filesystem supports.
-        */
-       ASSERT(BBTOB(bp->b_length) >=
-                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
-
-       bp->b_ops = &xfs_symlink_buf_ops;
-
-       buf = bp->b_addr;
-       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
-       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
-}
 
 /* ----- Kernel only functions below ----- */
 STATIC int
@@ -386,8 +215,11 @@ xfs_symlink(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-               XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
+       error = xfs_qm_vop_dqalloc(dp,
+                       xfs_kuid_to_uid(current_fsuid()),
+                       xfs_kgid_to_gid(current_fsgid()), prid,
+                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                       &udqp, &gdqp, &pdqp);
        if (error)
                goto std_return;
 
@@ -402,12 +234,10 @@ xfs_symlink(
        else
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-       error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
        if (error == ENOSPC && fs_blocks == 0) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
        }
        if (error) {
                cancel_flags = 0;
@@ -710,8 +540,8 @@ xfs_inactive_symlink_rmt(
         * Put an itruncate log reservation in the new transaction
         * for our caller.
         */
-       if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+       if (error) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                goto error0;
        }
index 374394880c01e4d8db8dc36d6468e748295c6f29..99338ba666ac68c11350fb1b24906364c7a6de01 100644 (file)
 #ifndef __XFS_SYMLINK_H
 #define __XFS_SYMLINK_H 1
 
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_inode;
-struct xfs_buf;
-struct xfs_ifork;
-struct xfs_name;
-
-#define XFS_SYMLINK_MAGIC      0x58534c4d      /* XSLM */
-
-struct xfs_dsymlink_hdr {
-       __be32  sl_magic;
-       __be32  sl_offset;
-       __be32  sl_bytes;
-       __be32  sl_crc;
-       uuid_t  sl_uuid;
-       __be64  sl_owner;
-       __be64  sl_blkno;
-       __be64  sl_lsn;
-};
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 3 extents back from
- * bmapi when crc headers are taken into account.
- */
-#define XFS_SYMLINK_MAPS 3
-
-#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)     \
-       ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
-                       sizeof(struct xfs_dsymlink_hdr) : 0))
-
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
-                                struct xfs_inode *ip, struct xfs_ifork *ifp);
-
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
-
-#ifdef __KERNEL__
+/* Kernel only symlink defintions */
 
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
 
-#endif /* __KERNEL__ */
 #endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
new file mode 100644 (file)
index 0000000..01c85e3
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+       struct xfs_mount *mp,
+       int             pathlen)
+{
+       int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+       return (pathlen + buflen - 1) / buflen;
+}
+
+int
+xfs_symlink_hdr_set(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return 0;
+
+       dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+       dsl->sl_offset = cpu_to_be32(offset);
+       dsl->sl_bytes = cpu_to_be32(size);
+       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+       dsl->sl_owner = cpu_to_be64(ino);
+       dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino,
+       uint32_t                offset,
+       uint32_t                size,
+       struct xfs_buf          *bp)
+{
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (offset != be32_to_cpu(dsl->sl_offset))
+               return false;
+       if (size != be32_to_cpu(dsl->sl_bytes))
+               return false;
+       if (ino != be64_to_cpu(dsl->sl_owner))
+               return false;
+
+       /* ok */
+       return true;
+}
+
+static bool
+xfs_symlink_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return false;
+       if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+               return false;
+       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+               return false;
+       if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+               return false;
+       if (be32_to_cpu(dsl->sl_offset) +
+                               be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+               return false;
+       if (dsl->sl_owner == 0)
+               return false;
+
+       return true;
+}
+
+static void
+xfs_symlink_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                                 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
+           !xfs_symlink_verify(bp)) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_symlink_write_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+       /* no verification of non-crc buffers */
+       if (!xfs_sb_version_hascrc(&mp->m_sb))
+               return;
+
+       if (!xfs_symlink_verify(bp)) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+               return;
+       }
+
+       if (bip) {
+               struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+               dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+       }
+       xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+                        offsetof(struct xfs_dsymlink_hdr, sl_crc));
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+       .verify_read = xfs_symlink_read_verify,
+       .verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp,
+       struct xfs_inode        *ip,
+       struct xfs_ifork        *ifp)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       char                    *buf;
+
+       if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+               bp->b_ops = NULL;
+               memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+               return;
+       }
+
+       /*
+        * As this symlink fits in an inode literal area, it must also fit in
+        * the smallest buffer the filesystem supports.
+        */
+       ASSERT(BBTOB(bp->b_length) >=
+                       ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+       bp->b_ops = &xfs_symlink_buf_ops;
+
+       buf = bp->b_addr;
+       buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+       memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
index b6e3897c1d9f0bbeb34f3b1e610e58217ee005f4..5d7b3e40705ffe4a96c75493d09ac74d9ae265cd 100644 (file)
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
index 35a229981354159add4b143aea86682c22966a7f..5411e01ab4527318b187846fa3e7a53ad6300eb3 100644 (file)
@@ -18,7 +18,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 kmem_zone_t    *xfs_trans_zone;
 kmem_zone_t    *xfs_log_item_desc_zone;
 
-/*
- * A buffer has a format structure overhead in the log in addition
- * to the data, so we need to take this into account when reserving
- * space in a transaction for a buffer.  Round the space required up
- * to a multiple of 128 bytes so that we don't change the historical
- * reservation that has been used for this overhead.
- */
-STATIC uint
-xfs_buf_log_overhead(void)
-{
-       return round_up(sizeof(struct xlog_op_header) +
-                       sizeof(struct xfs_buf_log_format), 128);
-}
-
-/*
- * Calculate out transaction log reservation per item in bytes.
- *
- * The nbufs argument is used to indicate the number of items that
- * will be changed in a transaction.  size is used to tell how many
- * bytes should be reserved per item.
- */
-STATIC uint
-xfs_calc_buf_res(
-       uint            nbufs,
-       uint            size)
-{
-       return nbufs * (size + xfs_buf_log_overhead());
-}
-
-/*
- * Various log reservation values.
- *
- * These are based on the size of the file system block because that is what
- * most transactions manipulate.  Each adds in an additional 128 bytes per
- * item logged to try to account for the overhead of the transaction mechanism.
- *
- * Note:  Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish() call.
- * This is because the number in the worst case is quite high and quite
- * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
- * extents in only a single AG at a time.  This will require changes to the
- * EFI code as well, however, so that the EFI for the extents not freed is
- * logged again in each transaction.  See SGI PV #261917.
- *
- * Reservation functions here avoid a huge stack in xfs_trans_init due to
- * register overflow from temporaries in the calculations.
- */
-
-
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_write_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_itruncate_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                     XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(5, 0) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                    XFS_FSB_TO_B(mp, 1)) +
-                   xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                                    mp->m_in_maxlevels, 0)));
-}
-
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *     of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_rename_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_link_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_remove_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                                     XFS_FSB_TO_B(mp, 1))),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * For create, break it in to the two cases that the transaction
- * covers. We start with the modify case - allocation done by modification
- * of the state of existing inodes - and the allocation case.
- */
-
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- */
-STATIC uint
-xfs_calc_create_resv_modify(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               (uint)XFS_FSB_TO_B(mp, 1) +
-               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * For create we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_create_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-__xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_create_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-/*
- * For icreate we can allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_icreate_resv_alloc(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               mp->m_sb.sb_sectsize +
-               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-STATIC uint
-xfs_calc_icreate_reservation(xfs_mount_t *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX(xfs_calc_icreate_resv_alloc(mp),
-                   xfs_calc_create_resv_modify(mp));
-}
-
-STATIC uint
-xfs_calc_create_reservation(
-       struct xfs_mount        *mp)
-{
-       if (xfs_sb_version_hascrc(&mp->m_sb))
-               return xfs_calc_icreate_reservation(mp);
-       return __xfs_calc_create_reservation(mp);
-
-}
-
-/*
- * Making a new directory is the same as creating a new file.
- */
-STATIC uint
-xfs_calc_mkdir_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp);
-}
-
-
-/*
- * Making a new symplink is the same as creating a new file, but
- * with the added blocks for remote symlink data which can be up to 1kB in
- * length (MAXPATHLEN).
- */
-STATIC uint
-xfs_calc_symlink_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_create_reservation(mp) +
-              xfs_calc_buf_res(1, MAXPATHLEN);
-}
-
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_ifree_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-               MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
-                   XFS_INODE_CLUSTER_SIZE(mp)) +
-               xfs_calc_buf_res(1, 0) +
-               xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                                mp->m_in_maxlevels, 0) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-STATIC uint
-xfs_calc_ichange_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               mp->m_sb.sb_inodesize +
-               mp->m_sb.sb_sectsize +
-               512;
-
-}
-
-/*
- * Growing the data section of the filesystem.
- *     superblock
- *     agi and agf
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_growdata_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *     superblock: sector size
- *     agf of the ag from which the extent is allocated: sector size
- *     bmap btree for bitmap/summary inode: max depth * blocksize
- *     bitmap/summary inode: inode size
- *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-STATIC uint
-xfs_calc_growrtalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *     one bitmap/summary block: blocksize
- */
-STATIC uint
-xfs_calc_growrtzero_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
-}
-
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *     superblock: sector size
- *     bitmap inode: inode size
- *     summary inode: inode size
- *     one bitmap block: blocksize
- *     summary blocks: new summary size
- */
-STATIC uint
-xfs_calc_growrtfree_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
-               xfs_calc_buf_res(1, mp->m_rsumsize);
-}
-
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *     inode
- */
-STATIC uint
-xfs_calc_swrite_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
-}
-
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *     inode
- */
-STATIC uint
-xfs_calc_writeid_reservation(xfs_mount_t *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
-}
-
-/*
- * Converting the inode from non-attributed to attributed.
- *     the inode being converted: inode size
- *     agf block and superblock (for block allocation)
- *     the new block (directory sized)
- *     bmap blocks for the new directory block
- *     allocation btrees
- */
-STATIC uint
-xfs_calc_addafork_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(1, mp->m_dirblksize) +
-               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
-                                XFS_FSB_TO_B(mp, 1)) +
-               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *             4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrinval_reservation(
-       struct xfs_mount        *mp)
-{
-       return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                    XFS_FSB_TO_B(mp, 1))),
-                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                                    XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Setting an attribute at mount time.
- *     the inode getting the attribute
- *     the superblock for allocations
- *     the agfs extents are allocated from
- *     the attribute btree * max depth
- *     the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime(see
- * below).
- */
-STATIC uint
-xfs_calc_attrsetm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Setting an attribute at runtime, transaction space unit per block.
- *     the superblock for allocations: sector size
- *     the inode bmap btree could join or split: max depth * block size
- * Since the runtime attribute transaction space is dependent on the total
- * blocks needed for the 1st bmap, here we calculate out the space unit for
- * one block so that the caller could figure out the total space according
- * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
- */
-STATIC uint
-xfs_calc_attrsetrt_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                                XFS_FSB_TO_B(mp, 1));
-}
-
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-STATIC uint
-xfs_calc_attrrm_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_DQUOT_LOGRES(mp) +
-               MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
-                                     XFS_FSB_TO_B(mp, 1)) +
-                    (uint)XFS_FSB_TO_B(mp,
-                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
-                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                                     XFS_FSB_TO_B(mp, 1))));
-}
-
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-STATIC uint
-xfs_calc_clear_agi_bucket_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Clearing the quotaflags in the superblock.
- *     the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * Adjusting quota limits.
- *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
- */
-STATIC uint
-xfs_calc_qm_setqlim_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
-}
-
-/*
- * Allocating quota on disk if needed.
- *     the write transaction log space: XFS_WRITE_LOG_RES(mp)
- *     the unit of quota allocation: one system block size
- */
-STATIC uint
-xfs_calc_qm_dqalloc_reservation(
-       struct xfs_mount        *mp)
-{
-       return XFS_WRITE_LOG_RES(mp) +
-               xfs_calc_buf_res(1,
-                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
-}
-
-/*
- * Turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- *    the superblock for the quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_quotaoff_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2 +
-               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
-/*
- * End of turning off quotas.
- *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
- */
-STATIC uint
-xfs_calc_qm_quotaoff_end_reservation(
-       struct xfs_mount        *mp)
-{
-       return sizeof(struct xfs_qoff_logitem) * 2;
-}
-
-/*
- * Syncing the incore super block changes to disk.
- *     the super block to reflect the changes: sector size
- */
-STATIC uint
-xfs_calc_sb_reservation(
-       struct xfs_mount        *mp)
-{
-       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-
 /*
  * Initialize the precomputed transaction reservation values
  * in the mount structure.
@@ -679,36 +56,7 @@ void
 xfs_trans_init(
        struct xfs_mount        *mp)
 {
-       struct xfs_trans_reservations *resp = &mp->m_reservations;
-
-       resp->tr_write = xfs_calc_write_reservation(mp);
-       resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
-       resp->tr_rename = xfs_calc_rename_reservation(mp);
-       resp->tr_link = xfs_calc_link_reservation(mp);
-       resp->tr_remove = xfs_calc_remove_reservation(mp);
-       resp->tr_symlink = xfs_calc_symlink_reservation(mp);
-       resp->tr_create = xfs_calc_create_reservation(mp);
-       resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
-       resp->tr_ifree = xfs_calc_ifree_reservation(mp);
-       resp->tr_ichange = xfs_calc_ichange_reservation(mp);
-       resp->tr_growdata = xfs_calc_growdata_reservation(mp);
-       resp->tr_swrite = xfs_calc_swrite_reservation(mp);
-       resp->tr_writeid = xfs_calc_writeid_reservation(mp);
-       resp->tr_addafork = xfs_calc_addafork_reservation(mp);
-       resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
-       resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
-       resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
-       resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
-       resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
-       resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
-       resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
-       resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
-       resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
-       resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
-       resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
-       resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
-       resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
-       resp->tr_sb = xfs_calc_sb_reservation(mp);
+       xfs_trans_resv_calc(mp, M_RES(mp));
 }
 
 /*
@@ -744,7 +92,7 @@ _xfs_trans_alloc(
        atomic_inc(&mp->m_active_trans);
 
        tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
-       tp->t_magic = XFS_TRANS_MAGIC;
+       tp->t_magic = XFS_TRANS_HEADER_MAGIC;
        tp->t_type = type;
        tp->t_mountp = mp;
        INIT_LIST_HEAD(&tp->t_items);
@@ -789,7 +137,7 @@ xfs_trans_dup(
        /*
         * Initialize the new transaction structure.
         */
-       ntp->t_magic = XFS_TRANS_MAGIC;
+       ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        INIT_LIST_HEAD(&ntp->t_items);
@@ -832,12 +180,10 @@ xfs_trans_dup(
  */
 int
 xfs_trans_reserve(
-       xfs_trans_t     *tp,
-       uint            blocks,
-       uint            logspace,
-       uint            rtextents,
-       uint            flags,
-       uint            logcount)
+       struct xfs_trans        *tp,
+       struct xfs_trans_res    *resp,
+       uint                    blocks,
+       uint                    rtextents)
 {
        int             error = 0;
        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -863,13 +209,15 @@ xfs_trans_reserve(
        /*
         * Reserve the log space needed for this transaction.
         */
-       if (logspace > 0) {
+       if (resp->tr_logres > 0) {
                bool    permanent = false;
 
-               ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
-               ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+               ASSERT(tp->t_log_res == 0 ||
+                      tp->t_log_res == resp->tr_logres);
+               ASSERT(tp->t_log_count == 0 ||
+                      tp->t_log_count == resp->tr_logcount);
 
-               if (flags & XFS_TRANS_PERM_LOG_RES) {
+               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
                        tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
                        permanent = true;
                } else {
@@ -878,20 +226,21 @@ xfs_trans_reserve(
                }
 
                if (tp->t_ticket != NULL) {
-                       ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+                       ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
                        error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
                } else {
-                       error = xfs_log_reserve(tp->t_mountp, logspace,
-                                               logcount, &tp->t_ticket,
-                                               XFS_TRANSACTION, permanent,
-                                               tp->t_type);
+                       error = xfs_log_reserve(tp->t_mountp,
+                                               resp->tr_logres,
+                                               resp->tr_logcount,
+                                               &tp->t_ticket, XFS_TRANSACTION,
+                                               permanent, tp->t_type);
                }
 
                if (error)
                        goto undo_blocks;
 
-               tp->t_log_res = logspace;
-               tp->t_log_count = logcount;
+               tp->t_log_res = resp->tr_logres;
+               tp->t_log_count = resp->tr_logcount;
        }
 
        /*
@@ -916,10 +265,10 @@ xfs_trans_reserve(
         * reservations which have already been performed.
         */
 undo_log:
-       if (logspace > 0) {
+       if (resp->tr_logres > 0) {
                int             log_flags;
 
-               if (flags & XFS_TRANS_PERM_LOG_RES) {
+               if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
                        log_flags = XFS_LOG_REL_PERM_RESERV;
                } else {
                        log_flags = 0;
@@ -1367,10 +716,10 @@ xfs_trans_free_items(
                lip->li_desc = NULL;
 
                if (commit_lsn != NULLCOMMITLSN)
-                       IOP_COMMITTING(lip, commit_lsn);
+                       lip->li_ops->iop_committing(lip, commit_lsn);
                if (flags & XFS_TRANS_ABORT)
                        lip->li_flags |= XFS_LI_ABORTED;
-               IOP_UNLOCK(lip);
+               lip->li_ops->iop_unlock(lip);
 
                xfs_trans_free_item_desc(lidp);
        }
@@ -1390,8 +739,11 @@ xfs_log_item_batch_insert(
        /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
        xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
 
-       for (i = 0; i < nr_items; i++)
-               IOP_UNPIN(log_items[i], 0);
+       for (i = 0; i < nr_items; i++) {
+               struct xfs_log_item *lip = log_items[i];
+
+               lip->li_ops->iop_unpin(lip, 0);
+       }
 }
 
 /*
@@ -1401,11 +753,11 @@ xfs_log_item_batch_insert(
  *
  * If we are called with the aborted flag set, it is because a log write during
  * a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * checkpoint have already gone through iop_commited and iop_unlock, which
  * means that checkpoint commit abort handling is treated exactly the same
  * as an iclog write error even though we haven't started any IO yet. Hence in
- * this case all we need to do is IOP_COMMITTED processing, followed by an
- * IOP_UNPIN(aborted) call.
+ * this case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
  *
  * The AIL cursor is used to optimise the insert process. If commit_lsn is not
  * at the end of the AIL, the insert cursor avoids the need to walk
@@ -1438,7 +790,7 @@ xfs_trans_committed_bulk(
 
                if (aborted)
                        lip->li_flags |= XFS_LI_ABORTED;
-               item_lsn = IOP_COMMITTED(lip, commit_lsn);
+               item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
 
                /* item_lsn of -1 means the item needs no further processing */
                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -1450,7 +802,7 @@ xfs_trans_committed_bulk(
                 */
                if (aborted) {
                        ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
-                       IOP_UNPIN(lip, 1);
+                       lip->li_ops->iop_unpin(lip, 1);
                        continue;
                }
 
@@ -1468,7 +820,7 @@ xfs_trans_committed_bulk(
                                xfs_trans_ail_update(ailp, lip, item_lsn);
                        else
                                spin_unlock(&ailp->xa_lock);
-                       IOP_UNPIN(lip, 0);
+                       lip->li_ops->iop_unpin(lip, 0);
                        continue;
                }
 
@@ -1666,7 +1018,7 @@ xfs_trans_roll(
        struct xfs_inode        *dp)
 {
        struct xfs_trans        *trans;
-       unsigned int            logres, count;
+       struct xfs_trans_res    tres;
        int                     error;
 
        /*
@@ -1678,8 +1030,8 @@ xfs_trans_roll(
        /*
         * Copy the critical parameters from one trans to the next.
         */
-       logres = trans->t_log_res;
-       count = trans->t_log_count;
+       tres.tr_logres = trans->t_log_res;
+       tres.tr_logcount = trans->t_log_count;
        *tpp = xfs_trans_dup(trans);
 
        /*
@@ -1710,8 +1062,8 @@ xfs_trans_roll(
         * across this call, or that anything that is locked be logged in
         * the prior and the next transactions.
         */
-       error = xfs_trans_reserve(trans, 0, logres, 0,
-                                 XFS_TRANS_PERM_LOG_RES, count);
+       tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+       error = xfs_trans_reserve(trans, &tres, 0, 0);
        /*
         *  Ensure that the inode is in the new transaction and locked.
         */
index 2b4946393e30f56655e55c782813778760846f79..09cf40b89e8c1d85817cc649b22a12c3ba97aa78 100644 (file)
 
 struct xfs_log_item;
 
-/*
- * This is the structure written in the log at the head of
- * every transaction. It identifies the type and id of the
- * transaction, and contains the number of items logged by
- * the transaction so we know how many to expect during recovery.
- *
- * Do not change the below structure without redoing the code in
- * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
- */
-typedef struct xfs_trans_header {
-       uint            th_magic;               /* magic number */
-       uint            th_type;                /* transaction type */
-       __int32_t       th_tid;                 /* transaction id (unused) */
-       uint            th_num_items;           /* num items logged by trans */
-} xfs_trans_header_t;
-
-#define        XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
-
-/*
- * Log item types.
- */
-#define        XFS_LI_EFI              0x1236
-#define        XFS_LI_EFD              0x1237
-#define        XFS_LI_IUNLINK          0x1238
-#define        XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
-#define        XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
-#define        XFS_LI_DQUOT            0x123d
-#define        XFS_LI_QUOTAOFF         0x123e
-#define        XFS_LI_ICREATE          0x123f
-
-#define XFS_LI_TYPE_DESC \
-       { XFS_LI_EFI,           "XFS_LI_EFI" }, \
-       { XFS_LI_EFD,           "XFS_LI_EFD" }, \
-       { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
-       { XFS_LI_INODE,         "XFS_LI_INODE" }, \
-       { XFS_LI_BUF,           "XFS_LI_BUF" }, \
-       { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
-       { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
-
-/*
- * Transaction types.  Used to distinguish types of buffers.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE     1
-#define XFS_TRANS_SETATTR_SIZE         2
-#define XFS_TRANS_INACTIVE             3
-#define XFS_TRANS_CREATE               4
-#define XFS_TRANS_CREATE_TRUNC         5
-#define XFS_TRANS_TRUNCATE_FILE                6
-#define XFS_TRANS_REMOVE               7
-#define XFS_TRANS_LINK                 8
-#define XFS_TRANS_RENAME               9
-#define XFS_TRANS_MKDIR                        10
-#define XFS_TRANS_RMDIR                        11
-#define XFS_TRANS_SYMLINK              12
-#define XFS_TRANS_SET_DMATTRS          13
-#define XFS_TRANS_GROWFS               14
-#define XFS_TRANS_STRAT_WRITE          15
-#define XFS_TRANS_DIOSTRAT             16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define        XFS_TRANS_WRITEID               18
-#define        XFS_TRANS_ADDAFORK              19
-#define        XFS_TRANS_ATTRINVAL             20
-#define        XFS_TRANS_ATRUNCATE             21
-#define        XFS_TRANS_ATTR_SET              22
-#define        XFS_TRANS_ATTR_RM               23
-#define        XFS_TRANS_ATTR_FLAG             24
-#define        XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE          26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1               27
-#define XFS_TRANS_DUMMY2               28
-#define XFS_TRANS_QM_QUOTAOFF          29
-#define XFS_TRANS_QM_DQALLOC           30
-#define XFS_TRANS_QM_SETQLIM           31
-#define XFS_TRANS_QM_DQCLUSTER         32
-#define XFS_TRANS_QM_QINOCREATE                33
-#define XFS_TRANS_QM_QUOTAOFF_END      34
-#define XFS_TRANS_SB_UNIT              35
-#define XFS_TRANS_FSYNC_TS             36
-#define        XFS_TRANS_GROWFSRT_ALLOC        37
-#define        XFS_TRANS_GROWFSRT_ZERO         38
-#define        XFS_TRANS_GROWFSRT_FREE         39
-#define        XFS_TRANS_SWAPEXT               40
-#define        XFS_TRANS_SB_COUNT              41
-#define        XFS_TRANS_CHECKPOINT            42
-#define        XFS_TRANS_ICREATE               43
-#define        XFS_TRANS_TYPE_MAX              43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
-       { XFS_TRANS_SETATTR_NOT_SIZE,   "SETATTR_NOT_SIZE" }, \
-       { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
-       { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
-       { XFS_TRANS_CREATE,             "CREATE" }, \
-       { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
-       { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
-       { XFS_TRANS_REMOVE,             "REMOVE" }, \
-       { XFS_TRANS_LINK,               "LINK" }, \
-       { XFS_TRANS_RENAME,             "RENAME" }, \
-       { XFS_TRANS_MKDIR,              "MKDIR" }, \
-       { XFS_TRANS_RMDIR,              "RMDIR" }, \
-       { XFS_TRANS_SYMLINK,            "SYMLINK" }, \
-       { XFS_TRANS_SET_DMATTRS,        "SET_DMATTRS" }, \
-       { XFS_TRANS_GROWFS,             "GROWFS" }, \
-       { XFS_TRANS_STRAT_WRITE,        "STRAT_WRITE" }, \
-       { XFS_TRANS_DIOSTRAT,           "DIOSTRAT" }, \
-       { XFS_TRANS_WRITEID,            "WRITEID" }, \
-       { XFS_TRANS_ADDAFORK,           "ADDAFORK" }, \
-       { XFS_TRANS_ATTRINVAL,          "ATTRINVAL" }, \
-       { XFS_TRANS_ATRUNCATE,          "ATRUNCATE" }, \
-       { XFS_TRANS_ATTR_SET,           "ATTR_SET" }, \
-       { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
-       { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
-       { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-       { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
-       { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
-       { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
-       { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
-       { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
-       { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
-       { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-       { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
-       { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
-       { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
-       { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
-       { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
-       { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-       { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
-       { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-       { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
-       { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
-       { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction.  It points to the log item and keeps some
- * flags to track the state of the log item.  It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
-       struct xfs_log_item     *lid_item;
-       struct list_head        lid_trans;
-       unsigned char           lid_flags;
-};
-
-#define XFS_LID_DIRTY          0x1
-
-#define        XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
-/*
- * Values for t_flags.
- */
-#define        XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
-#define        XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
-#define        XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
-#define        XFS_TRANS_SYNC          0x08    /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY     0x10    /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE      0x20    /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT  0x40    /* Transaction has elevated writer
-                                          count in superblock */
-
-/*
- * Values for call flags parameter.
- */
-#define        XFS_TRANS_RELEASE_LOG_RES       0x4
-#define        XFS_TRANS_ABORT                 0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define        XFS_TRANS_SB_ICOUNT             0x00000001
-#define        XFS_TRANS_SB_IFREE              0x00000002
-#define        XFS_TRANS_SB_FDBLOCKS           0x00000004
-#define        XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
-#define        XFS_TRANS_SB_FREXTENTS          0x00000010
-#define        XFS_TRANS_SB_RES_FREXTENTS      0x00000020
-#define        XFS_TRANS_SB_DBLOCKS            0x00000040
-#define        XFS_TRANS_SB_AGCOUNT            0x00000080
-#define        XFS_TRANS_SB_IMAXPCT            0x00000100
-#define        XFS_TRANS_SB_REXTSIZE           0x00000200
-#define        XFS_TRANS_SB_RBMBLOCKS          0x00000400
-#define        XFS_TRANS_SB_RBLOCKS            0x00000800
-#define        XFS_TRANS_SB_REXTENTS           0x00001000
-#define        XFS_TRANS_SB_REXTSLOG           0x00002000
-
-
-/*
- * Per-extent log reservation for the allocation btree changes
- * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1)
- */
-#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
-       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
-
-/*
- * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block)
- * bmap btree: (levels + 2) * max depth
- * v2 directory blocks can be fragmented below the dirblksize down to the fsb
- * size, so account for that in the DAENTER macros.
- */
-#define        XFS_DIROP_LOG_COUNT(mp) \
-       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
-        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-
+#include "xfs_trans_resv.h"
 
-#define        XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-#define        XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-#define        XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-#define        XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-#define        XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-#define        XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-#define        XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-#define        XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-#define        XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-#define        XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-#define        XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-#define        XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-#define        XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-#define        XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-#define        XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode timestamps on an fsync -- same as SWRITE
- * as long as SWRITE logs the entire inode core
- */
-#define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-#define        XFS_WRITEID_LOG_RES(mp)         ((mp)->m_reservations.tr_swrite)
-#define        XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-#define        XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-#define        XFS_ATTRSETM_LOG_RES(mp)        ((mp)->m_reservations.tr_attrsetm)
-#define XFS_ATTRSETRT_LOG_RES(mp)      ((mp)->m_reservations.tr_attrsetrt)
-#define        XFS_ATTRRM_LOG_RES(mp)          ((mp)->m_reservations.tr_attrrm)
-#define        XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
-#define XFS_QM_SBCHANGE_LOG_RES(mp)    ((mp)->m_reservations.tr_qm_sbchange)
-#define XFS_QM_SETQLIM_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_setqlim)
-#define XFS_QM_DQALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_dqalloc)
-#define XFS_QM_QUOTAOFF_LOG_RES(mp)    ((mp)->m_reservations.tr_qm_quotaoff)
-#define XFS_QM_QUOTAOFF_END_LOG_RES(mp)        ((mp)->m_reservations.tr_qm_equotaoff)
-#define XFS_SB_LOG_RES(mp)             ((mp)->m_reservations.tr_sb)
-
-/*
- * Various log count values.
- */
-#define        XFS_DEFAULT_LOG_COUNT           1
-#define        XFS_DEFAULT_PERM_LOG_COUNT      2
-#define        XFS_ITRUNCATE_LOG_COUNT         2
-#define XFS_INACTIVE_LOG_COUNT         2
-#define        XFS_CREATE_LOG_COUNT            2
-#define        XFS_MKDIR_LOG_COUNT             3
-#define        XFS_SYMLINK_LOG_COUNT           3
-#define        XFS_REMOVE_LOG_COUNT            2
-#define        XFS_LINK_LOG_COUNT              2
-#define        XFS_RENAME_LOG_COUNT            2
-#define        XFS_WRITE_LOG_COUNT             2
-#define        XFS_ADDAFORK_LOG_COUNT          2
-#define        XFS_ATTRINVAL_LOG_COUNT         1
-#define        XFS_ATTRSET_LOG_COUNT           3
-#define        XFS_ATTRRM_LOG_COUNT            3
-
-/*
- * Here we centralize the specification of XFS meta-data buffer
- * reference count values.  This determine how hard the buffer
- * cache tries to hold onto the buffer.
- */
-#define        XFS_AGF_REF             4
-#define        XFS_AGI_REF             4
-#define        XFS_AGFL_REF            3
-#define        XFS_INO_BTREE_REF       3
-#define        XFS_ALLOC_BTREE_REF     2
-#define        XFS_BMAP_BTREE_REF      2
-#define        XFS_DIR_BTREE_REF       2
-#define        XFS_INO_REF             2
-#define        XFS_ATTR_BTREE_REF      1
-#define        XFS_DQUOT_REF           1
-
-#ifdef __KERNEL__
+/* kernel only transaction subsystem defines */
 
 struct xfs_buf;
 struct xfs_buftarg;
@@ -310,6 +34,7 @@ struct xfs_log_iovec;
 struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_trans_res;
 struct xfs_dquot_acct;
 struct xfs_busy_extent;
 
@@ -342,7 +67,7 @@ typedef struct xfs_log_item {
        { XFS_LI_ABORTED,       "ABORTED" }
 
 struct xfs_item_ops {
-       uint (*iop_size)(xfs_log_item_t *);
+       void (*iop_size)(xfs_log_item_t *, int *, int *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
        void (*iop_unpin)(xfs_log_item_t *, int remove);
@@ -352,17 +77,8 @@ struct xfs_item_ops {
        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 };
 
-#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, remove)  (*(ip)->li_ops->iop_unpin)(ip, remove)
-#define IOP_PUSH(ip, list)     (*(ip)->li_ops->iop_push)(ip, list)
-#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-
 /*
- * Return values for the IOP_PUSH() routines.
+ * Return values for the iop_push() routines.
  */
 #define XFS_ITEM_SUCCESS       0
 #define XFS_ITEM_PINNED                1
@@ -446,7 +162,7 @@ typedef struct xfs_trans {
 xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
 xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
-int            xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+int            xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
                                  uint, uint);
 void           xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 
@@ -528,9 +244,4 @@ void                xfs_trans_ail_destroy(struct xfs_mount *);
 extern kmem_zone_t     *xfs_trans_zone;
 extern kmem_zone_t     *xfs_log_item_desc_zone;
 
-#endif /* __KERNEL__ */
-
-void           xfs_trans_init(struct xfs_mount *);
-int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-
 #endif /* __XFS_TRANS_H__ */
index 0eda7254305f3c5a6524a378596215c8cf861140..21c6d7ddbc06b474e102b30cb6a13c7690d1a2a5 100644 (file)
@@ -61,20 +61,6 @@ xfs_ail_check(
 #endif /* DEBUG */
 
 /*
- * Return a pointer to the first item in the AIL.  If the AIL is empty, then
- * return NULL.
- */
-xfs_log_item_t *
-xfs_ail_min(
-       struct xfs_ail  *ailp)
-{
-       if (list_empty(&ailp->xa_ail))
-               return NULL;
-
-       return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-
- /*
  * Return a pointer to the last item in the AIL.  If the AIL is empty, then
  * return NULL.
  */
@@ -393,11 +379,11 @@ xfsaild_push(
                int     lock_result;
 
                /*
-                * Note that IOP_PUSH may unlock and reacquire the AIL lock.  We
+                * Note that iop_push may unlock and reacquire the AIL lock.  We
                 * rely on the AIL cursor implementation to be able to deal with
                 * the dropped lock.
                 */
-               lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
+               lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
index aa5a04b844d6d530b12edd39b3f99240aff625bc..8c75b8f672702419beede8e54363ec259616039a 100644 (file)
@@ -505,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t        *tp,
 
 /*
  * Mark the buffer as not needing to be unlocked when the buf item's
- * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * iop_unlock() routine is called.  The buffer must already be locked
  * and associated with the given transaction.
  */
 /* ARGSUSED */
index 61407a847b869a6bb0faec7bf2c3279d66aaeb83..54ee3c5dee76093b6a6136a5fde759a8be309ccd 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_format.h"
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
index 53b7c9b0f8f7a6fa3c96a3f73298c2862eb3d107..c52def0b441cd89f2cb207e1b7ba5a9f22cc915c 100644 (file)
@@ -25,6 +25,9 @@ struct xfs_trans;
 struct xfs_ail;
 struct xfs_log_vec;
 
+
+void   xfs_trans_init(struct xfs_mount *);
+int    xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 void   xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void   xfs_trans_del_item(struct xfs_log_item *);
 void   xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -83,6 +86,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                struct xfs_ail_cursor *cur,
                                struct xfs_log_item **log_items, int nr_items,
                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static inline struct xfs_log_item *
+xfs_ail_min(
+       struct xfs_ail  *ailp)
+{
+       return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
+                                       li_ail);
+}
+
 static inline void
 xfs_trans_ail_update(
        struct xfs_ail          *ailp,
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
new file mode 100644 (file)
index 0000000..a65a3cc
--- /dev/null
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log.h"
+#include "xfs_trans_resv.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+       return round_up(sizeof(struct xlog_op_header) +
+                       sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+       uint            nbufs,
+       uint            size)
+{
+       return nbufs * (size + xfs_buf_log_overhead());
+}
+
+/*
+ * Logging inodes is really tricksy. They are logged in memory format,
+ * which means that what we write into the log doesn't directly translate into
+ * the amount of space they use on disk.
+ *
+ * Case in point - btree format forks in memory format use more space than the
+ * on-disk format. In memory, the buffer contains a normal btree block header so
+ * the btree code can treat it as though it is just another generic buffer.
+ * However, when we write it to the inode fork, we don't write all of this
+ * header as it isn't needed. e.g. the root is only ever in the inode, so
+ * there's no need for sibling pointers which would waste 16 bytes of space.
+ *
+ * Hence when we have an inode with a maximally sized btree format fork, then
+ * amount of information we actually log is greater than the size of the inode
+ * on disk. Hence we need an inode reservation function that calculates all this
+ * correctly. So, we log:
+ *
+ * - log op headers for object
+ * - inode log format object
+ * - the entire inode contents (core + 2 forks)
+ * - two bmap btree block headers
+ */
+STATIC uint
+xfs_calc_inode_res(
+       struct xfs_mount        *mp,
+       uint                    ninodes)
+{
+       return ninodes * (sizeof(struct xlog_op_header) +
+                         sizeof(struct xfs_inode_log_format) +
+                         mp->m_sb.sb_inodesize +
+                         2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * Various log reservation values.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_write_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_itruncate_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                     XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(5, 0) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                    XFS_FSB_TO_B(mp, 1)) +
+                   xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                                    mp->m_in_maxlevels, 0)));
+}
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *     of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_rename_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 4) +
+                    xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_link_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 2) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_remove_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 2) +
+                    xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                     XFS_FSB_TO_B(mp, 1))),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               (uint)XFS_FSB_TO_B(mp, 1) +
+               xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_create_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_create_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+/*
+ * For icreate we can allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_icreate_resv_alloc(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               mp->m_sb.sb_sectsize +
+               xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX(xfs_calc_icreate_resv_alloc(mp),
+                   xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+       struct xfs_mount        *mp)
+{
+       if (xfs_sb_version_hascrc(&mp->m_sb))
+               return xfs_calc_icreate_reservation(mp);
+       return __xfs_calc_create_reservation(mp);
+
+}
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+STATIC uint
+xfs_calc_mkdir_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp);
+}
+
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_create_reservation(mp) +
+              xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_ifree_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
+               MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                   XFS_INODE_CLUSTER_SIZE(mp)) +
+               xfs_calc_buf_res(1, 0) +
+               xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+                                mp->m_in_maxlevels, 0) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+STATIC uint
+xfs_calc_ichange_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+
+}
+
+/*
+ * Growing the data section of the filesystem.
+ *     superblock
+ *     agi and agf
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_growdata_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *     superblock: sector size
+ *     agf of the ag from which the extent is allocated: sector size
+ *     bmap btree for bitmap/summary inode: max depth * blocksize
+ *     bitmap/summary inode: inode size
+ *     allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+STATIC uint
+xfs_calc_growrtalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *     one bitmap/summary block: blocksize
+ */
+STATIC uint
+xfs_calc_growrtzero_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *     superblock: sector size
+ *     bitmap inode: inode size
+ *     summary inode: inode size
+ *     one bitmap block: blocksize
+ *     summary blocks: new summary size
+ */
+STATIC uint
+xfs_calc_growrtfree_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_inode_res(mp, 2) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+               xfs_calc_buf_res(1, mp->m_rsumsize);
+}
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *     inode
+ */
+STATIC uint
+xfs_calc_swrite_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *     inode
+ */
+STATIC uint
+xfs_calc_writeid_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_inode_res(mp, 1);
+}
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *     the inode being converted: inode size
+ *     agf block and superblock (for block allocation)
+ *     the new block (directory sized)
+ *     bmap blocks for the new directory block
+ *     allocation btrees
+ */
+STATIC uint
+xfs_calc_addafork_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(1, mp->m_dirblksize) +
+               xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+                                XFS_FSB_TO_B(mp, 1)) +
+               xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *             4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrinval_reservation(
+       struct xfs_mount        *mp)
+{
+       return MAX((xfs_calc_inode_res(mp, 1) +
+                   xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                    XFS_FSB_TO_B(mp, 1))),
+                  (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+                   xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+                                    XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Setting an attribute at mount time.
+ *     the inode getting the attribute
+ *     the superblock for allocations
+ *     the agfs extents are allocated from
+ *     the attribute btree * max depth
+ *     the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
+ */
+STATIC uint
+xfs_calc_attrsetm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               xfs_calc_inode_res(mp, 1) +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ *     the superblock for allocations: sector size
+ *     the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by:
+ *     ext * M_RES(mp)->tr_attrsetrt.tr_logres
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+STATIC uint
+xfs_calc_attrrm_reservation(
+       struct xfs_mount        *mp)
+{
+       return XFS_DQUOT_LOGRES(mp) +
+               MAX((xfs_calc_inode_res(mp, 1) +
+                    xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+                                     XFS_FSB_TO_B(mp, 1)) +
+                    (uint)XFS_FSB_TO_B(mp,
+                                       XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+                   (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+                                     XFS_FSB_TO_B(mp, 1))));
+}
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+STATIC uint
+xfs_calc_clear_agi_bucket_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ *     the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *     the write transaction log space: M_RES(mp)->tr_write.tr_logres
+ *     the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+       struct xfs_mount        *mp)
+{
+       ASSERT(M_RES(mp)->tr_write.tr_logres);
+       return M_RES(mp)->tr_write.tr_logres +
+               xfs_calc_buf_res(1,
+                       XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2 +
+               xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+       struct xfs_mount        *mp)
+{
+       return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+       struct xfs_mount        *mp)
+{
+       return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+void
+xfs_trans_resv_calc(
+       struct xfs_mount        *mp,
+       struct xfs_trans_resv   *resp)
+{
+       /*
+        * The following transactions are logged in physical format and
+        * require a permanent reservation on space.
+        */
+       resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
+       resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
+       resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+       resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
+       resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
+       resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
+       resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
+       resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
+       resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
+       resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
+       resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
+       resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
+       resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
+       resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
+       resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
+       resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
+       resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
+       resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
+       resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
+       resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
+       resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
+       resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
+       resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+       resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
+       resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
+       resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
+       resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+       resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
+       resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+       resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
+       /*
+        * The following transactions are logged in logical format with
+        * a default log count.
+        */
+       resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
+       resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
+       resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
+       resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_qm_equotaoff.tr_logres =
+               xfs_calc_qm_quotaoff_end_reservation(mp);
+       resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
+       resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+
+       /* The following transaction are logged in logical format */
+       resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
+       resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+       resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
+       resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
+       resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
+       resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
+       resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
+       resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
+       resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
new file mode 100644 (file)
index 0000000..de7de9a
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef        __XFS_TRANS_RESV_H__
+#define        __XFS_TRANS_RESV_H__
+
+struct xfs_mount;
+
+/*
+ * structure for maintaining pre-calculated transaction reservations.
+ */
+struct xfs_trans_res {
+       uint    tr_logres;      /* log space unit in bytes per log ticket */
+       int     tr_logcount;    /* number of log operations per log ticket */
+       int     tr_logflags;    /* log flags, currently only used for indicating
+                                * a reservation request is permanent or not */
+};
+
+struct xfs_trans_resv {
+       struct xfs_trans_res    tr_write;       /* extent alloc trans */
+       struct xfs_trans_res    tr_itruncate;   /* truncate trans */
+       struct xfs_trans_res    tr_rename;      /* rename trans */
+       struct xfs_trans_res    tr_link;        /* link trans */
+       struct xfs_trans_res    tr_remove;      /* unlink trans */
+       struct xfs_trans_res    tr_symlink;     /* symlink trans */
+       struct xfs_trans_res    tr_create;      /* create trans */
+       struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
+       struct xfs_trans_res    tr_ifree;       /* inode free trans */
+       struct xfs_trans_res    tr_ichange;     /* inode update trans */
+       struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
+       struct xfs_trans_res    tr_swrite;      /* sync write inode trans */
+       struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
+       struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
+       struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
+                                                * invalidation */
+       struct xfs_trans_res    tr_attrsetm;    /* set/create an attribute at
+                                                * mount time */
+       struct xfs_trans_res    tr_attrsetrt;   /* set/create an attribute at
+                                                * runtime */
+       struct xfs_trans_res    tr_attrrm;      /* remove an attribute */
+       struct xfs_trans_res    tr_clearagi;    /* clear agi unlinked bucket */
+       struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
+       struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
+       struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
+       struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
+       struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
+       struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
+       struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
+       struct xfs_trans_res    tr_qm_equotaoff;/* end of turn quota off */
+       struct xfs_trans_res    tr_sb;          /* modify superblock */
+       struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
+};
+
+/* shorthand way of accessing reservation structure */
+#define M_RES(mp)      (&(mp)->m_resv)
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define        XFS_ALLOCFREE_LOG_RES(mp,nx) \
+       ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define        XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+       ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define        XFS_DIROP_LOG_RES(mp)   \
+       (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+        (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define        XFS_DIROP_LOG_COUNT(mp) \
+       (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+        XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * Various log count values.
+ */
+#define        XFS_DEFAULT_LOG_COUNT           1
+#define        XFS_DEFAULT_PERM_LOG_COUNT      2
+#define        XFS_ITRUNCATE_LOG_COUNT         2
+#define XFS_INACTIVE_LOG_COUNT         2
+#define        XFS_CREATE_LOG_COUNT            2
+#define        XFS_MKDIR_LOG_COUNT             3
+#define        XFS_SYMLINK_LOG_COUNT           3
+#define        XFS_REMOVE_LOG_COUNT            2
+#define        XFS_LINK_LOG_COUNT              2
+#define        XFS_RENAME_LOG_COUNT            2
+#define        XFS_WRITE_LOG_COUNT             2
+#define        XFS_ADDAFORK_LOG_COUNT          2
+#define        XFS_ATTRINVAL_LOG_COUNT         1
+#define        XFS_ATTRSET_LOG_COUNT           3
+#define        XFS_ATTRRM_LOG_COUNT            3
+
+void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+
+#endif /* __XFS_TRANS_RESV_H__ */
index 61ba1cfa974c7e3e32c493c1317236e2cb397730..82bbc34d54a3b344559379a665365bb2bab9b903 100644 (file)
 #ifndef __XFS_TYPES_H__
 #define        __XFS_TYPES_H__
 
-#ifdef __KERNEL__
-
-/*
- * Additional type declarations for XFS
- */
-typedef signed char            __int8_t;
-typedef unsigned char          __uint8_t;
-typedef signed short int       __int16_t;
-typedef unsigned short int     __uint16_t;
-typedef signed int             __int32_t;
-typedef unsigned int           __uint32_t;
-typedef signed long long int   __int64_t;
-typedef unsigned long long int __uint64_t;
-
-typedef __uint32_t             prid_t;         /* project ID */
-typedef __uint32_t             inst_t;         /* an instruction */
-
-typedef __s64                  xfs_off_t;      /* <file offset> type */
-typedef unsigned long long     xfs_ino_t;      /* <inode> type */
-typedef __s64                  xfs_daddr_t;    /* <disk address> type */
-typedef char *                 xfs_caddr_t;    /* <core address> type */
-typedef __u32                  xfs_dev_t;
-typedef __u32                  xfs_nlink_t;
-
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
-#endif /* __KERNEL__ */
+typedef __uint32_t     prid_t;         /* project ID */
 
 typedef __uint32_t     xfs_agblock_t;  /* blockno in alloc. group */
 typedef        __uint32_t      xfs_agino_t;    /* inode # within allocation grp */
@@ -145,6 +110,12 @@ typedef __uint64_t xfs_filblks_t;  /* number of blocks in a file */
 #define XFS_MIN_SECTORSIZE     (1 << XFS_MIN_SECTORSIZE_LOG)
 #define XFS_MAX_SECTORSIZE     (1 << XFS_MAX_SECTORSIZE_LOG)
 
+/*
+ * Inode fork identifiers.
+ */
+#define        XFS_DATA_FORK   0
+#define        XFS_ATTR_FORK   1
+
 /*
  * Min numbers of data/attr fork btree root pointers.
  */
@@ -169,6 +140,23 @@ typedef enum {
 struct xfs_name {
        const unsigned char     *name;
        int                     len;
+       int                     type;
 };
 
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __uint32_t     xfs_dqid_t;
+
+/*
+ * Constants for bit manipulations.
+ */
+#define        XFS_NBBYLOG     3               /* log2(NBBY) */
+#define        XFS_WORDLOG     2               /* log2(sizeof(xfs_rtword_t)) */
+#define        XFS_NBWORDLOG   (XFS_NBBYLOG + XFS_WORDLOG)
+#define        XFS_NBWORD      (1 << XFS_NBWORDLOG)
+#define        XFS_WORDMASK    ((1 << XFS_WORDLOG) - 1)
+
+
 #endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
deleted file mode 100644 (file)
index 0025c78..0000000
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_itable.h"
-#include "xfs_utils.h"
-
-
-/*
- * Allocates a new inode from disk and return a pointer to the
- * incore copy. This routine will internally commit the current
- * transaction and allocate a new one if the Space Manager needed
- * to do an allocation to replenish the inode free-list.
- *
- * This routine is designed to be called from xfs_create and
- * xfs_create_dir.
- *
- */
-int
-xfs_dir_ialloc(
-       xfs_trans_t     **tpp,          /* input: current transaction;
-                                          output: may be a new transaction. */
-       xfs_inode_t     *dp,            /* directory within whose allocate
-                                          the inode. */
-       umode_t         mode,
-       xfs_nlink_t     nlink,
-       xfs_dev_t       rdev,
-       prid_t          prid,           /* project id */
-       int             okalloc,        /* ok to allocate new space */
-       xfs_inode_t     **ipp,          /* pointer to inode; it will be
-                                          locked. */
-       int             *committed)
-
-{
-       xfs_trans_t     *tp;
-       xfs_trans_t     *ntp;
-       xfs_inode_t     *ip;
-       xfs_buf_t       *ialloc_context = NULL;
-       int             code;
-       uint            log_res;
-       uint            log_count;
-       void            *dqinfo;
-       uint            tflags;
-
-       tp = *tpp;
-       ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
-
-       /*
-        * xfs_ialloc will return a pointer to an incore inode if
-        * the Space Manager has an available inode on the free
-        * list. Otherwise, it will do an allocation and replenish
-        * the freelist.  Since we can only do one allocation per
-        * transaction without deadlocks, we will need to commit the
-        * current transaction and start a new one.  We will then
-        * need to call xfs_ialloc again to get the inode.
-        *
-        * If xfs_ialloc did an allocation to replenish the freelist,
-        * it returns the bp containing the head of the freelist as
-        * ialloc_context. We will hold a lock on it across the
-        * transaction commit so that no other process can steal
-        * the inode(s) that we've just allocated.
-        */
-       code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
-                         &ialloc_context, &ip);
-
-       /*
-        * Return an error if we were unable to allocate a new inode.
-        * This should only happen if we run out of space on disk or
-        * encounter a disk error.
-        */
-       if (code) {
-               *ipp = NULL;
-               return code;
-       }
-       if (!ialloc_context && !ip) {
-               *ipp = NULL;
-               return XFS_ERROR(ENOSPC);
-       }
-
-       /*
-        * If the AGI buffer is non-NULL, then we were unable to get an
-        * inode in one operation.  We need to commit the current
-        * transaction and call xfs_ialloc() again.  It is guaranteed
-        * to succeed the second time.
-        */
-       if (ialloc_context) {
-               /*
-                * Normally, xfs_trans_commit releases all the locks.
-                * We call bhold to hang on to the ialloc_context across
-                * the commit.  Holding this buffer prevents any other
-                * processes from doing any allocations in this
-                * allocation group.
-                */
-               xfs_trans_bhold(tp, ialloc_context);
-               /*
-                * Save the log reservation so we can use
-                * them in the next transaction.
-                */
-               log_res = xfs_trans_get_log_res(tp);
-               log_count = xfs_trans_get_log_count(tp);
-
-               /*
-                * We want the quota changes to be associated with the next
-                * transaction, NOT this one. So, detach the dqinfo from this
-                * and attach it to the next transaction.
-                */
-               dqinfo = NULL;
-               tflags = 0;
-               if (tp->t_dqinfo) {
-                       dqinfo = (void *)tp->t_dqinfo;
-                       tp->t_dqinfo = NULL;
-                       tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
-                       tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
-               }
-
-               ntp = xfs_trans_dup(tp);
-               code = xfs_trans_commit(tp, 0);
-               tp = ntp;
-               if (committed != NULL) {
-                       *committed = 1;
-               }
-               /*
-                * If we get an error during the commit processing,
-                * release the buffer that is still held and return
-                * to the caller.
-                */
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       if (dqinfo) {
-                               tp->t_dqinfo = dqinfo;
-                               xfs_trans_free_dqinfo(tp);
-                       }
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-
-               /*
-                * transaction commit worked ok so we can drop the extra ticket
-                * reference that we gained in xfs_trans_dup()
-                */
-               xfs_log_ticket_put(tp->t_ticket);
-               code = xfs_trans_reserve(tp, 0, log_res, 0,
-                                        XFS_TRANS_PERM_LOG_RES, log_count);
-               /*
-                * Re-attach the quota info that we detached from prev trx.
-                */
-               if (dqinfo) {
-                       tp->t_dqinfo = dqinfo;
-                       tp->t_flags |= tflags;
-               }
-
-               if (code) {
-                       xfs_buf_relse(ialloc_context);
-                       *tpp = ntp;
-                       *ipp = NULL;
-                       return code;
-               }
-               xfs_trans_bjoin(tp, ialloc_context);
-
-               /*
-                * Call ialloc again. Since we've locked out all
-                * other allocations in this allocation group,
-                * this call should always succeed.
-                */
-               code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
-                                 okalloc, &ialloc_context, &ip);
-
-               /*
-                * If we get an error at this point, return to the caller
-                * so that the current transaction can be aborted.
-                */
-               if (code) {
-                       *tpp = tp;
-                       *ipp = NULL;
-                       return code;
-               }
-               ASSERT(!ialloc_context && ip);
-
-       } else {
-               if (committed != NULL)
-                       *committed = 0;
-       }
-
-       *ipp = ip;
-       *tpp = tp;
-
-       return 0;
-}
-
-/*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
- */
-int                            /* error */
-xfs_droplink(
-       xfs_trans_t *tp,
-       xfs_inode_t *ip)
-{
-       int     error;
-
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-       ASSERT (ip->i_d.di_nlink > 0);
-       ip->i_d.di_nlink--;
-       drop_nlink(VFS_I(ip));
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       error = 0;
-       if (ip->i_d.di_nlink == 0) {
-               /*
-                * We're dropping the last link to this file.
-                * Move the on-disk inode to the AGI unlinked list.
-                * From xfs_inactive() we will pull the inode from
-                * the list and free it.
-                */
-               error = xfs_iunlink(tp, ip);
-       }
-       return error;
-}
-
-/*
- * This gets called when the inode's version needs to be changed from 1 to 2.
- * Currently this happens when the nlink field overflows the old 16-bit value
- * or when chproj is called to change the project for the first time.
- * As a side effect the superblock version will also get rev'd
- * to contain the NLINK bit.
- */
-void
-xfs_bump_ino_vers2(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip)
-{
-       xfs_mount_t     *mp;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(ip->i_d.di_version == 1);
-
-       ip->i_d.di_version = 2;
-       ip->i_d.di_onlink = 0;
-       memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-       mp = tp->t_mountp;
-       if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-               spin_lock(&mp->m_sb_lock);
-               if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
-                       xfs_sb_version_addnlink(&mp->m_sb);
-                       spin_unlock(&mp->m_sb_lock);
-                       xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
-               } else {
-                       spin_unlock(&mp->m_sb_lock);
-               }
-       }
-       /* Caller must log the inode */
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-int
-xfs_bumplink(
-       xfs_trans_t *tp,
-       xfs_inode_t *ip)
-{
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
-       ASSERT(ip->i_d.di_nlink > 0);
-       ip->i_d.di_nlink++;
-       inc_nlink(VFS_I(ip));
-       if ((ip->i_d.di_version == 1) &&
-           (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
-               /*
-                * The inode has increased its number of links beyond
-                * what can fit in an old format inode.  It now needs
-                * to be converted to a version 2 inode with a 32 bit
-                * link count.  If this is the first inode in the file
-                * system to do this, then we need to bump the superblock
-                * version number as well.
-                */
-               xfs_bump_ino_vers2(tp, ip);
-       }
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       return 0;
-}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
deleted file mode 100644 (file)
index 5eeab46..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_UTILS_H__
-#define __XFS_UTILS_H__
-
-extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
-                               xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
-extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
-extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
-extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
-
-#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
deleted file mode 100644 (file)
index dc730ac..0000000
+++ /dev/null
@@ -1,1870 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * Copyright (c) 2012 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trans_space.h"
-#include "xfs_log_priv.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-#include "xfs_symlink.h"
-
-
-/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
- */
-int
-xfs_free_eofblocks(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip,
-       bool            need_iolock)
-{
-       xfs_trans_t     *tp;
-       int             error;
-       xfs_fileoff_t   end_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_filblks_t   map_len;
-       int             nimaps;
-       xfs_bmbt_irec_t imap;
-
-       /*
-        * Figure out if there are any blocks beyond the end
-        * of the file.  If not, then there is nothing to do.
-        */
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
-       last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-       if (last_fsb <= end_fsb)
-               return 0;
-       map_len = last_fsb - end_fsb;
-
-       nimaps = 1;
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (!error && (nimaps != 0) &&
-           (imap.br_startblock != HOLESTARTBLOCK ||
-            ip->i_delayed_blks)) {
-               /*
-                * Attach the dquots to the inode up front.
-                */
-               error = xfs_qm_dqattach(ip, 0);
-               if (error)
-                       return error;
-
-               /*
-                * There are blocks after the end of file.
-                * Free them up now by truncating the file to
-                * its current size.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-
-               if (need_iolock) {
-                       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
-                               xfs_trans_cancel(tp, 0);
-                               return EAGAIN;
-                       }
-               }
-
-               error = xfs_trans_reserve(tp, 0,
-                                         XFS_ITRUNCATE_LOG_RES(mp),
-                                         0, XFS_TRANS_PERM_LOG_RES,
-                                         XFS_ITRUNCATE_LOG_COUNT);
-               if (error) {
-                       ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       if (need_iolock)
-                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                       return error;
-               }
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * Do not update the on-disk file size.  If we update the
-                * on-disk file size and then the system crashes before the
-                * contents of the file are flushed to disk then the files
-                * may be full of holes (ie NULL files bug).
-                */
-               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-                                             XFS_ISIZE(ip));
-               if (error) {
-                       /*
-                        * If we get an error at this point we simply don't
-                        * bother truncating the file.
-                        */
-                       xfs_trans_cancel(tp,
-                                        (XFS_TRANS_RELEASE_LOG_RES |
-                                         XFS_TRANS_ABORT));
-               } else {
-                       error = xfs_trans_commit(tp,
-                                               XFS_TRANS_RELEASE_LOG_RES);
-                       if (!error)
-                               xfs_inode_clear_eofblocks_tag(ip);
-               }
-
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (need_iolock)
-                       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       }
-       return error;
-}
-
-int
-xfs_release(
-       xfs_inode_t     *ip)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       int             error;
-
-       if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
-               return 0;
-
-       /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return 0;
-
-       if (!XFS_FORCED_SHUTDOWN(mp)) {
-               int truncated;
-
-               /*
-                * If we are using filestreams, and we have an unlinked
-                * file that we are processing the last close on, then nothing
-                * will be able to reopen and write to this file. Purge this
-                * inode from the filestreams cache so that it doesn't delay
-                * teardown of the inode.
-                */
-               if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
-                       xfs_filestream_deassociate(ip);
-
-               /*
-                * If we previously truncated this file and removed old data
-                * in the process, we want to initiate "early" writeout on
-                * the last close.  This is an attempt to combat the notorious
-                * NULL files problem which is particularly noticeable from a
-                * truncate down, buffered (re-)write (delalloc), followed by
-                * a crash.  What we are effectively doing here is
-                * significantly reducing the time window where we'd otherwise
-                * be exposed to that problem.
-                */
-               truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-               if (truncated) {
-                       xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-                               error = -filemap_flush(VFS_I(ip)->i_mapping);
-                               if (error)
-                                       return error;
-                       }
-               }
-       }
-
-       if (ip->i_d.di_nlink == 0)
-               return 0;
-
-       if (xfs_can_free_eofblocks(ip, false)) {
-
-               /*
-                * If we can't get the iolock just skip truncating the blocks
-                * past EOF because we could deadlock with the mmap_sem
-                * otherwise.  We'll get another chance to drop them once the
-                * last reference to the inode is dropped, so we'll never leak
-                * blocks permanently.
-                *
-                * Further, check if the inode is being opened, written and
-                * closed frequently and we have delayed allocation blocks
-                * outstanding (e.g. streaming writes from the NFS server),
-                * truncating the blocks past EOF will cause fragmentation to
-                * occur.
-                *
-                * In this case don't do the truncation, either, but we have to
-                * be careful how we detect this case. Blocks beyond EOF show
-                * up as i_delayed_blks even when the inode is clean, so we
-                * need to truncate them away first before checking for a dirty
-                * release. Hence on the first dirty close we will still remove
-                * the speculative allocation, but after that we will leave it
-                * in place.
-                */
-               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
-                       return 0;
-
-               error = xfs_free_eofblocks(mp, ip, true);
-               if (error && error != EAGAIN)
-                       return error;
-
-               /* delalloc blocks after truncation means it really is dirty */
-               if (ip->i_delayed_blks)
-                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
-       }
-       return 0;
-}
-
-/*
- * xfs_inactive
- *
- * This is called when the vnode reference count for the vnode
- * goes to zero.  If the file has been unlinked, then it must
- * now be truncated.  Also, we clear all of the read-ahead state
- * kept for the inode here since the file is now closed.
- */
-int
-xfs_inactive(
-       xfs_inode_t     *ip)
-{
-       xfs_bmap_free_t free_list;
-       xfs_fsblock_t   first_block;
-       int             committed;
-       xfs_trans_t     *tp;
-       xfs_mount_t     *mp;
-       int             error;
-       int             truncate = 0;
-
-       /*
-        * If the inode is already free, then there can be nothing
-        * to clean up here.
-        */
-       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
-               ASSERT(ip->i_df.if_real_bytes == 0);
-               ASSERT(ip->i_df.if_broot_bytes == 0);
-               return VN_INACTIVE_CACHE;
-       }
-
-       mp = ip->i_mount;
-
-       error = 0;
-
-       /* If this is a read-only mount, don't do this (would generate I/O) */
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               goto out;
-
-       if (ip->i_d.di_nlink != 0) {
-               /*
-                * force is true because we are evicting an inode from the
-                * cache. Post-eof blocks must be freed, lest we end up with
-                * broken free space accounting.
-                */
-               if (xfs_can_free_eofblocks(ip, true)) {
-                       error = xfs_free_eofblocks(mp, ip, false);
-                       if (error)
-                               return VN_INACTIVE_CACHE;
-               }
-               goto out;
-       }
-
-       if (S_ISREG(ip->i_d.di_mode) &&
-           (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
-            ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
-               truncate = 1;
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return VN_INACTIVE_CACHE;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-       error = xfs_trans_reserve(tp, 0,
-                       (truncate || S_ISLNK(ip->i_d.di_mode)) ?
-                               XFS_ITRUNCATE_LOG_RES(mp) :
-                               XFS_IFREE_LOG_RES(mp),
-                       0,
-                       XFS_TRANS_PERM_LOG_RES,
-                       XFS_ITRUNCATE_LOG_COUNT);
-       if (error) {
-               ASSERT(XFS_FORCED_SHUTDOWN(mp));
-               xfs_trans_cancel(tp, 0);
-               return VN_INACTIVE_CACHE;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, 0);
-
-       if (S_ISLNK(ip->i_d.di_mode)) {
-               error = xfs_inactive_symlink(ip, &tp);
-               if (error)
-                       goto out_cancel;
-       } else if (truncate) {
-               ip->i_d.di_size = 0;
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
-               if (error)
-                       goto out_cancel;
-
-               ASSERT(ip->i_d.di_nextents == 0);
-       }
-
-       /*
-        * If there are attributes associated with the file then blow them away
-        * now.  The code calls a routine that recursively deconstructs the
-        * attribute fork.  We need to just commit the current transaction
-        * because we can't use it for xfs_attr_inactive().
-        */
-       if (ip->i_d.di_anextents > 0) {
-               ASSERT(ip->i_d.di_forkoff != 0);
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               if (error)
-                       goto out_unlock;
-
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-               error = xfs_attr_inactive(ip);
-               if (error)
-                       goto out;
-
-               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
-               error = xfs_trans_reserve(tp, 0,
-                                         XFS_IFREE_LOG_RES(mp),
-                                         0, XFS_TRANS_PERM_LOG_RES,
-                                         XFS_INACTIVE_LOG_COUNT);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       goto out;
-               }
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_trans_ijoin(tp, ip, 0);
-       }
-
-       if (ip->i_afp)
-               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-       ASSERT(ip->i_d.di_anextents == 0);
-
-       /*
-        * Free the inode.
-        */
-       xfs_bmap_init(&free_list, &first_block);
-       error = xfs_ifree(tp, ip, &free_list);
-       if (error) {
-               /*
-                * If we fail to free the inode, shut down.  The cancel
-                * might do that, we need to make sure.  Otherwise the
-                * inode might be lost for a long time or forever.
-                */
-               if (!XFS_FORCED_SHUTDOWN(mp)) {
-                       xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                               __func__, error);
-                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-               }
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       } else {
-               /*
-                * Credit the quota account(s). The inode is gone.
-                */
-               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
-
-               /*
-                * Just ignore errors at this point.  There is nothing we can
-                * do except to try to keep going. Make sure it's not a silent
-                * error.
-                */
-               error = xfs_bmap_finish(&tp,  &free_list, &committed);
-               if (error)
-                       xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                               __func__, error);
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               if (error)
-                       xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                               __func__, error);
-       }
-
-       /*
-        * Release the dquots held by inode, if any.
-        */
-       xfs_qm_dqdetach(ip);
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
-       return VN_INACTIVE_CACHE;
-out_cancel:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       goto out_unlock;
-}
-
-/*
- * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
- * is allowed, otherwise it has to be an exact match. If a CI match is found,
- * ci_name->name will point to a the actual name (caller must free) or
- * will be set to NULL if an exact match is found.
- */
-int
-xfs_lookup(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       xfs_inode_t             **ipp,
-       struct xfs_name         *ci_name)
-{
-       xfs_ino_t               inum;
-       int                     error;
-       uint                    lock_mode;
-
-       trace_xfs_lookup(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-               return XFS_ERROR(EIO);
-
-       lock_mode = xfs_ilock_map_shared(dp);
-       error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-       xfs_iunlock_map_shared(dp, lock_mode);
-
-       if (error)
-               goto out;
-
-       error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
-       if (error)
-               goto out_free_name;
-
-       return 0;
-
-out_free_name:
-       if (ci_name)
-               kmem_free(ci_name->name);
-out:
-       *ipp = NULL;
-       return error;
-}
-
-int
-xfs_create(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       umode_t                 mode,
-       xfs_dev_t               rdev,
-       xfs_inode_t             **ipp)
-{
-       int                     is_dir = S_ISDIR(mode);
-       struct xfs_mount        *mp = dp->i_mount;
-       struct xfs_inode        *ip = NULL;
-       struct xfs_trans        *tp = NULL;
-       int                     error;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       bool                    unlock_dp_on_error = false;
-       uint                    cancel_flags;
-       int                     committed;
-       prid_t                  prid;
-       struct xfs_dquot        *udqp = NULL;
-       struct xfs_dquot        *gdqp = NULL;
-       struct xfs_dquot        *pdqp = NULL;
-       uint                    resblks;
-       uint                    log_res;
-       uint                    log_count;
-
-       trace_xfs_create(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-               prid = xfs_get_projid(dp);
-       else
-               prid = XFS_PROJID_DEFAULT;
-
-       /*
-        * Make sure that we have allocated dquot(s) on disk.
-        */
-       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
-                                       &udqp, &gdqp, &pdqp);
-       if (error)
-               return error;
-
-       if (is_dir) {
-               rdev = 0;
-               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
-               log_res = XFS_MKDIR_LOG_RES(mp);
-               log_count = XFS_MKDIR_LOG_COUNT;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
-       } else {
-               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
-               log_res = XFS_CREATE_LOG_RES(mp);
-               log_count = XFS_CREATE_LOG_COUNT;
-               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
-       }
-
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
-       /*
-        * Initially assume that the file does not exist and
-        * reserve the resources for that case.  If that is not
-        * the case we'll drop the one we have and get a more
-        * appropriate transaction later.
-        */
-       error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                       XFS_TRANS_PERM_LOG_RES, log_count);
-       if (error == ENOSPC) {
-               /* flush outstanding delalloc blocks and retry */
-               xfs_flush_inodes(mp);
-               error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                               XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error == ENOSPC) {
-               /* No space at all so try a "no-allocation" reservation */
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, log_res, 0,
-                               XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error) {
-               cancel_flags = 0;
-               goto out_trans_cancel;
-       }
-
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-       unlock_dp_on_error = true;
-
-       xfs_bmap_init(&free_list, &first_block);
-
-       /*
-        * Reserve disk quota and the inode.
-        */
-       error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
-                                               pdqp, resblks, 1, 0);
-       if (error)
-               goto out_trans_cancel;
-
-       error = xfs_dir_canenter(tp, dp, name, resblks);
-       if (error)
-               goto out_trans_cancel;
-
-       /*
-        * A newly created regular or special file just has one directory
-        * entry pointing to them, but a directory also the "." entry
-        * pointing to itself.
-        */
-       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
-                              prid, resblks > 0, &ip, &committed);
-       if (error) {
-               if (error == ENOSPC)
-                       goto out_trans_cancel;
-               goto out_trans_abort;
-       }
-
-       /*
-        * Now we join the directory inode to the transaction.  We do not do it
-        * earlier because xfs_dir_ialloc might commit the previous transaction
-        * (and release all the locks).  An error from here on will result in
-        * the transaction cancel unlocking dp so don't do it explicitly in the
-        * error path.
-        */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-       unlock_dp_on_error = false;
-
-       error = xfs_dir_createname(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks ?
-                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
-       if (error) {
-               ASSERT(error != ENOSPC);
-               goto out_trans_abort;
-       }
-       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
-       if (is_dir) {
-               error = xfs_dir_init(tp, ip, dp);
-               if (error)
-                       goto out_bmap_cancel;
-
-               error = xfs_bumplink(tp, dp);
-               if (error)
-                       goto out_bmap_cancel;
-       }
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * create transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
-               xfs_trans_set_sync(tp);
-
-       /*
-        * Attach the dquot(s) to the inodes and modify them incore.
-        * These ids of the inode couldn't have changed since the new
-        * inode has been locked ever since it was created.
-        */
-       xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error)
-               goto out_bmap_cancel;
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       if (error)
-               goto out_release_inode;
-
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       xfs_qm_dqrele(pdqp);
-
-       *ipp = ip;
-       return 0;
-
- out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
- out_trans_abort:
-       cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
- out_release_inode:
-       /*
-        * Wait until after the current transaction is aborted to
-        * release the inode.  This prevents recursive transactions
-        * and deadlocks from xfs_inactive.
-        */
-       if (ip)
-               IRELE(ip);
-
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       xfs_qm_dqrele(pdqp);
-
-       if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return error;
-}
-
-#ifdef DEBUG
-int xfs_locked_n;
-int xfs_small_retries;
-int xfs_middle_retries;
-int xfs_lots_retries;
-int xfs_lock_delays;
-#endif
-
-/*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
- * a different value
- */
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
-{
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
-       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
-
-       return lock_mode;
-}
-
-/*
- * The following routine will lock n inodes in exclusive mode.
- * We assume the caller calls us with the inodes in i_ino order.
- *
- * We need to detect deadlock where an inode that we lock
- * is in the AIL and we start waiting for another inode that is locked
- * by a thread in a long running transaction (such as truncate). This can
- * result in deadlock since the long running trans might need to wait
- * for the inode we just locked in order to push the tail and free space
- * in the log.
- */
-void
-xfs_lock_inodes(
-       xfs_inode_t     **ips,
-       int             inodes,
-       uint            lock_mode)
-{
-       int             attempts = 0, i, j, try_lock;
-       xfs_log_item_t  *lp;
-
-       ASSERT(ips && (inodes >= 2)); /* we need at least two */
-
-       try_lock = 0;
-       i = 0;
-
-again:
-       for (; i < inodes; i++) {
-               ASSERT(ips[i]);
-
-               if (i && (ips[i] == ips[i-1]))  /* Already locked */
-                       continue;
-
-               /*
-                * If try_lock is not set yet, make sure all locked inodes
-                * are not in the AIL.
-                * If any are, set try_lock to be used later.
-                */
-
-               if (!try_lock) {
-                       for (j = (i - 1); j >= 0 && !try_lock; j--) {
-                               lp = (xfs_log_item_t *)ips[j]->i_itemp;
-                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-                                       try_lock++;
-                               }
-                       }
-               }
-
-               /*
-                * If any of the previous locks we have locked is in the AIL,
-                * we must TRY to get the second and subsequent locks. If
-                * we can't get any, we must release all we have
-                * and try again.
-                */
-
-               if (try_lock) {
-                       /* try_lock must be 0 if i is 0. */
-                       /*
-                        * try_lock means we have an inode locked
-                        * that is in the AIL.
-                        */
-                       ASSERT(i != 0);
-                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
-                               attempts++;
-
-                               /*
-                                * Unlock all previous guys and try again.
-                                * xfs_iunlock will try to push the tail
-                                * if the inode is in the AIL.
-                                */
-
-                               for(j = i - 1; j >= 0; j--) {
-
-                                       /*
-                                        * Check to see if we've already
-                                        * unlocked this one.
-                                        * Not the first one going back,
-                                        * and the inode ptr is the same.
-                                        */
-                                       if ((j != (i - 1)) && ips[j] ==
-                                                               ips[j+1])
-                                               continue;
-
-                                       xfs_iunlock(ips[j], lock_mode);
-                               }
-
-                               if ((attempts % 5) == 0) {
-                                       delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-                                       xfs_lock_delays++;
-#endif
-                               }
-                               i = 0;
-                               try_lock = 0;
-                               goto again;
-                       }
-               } else {
-                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
-               }
-       }
-
-#ifdef DEBUG
-       if (attempts) {
-               if (attempts < 5) xfs_small_retries++;
-               else if (attempts < 100) xfs_middle_retries++;
-               else xfs_lots_retries++;
-       } else {
-               xfs_locked_n++;
-       }
-#endif
-}
-
-/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
- * at a time - the iolock or the ilock, but not both at once. If
- * we lock both at once, lockdep will report false positives saying
- * we have violated locking orders.
- */
-void
-xfs_lock_two_inodes(
-       xfs_inode_t             *ip0,
-       xfs_inode_t             *ip1,
-       uint                    lock_mode)
-{
-       xfs_inode_t             *temp;
-       int                     attempts = 0;
-       xfs_log_item_t          *lp;
-
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
-               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
-       ASSERT(ip0->i_ino != ip1->i_ino);
-
-       if (ip0->i_ino > ip1->i_ino) {
-               temp = ip0;
-               ip0 = ip1;
-               ip1 = temp;
-       }
-
- again:
-       xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
-
-       /*
-        * If the first lock we have locked is in the AIL, we must TRY to get
-        * the second lock. If we can't get it, we must release the first one
-        * and try again.
-        */
-       lp = (xfs_log_item_t *)ip0->i_itemp;
-       if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-               if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
-                       xfs_iunlock(ip0, lock_mode);
-                       if ((++attempts % 5) == 0)
-                               delay(1); /* Don't just spin the CPU */
-                       goto again;
-               }
-       } else {
-               xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
-       }
-}
-
-int
-xfs_remove(
-       xfs_inode_t             *dp,
-       struct xfs_name         *name,
-       xfs_inode_t             *ip)
-{
-       xfs_mount_t             *mp = dp->i_mount;
-       xfs_trans_t             *tp = NULL;
-       int                     is_dir = S_ISDIR(ip->i_d.di_mode);
-       int                     error = 0;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       int                     cancel_flags;
-       int                     committed;
-       int                     link_zero;
-       uint                    resblks;
-       uint                    log_count;
-
-       trace_xfs_remove(dp, name);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(dp, 0);
-       if (error)
-               goto std_return;
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               goto std_return;
-
-       if (is_dir) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-               log_count = XFS_DEFAULT_LOG_COUNT;
-       } else {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
-               log_count = XFS_REMOVE_LOG_COUNT;
-       }
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
-       /*
-        * We try to get the real space reservation first,
-        * allowing for directory btree deletion(s) implying
-        * possible bmap insert(s).  If we can't get the space
-        * reservation then we use 0 instead, and avoid the bmap
-        * btree insert(s) in the directory code by, if the bmap
-        * insert tries to happen, instead trimming the LAST
-        * block from the directory.
-        */
-       resblks = XFS_REMOVE_SPACE_RES(mp);
-       error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES, log_count);
-       if (error == ENOSPC) {
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-                                         XFS_TRANS_PERM_LOG_RES, log_count);
-       }
-       if (error) {
-               ASSERT(error != ENOSPC);
-               cancel_flags = 0;
-               goto out_trans_cancel;
-       }
-
-       xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       /*
-        * If we're removing a directory perform some additional validation.
-        */
-       if (is_dir) {
-               ASSERT(ip->i_d.di_nlink >= 2);
-               if (ip->i_d.di_nlink != 2) {
-                       error = XFS_ERROR(ENOTEMPTY);
-                       goto out_trans_cancel;
-               }
-               if (!xfs_dir_isempty(ip)) {
-                       error = XFS_ERROR(ENOTEMPTY);
-                       goto out_trans_cancel;
-               }
-       }
-
-       xfs_bmap_init(&free_list, &first_block);
-       error = xfs_dir_removename(tp, dp, name, ip->i_ino,
-                                       &first_block, &free_list, resblks);
-       if (error) {
-               ASSERT(error != ENOENT);
-               goto out_bmap_cancel;
-       }
-       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-       if (is_dir) {
-               /*
-                * Drop the link from ip's "..".
-                */
-               error = xfs_droplink(tp, dp);
-               if (error)
-                       goto out_bmap_cancel;
-
-               /*
-                * Drop the "." link from ip to self.
-                */
-               error = xfs_droplink(tp, ip);
-               if (error)
-                       goto out_bmap_cancel;
-       } else {
-               /*
-                * When removing a non-directory we need to log the parent
-                * inode here.  For a directory this is done implicitly
-                * by the xfs_droplink call for the ".." entry.
-                */
-               xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-       }
-
-       /*
-        * Drop the link from dp to ip.
-        */
-       error = xfs_droplink(tp, ip);
-       if (error)
-               goto out_bmap_cancel;
-
-       /*
-        * Determine if this is the last link while
-        * we are in the transaction.
-        */
-       link_zero = (ip->i_d.di_nlink == 0);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * remove transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
-               xfs_trans_set_sync(tp);
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error)
-               goto out_bmap_cancel;
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       if (error)
-               goto std_return;
-
-       /*
-        * If we are using filestreams, kill the stream association.
-        * If the file is still open it may get a new one but that
-        * will get killed on last close in xfs_close() so we don't
-        * have to worry about that.
-        */
-       if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
-               xfs_filestream_deassociate(ip);
-
-       return 0;
-
- out_bmap_cancel:
-       xfs_bmap_cancel(&free_list);
-       cancel_flags |= XFS_TRANS_ABORT;
- out_trans_cancel:
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
-
-int
-xfs_link(
-       xfs_inode_t             *tdp,
-       xfs_inode_t             *sip,
-       struct xfs_name         *target_name)
-{
-       xfs_mount_t             *mp = tdp->i_mount;
-       xfs_trans_t             *tp;
-       int                     error;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       int                     cancel_flags;
-       int                     committed;
-       int                     resblks;
-
-       trace_xfs_link(tdp, target_name);
-
-       ASSERT(!S_ISDIR(sip->i_d.di_mode));
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(sip, 0);
-       if (error)
-               goto std_return;
-
-       error = xfs_qm_dqattach(tdp, 0);
-       if (error)
-               goto std_return;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
-       error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
-       if (error == ENOSPC) {
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
-       }
-       if (error) {
-               cancel_flags = 0;
-               goto error_return;
-       }
-
-       xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-
-       xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
-       /*
-        * If we are using project inheritance, we only allow hard link
-        * creation in our tree when the project IDs are the same; else
-        * the tree quota mechanism could be circumvented.
-        */
-       if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                    (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
-               error = XFS_ERROR(EXDEV);
-               goto error_return;
-       }
-
-       error = xfs_dir_canenter(tp, tdp, target_name, resblks);
-       if (error)
-               goto error_return;
-
-       xfs_bmap_init(&free_list, &first_block);
-
-       error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
-                                       &first_block, &free_list, resblks);
-       if (error)
-               goto abort_return;
-       xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
-       error = xfs_bumplink(tp, sip);
-       if (error)
-               goto abort_return;
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * link transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
-
-       error = xfs_bmap_finish (&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               goto abort_return;
-       }
-
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-       xfs_trans_cancel(tp, cancel_flags);
- std_return:
-       return error;
-}
-
-int
-xfs_set_dmattrs(
-       xfs_inode_t     *ip,
-       u_int           evmask,
-       u_int16_t       state)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_trans_t     *tp;
-       int             error;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
-       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       ip->i_d.di_dmevmask = evmask;
-       ip->i_d.di_dmstate  = state;
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       error = xfs_trans_commit(tp, 0);
-
-       return error;
-}
-
-/*
- * xfs_alloc_file_space()
- *      This routine allocates disk space for the given file.
- *
- *     If alloc_type == 0, this request is for an ALLOCSP type
- *     request which will change the file size.  In this case, no
- *     DMAPI event will be generated by the call.  A TRUNCATE event
- *     will be generated later by xfs_setattr.
- *
- *     If alloc_type != 0, this request is for a RESVSP type
- *     request, and a DMAPI DM_EVENT_WRITE will be generated if the
- *     lower block boundary byte address is less than the file's
- *     length.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
-xfs_alloc_file_space(
-       xfs_inode_t             *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     alloc_type,
-       int                     attr_flags)
-{
-       xfs_mount_t             *mp = ip->i_mount;
-       xfs_off_t               count;
-       xfs_filblks_t           allocated_fsb;
-       xfs_filblks_t           allocatesize_fsb;
-       xfs_extlen_t            extsz, temp;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_fsblock_t           firstfsb;
-       int                     nimaps;
-       int                     quota_flag;
-       int                     rt;
-       xfs_trans_t             *tp;
-       xfs_bmbt_irec_t         imaps[1], *imapp;
-       xfs_bmap_free_t         free_list;
-       uint                    qblocks, resblks, resrtextents;
-       int                     committed;
-       int                     error;
-
-       trace_xfs_alloc_file_space(ip);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return error;
-
-       if (len <= 0)
-               return XFS_ERROR(EINVAL);
-
-       rt = XFS_IS_REALTIME_INODE(ip);
-       extsz = xfs_get_extsz_hint(ip);
-
-       count = len;
-       imapp = &imaps[0];
-       nimaps = 1;
-       startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
-       allocatesize_fsb = XFS_B_TO_FSB(mp, count);
-
-       /*
-        * Allocate file space until done or until there is an error
-        */
-       while (allocatesize_fsb && !error) {
-               xfs_fileoff_t   s, e;
-
-               /*
-                * Determine space reservations for data/realtime.
-                */
-               if (unlikely(extsz)) {
-                       s = startoffset_fsb;
-                       do_div(s, extsz);
-                       s *= extsz;
-                       e = startoffset_fsb + allocatesize_fsb;
-                       if ((temp = do_mod(startoffset_fsb, extsz)))
-                               e += temp;
-                       if ((temp = do_mod(e, extsz)))
-                               e += extsz - temp;
-               } else {
-                       s = 0;
-                       e = allocatesize_fsb;
-               }
-
-               /*
-                * The transaction reservation is limited to a 32-bit block
-                * count, hence we need to limit the number of blocks we are
-                * trying to reserve to avoid an overflow. We can't allocate
-                * more than @nimaps extents, and an extent is limited on disk
-                * to MAXEXTLEN (21 bits), so use that to enforce the limit.
-                */
-               resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
-               if (unlikely(rt)) {
-                       resrtextents = qblocks = resblks;
-                       resrtextents /= mp->m_sb.sb_rextsize;
-                       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-                       quota_flag = XFS_QMOPT_RES_RTBLKS;
-               } else {
-                       resrtextents = 0;
-                       resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
-                       quota_flag = XFS_QMOPT_RES_REGBLKS;
-               }
-
-               /*
-                * Allocate and setup the transaction.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               error = xfs_trans_reserve(tp, resblks,
-                                         XFS_WRITE_LOG_RES(mp), resrtextents,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
-               /*
-                * Check for running out of space
-                */
-               if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       break;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
-                                                     0, quota_flag);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-                                       allocatesize_fsb, alloc_type, &firstfsb,
-                                       0, imapp, &nimaps, &free_list);
-               if (error) {
-                       goto error0;
-               }
-
-               /*
-                * Complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, &committed);
-               if (error) {
-                       goto error0;
-               }
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               if (error) {
-                       break;
-               }
-
-               allocated_fsb = imapp->br_blockcount;
-
-               if (nimaps == 0) {
-                       error = XFS_ERROR(ENOSPC);
-                       break;
-               }
-
-               startoffset_fsb += allocated_fsb;
-               allocatesize_fsb -= allocated_fsb;
-       }
-
-       return error;
-
-error0:        /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
-       xfs_bmap_cancel(&free_list);
-       xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1:        /* Just cancel transaction */
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-       xfs_inode_t             *ip,
-       xfs_off_t               startoff,
-       xfs_off_t               endoff)
-{
-       xfs_bmbt_irec_t         imap;
-       xfs_fileoff_t           offset_fsb;
-       xfs_off_t               lastoffset;
-       xfs_off_t               offset;
-       xfs_buf_t               *bp;
-       xfs_mount_t             *mp = ip->i_mount;
-       int                     nimap;
-       int                     error = 0;
-
-       /*
-        * Avoid doing I/O beyond eof - it's not necessary
-        * since nothing can read beyond eof.  The space will
-        * be zeroed when the file is extended anyway.
-        */
-       if (startoff >= XFS_ISIZE(ip))
-               return 0;
-
-       if (endoff > XFS_ISIZE(ip))
-               endoff = XFS_ISIZE(ip);
-
-       bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
-                                       mp->m_rtdev_targp : mp->m_ddev_targp,
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
-       if (!bp)
-               return XFS_ERROR(ENOMEM);
-
-       xfs_buf_unlock(bp);
-
-       for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
-               offset_fsb = XFS_B_TO_FSBT(mp, offset);
-               nimap = 1;
-               error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-               if (error || nimap < 1)
-                       break;
-               ASSERT(imap.br_blockcount >= 1);
-               ASSERT(imap.br_startoff == offset_fsb);
-               lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
-               if (lastoffset > endoff)
-                       lastoffset = endoff;
-               if (imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
-                       continue;
-               XFS_BUF_UNDONE(bp);
-               XFS_BUF_UNWRITE(bp);
-               XFS_BUF_READ(bp);
-               XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-               xfsbdstrat(mp, bp);
-               error = xfs_buf_iowait(bp);
-               if (error) {
-                       xfs_buf_ioerror_alert(bp,
-                                       "xfs_zero_remaining_bytes(read)");
-                       break;
-               }
-               memset(bp->b_addr +
-                       (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-                     0, lastoffset - offset + 1);
-               XFS_BUF_UNDONE(bp);
-               XFS_BUF_UNREAD(bp);
-               XFS_BUF_WRITE(bp);
-               xfsbdstrat(mp, bp);
-               error = xfs_buf_iowait(bp);
-               if (error) {
-                       xfs_buf_ioerror_alert(bp,
-                                       "xfs_zero_remaining_bytes(write)");
-                       break;
-               }
-       }
-       xfs_buf_free(bp);
-       return error;
-}
-
-/*
- * xfs_free_file_space()
- *      This routine frees disk space for the given file.
- *
- *     This routine is only called by xfs_change_file_space
- *     for an UNRESVSP type call.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-STATIC int
-xfs_free_file_space(
-       xfs_inode_t             *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     attr_flags)
-{
-       int                     committed;
-       int                     done;
-       xfs_fileoff_t           endoffset_fsb;
-       int                     error;
-       xfs_fsblock_t           firstfsb;
-       xfs_bmap_free_t         free_list;
-       xfs_bmbt_irec_t         imap;
-       xfs_off_t               ioffset;
-       xfs_extlen_t            mod=0;
-       xfs_mount_t             *mp;
-       int                     nimap;
-       uint                    resblks;
-       xfs_off_t               rounding;
-       int                     rt;
-       xfs_fileoff_t           startoffset_fsb;
-       xfs_trans_t             *tp;
-       int                     need_iolock = 1;
-
-       mp = ip->i_mount;
-
-       trace_xfs_free_file_space(ip);
-
-       error = xfs_qm_dqattach(ip, 0);
-       if (error)
-               return error;
-
-       error = 0;
-       if (len <= 0)   /* if nothing being freed */
-               return error;
-       rt = XFS_IS_REALTIME_INODE(ip);
-       startoffset_fsb = XFS_B_TO_FSB(mp, offset);
-       endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
-
-       if (attr_flags & XFS_ATTR_NOLOCK)
-               need_iolock = 0;
-       if (need_iolock) {
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-               /* wait for the completion of any pending DIOs */
-               inode_dio_wait(VFS_I(ip));
-       }
-
-       rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-       ioffset = offset & ~(rounding - 1);
-       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                             ioffset, -1);
-       if (error)
-               goto out_unlock_iolock;
-       truncate_pagecache_range(VFS_I(ip), ioffset, -1);
-
-       /*
-        * Need to zero the stuff we're not freeing, on disk.
-        * If it's a realtime file & can't use unwritten extents then we
-        * actually need to zero the extent edges.  Otherwise xfs_bunmapi
-        * will take care of it for us.
-        */
-       if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-               nimap = 1;
-               error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out_unlock_iolock;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       xfs_daddr_t     block;
-
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       block = imap.br_startblock;
-                       mod = do_div(block, mp->m_sb.sb_rextsize);
-                       if (mod)
-                               startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-               }
-               nimap = 1;
-               error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-                                       &imap, &nimap, 0);
-               if (error)
-                       goto out_unlock_iolock;
-               ASSERT(nimap == 0 || nimap == 1);
-               if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-                       ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-                       mod++;
-                       if (mod && (mod != mp->m_sb.sb_rextsize))
-                               endoffset_fsb -= mod;
-               }
-       }
-       if ((done = (endoffset_fsb <= startoffset_fsb)))
-               /*
-                * One contiguous piece to clear
-                */
-               error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-       else {
-               /*
-                * Some full blocks, possibly two pieces to clear
-                */
-               if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-                       error = xfs_zero_remaining_bytes(ip, offset,
-                               XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-               if (!error &&
-                   XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-                       error = xfs_zero_remaining_bytes(ip,
-                               XFS_FSB_TO_B(mp, endoffset_fsb),
-                               offset + len - 1);
-       }
-
-       /*
-        * free file space until done or until there is an error
-        */
-       resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-       while (!error && !done) {
-
-               /*
-                * allocate and setup the transaction. Allow this
-                * transaction to dip into the reserve blocks to ensure
-                * the freeing of the space succeeds at ENOSPC.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-               tp->t_flags |= XFS_TRANS_RESERVE;
-               error = xfs_trans_reserve(tp,
-                                         resblks,
-                                         XFS_WRITE_LOG_RES(mp),
-                                         0,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_WRITE_LOG_COUNT);
-
-               /*
-                * check for running out of space
-                */
-               if (error) {
-                       /*
-                        * Free the transaction structure.
-                        */
-                       ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-                       xfs_trans_cancel(tp, 0);
-                       break;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_trans_reserve_quota(tp, mp,
-                               ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-                               resblks, 0, XFS_QMOPT_RES_REGBLKS);
-               if (error)
-                       goto error1;
-
-               xfs_trans_ijoin(tp, ip, 0);
-
-               /*
-                * issue the bunmapi() call to free the blocks
-                */
-               xfs_bmap_init(&free_list, &firstfsb);
-               error = xfs_bunmapi(tp, ip, startoffset_fsb,
-                                 endoffset_fsb - startoffset_fsb,
-                                 0, 2, &firstfsb, &free_list, &done);
-               if (error) {
-                       goto error0;
-               }
-
-               /*
-                * complete the transaction
-                */
-               error = xfs_bmap_finish(&tp, &free_list, &committed);
-               if (error) {
-                       goto error0;
-               }
-
-               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-
- out_unlock_iolock:
-       if (need_iolock)
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return error;
-
- error0:
-       xfs_bmap_cancel(&free_list);
- error1:
-       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
-                   XFS_ILOCK_EXCL);
-       return error;
-}
-
-
-STATIC int
-xfs_zero_file_space(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len,
-       int                     attr_flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       uint                    granularity;
-       xfs_off_t               start_boundary;
-       xfs_off_t               end_boundary;
-       int                     error;
-
-       granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
-
-       /*
-        * Round the range of extents we are going to convert inwards.  If the
-        * offset is aligned, then it doesn't get changed so we zero from the
-        * start of the block offset points to.
-        */
-       start_boundary = round_up(offset, granularity);
-       end_boundary = round_down(offset + len, granularity);
-
-       ASSERT(start_boundary >= offset);
-       ASSERT(end_boundary <= offset + len);
-
-       if (!(attr_flags & XFS_ATTR_NOLOCK))
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-       if (start_boundary < end_boundary - 1) {
-               /* punch out the page cache over the conversion range */
-               truncate_pagecache_range(VFS_I(ip), start_boundary,
-                                        end_boundary - 1);
-               /* convert the blocks */
-               error = xfs_alloc_file_space(ip, start_boundary,
-                                       end_boundary - start_boundary - 1,
-                                       XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
-                                       attr_flags);
-               if (error)
-                       goto out_unlock;
-
-               /* We've handled the interior of the range, now for the edges */
-               if (start_boundary != offset)
-                       error = xfs_iozero(ip, offset, start_boundary - offset);
-               if (error)
-                       goto out_unlock;
-
-               if (end_boundary != offset + len)
-                       error = xfs_iozero(ip, end_boundary,
-                                          offset + len - end_boundary);
-
-       } else {
-               /*
-                * It's either a sub-granularity range or the range spanned lies
-                * partially across two adjacent blocks.
-                */
-               error = xfs_iozero(ip, offset, len);
-       }
-
-out_unlock:
-       if (!(attr_flags & XFS_ATTR_NOLOCK))
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return error;
-
-}
-
-/*
- * xfs_change_file_space()
- *      This routine allocates or frees disk space for the given file.
- *      The user specified parameters are checked for alignment and size
- *      limitations.
- *
- * RETURNS:
- *       0 on success
- *      errno on error
- *
- */
-int
-xfs_change_file_space(
-       xfs_inode_t     *ip,
-       int             cmd,
-       xfs_flock64_t   *bf,
-       xfs_off_t       offset,
-       int             attr_flags)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       int             clrprealloc;
-       int             error;
-       xfs_fsize_t     fsize;
-       int             setprealloc;
-       xfs_off_t       startoffset;
-       xfs_trans_t     *tp;
-       struct iattr    iattr;
-
-       if (!S_ISREG(ip->i_d.di_mode))
-               return XFS_ERROR(EINVAL);
-
-       switch (bf->l_whence) {
-       case 0: /*SEEK_SET*/
-               break;
-       case 1: /*SEEK_CUR*/
-               bf->l_start += offset;
-               break;
-       case 2: /*SEEK_END*/
-               bf->l_start += XFS_ISIZE(ip);
-               break;
-       default:
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * length of <= 0 for resv/unresv/zero is invalid.  length for
-        * alloc/free is ignored completely and we have no idea what userspace
-        * might have set it to, so set it to zero to allow range
-        * checks to pass.
-        */
-       switch (cmd) {
-       case XFS_IOC_ZERO_RANGE:
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_RESVSP64:
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_UNRESVSP64:
-               if (bf->l_len <= 0)
-                       return XFS_ERROR(EINVAL);
-               break;
-       default:
-               bf->l_len = 0;
-               break;
-       }
-
-       if (bf->l_start < 0 ||
-           bf->l_start > mp->m_super->s_maxbytes ||
-           bf->l_start + bf->l_len < 0 ||
-           bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
-               return XFS_ERROR(EINVAL);
-
-       bf->l_whence = 0;
-
-       startoffset = bf->l_start;
-       fsize = XFS_ISIZE(ip);
-
-       setprealloc = clrprealloc = 0;
-       switch (cmd) {
-       case XFS_IOC_ZERO_RANGE:
-               error = xfs_zero_file_space(ip, startoffset, bf->l_len,
-                                               attr_flags);
-               if (error)
-                       return error;
-               setprealloc = 1;
-               break;
-
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_RESVSP64:
-               error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                               XFS_BMAPI_PREALLOC, attr_flags);
-               if (error)
-                       return error;
-               setprealloc = 1;
-               break;
-
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_UNRESVSP64:
-               if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
-                                                               attr_flags)))
-                       return error;
-               break;
-
-       case XFS_IOC_ALLOCSP:
-       case XFS_IOC_ALLOCSP64:
-       case XFS_IOC_FREESP:
-       case XFS_IOC_FREESP64:
-               /*
-                * These operations actually do IO when extending the file, but
-                * the allocation is done seperately to the zeroing that is
-                * done. This set of operations need to be serialised against
-                * other IO operations, such as truncate and buffered IO. We
-                * need to take the IOLOCK here to serialise the allocation and
-                * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
-                * truncate, direct IO) from racing against the transient
-                * allocated but not written state we can have here.
-                */
-               xfs_ilock(ip, XFS_IOLOCK_EXCL);
-               if (startoffset > fsize) {
-                       error = xfs_alloc_file_space(ip, fsize,
-                                       startoffset - fsize, 0,
-                                       attr_flags | XFS_ATTR_NOLOCK);
-                       if (error) {
-                               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                               break;
-                       }
-               }
-
-               iattr.ia_valid = ATTR_SIZE;
-               iattr.ia_size = startoffset;
-
-               error = xfs_setattr_size(ip, &iattr,
-                                        attr_flags | XFS_ATTR_NOLOCK);
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
-               if (error)
-                       return error;
-
-               clrprealloc = 1;
-               break;
-
-       default:
-               ASSERT(0);
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * update the inode timestamp, mode, and prealloc flag bits
-        */
-       tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-
-       if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
-                                     0, 0, 0))) {
-               /* ASSERT(0); */
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
-       if ((attr_flags & XFS_ATTR_DMI) == 0) {
-               ip->i_d.di_mode &= ~S_ISUID;
-
-               /*
-                * Note that we don't have to worry about mandatory
-                * file locking being disabled here because we only
-                * clear the S_ISGID bit if the Group execute bit is
-                * on, but if it was on then mandatory locking wouldn't
-                * have been enabled.
-                */
-               if (ip->i_d.di_mode & S_IXGRP)
-                       ip->i_d.di_mode &= ~S_ISGID;
-
-               xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       }
-       if (setprealloc)
-               ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-       else if (clrprealloc)
-               ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       if (attr_flags & XFS_ATTR_SYNC)
-               xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
-}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
deleted file mode 100644 (file)
index 38c67c3..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _XFS_VNODEOPS_H
-#define _XFS_VNODEOPS_H 1
-
-struct attrlist_cursor_kern;
-struct file;
-struct iattr;
-struct inode;
-struct iovec;
-struct kiocb;
-struct pipe_inode_info;
-struct uio;
-struct xfs_inode;
-
-
-int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
-#define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
-#define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
-#define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
-#define XFS_ATTR_NOACL         0x08    /* Don't call xfs_acl_chmod */
-#define XFS_ATTR_SYNC          0x10    /* synchronous operation required */
-
-int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_release(struct xfs_inode *ip);
-int xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-               struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
-               xfs_dev_t rdev, struct xfs_inode **ipp);
-int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
-               struct xfs_inode *ip);
-int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
-               struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
-int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
-               const char *target_path, umode_t mode, struct xfs_inode **ipp);
-int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
-int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-               xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
-               struct xfs_inode *src_ip, struct xfs_inode *target_dp,
-               struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-               unsigned char *value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
-               unsigned char *value, int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
-               int flags, struct attrlist_cursor_kern *cursor);
-
-int xfs_iozero(struct xfs_inode *, loff_t, size_t);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-
-#endif /* _XFS_VNODEOPS_H */
index 87d3e03878c8da30b762d19263ec3e1cbe02aa24..e01f35ea76ba436310f11d82b9fdf78322d8f6fe 100644 (file)
  */
 
 #include "xfs.h"
+#include "xfs_log_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_acl.h"
-#include "xfs_vnodeops.h"
 
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
index d13371134c59fbec0be79992a692257d8021a959..cc7494a3542983bbd4e73b95920eb981bb196716 100644 (file)
@@ -328,6 +328,7 @@ struct quotactl_ops {
        int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
        int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
        int (*set_xstate)(struct super_block *, unsigned int, int);
+       int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
 };
 
 struct quota_format_type {
index 86552807aed949529fc34a3007658ede564f4e52..dcd75cc261962f65c909a6efa0defe3f1dcdc281 100644 (file)
@@ -38,6 +38,7 @@
 #define Q_XGETQSTAT    XQM_CMD(5)      /* get quota subsystem status */
 #define Q_XQUOTARM     XQM_CMD(6)      /* free disk space used by dquots */
 #define Q_XQUOTASYNC   XQM_CMD(7)      /* delalloc flush, updates dquots */
+#define Q_XGETQSTATV   XQM_CMD(8)      /* newer version of get quota */
 
 /*
  * fs_disk_quota structure:
@@ -163,4 +164,50 @@ typedef struct fs_quota_stat {
        __u16           qs_iwarnlimit;  /* limit for num warnings */
 } fs_quota_stat_t;
 
+/*
+ * fs_quota_statv is used by Q_XGETQSTATV for a given file system. It provides
+ * a centralized way to get meta information about the quota subsystem. eg.
+ * space taken up for user, group, and project quotas, number of dquots
+ * currently incore.
+ *
+ * This version has proper versioning support with appropriate padding for
+ * future expansions, and ability to expand for future without creating any
+ * backward compatibility issues.
+ *
+ * Q_XGETQSTATV uses the passed in value of the requested version via
+ * fs_quota_statv.qs_version to determine the return data layout of
+ * fs_quota_statv.  The kernel will fill the data fields relevant to that
+ * version.
+ *
+ * If kernel does not support user space caller specified version, EINVAL will
+ * be returned. User space caller can then reduce the version number and retry
+ * the same command.
+ */
+#define FS_QSTATV_VERSION1     1       /* fs_quota_statv.qs_version */
+/*
+ * Some basic information about 'quota files' for Q_XGETQSTATV command
+ */
+struct fs_qfilestatv {
+       __u64           qfs_ino;        /* inode number */
+       __u64           qfs_nblks;      /* number of BBs 512-byte-blks */
+       __u32           qfs_nextents;   /* number of extents */
+       __u32           qfs_pad;        /* pad for 8-byte alignment */
+};
+
+struct fs_quota_statv {
+       __s8                    qs_version;     /* version for future changes */
+       __u8                    qs_pad1;        /* pad for 16bit alignment */
+       __u16                   qs_flags;       /* FS_QUOTA_.* flags */
+       __u32                   qs_incoredqs;   /* number of dquots incore */
+       struct fs_qfilestatv    qs_uquota;      /* user quota information */
+       struct fs_qfilestatv    qs_gquota;      /* group quota information */
+       struct fs_qfilestatv    qs_pquota;      /* project quota information */
+       __s32                   qs_btimelimit;  /* limit for blks timer */
+       __s32                   qs_itimelimit;  /* limit for inodes timer */
+       __s32                   qs_rtbtimelimit;/* limit for rt blks timer */
+       __u16                   qs_bwarnlimit;  /* limit for num warnings */
+       __u16                   qs_iwarnlimit;  /* limit for num warnings */
+       __u64                   qs_pad2[8];     /* for future proofing */
+};
+
 #endif /* _LINUX_DQBLK_XFS_H */
index 0a2c4bcf179e21d321b25206718cb58aad722b54..bfa9e13c9a939d34045bb8a360aef1743093b800 100644 (file)
@@ -1123,7 +1123,6 @@ config IPC_NS
 
 config USER_NS
        bool "User namespace"
-       depends on UIDGID_CONVERTED
        select UIDGID_STRICT_TYPE_CHECKS
 
        default n
@@ -1157,20 +1156,8 @@ config NET_NS
 
 endif # NAMESPACES
 
-config UIDGID_CONVERTED
-       # True if all of the selected software conmponents are known
-       # to have uid_t and gid_t converted to kuid_t and kgid_t
-       # where appropriate and are otherwise safe to use with
-       # the user namespace.
-       bool
-       default y
-
-       # Filesystems
-       depends on XFS_FS = n
-
 config UIDGID_STRICT_TYPE_CHECKS
        bool "Require conversions between uid/gids and their internal representation"
-       depends on UIDGID_CONVERTED
        default n
        help
         While the nececessary conversions are being added to all subsystems this option allows
index 6fc1c8af44df745bad512e02be04645984d1b5a1..4e66bf9275b03edf3c62e0e350afc5248f2b00b8 100644 (file)
@@ -452,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
 
        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
 }
+EXPORT_SYMBOL(inode_capable);