]> Pileus Git - ~andy/linux/blobdiff - fs/ext4/extents_status.c
ext4: add support for extent pre-caching
[~andy/linux] / fs / ext4 / extents_status.c
index ee018d5f397e6ba88ec0bf21196e1ad0612d4e53..0e88a367b535f0b1321e68c6a5d742d910a428e6 100644 (file)
@@ -148,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end);
 static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
                                       int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei);
 
 int __init ext4_init_es(void)
 {
@@ -261,7 +263,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
        if (tree->cache_es) {
                es1 = tree->cache_es;
                if (in_range(lblk, es1->es_lblk, es1->es_len)) {
-                       es_debug("%u cached by [%u/%u) %llu %llx\n",
+                       es_debug("%u cached by [%u/%u) %llu %x\n",
                                 lblk, es1->es_lblk, es1->es_len,
                                 ext4_es_pblock(es1), ext4_es_status(es1));
                        goto out;
@@ -417,7 +419,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
        unsigned short ee_len;
        int depth, ee_status, es_status;
 
-       path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+       path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;
 
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                }
 
                if (ee_status ^ es_status) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
                                "to add an written/unwritten extent "
                                "[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%llx]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
-                               pr_warn("ES insert assertation failed for "
+                               pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
-                       pr_warn("ES insert assertation failed for inode: %lu "
+                       pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
                                "an written extent [%d/%d/%llu/%llx]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
@@ -632,22 +634,20 @@ out:
 }
 
 /*
- * ext4_es_insert_extent() adds a space to a extent status tree.
- *
- * ext4_es_insert_extent is called by ext4_da_write_begin and
- * ext4_es_remove_extent.
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
  *
  * Return 0 on success, error code on failure.
  */
 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
-                         unsigned long long status)
+                         unsigned int status)
 {
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err = 0;
 
-       es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+       es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, inode->i_ino);
 
        if (!len)
@@ -667,7 +667,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        err = __es_remove_extent(inode, lblk, end);
        if (err != 0)
                goto error;
+retry:
        err = __es_insert_extent(inode, &newes);
+       if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                              EXT4_I(inode)))
+               goto retry;
+       if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+               err = 0;
 
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -677,6 +683,38 @@ error:
        return err;
 }
 
+/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+                         ext4_lblk_t len, ext4_fsblk_t pblk,
+                         unsigned int status)
+{
+       struct extent_status *es;
+       struct extent_status newes;
+       ext4_lblk_t end = lblk + len - 1;
+
+       newes.es_lblk = lblk;
+       newes.es_len = len;
+       ext4_es_store_pblock(&newes, pblk);
+       ext4_es_store_status(&newes, status);
+       trace_ext4_es_cache_extent(inode, &newes);
+
+       if (!len)
+               return;
+
+       BUG_ON(end < lblk);
+
+       write_lock(&EXT4_I(inode)->i_es_lock);
+
+       es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+       if (!es || es->es_lblk > end)
+               __es_insert_extent(inode, &newes);
+       write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
 /*
  * ext4_es_lookup_extent() looks up an extent in extent status tree.
  *
@@ -746,8 +784,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
-       int err = 0;
+       int err;
 
+retry:
+       err = 0;
        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
@@ -782,6 +822,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                        if (err) {
                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
+                               if ((err == -ENOMEM) &&
+                                   __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+                                                    EXT4_I(inode)))
+                                       goto retry;
                                goto out;
                        }
                } else {
@@ -883,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
        eia = list_entry(a, struct ext4_inode_info, i_es_lru);
        eib = list_entry(b, struct ext4_inode_info, i_es_lru);
 
+       if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+           !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+               return 1;
+       if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+           ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+               return -1;
        if (eia->i_touch_when == eib->i_touch_when)
                return 0;
        if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -891,34 +941,18 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
                return -1;
 }
 
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+                           struct ext4_inode_info *locked_ei)
 {
-       struct ext4_sb_info *sbi = container_of(shrink,
-                                       struct ext4_sb_info, s_es_shrinker);
        struct ext4_inode_info *ei;
        struct list_head *cur, *tmp;
-       LIST_HEAD(skiped);
-       int nr_to_scan = sc->nr_to_scan;
+       LIST_HEAD(skipped);
        int ret, nr_shrunk = 0;
-
-       ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
-       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
-
-       if (!nr_to_scan)
-               return ret;
+       int retried = 0, skip_precached = 1, nr_skipped = 0;
 
        spin_lock(&sbi->s_es_lru_lock);
 
-       /*
-        * If the inode that is at the head of LRU list is newer than
-        * last_sorted time, that means that we need to sort this list.
-        */
-       ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
-       if (sbi->s_es_last_sorted < ei->i_touch_when) {
-               list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
-               sbi->s_es_last_sorted = jiffies;
-       }
-
+retry:
        list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
                /*
                 * If we have already reclaimed all extents from extent
@@ -929,13 +963,20 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
                ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 
-               /* Skip the inode that is newer than the last_sorted time */
-               if (sbi->s_es_last_sorted < ei->i_touch_when) {
-                       list_move_tail(cur, &skiped);
+               /*
+                * Skip the inode that is newer than the last_sorted
+                * time.  Normally we try hard to avoid shrinking
+                * precached inodes, but we will as a last resort.
+                */
+               if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+                   (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+                                               EXT4_STATE_EXT_PRECACHED))) {
+                       nr_skipped++;
+                       list_move_tail(cur, &skipped);
                        continue;
                }
 
-               if (ei->i_es_lru_nr == 0)
+               if (ei->i_es_lru_nr == 0 || ei == locked_ei)
                        continue;
 
                write_lock(&ei->i_es_lock);
@@ -951,9 +992,52 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
        }
 
        /* Move the newer inodes into the tail of the LRU list. */
-       list_splice_tail(&skiped, &sbi->s_es_lru);
+       list_splice_tail(&skipped, &sbi->s_es_lru);
+       INIT_LIST_HEAD(&skipped);
+
+       /*
+        * If we skipped any inodes, and we weren't able to make any
+        * forward progress, sort the list and try again.
+        */
+       if ((nr_shrunk == 0) && nr_skipped && !retried) {
+               retried++;
+               list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+               sbi->s_es_last_sorted = jiffies;
+               ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+                                     i_es_lru);
+               /*
+                * If there are no non-precached inodes left on the
+                * list, start releasing precached extents.
+                */
+               if (ext4_test_inode_state(&ei->vfs_inode,
+                                         EXT4_STATE_EXT_PRECACHED))
+                       skip_precached = 0;
+               goto retry;
+       }
+
        spin_unlock(&sbi->s_es_lru_lock);
 
+       if (locked_ei && nr_shrunk == 0)
+               nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+       return nr_shrunk;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+       struct ext4_sb_info *sbi = container_of(shrink,
+                                       struct ext4_sb_info, s_es_shrinker);
+       int nr_to_scan = sc->nr_to_scan;
+       int ret, nr_shrunk;
+
+       ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+       trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+       if (!nr_to_scan)
+               return ret;
+
+       nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
        ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
        trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
        return ret;
@@ -1009,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
        struct rb_node *node;
        struct extent_status *es;
        int nr_shrunk = 0;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
 
        if (ei->i_es_lru_nr == 0)
                return 0;
 
+       if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+           __ratelimit(&_rs))
+               ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
        node = rb_first(&tree->root);
        while (node != NULL) {
                es = rb_entry(node, struct extent_status, rb_node);