]> Pileus Git - ~andy/linux/commitdiff
Merge commit 'v2.6.38-rc6' into for-2.6.39/core
authorJens Axboe <jaxboe@fusionio.com>
Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
committerJens Axboe <jaxboe@fusionio.com>
Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
Conflicts:
block/cfq-iosched.c

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
1  2 
block/blk-core.c
block/cfq-iosched.c
include/linux/blkdev.h
include/linux/elevator.h

diff --combined block/blk-core.c
index ab4a7696956d9c2ae4599e9cd4261caf958f18a0,2f4002f79a24b3cf242c870282d96859dc475dc9..3cc17e6064d68e5a3315d012a395c84f660bcb8a
@@@ -33,7 -33,7 +33,7 @@@
  
  #include "blk.h"
  
- EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  
@@@ -64,13 -64,27 +64,27 @@@ static void drive_stat_acct(struct requ
                return;
  
        cpu = part_stat_lock();
-       part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
  
-       if (!new_io)
+       if (!new_io) {
+               part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
-       else {
+       } else {
+               part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+               if (!hd_struct_try_get(part)) {
+                       /*
+                        * The partition is already being removed,
+                        * the request will be accounted on the disk only
+                        *
+                        * We take a reference on disk->part0 although that
+                        * partition will never be deleted, so we can treat
+                        * it as any other partition.
+                        */
+                       part = &rq->rq_disk->part0;
+                       hd_struct_get(part);
+               }
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
+               rq->part = part;
        }
  
        part_stat_unlock();
@@@ -128,35 -142,46 +142,36 @@@ void blk_rq_init(struct request_queue *
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
+       rq->part = NULL;
  }
  EXPORT_SYMBOL(blk_rq_init);
  
  static void req_bio_endio(struct request *rq, struct bio *bio,
                          unsigned int nbytes, int error)
  {
 -      struct request_queue *q = rq->q;
 -
 -      if (&q->flush_rq != rq) {
 -              if (error)
 -                      clear_bit(BIO_UPTODATE, &bio->bi_flags);
 -              else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 -                      error = -EIO;
 -
 -              if (unlikely(nbytes > bio->bi_size)) {
 -                      printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 -                             __func__, nbytes, bio->bi_size);
 -                      nbytes = bio->bi_size;
 -              }
 +      if (error)
 +              clear_bit(BIO_UPTODATE, &bio->bi_flags);
 +      else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 +              error = -EIO;
 +
 +      if (unlikely(nbytes > bio->bi_size)) {
 +              printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 +                     __func__, nbytes, bio->bi_size);
 +              nbytes = bio->bi_size;
 +      }
  
 -              if (unlikely(rq->cmd_flags & REQ_QUIET))
 -                      set_bit(BIO_QUIET, &bio->bi_flags);
 +      if (unlikely(rq->cmd_flags & REQ_QUIET))
 +              set_bit(BIO_QUIET, &bio->bi_flags);
  
 -              bio->bi_size -= nbytes;
 -              bio->bi_sector += (nbytes >> 9);
 +      bio->bi_size -= nbytes;
 +      bio->bi_sector += (nbytes >> 9);
  
 -              if (bio_integrity(bio))
 -                      bio_integrity_advance(bio, nbytes);
 +      if (bio_integrity(bio))
 +              bio_integrity_advance(bio, nbytes);
  
 -              if (bio->bi_size == 0)
 -                      bio_endio(bio, error);
 -      } else {
 -              /*
 -               * Okay, this is the sequenced flush request in
 -               * progress, just record the error;
 -               */
 -              if (error && !q->flush_err)
 -                      q->flush_err = error;
 -      }
 +      /* don't actually finish bio if it's part of flush sequence */
 +      if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 +              bio_endio(bio, error);
  }
  
  void blk_dump_rq_flags(struct request *rq, char *msg)
@@@ -515,9 -540,7 +530,9 @@@ struct request_queue *blk_alloc_queue_n
        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
 -      INIT_LIST_HEAD(&q->pending_flushes);
 +      INIT_LIST_HEAD(&q->flush_queue[0]);
 +      INIT_LIST_HEAD(&q->flush_queue[1]);
 +      INIT_LIST_HEAD(&q->flush_data_in_flight);
        INIT_WORK(&q->unplug_work, blk_unplug_work);
  
        kobject_init(&q->kobj, &blk_queue_ktype);
@@@ -737,25 -760,6 +752,25 @@@ static void freed_request(struct reques
                __freed_request(q, sync ^ 1);
  }
  
 +/*
 + * Determine if elevator data should be initialized when allocating the
 + * request associated with @bio.
 + */
 +static bool blk_rq_should_init_elevator(struct bio *bio)
 +{
 +      if (!bio)
 +              return true;
 +
 +      /*
 +       * Flush requests do not use the elevator so skip initialization.
 +       * This allows a request to share the flush and elevator data.
 +       */
 +      if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
 +              return false;
 +
 +      return true;
 +}
 +
  /*
   * Get a free request, queue_lock must be held.
   * Returns NULL on failure, with queue_lock held.
@@@ -768,7 -772,7 +783,7 @@@ static struct request *get_request(stru
        struct request_list *rl = &q->rq;
        struct io_context *ioc = NULL;
        const bool is_sync = rw_is_sync(rw_flags) != 0;
 -      int may_queue, priv;
 +      int may_queue, priv = 0;
  
        may_queue = elv_may_queue(q, rw_flags);
        if (may_queue == ELV_MQUEUE_NO)
        rl->count[is_sync]++;
        rl->starved[is_sync] = 0;
  
 -      priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 -      if (priv)
 -              rl->elvpriv++;
 +      if (blk_rq_should_init_elevator(bio)) {
 +              priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 +              if (priv)
 +                      rl->elvpriv++;
 +      }
  
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
@@@ -1217,7 -1219,7 +1232,7 @@@ static int __make_request(struct reques
        spin_lock_irq(q->queue_lock);
  
        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 -              where = ELEVATOR_INSERT_FRONT;
 +              where = ELEVATOR_INSERT_FLUSH;
                goto get_rq;
        }
  
@@@ -1342,9 -1344,9 +1357,9 @@@ static inline void blk_partition_remap(
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
  
-               trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev,
-                                   bio->bi_sector - p->start_sect);
+               trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+                                     bdev->bd_dev,
+                                     bio->bi_sector - p->start_sect);
        }
  }
  
@@@ -1513,7 -1515,7 +1528,7 @@@ static inline void __generic_make_reque
                        goto end_io;
  
                if (old_sector != -1)
-                       trace_block_remap(q, bio, old_dev, old_sector);
+                       trace_block_bio_remap(q, bio, old_dev, old_sector);
  
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@@ -1789,7 -1791,7 +1804,7 @@@ static void blk_account_io_completion(s
                int cpu;
  
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
@@@ -1802,20 -1804,21 +1817,21 @@@ static void blk_account_io_done(struct 
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
 -      if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
 +      if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
                struct hd_struct *part;
                int cpu;
  
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
  
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rw);
  
+               hd_struct_put(part);
                part_stat_unlock();
        }
  }
@@@ -2619,7 -2622,9 +2635,9 @@@ int __init blk_dev_init(void
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                        sizeof(((struct request *)0)->cmd_flags));
  
-       kblockd_workqueue = create_workqueue("kblockd");
+       /* used for unplugging and affects IO latency/throughput - HIGHPRI */
+       kblockd_workqueue = alloc_workqueue("kblockd",
+                                           WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");
  
diff --combined block/cfq-iosched.c
index 968455c57e1a1edeacc6da28fc1fea0ad4c3343c,7be4c79596250d28cb82afa9e1aef085abc6a66b..f27ff3efe6cd2cf92af18aeb0ae063f7dc38ab3a
@@@ -54,9 -54,9 +54,9 @@@ static const int cfq_hist_divisor = 4
  #define CFQQ_SEEKY(cfqq)      (hweight32(cfqq->seek_history) > 32/8)
  
  #define RQ_CIC(rq)            \
 -      ((struct cfq_io_context *) (rq)->elevator_private)
 -#define RQ_CFQQ(rq)           (struct cfq_queue *) ((rq)->elevator_private2)
 -#define RQ_CFQG(rq)           (struct cfq_group *) ((rq)->elevator_private3)
 +      ((struct cfq_io_context *) (rq)->elevator_private[0])
 +#define RQ_CFQQ(rq)           (struct cfq_queue *) ((rq)->elevator_private[1])
 +#define RQ_CFQG(rq)           (struct cfq_group *) ((rq)->elevator_private[2])
  
  static struct kmem_cache *cfq_pool;
  static struct kmem_cache *cfq_ioc_pool;
@@@ -87,7 -87,6 +87,6 @@@ struct cfq_rb_root 
        unsigned count;
        unsigned total_weight;
        u64 min_vdisktime;
-       struct rb_node *active;
  };
  #define CFQ_RB_ROOT   (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
                        .count = 0, .min_vdisktime = 0, }
@@@ -97,7 -96,7 +96,7 @@@
   */
  struct cfq_queue {
        /* reference count */
-       atomic_t ref;
+       int ref;
        /* various state flags, see below */
        unsigned int flags;
        /* parent cfq_data */
@@@ -180,7 -179,6 +179,6 @@@ struct cfq_group 
        /* group service_tree key */
        u64 vdisktime;
        unsigned int weight;
-       bool on_st;
  
        /* number of cfqq currently on this group */
        int nr_cfqq;
        struct blkio_group blkg;
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
        struct hlist_node cfqd_node;
-       atomic_t ref;
+       int ref;
  #endif
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
@@@ -563,11 -561,6 +561,6 @@@ static void update_min_vdisktime(struc
        u64 vdisktime = st->min_vdisktime;
        struct cfq_group *cfqg;
  
-       if (st->active) {
-               cfqg = rb_entry_cfqg(st->active);
-               vdisktime = cfqg->vdisktime;
-       }
        if (st->left) {
                cfqg = rb_entry_cfqg(st->left);
                vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@@ -605,8 -598,8 +598,8 @@@ cfq_group_slice(struct cfq_data *cfqd, 
        return cfq_target_latency * cfqg->weight / st->total_weight;
  }
  
- static inline void
- cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ static inline unsigned
+ cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
        unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
        if (cfqd->cfq_latency) {
                                    low_slice);
                }
        }
+       return slice;
+ }
+ static inline void
+ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
+       unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
        cfqq->slice_start = jiffies;
        cfqq->slice_end = jiffies + slice;
        cfqq->allocated_slice = slice;
  static inline bool cfq_slice_used(struct cfq_queue *cfqq)
  {
        if (cfq_cfqq_slice_new(cfqq))
-               return 0;
+               return false;
        if (time_before(jiffies, cfqq->slice_end))
-               return 0;
+               return false;
  
-       return 1;
+       return true;
  }
  
  /*
@@@ -869,7 -870,7 +870,7 @@@ cfq_group_service_tree_add(struct cfq_d
        struct rb_node *n;
  
        cfqg->nr_cfqq++;
-       if (cfqg->on_st)
+       if (!RB_EMPTY_NODE(&cfqg->rb_node))
                return;
  
        /*
                cfqg->vdisktime = st->min_vdisktime;
  
        __cfq_group_service_tree_add(st, cfqg);
-       cfqg->on_st = true;
        st->total_weight += cfqg->weight;
  }
  
@@@ -894,9 -894,6 +894,6 @@@ cfq_group_service_tree_del(struct cfq_d
  {
        struct cfq_rb_root *st = &cfqd->grp_service_tree;
  
-       if (st->active == &cfqg->rb_node)
-               st->active = NULL;
        BUG_ON(cfqg->nr_cfqq < 1);
        cfqg->nr_cfqq--;
  
                return;
  
        cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-       cfqg->on_st = false;
        st->total_weight -= cfqg->weight;
        if (!RB_EMPTY_NODE(&cfqg->rb_node))
                cfq_rb_erase(&cfqg->rb_node, st);
@@@ -1026,11 -1022,11 +1022,11 @@@ cfq_find_alloc_cfqg(struct cfq_data *cf
         * elevator which will be dropped by either elevator exit
         * or cgroup deletion path depending on who is exiting first.
         */
-       atomic_set(&cfqg->ref, 1);
+       cfqg->ref = 1;
  
        /*
         * Add group onto cgroup list. It might happen that bdi->dev is
-        * not initiliazed yet. Initialize this new group without major
+        * not initialized yet. Initialize this new group without major
         * and minor info and this info will be filled in once a new thread
         * comes for IO. See code above.
         */
@@@ -1071,7 -1067,7 +1067,7 @@@ static struct cfq_group *cfq_get_cfqg(s
  
  static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
  {
-       atomic_inc(&cfqg->ref);
+       cfqg->ref++;
        return cfqg;
  }
  
@@@ -1083,7 -1079,7 +1079,7 @@@ static void cfq_link_cfqq_cfqg(struct c
  
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
-       atomic_inc(&cfqq->cfqg->ref);
+       cfqq->cfqg->ref++;
  }
  
  static void cfq_put_cfqg(struct cfq_group *cfqg)
        struct cfq_rb_root *st;
        int i, j;
  
-       BUG_ON(atomic_read(&cfqg->ref) <= 0);
-       if (!atomic_dec_and_test(&cfqg->ref))
+       BUG_ON(cfqg->ref <= 0);
+       cfqg->ref--;
+       if (cfqg->ref)
                return;
        for_each_cfqg_st(cfqg, i, j, st)
-               BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+               BUG_ON(!RB_EMPTY_ROOT(&st->rb));
        kfree(cfqg);
  }
  
@@@ -1200,7 -1197,7 +1197,7 @@@ static void cfq_service_tree_add(struc
                        cfq_group_service_tree_del(cfqd, cfqq->cfqg);
                cfqq->orig_cfqg = cfqq->cfqg;
                cfqq->cfqg = &cfqd->root_group;
-               atomic_inc(&cfqd->root_group.ref);
+               cfqd->root_group.ref++;
                group_changed = 1;
        } else if (!cfqd->cfq_group_isolation
                   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@@ -1672,8 -1669,11 +1669,11 @@@ __cfq_slice_expired(struct cfq_data *cf
        /*
         * store what was left of this slice, if the queue idled/timed out
         */
-       if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
-               cfqq->slice_resid = cfqq->slice_end - jiffies;
+       if (timed_out) {
+               if (cfq_cfqq_slice_new(cfqq))
+                       cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
+               else
+                       cfqq->slice_resid = cfqq->slice_end - jiffies;
                cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
        }
  
        if (cfqq == cfqd->active_queue)
                cfqd->active_queue = NULL;
  
-       if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-               cfqd->grp_service_tree.active = NULL;
        if (cfqd->active_cic) {
                put_io_context(cfqd->active_cic->ioc);
                cfqd->active_cic = NULL;
@@@ -1901,10 -1898,10 +1898,10 @@@ static bool cfq_should_idle(struct cfq_
         * in their service tree.
         */
        if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-               return 1;
+               return true;
        cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                        service_tree->count);
-       return 0;
+       return false;
  }
  
  static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@@ -2040,7 -2037,7 +2037,7 @@@ static int cfqq_process_refs(struct cfq
        int process_refs, io_refs;
  
        io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-       process_refs = atomic_read(&cfqq->ref) - io_refs;
+       process_refs = cfqq->ref - io_refs;
        BUG_ON(process_refs < 0);
        return process_refs;
  }
@@@ -2080,10 -2077,10 +2077,10 @@@ static void cfq_setup_merge(struct cfq_
         */
        if (new_process_refs >= process_refs) {
                cfqq->new_cfqq = new_cfqq;
-               atomic_add(process_refs, &new_cfqq->ref);
+               new_cfqq->ref += process_refs;
        } else {
                new_cfqq->new_cfqq = cfqq;
-               atomic_add(new_process_refs, &cfqq->ref);
+               cfqq->ref += new_process_refs;
        }
  }
  
@@@ -2116,12 -2113,7 +2113,7 @@@ static void choose_service_tree(struct 
        unsigned count;
        struct cfq_rb_root *st;
        unsigned group_slice;
-       if (!cfqg) {
-               cfqd->serving_prio = IDLE_WORKLOAD;
-               cfqd->workload_expires = jiffies + 1;
-               return;
-       }
+       enum wl_prio_t original_prio = cfqd->serving_prio;
  
        /* Choose next priority. RT > BE > IDLE */
        if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
                return;
        }
  
+       if (original_prio != cfqd->serving_prio)
+               goto new_workload;
        /*
         * For RT and BE, we have to choose also the type
         * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
        if (count && !time_after(jiffies, cfqd->workload_expires))
                return;
  
+ new_workload:
        /* otherwise select new workload type */
        cfqd->serving_type =
                cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@@ -2199,7 -2195,6 +2195,6 @@@ static struct cfq_group *cfq_get_next_c
        if (RB_EMPTY_ROOT(&st->rb))
                return NULL;
        cfqg = cfq_rb_first_group(st);
-       st->active = &cfqg->rb_node;
        update_min_vdisktime(st);
        return cfqg;
  }
@@@ -2293,6 -2288,17 +2288,17 @@@ static struct cfq_queue *cfq_select_que
                goto keep_queue;
        }
  
+       /*
+        * This is a deep seek queue, but the device is much faster than
+        * the queue can deliver, don't idle
+        **/
+       if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+           (cfq_cfqq_slice_new(cfqq) ||
+           (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+               cfq_clear_cfqq_deep(cfqq);
+               cfq_clear_cfqq_idle_window(cfqq);
+       }
        if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
                cfqq = NULL;
                goto keep_queue;
@@@ -2367,12 -2373,12 +2373,12 @@@ static inline bool cfq_slice_used_soon(
  {
        /* the queue hasn't finished any request, can't estimate */
        if (cfq_cfqq_slice_new(cfqq))
-               return 1;
+               return true;
        if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
                cfqq->slice_end))
-               return 1;
+               return true;
  
-       return 0;
+       return false;
  }
  
  static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@@ -2538,9 -2544,10 +2544,10 @@@ static void cfq_put_queue(struct cfq_qu
        struct cfq_data *cfqd = cfqq->cfqd;
        struct cfq_group *cfqg, *orig_cfqg;
  
-       BUG_ON(atomic_read(&cfqq->ref) <= 0);
+       BUG_ON(cfqq->ref <= 0);
  
-       if (!atomic_dec_and_test(&cfqq->ref))
+       cfqq->ref--;
+       if (cfqq->ref)
                return;
  
        cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@@ -2843,7 -2850,7 +2850,7 @@@ static void cfq_init_cfqq(struct cfq_da
        RB_CLEAR_NODE(&cfqq->p_node);
        INIT_LIST_HEAD(&cfqq->fifo);
  
-       atomic_set(&cfqq->ref, 0);
+       cfqq->ref = 0;
        cfqq->cfqd = cfqd;
  
        cfq_mark_cfqq_prio_changed(cfqq);
@@@ -2979,11 -2986,11 +2986,11 @@@ cfq_get_queue(struct cfq_data *cfqd, bo
         * pin the queue now that it's allocated, scheduler exit will prune it
         */
        if (!is_sync && !(*async_cfqq)) {
-               atomic_inc(&cfqq->ref);
+               cfqq->ref++;
                *async_cfqq = cfqq;
        }
  
-       atomic_inc(&cfqq->ref);
+       cfqq->ref++;
        return cfqq;
  }
  
@@@ -3265,6 -3272,10 +3272,10 @@@ cfq_should_preempt(struct cfq_data *cfq
        if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
                return true;
  
+       /* An idle queue should not be idle now for some reason */
+       if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+               return true;
        if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                return false;
  
   */
  static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  {
+       struct cfq_queue *old_cfqq = cfqd->active_queue;
        cfq_log_cfqq(cfqd, cfqq, "preempt");
        cfq_slice_expired(cfqd, 1);
  
+       /*
+        * workload type is changed, don't save slice, otherwise preempt
+        * doesn't happen
+        */
+       if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+               cfqq->cfqg->saved_workload_slice = 0;
        /*
         * Put the new queue at the front of the of the current list,
         * so we know that it will be selected next.
@@@ -3412,6 -3432,10 +3432,10 @@@ static bool cfq_should_wait_busy(struc
  {
        struct cfq_io_context *cic = cfqd->active_cic;
  
+       /* If the queue already has requests, don't wait */
+       if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+               return false;
        /* If there are other queues in the group, don't wait */
        if (cfqq->cfqg->nr_cfqq > 1)
                return false;
@@@ -3589,12 -3613,12 +3613,12 @@@ static void cfq_put_request(struct requ
  
                put_io_context(RQ_CIC(rq)->ioc);
  
 -              rq->elevator_private = NULL;
 -              rq->elevator_private2 = NULL;
 +              rq->elevator_private[0] = NULL;
 +              rq->elevator_private[1] = NULL;
  
                /* Put down rq reference on cfqg */
                cfq_put_cfqg(RQ_CFQG(rq));
 -              rq->elevator_private3 = NULL;
 +              rq->elevator_private[2] = NULL;
  
                cfq_put_queue(cfqq);
        }
@@@ -3681,13 -3705,13 +3705,13 @@@ new_queue
        }
  
        cfqq->allocated[rw]++;
-       atomic_inc(&cfqq->ref);
 -      cfqq->ref++;
 -      rq->elevator_private = cic;
 -      rq->elevator_private2 = cfqq;
 -      rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
  
        spin_unlock_irqrestore(q->queue_lock, flags);
  
++      cfqq->ref++;
 +      rq->elevator_private[0] = cic;
 +      rq->elevator_private[1] = cfqq;
 +      rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
        return 0;
  
  queue_fail:
@@@ -3862,6 -3886,10 +3886,10 @@@ static void *cfq_init_queue(struct requ
        if (!cfqd)
                return NULL;
  
+       /*
+        * Don't need take queue_lock in the routine, since we are
+        * initializing the ioscheduler, and nobody is using cfqd
+        */
        cfqd->cic_index = i;
  
        /* Init root service tree */
         * Take a reference to root group which we never drop. This is just
         * to make sure that cfq_put_cfqg() does not try to kfree root group
         */
-       atomic_set(&cfqg->ref, 1);
+       cfqg->ref = 1;
        rcu_read_lock();
        cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                        (void *)cfqd, 0);
         * will not attempt to free it.
         */
        cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-       atomic_inc(&cfqd->oom_cfqq.ref);
+       cfqd->oom_cfqq.ref++;
        cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
  
        INIT_LIST_HEAD(&cfqd->cic_list);
diff --combined include/linux/blkdev.h
index 12bb426949e9f68a42c5b505db6c63650d9b6128,4d18ff34670a4a882e5d08e83b1633ecd2973610..e3ee74fc59030fe6b4c2c15bc8c4ef85a03e0096
@@@ -108,19 -108,14 +108,20 @@@ struct request 
  
        /*
         * Three pointers are available for the IO schedulers, if they need
 -       * more they have to dynamically allocate it.
 +       * more they have to dynamically allocate it.  Flush requests are
 +       * never put on the IO scheduler. So let the flush fields share
 +       * space with the three elevator_private pointers.
         */
 -      void *elevator_private;
 -      void *elevator_private2;
 -      void *elevator_private3;
 +      union {
 +              void *elevator_private[3];
 +              struct {
 +                      unsigned int            seq;
 +                      struct list_head        list;
 +              } flush;
 +      };
  
        struct gendisk *rq_disk;
+       struct hd_struct *part;
        unsigned long start_time;
  #ifdef CONFIG_BLK_CGROUP
        unsigned long long start_time_ns;
@@@ -368,12 -363,11 +369,12 @@@ struct request_queu
         * for flush operations
         */
        unsigned int            flush_flags;
 -      unsigned int            flush_seq;
 -      int                     flush_err;
 +      unsigned int            flush_pending_idx:1;
 +      unsigned int            flush_running_idx:1;
 +      unsigned long           flush_pending_since;
 +      struct list_head        flush_queue[2];
 +      struct list_head        flush_data_in_flight;
        struct request          flush_rq;
 -      struct request          *orig_flush_rq;
 -      struct list_head        pending_flushes;
  
        struct mutex            sysfs_lock;
  
@@@ -653,7 -647,6 +654,6 @@@ static inline void rq_flush_dcache_page
  
  extern int blk_register_queue(struct gendisk *disk);
  extern void blk_unregister_queue(struct gendisk *disk);
- extern void register_disk(struct gendisk *dev);
  extern void generic_make_request(struct bio *bio);
  extern void blk_rq_init(struct request_queue *q, struct request *rq);
  extern void blk_put_request(struct request *);
@@@ -1263,6 -1256,9 +1263,9 @@@ struct block_device_operations 
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*direct_access) (struct block_device *, sector_t,
                                                void **, unsigned long *);
+       unsigned int (*check_events) (struct gendisk *disk,
+                                     unsigned int clearing);
+       /* ->media_changed() is DEPRECATED, use ->check_events() instead */
        int (*media_changed) (struct gendisk *);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*revalidate_disk) (struct gendisk *);
diff --combined include/linux/elevator.h
index 86120c916fcc05c2190db55b0a8e0b350610e3ab,4d857973d2c94317cf11041a4a7070794fc13a99..39b68edb388d34df1d0988382cd8e825d3fc66d7
@@@ -167,7 -167,6 +167,7 @@@ extern struct request *elv_rb_find(stru
  #define ELEVATOR_INSERT_BACK  2
  #define ELEVATOR_INSERT_SORT  3
  #define ELEVATOR_INSERT_REQUEUE       4
 +#define ELEVATOR_INSERT_FLUSH 5
  
  /*
   * return values from elevator_may_queue_fn
@@@ -196,15 -195,9 +196,9 @@@ enum 
  /*
   * io context count accounting
   */
- #define elv_ioc_count_mod(name, __val)                                \
-       do {                                                    \
-               preempt_disable();                              \
-               __get_cpu_var(name) += (__val);                 \
-               preempt_enable();                               \
-       } while (0)
- #define elv_ioc_count_inc(name)       elv_ioc_count_mod(name, 1)
- #define elv_ioc_count_dec(name)       elv_ioc_count_mod(name, -1)
+ #define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val)
+ #define elv_ioc_count_inc(name)       this_cpu_inc(name)
+ #define elv_ioc_count_dec(name)       this_cpu_dec(name)
  
  #define elv_ioc_count_read(name)                              \
  ({                                                            \