Pileus Git - ~andy/linux/blob - block/blk-cgroup.c

   1 /*
   2  * Common Block IO controller cgroup interface
   3  *
   4  * Based on ideas and code from CFQ, CFS and BFQ:
   5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8  *                    Paolo Valente <paolo.valente@unimore.it>
   9  *
  10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  11  *                    Nauman Rafique <nauman@google.com>
  12  */
  13 #include <linux/ioprio.h>
  14 #include <linux/kdev_t.h>
  15 #include <linux/module.h>
  16 #include <linux/err.h>
  17 #include <linux/blkdev.h>
  18 #include <linux/slab.h>
  19 #include <linux/genhd.h>
  20 #include <linux/delay.h>
  21 #include <linux/atomic.h>
  22 #include "blk-cgroup.h"
  23 #include "blk.h"
  24
  25 #define MAX_KEY_LEN 100
  26
  27 static DEFINE_MUTEX(blkcg_pol_mutex);
  28
  29 struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
  30                             .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
  31 EXPORT_SYMBOL_GPL(blkcg_root);
  32
  33 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  34
  35 static bool blkcg_policy_enabled(struct request_queue *q,
  36                                  const struct blkcg_policy *pol)
  37 {
  38         return pol && test_bit(pol->plid, q->blkcg_pols);
  39 }
  40
  41 /**
  42  * blkg_free - free a blkg
  43  * @blkg: blkg to free
  44  *
  45  * Free @blkg which may be partially allocated.
  46  */
  47 static void blkg_free(struct blkcg_gq *blkg)
  48 {
  49         int i;
  50
  51         if (!blkg)
  52                 return;
  53
  54         for (i = 0; i < BLKCG_MAX_POLS; i++) {
  55                 struct blkcg_policy *pol = blkcg_policy[i];
  56                 struct blkg_policy_data *pd = blkg->pd[i];
  57
  58                 if (!pd)
  59                         continue;
  60
  61                 if (pol && pol->pd_exit_fn)
  62                         pol->pd_exit_fn(blkg);
  63
  64                 kfree(pd);
  65         }
  66
  67         blk_exit_rl(&blkg->rl);
  68         kfree(blkg);
  69 }
  70
  71 /**
  72  * blkg_alloc - allocate a blkg
  73  * @blkcg: block cgroup the new blkg is associated with
  74  * @q: request_queue the new blkg is associated with
  75  * @gfp_mask: allocation mask to use
  76  *
  77  * Allocate a new blkg assocating @blkcg and @q.
  78  */
  79 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  80                                    gfp_t gfp_mask)
  81 {
  82         struct blkcg_gq *blkg;
  83         int i;
  84
  85         /* alloc and init base part */
  86         blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
  87         if (!blkg)
  88                 return NULL;
  89
  90         blkg->q = q;
  91         INIT_LIST_HEAD(&blkg->q_node);
  92         blkg->blkcg = blkcg;
  93         blkg->refcnt = 1;
  94
  95         /* root blkg uses @q->root_rl, init rl only for !root blkgs */
  96         if (blkcg != &blkcg_root) {
  97                 if (blk_init_rl(&blkg->rl, q, gfp_mask))
  98                         goto err_free;
  99                 blkg->rl.blkg = blkg;
 100         }
 101
 102         for (i = 0; i < BLKCG_MAX_POLS; i++) {
 103                 struct blkcg_policy *pol = blkcg_policy[i];
 104                 struct blkg_policy_data *pd;
 105
 106                 if (!blkcg_policy_enabled(q, pol))
 107                         continue;
 108
 109                 /* alloc per-policy data and attach it to blkg */
 110                 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
 111                 if (!pd)
 112                         goto err_free;
 113
 114                 blkg->pd[i] = pd;
 115                 pd->blkg = blkg;
 116
 117                 /* invoke per-policy init */
 118                 if (pol->pd_init_fn)
 119                         pol->pd_init_fn(blkg);
 120         }
 121
 122         return blkg;
 123
 124 err_free:
 125         blkg_free(blkg);
 126         return NULL;
 127 }
 128
 129 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 130                                       struct request_queue *q, bool update_hint)
 131 {
 132         struct blkcg_gq *blkg;
 133
 134         blkg = rcu_dereference(blkcg->blkg_hint);
 135         if (blkg && blkg->q == q)
 136                 return blkg;
 137
 138         /*
 139          * Hint didn't match.  Look up from the radix tree.  Note that the
 140          * hint can only be updated under queue_lock as otherwise @blkg
 141          * could have already been removed from blkg_tree.  The caller is
 142          * responsible for grabbing queue_lock if @update_hint.
 143          */
 144         blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 145         if (blkg && blkg->q == q) {
 146                 if (update_hint) {
 147                         lockdep_assert_held(q->queue_lock);
 148                         rcu_assign_pointer(blkcg->blkg_hint, blkg);
 149                 }
 150                 return blkg;
 151         }
 152
 153         return NULL;
 154 }
 155
 156 /**
 157  * blkg_lookup - lookup blkg for the specified blkcg - q pair
 158  * @blkcg: blkcg of interest
 159  * @q: request_queue of interest
 160  *
 161  * Lookup blkg for the @blkcg - @q pair.  This function should be called
 162  * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
 163  * - see blk_queue_bypass_start() for details.
 164  */
 165 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
 166 {
 167         WARN_ON_ONCE(!rcu_read_lock_held());
 168
 169         if (unlikely(blk_queue_bypass(q)))
 170                 return NULL;
 171         return __blkg_lookup(blkcg, q, false);
 172 }
 173 EXPORT_SYMBOL_GPL(blkg_lookup);
 174
 175 /*
 176  * If @new_blkg is %NULL, this function tries to allocate a new one as
 177  * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
 178  */
 179 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 180                                     struct request_queue *q,
 181                                     struct blkcg_gq *new_blkg)
 182 {
 183         struct blkcg_gq *blkg;
 184         int ret;
 185
 186         WARN_ON_ONCE(!rcu_read_lock_held());
 187         lockdep_assert_held(q->queue_lock);
 188
 189         /* blkg holds a reference to blkcg */
 190         if (!css_tryget(&blkcg->css)) {
 191                 ret = -EINVAL;
 192                 goto err_free_blkg;
 193         }
 194
 195         /* allocate */
 196         if (!new_blkg) {
 197                 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 198                 if (unlikely(!new_blkg)) {
 199                         ret = -ENOMEM;
 200                         goto err_put_css;
 201                 }
 202         }
 203         blkg = new_blkg;
 204
 205         /* link parent and insert */
 206         if (blkcg_parent(blkcg)) {
 207                 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 208                 if (WARN_ON_ONCE(!blkg->parent)) {
 209                         blkg = ERR_PTR(-EINVAL);
 210                         goto err_put_css;
 211                 }
 212                 blkg_get(blkg->parent);
 213         }
 214
 215         spin_lock(&blkcg->lock);
 216         ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
 217         if (likely(!ret)) {
 218                 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 219                 list_add(&blkg->q_node, &q->blkg_list);
 220         }
 221         spin_unlock(&blkcg->lock);
 222
 223         if (!ret)
 224                 return blkg;
 225
 226         /* @blkg failed fully initialized, use the usual release path */
 227         blkg_put(blkg);
 228         return ERR_PTR(ret);
 229
 230 err_put_css:
 231         css_put(&blkcg->css);
 232 err_free_blkg:
 233         blkg_free(new_blkg);
 234         return ERR_PTR(ret);
 235 }
 236
 237 /**
 238  * blkg_lookup_create - lookup blkg, try to create one if not there
 239  * @blkcg: blkcg of interest
 240  * @q: request_queue of interest
 241  *
 242  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
 243  * create one.  blkg creation is performed recursively from blkcg_root such
 244  * that all non-root blkg's have access to the parent blkg.  This function
 245  * should be called under RCU read lock and @q->queue_lock.
 246  *
 247  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
 248  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
 249  * dead and bypassing, returns ERR_PTR(-EBUSY).
 250  */
 251 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 252                                     struct request_queue *q)
 253 {
 254         struct blkcg_gq *blkg;
 255
 256         WARN_ON_ONCE(!rcu_read_lock_held());
 257         lockdep_assert_held(q->queue_lock);
 258
 259         /*
 260          * This could be the first entry point of blkcg implementation and
 261          * we shouldn't allow anything to go through for a bypassing queue.
 262          */
 263         if (unlikely(blk_queue_bypass(q)))
 264                 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
 265
 266         blkg = __blkg_lookup(blkcg, q, true);
 267         if (blkg)
 268                 return blkg;
 269
 270         /*
 271          * Create blkgs walking down from blkcg_root to @blkcg, so that all
 272          * non-root blkgs have access to their parents.
 273          */
 274         while (true) {
 275                 struct blkcg *pos = blkcg;
 276                 struct blkcg *parent = blkcg_parent(blkcg);
 277
 278                 while (parent && !__blkg_lookup(parent, q, false)) {
 279                         pos = parent;
 280                         parent = blkcg_parent(parent);
 281                 }
 282
 283                 blkg = blkg_create(pos, q, NULL);
 284                 if (pos == blkcg || IS_ERR(blkg))
 285                         return blkg;
 286         }
 287 }
 288 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 289
 290 static void blkg_destroy(struct blkcg_gq *blkg)
 291 {
 292         struct blkcg *blkcg = blkg->blkcg;
 293
 294         lockdep_assert_held(blkg->q->queue_lock);
 295         lockdep_assert_held(&blkcg->lock);
 296
 297         /* Something wrong if we are trying to remove same group twice */
 298         WARN_ON_ONCE(list_empty(&blkg->q_node));
 299         WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 300
 301         radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
 302         list_del_init(&blkg->q_node);
 303         hlist_del_init_rcu(&blkg->blkcg_node);
 304
 305         /*
 306          * Both setting lookup hint to and clearing it from @blkg are done
 307          * under queue_lock.  If it's not pointing to @blkg now, it never
 308          * will.  Hint assignment itself can race safely.
 309          */
 310         if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
 311                 rcu_assign_pointer(blkcg->blkg_hint, NULL);
 312
 313         /*
 314          * Put the reference taken at the time of creation so that when all
 315          * queues are gone, group can be destroyed.
 316          */
 317         blkg_put(blkg);
 318 }
 319
 320 /**
 321  * blkg_destroy_all - destroy all blkgs associated with a request_queue
 322  * @q: request_queue of interest
 323  *
 324  * Destroy all blkgs associated with @q.
 325  */
 326 static void blkg_destroy_all(struct request_queue *q)
 327 {
 328         struct blkcg_gq *blkg, *n;
 329
 330         lockdep_assert_held(q->queue_lock);
 331
 332         list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 333                 struct blkcg *blkcg = blkg->blkcg;
 334
 335                 spin_lock(&blkcg->lock);
 336                 blkg_destroy(blkg);
 337                 spin_unlock(&blkcg->lock);
 338         }
 339
 340         /*
 341          * root blkg is destroyed.  Just clear the pointer since
 342          * root_rl does not take reference on root blkg.
 343          */
 344         q->root_blkg = NULL;
 345         q->root_rl.blkg = NULL;
 346 }
 347
 348 static void blkg_rcu_free(struct rcu_head *rcu_head)
 349 {
 350         blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
 351 }
 352
 353 void __blkg_release(struct blkcg_gq *blkg)
 354 {
 355         /* release the blkcg and parent blkg refs this blkg has been holding */
 356         css_put(&blkg->blkcg->css);
 357         if (blkg->parent)
 358                 blkg_put(blkg->parent);
 359
 360         /*
 361          * A group is freed in rcu manner. But having an rcu lock does not
 362          * mean that one can access all the fields of blkg and assume these
 363          * are valid. For example, don't try to follow throtl_data and
 364          * request queue links.
 365          *
 366          * Having a reference to blkg under an rcu allows acess to only
 367          * values local to groups like group stats and group rate limits
 368          */
 369         call_rcu(&blkg->rcu_head, blkg_rcu_free);
 370 }
 371 EXPORT_SYMBOL_GPL(__blkg_release);
 372
 373 /*
 374  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
 375  * because the root blkg uses @q->root_rl instead of its own rl.
 376  */
 377 struct request_list *__blk_queue_next_rl(struct request_list *rl,
 378                                          struct request_queue *q)
 379 {
 380         struct list_head *ent;
 381         struct blkcg_gq *blkg;
 382
 383         /*
 384          * Determine the current blkg list_head.  The first entry is
 385          * root_rl which is off @q->blkg_list and mapped to the head.
 386          */
 387         if (rl == &q->root_rl) {
 388                 ent = &q->blkg_list;
 389                 /* There are no more block groups, hence no request lists */
 390                 if (list_empty(ent))
 391                         return NULL;
 392         } else {
 393                 blkg = container_of(rl, struct blkcg_gq, rl);
 394                 ent = &blkg->q_node;
 395         }
 396
 397         /* walk to the next list_head, skip root blkcg */
 398         ent = ent->next;
 399         if (ent == &q->root_blkg->q_node)
 400                 ent = ent->next;
 401         if (ent == &q->blkg_list)
 402                 return NULL;
 403
 404         blkg = container_of(ent, struct blkcg_gq, q_node);
 405         return &blkg->rl;
 406 }
 407
 408 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
 409                              u64 val)
 410 {
 411         struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 412         struct blkcg_gq *blkg;
 413         struct hlist_node *n;
 414         int i;
 415
 416         mutex_lock(&blkcg_pol_mutex);
 417         spin_lock_irq(&blkcg->lock);
 418
 419         /*
 420          * Note that stat reset is racy - it doesn't synchronize against
 421          * stat updates.  This is a debug feature which shouldn't exist
 422          * anyway.  If you get hit by a race, retry.
 423          */
 424         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 425                 for (i = 0; i < BLKCG_MAX_POLS; i++) {
 426                         struct blkcg_policy *pol = blkcg_policy[i];
 427
 428                         if (blkcg_policy_enabled(blkg->q, pol) &&
 429                             pol->pd_reset_stats_fn)
 430                                 pol->pd_reset_stats_fn(blkg);
 431                 }
 432         }
 433
 434         spin_unlock_irq(&blkcg->lock);
 435         mutex_unlock(&blkcg_pol_mutex);
 436         return 0;
 437 }
 438
 439 static const char *blkg_dev_name(struct blkcg_gq *blkg)
 440 {
 441         /* some drivers (floppy) instantiate a queue w/o disk registered */
 442         if (blkg->q->backing_dev_info.dev)
 443                 return dev_name(blkg->q->backing_dev_info.dev);
 444         return NULL;
 445 }
 446
 447 /**
 448  * blkcg_print_blkgs - helper for printing per-blkg data
 449  * @sf: seq_file to print to
 450  * @blkcg: blkcg of interest
 451  * @prfill: fill function to print out a blkg
 452  * @pol: policy in question
 453  * @data: data to be passed to @prfill
 454  * @show_total: to print out sum of prfill return values or not
 455  *
 456  * This function invokes @prfill on each blkg of @blkcg if pd for the
 457  * policy specified by @pol exists.  @prfill is invoked with @sf, the
 458  * policy data and @data.  If @show_total is %true, the sum of the return
 459  * values from @prfill is printed with "Total" label at the end.
 460  *
 461  * This is to be used to construct print functions for
 462  * cftype->read_seq_string method.
 463  */
 464 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 465                        u64 (*prfill)(struct seq_file *,
 466                                      struct blkg_policy_data *, int),
 467                        const struct blkcg_policy *pol, int data,
 468                        bool show_total)
 469 {
 470         struct blkcg_gq *blkg;
 471         struct hlist_node *n;
 472         u64 total = 0;
 473
 474         spin_lock_irq(&blkcg->lock);
 475         hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
 476                 if (blkcg_policy_enabled(blkg->q, pol))
 477                         total += prfill(sf, blkg->pd[pol->plid], data);
 478         spin_unlock_irq(&blkcg->lock);
 479
 480         if (show_total)
 481                 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
 482 }
 483 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
 484
 485 /**
 486  * __blkg_prfill_u64 - prfill helper for a single u64 value
 487  * @sf: seq_file to print to
 488  * @pd: policy private data of interest
 489  * @v: value to print
 490  *
 491  * Print @v to @sf for the device assocaited with @pd.
 492  */
 493 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
 494 {
 495         const char *dname = blkg_dev_name(pd->blkg);
 496
 497         if (!dname)
 498                 return 0;
 499
 500         seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
 501         return v;
 502 }
 503 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
 504
 505 /**
 506  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
 507  * @sf: seq_file to print to
 508  * @pd: policy private data of interest
 509  * @rwstat: rwstat to print
 510  *
 511  * Print @rwstat to @sf for the device assocaited with @pd.
 512  */
 513 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 514                          const struct blkg_rwstat *rwstat)
 515 {
 516         static const char *rwstr[] = {
 517                 [BLKG_RWSTAT_READ]      = "Read",
 518                 [BLKG_RWSTAT_WRITE]     = "Write",
 519                 [BLKG_RWSTAT_SYNC]      = "Sync",
 520                 [BLKG_RWSTAT_ASYNC]     = "Async",
 521         };
 522         const char *dname = blkg_dev_name(pd->blkg);
 523         u64 v;
 524         int i;
 525
 526         if (!dname)
 527                 return 0;
 528
 529         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 530                 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
 531                            (unsigned long long)rwstat->cnt[i]);
 532
 533         v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
 534         seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
 535         return v;
 536 }
 537
 538 /**
 539  * blkg_prfill_stat - prfill callback for blkg_stat
 540  * @sf: seq_file to print to
 541  * @pd: policy private data of interest
 542  * @off: offset to the blkg_stat in @pd
 543  *
 544  * prfill callback for printing a blkg_stat.
 545  */
 546 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
 547 {
 548         return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
 549 }
 550 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
 551
 552 /**
 553  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
 554  * @sf: seq_file to print to
 555  * @pd: policy private data of interest
 556  * @off: offset to the blkg_rwstat in @pd
 557  *
 558  * prfill callback for printing a blkg_rwstat.
 559  */
 560 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 561                        int off)
 562 {
 563         struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
 564
 565         return __blkg_prfill_rwstat(sf, pd, &rwstat);
 566 }
 567 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 568
 569 /**
 570  * blkg_conf_prep - parse and prepare for per-blkg config update
 571  * @blkcg: target block cgroup
 572  * @pol: target policy
 573  * @input: input string
 574  * @ctx: blkg_conf_ctx to be filled
 575  *
 576  * Parse per-blkg config update from @input and initialize @ctx with the
 577  * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
 578  * value.  This function returns with RCU read lock and queue lock held and
 579  * must be paired with blkg_conf_finish().
 580  */
 581 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 582                    const char *input, struct blkg_conf_ctx *ctx)
 583         __acquires(rcu) __acquires(disk->queue->queue_lock)
 584 {
 585         struct gendisk *disk;
 586         struct blkcg_gq *blkg;
 587         unsigned int major, minor;
 588         unsigned long long v;
 589         int part, ret;
 590
 591         if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
 592                 return -EINVAL;
 593
 594         disk = get_gendisk(MKDEV(major, minor), &part);
 595         if (!disk || part)
 596                 return -EINVAL;
 597
 598         rcu_read_lock();
 599         spin_lock_irq(disk->queue->queue_lock);
 600
 601         if (blkcg_policy_enabled(disk->queue, pol))
 602                 blkg = blkg_lookup_create(blkcg, disk->queue);
 603         else
 604                 blkg = ERR_PTR(-EINVAL);
 605
 606         if (IS_ERR(blkg)) {
 607                 ret = PTR_ERR(blkg);
 608                 rcu_read_unlock();
 609                 spin_unlock_irq(disk->queue->queue_lock);
 610                 put_disk(disk);
 611                 /*
 612                  * If queue was bypassing, we should retry.  Do so after a
 613                  * short msleep().  It isn't strictly necessary but queue
 614                  * can be bypassing for some time and it's always nice to
 615                  * avoid busy looping.
 616                  */
 617                 if (ret == -EBUSY) {
 618                         msleep(10);
 619                         ret = restart_syscall();
 620                 }
 621                 return ret;
 622         }
 623
 624         ctx->disk = disk;
 625         ctx->blkg = blkg;
 626         ctx->v = v;
 627         return 0;
 628 }
 629 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 630
 631 /**
 632  * blkg_conf_finish - finish up per-blkg config update
 633  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
 634  *
 635  * Finish up after per-blkg config update.  This function must be paired
 636  * with blkg_conf_prep().
 637  */
 638 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 639         __releases(ctx->disk->queue->queue_lock) __releases(rcu)
 640 {
 641         spin_unlock_irq(ctx->disk->queue->queue_lock);
 642         rcu_read_unlock();
 643         put_disk(ctx->disk);
 644 }
 645 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 646
 647 struct cftype blkcg_files[] = {
 648         {
 649                 .name = "reset_stats",
 650                 .write_u64 = blkcg_reset_stats,
 651         },
 652         { }     /* terminate */
 653 };
 654
 655 /**
 656  * blkcg_css_offline - cgroup css_offline callback
 657  * @cgroup: cgroup of interest
 658  *
 659  * This function is called when @cgroup is about to go away and responsible
 660  * for shooting down all blkgs associated with @cgroup.  blkgs should be
 661  * removed while holding both q and blkcg locks.  As blkcg lock is nested
 662  * inside q lock, this function performs reverse double lock dancing.
 663  *
 664  * This is the blkcg counterpart of ioc_release_fn().
 665  */
 666 static void blkcg_css_offline(struct cgroup *cgroup)
 667 {
 668         struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 669
 670         spin_lock_irq(&blkcg->lock);
 671
 672         while (!hlist_empty(&blkcg->blkg_list)) {
 673                 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
 674                                                 struct blkcg_gq, blkcg_node);
 675                 struct request_queue *q = blkg->q;
 676
 677                 if (spin_trylock(q->queue_lock)) {
 678                         blkg_destroy(blkg);
 679                         spin_unlock(q->queue_lock);
 680                 } else {
 681                         spin_unlock_irq(&blkcg->lock);
 682                         cpu_relax();
 683                         spin_lock_irq(&blkcg->lock);
 684                 }
 685         }
 686
 687         spin_unlock_irq(&blkcg->lock);
 688 }
 689
 690 static void blkcg_css_free(struct cgroup *cgroup)
 691 {
 692         struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
 693
 694         if (blkcg != &blkcg_root)
 695                 kfree(blkcg);
 696 }
 697
 698 static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
 699 {
 700         static atomic64_t id_seq = ATOMIC64_INIT(0);
 701         struct blkcg *blkcg;
 702         struct cgroup *parent = cgroup->parent;
 703
 704         if (!parent) {
 705                 blkcg = &blkcg_root;
 706                 goto done;
 707         }
 708
 709         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 710         if (!blkcg)
 711                 return ERR_PTR(-ENOMEM);
 712
 713         blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
 714         blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 715         blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
 716 done:
 717         spin_lock_init(&blkcg->lock);
 718         INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 719         INIT_HLIST_HEAD(&blkcg->blkg_list);
 720
 721         return &blkcg->css;
 722 }
 723
 724 /**
 725  * blkcg_init_queue - initialize blkcg part of request queue
 726  * @q: request_queue to initialize
 727  *
 728  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
 729  * part of new request_queue @q.
 730  *
 731  * RETURNS:
 732  * 0 on success, -errno on failure.
 733  */
 734 int blkcg_init_queue(struct request_queue *q)
 735 {
 736         might_sleep();
 737
 738         return blk_throtl_init(q);
 739 }
 740
 741 /**
 742  * blkcg_drain_queue - drain blkcg part of request_queue
 743  * @q: request_queue to drain
 744  *
 745  * Called from blk_drain_queue().  Responsible for draining blkcg part.
 746  */
 747 void blkcg_drain_queue(struct request_queue *q)
 748 {
 749         lockdep_assert_held(q->queue_lock);
 750
 751         blk_throtl_drain(q);
 752 }
 753
 754 /**
 755  * blkcg_exit_queue - exit and release blkcg part of request_queue
 756  * @q: request_queue being released
 757  *
 758  * Called from blk_release_queue().  Responsible for exiting blkcg part.
 759  */
 760 void blkcg_exit_queue(struct request_queue *q)
 761 {
 762         spin_lock_irq(q->queue_lock);
 763         blkg_destroy_all(q);
 764         spin_unlock_irq(q->queue_lock);
 765
 766         blk_throtl_exit(q);
 767 }
 768
 769 /*
 770  * We cannot support shared io contexts, as we have no mean to support
 771  * two tasks with the same ioc in two different groups without major rework
 772  * of the main cic data structures.  For now we allow a task to change
 773  * its cgroup only if it's the only owner of its ioc.
 774  */
 775 static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 776 {
 777         struct task_struct *task;
 778         struct io_context *ioc;
 779         int ret = 0;
 780
 781         /* task_lock() is needed to avoid races with exit_io_context() */
 782         cgroup_taskset_for_each(task, cgrp, tset) {
 783                 task_lock(task);
 784                 ioc = task->io_context;
 785                 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
 786                         ret = -EINVAL;
 787                 task_unlock(task);
 788                 if (ret)
 789                         break;
 790         }
 791         return ret;
 792 }
 793
 794 struct cgroup_subsys blkio_subsys = {
 795         .name = "blkio",
 796         .css_alloc = blkcg_css_alloc,
 797         .css_offline = blkcg_css_offline,
 798         .css_free = blkcg_css_free,
 799         .can_attach = blkcg_can_attach,
 800         .subsys_id = blkio_subsys_id,
 801         .base_cftypes = blkcg_files,
 802         .module = THIS_MODULE,
 803
 804         /*
 805          * blkio subsystem is utterly broken in terms of hierarchy support.
 806          * It treats all cgroups equally regardless of where they're
 807          * located in the hierarchy - all cgroups are treated as if they're
 808          * right below the root.  Fix it and remove the following.
 809          */
 810         .broken_hierarchy = true,
 811 };
 812 EXPORT_SYMBOL_GPL(blkio_subsys);
 813
 814 /**
 815  * blkcg_activate_policy - activate a blkcg policy on a request_queue
 816  * @q: request_queue of interest
 817  * @pol: blkcg policy to activate
 818  *
 819  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
 820  * bypass mode to populate its blkgs with policy_data for @pol.
 821  *
 822  * Activation happens with @q bypassed, so nobody would be accessing blkgs
 823  * from IO path.  Update of each blkg is protected by both queue and blkcg
 824  * locks so that holding either lock and testing blkcg_policy_enabled() is
 825  * always enough for dereferencing policy data.
 826  *
 827  * The caller is responsible for synchronizing [de]activations and policy
 828  * [un]registerations.  Returns 0 on success, -errno on failure.
 829  */
 830 int blkcg_activate_policy(struct request_queue *q,
 831                           const struct blkcg_policy *pol)
 832 {
 833         LIST_HEAD(pds);
 834         struct blkcg_gq *blkg, *new_blkg;
 835         struct blkg_policy_data *pd, *n;
 836         int cnt = 0, ret;
 837         bool preloaded;
 838
 839         if (blkcg_policy_enabled(q, pol))
 840                 return 0;
 841
 842         /* preallocations for root blkg */
 843         new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
 844         if (!new_blkg)
 845                 return -ENOMEM;
 846
 847         preloaded = !radix_tree_preload(GFP_KERNEL);
 848
 849         blk_queue_bypass_start(q);
 850
 851         /*
 852          * Make sure the root blkg exists and count the existing blkgs.  As
 853          * @q is bypassing at this point, blkg_lookup_create() can't be
 854          * used.  Open code it.
 855          */
 856         spin_lock_irq(q->queue_lock);
 857
 858         rcu_read_lock();
 859         blkg = __blkg_lookup(&blkcg_root, q, false);
 860         if (blkg)
 861                 blkg_free(new_blkg);
 862         else
 863                 blkg = blkg_create(&blkcg_root, q, new_blkg);
 864         rcu_read_unlock();
 865
 866         if (preloaded)
 867                 radix_tree_preload_end();
 868
 869         if (IS_ERR(blkg)) {
 870                 ret = PTR_ERR(blkg);
 871                 goto out_unlock;
 872         }
 873         q->root_blkg = blkg;
 874         q->root_rl.blkg = blkg;
 875
 876         list_for_each_entry(blkg, &q->blkg_list, q_node)
 877                 cnt++;
 878
 879         spin_unlock_irq(q->queue_lock);
 880
 881         /* allocate policy_data for all existing blkgs */
 882         while (cnt--) {
 883                 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 884                 if (!pd) {
 885                         ret = -ENOMEM;
 886                         goto out_free;
 887                 }
 888                 list_add_tail(&pd->alloc_node, &pds);
 889         }
 890
 891         /*
 892          * Install the allocated pds.  With @q bypassing, no new blkg
 893          * should have been created while the queue lock was dropped.
 894          */
 895         spin_lock_irq(q->queue_lock);
 896
 897         list_for_each_entry(blkg, &q->blkg_list, q_node) {
 898                 if (WARN_ON(list_empty(&pds))) {
 899                         /* umm... this shouldn't happen, just abort */
 900                         ret = -ENOMEM;
 901                         goto out_unlock;
 902                 }
 903                 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 904                 list_del_init(&pd->alloc_node);
 905
 906                 /* grab blkcg lock too while installing @pd on @blkg */
 907                 spin_lock(&blkg->blkcg->lock);
 908
 909                 blkg->pd[pol->plid] = pd;
 910                 pd->blkg = blkg;
 911                 pol->pd_init_fn(blkg);
 912
 913                 spin_unlock(&blkg->blkcg->lock);
 914         }
 915
 916         __set_bit(pol->plid, q->blkcg_pols);
 917         ret = 0;
 918 out_unlock:
 919         spin_unlock_irq(q->queue_lock);
 920 out_free:
 921         blk_queue_bypass_end(q);
 922         list_for_each_entry_safe(pd, n, &pds, alloc_node)
 923                 kfree(pd);
 924         return ret;
 925 }
 926 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
 927
 928 /**
 929  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
 930  * @q: request_queue of interest
 931  * @pol: blkcg policy to deactivate
 932  *
 933  * Deactivate @pol on @q.  Follows the same synchronization rules as
 934  * blkcg_activate_policy().
 935  */
 936 void blkcg_deactivate_policy(struct request_queue *q,
 937                              const struct blkcg_policy *pol)
 938 {
 939         struct blkcg_gq *blkg;
 940
 941         if (!blkcg_policy_enabled(q, pol))
 942                 return;
 943
 944         blk_queue_bypass_start(q);
 945         spin_lock_irq(q->queue_lock);
 946
 947         __clear_bit(pol->plid, q->blkcg_pols);
 948
 949         /* if no policy is left, no need for blkgs - shoot them down */
 950         if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
 951                 blkg_destroy_all(q);
 952
 953         list_for_each_entry(blkg, &q->blkg_list, q_node) {
 954                 /* grab blkcg lock too while removing @pd from @blkg */
 955                 spin_lock(&blkg->blkcg->lock);
 956
 957                 if (pol->pd_exit_fn)
 958                         pol->pd_exit_fn(blkg);
 959
 960                 kfree(blkg->pd[pol->plid]);
 961                 blkg->pd[pol->plid] = NULL;
 962
 963                 spin_unlock(&blkg->blkcg->lock);
 964         }
 965
 966         spin_unlock_irq(q->queue_lock);
 967         blk_queue_bypass_end(q);
 968 }
 969 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 970
 971 /**
 972  * blkcg_policy_register - register a blkcg policy
 973  * @pol: blkcg policy to register
 974  *
 975  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 976  * successful registration.  Returns 0 on success and -errno on failure.
 977  */
 978 int blkcg_policy_register(struct blkcg_policy *pol)
 979 {
 980         int i, ret;
 981
 982         if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
 983                 return -EINVAL;
 984
 985         mutex_lock(&blkcg_pol_mutex);
 986
 987         /* find an empty slot */
 988         ret = -ENOSPC;
 989         for (i = 0; i < BLKCG_MAX_POLS; i++)
 990                 if (!blkcg_policy[i])
 991                         break;
 992         if (i >= BLKCG_MAX_POLS)
 993                 goto out_unlock;
 994
 995         /* register and update blkgs */
 996         pol->plid = i;
 997         blkcg_policy[i] = pol;
 998
 999         /* everything is in place, add intf files for the new policy */
1000         if (pol->cftypes)
1001                 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
1002         ret = 0;
1003 out_unlock:
1004         mutex_unlock(&blkcg_pol_mutex);
1005         return ret;
1006 }
1007 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1008
1009 /**
1010  * blkcg_policy_unregister - unregister a blkcg policy
1011  * @pol: blkcg policy to unregister
1012  *
1013  * Undo blkcg_policy_register(@pol).  Might sleep.
1014  */
1015 void blkcg_policy_unregister(struct blkcg_policy *pol)
1016 {
1017         mutex_lock(&blkcg_pol_mutex);
1018
1019         if (WARN_ON(blkcg_policy[pol->plid] != pol))
1020                 goto out_unlock;
1021
1022         /* kill the intf files first */
1023         if (pol->cftypes)
1024                 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
1025
1026         /* unregister and update blkgs */
1027         blkcg_policy[pol->plid] = NULL;
1028 out_unlock:
1029         mutex_unlock(&blkcg_pol_mutex);
1030 }
1031 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);