Pileus Git - ~andy/linux/blob - kernel/workqueue.c

   1 /*
   2  * kernel/workqueue.c - generic async execution with shared worker pool
   3  *
   4  * Copyright (C) 2002           Ingo Molnar
   5  *
   6  *   Derived from the taskqueue/keventd code by:
   7  *     David Woodhouse <dwmw2@infradead.org>
   8  *     Andrew Morton
   9  *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
  10  *     Theodore Ts'o <tytso@mit.edu>
  11  *
  12  * Made to use alloc_percpu by Christoph Lameter.
  13  *
  14  * Copyright (C) 2010           SUSE Linux Products GmbH
  15  * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
  16  *
  17  * This is the generic async execution mechanism.  Work items as are
  18  * executed in process context.  The worker pool is shared and
  19  * automatically managed.  There is one worker pool for each CPU and
  20  * one extra for works which are better served by workers which are
  21  * not bound to any specific CPU.
  22  *
  23  * Please read Documentation/workqueue.txt for details.
  24  */
  25
  26 #include <linux/module.h>
  27 #include <linux/kernel.h>
  28 #include <linux/sched.h>
  29 #include <linux/init.h>
  30 #include <linux/signal.h>
  31 #include <linux/completion.h>
  32 #include <linux/workqueue.h>
  33 #include <linux/slab.h>
  34 #include <linux/cpu.h>
  35 #include <linux/notifier.h>
  36 #include <linux/kthread.h>
  37 #include <linux/hardirq.h>
  38 #include <linux/mempolicy.h>
  39 #include <linux/freezer.h>
  40 #include <linux/kallsyms.h>
  41 #include <linux/debug_locks.h>
  42 #include <linux/lockdep.h>
  43 #include <linux/idr.h>
  44
  45 #include "workqueue_sched.h"
  46
  47 enum {
  48         /* global_cwq flags */
  49         GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
  50         GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
  51         GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
  52         GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
  53         GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
  54
  55         /* worker flags */
  56         WORKER_STARTED          = 1 << 0,       /* started */
  57         WORKER_DIE              = 1 << 1,       /* die die die */
  58         WORKER_IDLE             = 1 << 2,       /* is idle */
  59         WORKER_PREP             = 1 << 3,       /* preparing to run works */
  60         WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
  61         WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
  62         WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
  63         WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
  64
  65         WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
  66                                   WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
  67
  68         /* gcwq->trustee_state */
  69         TRUSTEE_START           = 0,            /* start */
  70         TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
  71         TRUSTEE_BUTCHER         = 2,            /* butcher workers */
  72         TRUSTEE_RELEASE         = 3,            /* release workers */
  73         TRUSTEE_DONE            = 4,            /* trustee is done */
  74
  75         BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
  76         BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
  77         BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
  78
  79         MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
  80         IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
  81
  82         MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
  83                                                 /* call for help after 10ms
  84                                                    (min two ticks) */
  85         MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
  86         CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
  87         TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
  88
  89         /*
  90          * Rescue workers are used only on emergencies and shared by
  91          * all cpus.  Give -20.
  92          */
  93         RESCUER_NICE_LEVEL      = -20,
  94 };
  95
  96 /*
  97  * Structure fields follow one of the following exclusion rules.
  98  *
  99  * I: Modifiable by initialization/destruction paths and read-only for
 100  *    everyone else.
 101  *
 102  * P: Preemption protected.  Disabling preemption is enough and should
 103  *    only be modified and accessed from the local cpu.
 104  *
 105  * L: gcwq->lock protected.  Access with gcwq->lock held.
 106  *
 107  * X: During normal operation, modification requires gcwq->lock and
 108  *    should be done only from local cpu.  Either disabling preemption
 109  *    on local cpu or grabbing gcwq->lock is enough for read access.
 110  *    If GCWQ_DISASSOCIATED is set, it's identical to L.
 111  *
 112  * F: wq->flush_mutex protected.
 113  *
 114  * W: workqueue_lock protected.
 115  */
 116
 117 struct global_cwq;
 118
 119 /*
 120  * The poor guys doing the actual heavy lifting.  All on-duty workers
 121  * are either serving the manager role, on idle list or on busy hash.
 122  */
 123 struct worker {
 124         /* on idle list while idle, on busy hash table while busy */
 125         union {
 126                 struct list_head        entry;  /* L: while idle */
 127                 struct hlist_node       hentry; /* L: while busy */
 128         };
 129
 130         struct work_struct      *current_work;  /* L: work being processed */
 131         struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
 132         struct list_head        scheduled;      /* L: scheduled works */
 133         struct task_struct      *task;          /* I: worker task */
 134         struct global_cwq       *gcwq;          /* I: the associated gcwq */
 135         /* 64 bytes boundary on 64bit, 32 on 32bit */
 136         unsigned long           last_active;    /* L: last active timestamp */
 137         unsigned int            flags;          /* X: flags */
 138         int                     id;             /* I: worker id */
 139         struct work_struct      rebind_work;    /* L: rebind worker to cpu */
 140 };
 141
 142 /*
 143  * Global per-cpu workqueue.  There's one and only one for each cpu
 144  * and all works are queued and processed here regardless of their
 145  * target workqueues.
 146  */
 147 struct global_cwq {
 148         spinlock_t              lock;           /* the gcwq lock */
 149         struct list_head        worklist;       /* L: list of pending works */
 150         unsigned int            cpu;            /* I: the associated cpu */
 151         unsigned int            flags;          /* L: GCWQ_* flags */
 152
 153         int                     nr_workers;     /* L: total number of workers */
 154         int                     nr_idle;        /* L: currently idle ones */
 155
 156         /* workers are chained either in the idle_list or busy_hash */
 157         struct list_head        idle_list;      /* X: list of idle workers */
 158         struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
 159                                                 /* L: hash of busy workers */
 160
 161         struct timer_list       idle_timer;     /* L: worker idle timeout */
 162         struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
 163
 164         struct ida              worker_ida;     /* L: for worker IDs */
 165
 166         struct task_struct      *trustee;       /* L: for gcwq shutdown */
 167         unsigned int            trustee_state;  /* L: trustee state */
 168         wait_queue_head_t       trustee_wait;   /* trustee wait */
 169         struct worker           *first_idle;    /* L: first idle worker */
 170 } ____cacheline_aligned_in_smp;
 171
 172 /*
 173  * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
 174  * work_struct->data are used for flags and thus cwqs need to be
 175  * aligned at two's power of the number of flag bits.
 176  */
 177 struct cpu_workqueue_struct {
 178         struct global_cwq       *gcwq;          /* I: the associated gcwq */
 179         struct workqueue_struct *wq;            /* I: the owning workqueue */
 180         int                     work_color;     /* L: current color */
 181         int                     flush_color;    /* L: flushing color */
 182         int                     nr_in_flight[WORK_NR_COLORS];
 183                                                 /* L: nr of in_flight works */
 184         int                     nr_active;      /* L: nr of active works */
 185         int                     max_active;     /* L: max active works */
 186         struct list_head        delayed_works;  /* L: delayed works */
 187 };
 188
 189 /*
 190  * Structure used to wait for workqueue flush.
 191  */
 192 struct wq_flusher {
 193         struct list_head        list;           /* F: list of flushers */
 194         int                     flush_color;    /* F: flush color waiting for */
 195         struct completion       done;           /* flush completion */
 196 };
 197
 198 /*
 199  * All cpumasks are assumed to be always set on UP and thus can't be
 200  * used to determine whether there's something to be done.
 201  */
 202 #ifdef CONFIG_SMP
 203 typedef cpumask_var_t mayday_mask_t;
 204 #define mayday_test_and_set_cpu(cpu, mask)      \
 205         cpumask_test_and_set_cpu((cpu), (mask))
 206 #define mayday_clear_cpu(cpu, mask)             cpumask_clear_cpu((cpu), (mask))
 207 #define for_each_mayday_cpu(cpu, mask)          for_each_cpu((cpu), (mask))
 208 #define alloc_mayday_mask(maskp, gfp)           zalloc_cpumask_var((maskp), (gfp))
 209 #define free_mayday_mask(mask)                  free_cpumask_var((mask))
 210 #else
 211 typedef unsigned long mayday_mask_t;
 212 #define mayday_test_and_set_cpu(cpu, mask)      test_and_set_bit(0, &(mask))
 213 #define mayday_clear_cpu(cpu, mask)             clear_bit(0, &(mask))
 214 #define for_each_mayday_cpu(cpu, mask)          if ((cpu) = 0, (mask))
 215 #define alloc_mayday_mask(maskp, gfp)           true
 216 #define free_mayday_mask(mask)                  do { } while (0)
 217 #endif
 218
 219 /*
 220  * The externally visible workqueue abstraction is an array of
 221  * per-CPU workqueues:
 222  */
 223 struct workqueue_struct {
 224         unsigned int            flags;          /* I: WQ_* flags */
 225         union {
 226                 struct cpu_workqueue_struct __percpu    *pcpu;
 227                 struct cpu_workqueue_struct             *single;
 228                 unsigned long                           v;
 229         } cpu_wq;                               /* I: cwq's */
 230         struct list_head        list;           /* W: list of all workqueues */
 231
 232         struct mutex            flush_mutex;    /* protects wq flushing */
 233         int                     work_color;     /* F: current work color */
 234         int                     flush_color;    /* F: current flush color */
 235         atomic_t                nr_cwqs_to_flush; /* flush in progress */
 236         struct wq_flusher       *first_flusher; /* F: first flusher */
 237         struct list_head        flusher_queue;  /* F: flush waiters */
 238         struct list_head        flusher_overflow; /* F: flush overflow list */
 239
 240         mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
 241         struct worker           *rescuer;       /* I: rescue worker */
 242
 243         int                     saved_max_active; /* W: saved cwq max_active */
 244         const char              *name;          /* I: workqueue name */
 245 #ifdef CONFIG_LOCKDEP
 246         struct lockdep_map      lockdep_map;
 247 #endif
 248 };
 249
 250 struct workqueue_struct *system_wq __read_mostly;
 251 struct workqueue_struct *system_long_wq __read_mostly;
 252 struct workqueue_struct *system_nrt_wq __read_mostly;
 253 struct workqueue_struct *system_unbound_wq __read_mostly;
 254 struct workqueue_struct *system_freezeable_wq __read_mostly;
 255 EXPORT_SYMBOL_GPL(system_wq);
 256 EXPORT_SYMBOL_GPL(system_long_wq);
 257 EXPORT_SYMBOL_GPL(system_nrt_wq);
 258 EXPORT_SYMBOL_GPL(system_unbound_wq);
 259 EXPORT_SYMBOL_GPL(system_freezeable_wq);
 260
 261 #define CREATE_TRACE_POINTS
 262 #include <trace/events/workqueue.h>
 263
 264 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
 265         for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
 266                 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 267
 268 static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
 269                                   unsigned int sw)
 270 {
 271         if (cpu < nr_cpu_ids) {
 272                 if (sw & 1) {
 273                         cpu = cpumask_next(cpu, mask);
 274                         if (cpu < nr_cpu_ids)
 275                                 return cpu;
 276                 }
 277                 if (sw & 2)
 278                         return WORK_CPU_UNBOUND;
 279         }
 280         return WORK_CPU_NONE;
 281 }
 282
 283 static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
 284                                 struct workqueue_struct *wq)
 285 {
 286         return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 287 }
 288
 289 /*
 290  * CPU iterators
 291  *
 292  * An extra gcwq is defined for an invalid cpu number
 293  * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
 294  * specific CPU.  The following iterators are similar to
 295  * for_each_*_cpu() iterators but also considers the unbound gcwq.
 296  *
 297  * for_each_gcwq_cpu()          : possible CPUs + WORK_CPU_UNBOUND
 298  * for_each_online_gcwq_cpu()   : online CPUs + WORK_CPU_UNBOUND
 299  * for_each_cwq_cpu()           : possible CPUs for bound workqueues,
 300  *                                WORK_CPU_UNBOUND for unbound workqueues
 301  */
 302 #define for_each_gcwq_cpu(cpu)                                          \
 303         for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);         \
 304              (cpu) < WORK_CPU_NONE;                                     \
 305              (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
 306
 307 #define for_each_online_gcwq_cpu(cpu)                                   \
 308         for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);           \
 309              (cpu) < WORK_CPU_NONE;                                     \
 310              (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
 311
 312 #define for_each_cwq_cpu(cpu, wq)                                       \
 313         for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));        \
 314              (cpu) < WORK_CPU_NONE;                                     \
 315              (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
 316
 317 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 318
 319 static struct debug_obj_descr work_debug_descr;
 320
 321 /*
 322  * fixup_init is called when:
 323  * - an active object is initialized
 324  */
 325 static int work_fixup_init(void *addr, enum debug_obj_state state)
 326 {
 327         struct work_struct *work = addr;
 328
 329         switch (state) {
 330         case ODEBUG_STATE_ACTIVE:
 331                 cancel_work_sync(work);
 332                 debug_object_init(work, &work_debug_descr);
 333                 return 1;
 334         default:
 335                 return 0;
 336         }
 337 }
 338
 339 /*
 340  * fixup_activate is called when:
 341  * - an active object is activated
 342  * - an unknown object is activated (might be a statically initialized object)
 343  */
 344 static int work_fixup_activate(void *addr, enum debug_obj_state state)
 345 {
 346         struct work_struct *work = addr;
 347
 348         switch (state) {
 349
 350         case ODEBUG_STATE_NOTAVAILABLE:
 351                 /*
 352                  * This is not really a fixup. The work struct was
 353                  * statically initialized. We just make sure that it
 354                  * is tracked in the object tracker.
 355                  */
 356                 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 357                         debug_object_init(work, &work_debug_descr);
 358                         debug_object_activate(work, &work_debug_descr);
 359                         return 0;
 360                 }
 361                 WARN_ON_ONCE(1);
 362                 return 0;
 363
 364         case ODEBUG_STATE_ACTIVE:
 365                 WARN_ON(1);
 366
 367         default:
 368                 return 0;
 369         }
 370 }
 371
 372 /*
 373  * fixup_free is called when:
 374  * - an active object is freed
 375  */
 376 static int work_fixup_free(void *addr, enum debug_obj_state state)
 377 {
 378         struct work_struct *work = addr;
 379
 380         switch (state) {
 381         case ODEBUG_STATE_ACTIVE:
 382                 cancel_work_sync(work);
 383                 debug_object_free(work, &work_debug_descr);
 384                 return 1;
 385         default:
 386                 return 0;
 387         }
 388 }
 389
 390 static struct debug_obj_descr work_debug_descr = {
 391         .name           = "work_struct",
 392         .fixup_init     = work_fixup_init,
 393         .fixup_activate = work_fixup_activate,
 394         .fixup_free     = work_fixup_free,
 395 };
 396
 397 static inline void debug_work_activate(struct work_struct *work)
 398 {
 399         debug_object_activate(work, &work_debug_descr);
 400 }
 401
 402 static inline void debug_work_deactivate(struct work_struct *work)
 403 {
 404         debug_object_deactivate(work, &work_debug_descr);
 405 }
 406
 407 void __init_work(struct work_struct *work, int onstack)
 408 {
 409         if (onstack)
 410                 debug_object_init_on_stack(work, &work_debug_descr);
 411         else
 412                 debug_object_init(work, &work_debug_descr);
 413 }
 414 EXPORT_SYMBOL_GPL(__init_work);
 415
 416 void destroy_work_on_stack(struct work_struct *work)
 417 {
 418         debug_object_free(work, &work_debug_descr);
 419 }
 420 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 421
 422 #else
 423 static inline void debug_work_activate(struct work_struct *work) { }
 424 static inline void debug_work_deactivate(struct work_struct *work) { }
 425 #endif
 426
 427 /* Serializes the accesses to the list of workqueues. */
 428 static DEFINE_SPINLOCK(workqueue_lock);
 429 static LIST_HEAD(workqueues);
 430 static bool workqueue_freezing;         /* W: have wqs started freezing? */
 431
 432 /*
 433  * The almighty global cpu workqueues.  nr_running is the only field
 434  * which is expected to be used frequently by other cpus via
 435  * try_to_wake_up().  Put it in a separate cacheline.
 436  */
 437 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 438 static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 439
 440 /*
 441  * Global cpu workqueue and nr_running counter for unbound gcwq.  The
 442  * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
 443  * workers have WORKER_UNBOUND set.
 444  */
 445 static struct global_cwq unbound_global_cwq;
 446 static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
 447
 448 static int worker_thread(void *__worker);
 449
 450 static struct global_cwq *get_gcwq(unsigned int cpu)
 451 {
 452         if (cpu != WORK_CPU_UNBOUND)
 453                 return &per_cpu(global_cwq, cpu);
 454         else
 455                 return &unbound_global_cwq;
 456 }
 457
 458 static atomic_t *get_gcwq_nr_running(unsigned int cpu)
 459 {
 460         if (cpu != WORK_CPU_UNBOUND)
 461                 return &per_cpu(gcwq_nr_running, cpu);
 462         else
 463                 return &unbound_gcwq_nr_running;
 464 }
 465
 466 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 467                                             struct workqueue_struct *wq)
 468 {
 469         if (!(wq->flags & WQ_UNBOUND)) {
 470                 if (likely(cpu < nr_cpu_ids)) {
 471 #ifdef CONFIG_SMP
 472                         return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
 473 #else
 474                         return wq->cpu_wq.single;
 475 #endif
 476                 }
 477         } else if (likely(cpu == WORK_CPU_UNBOUND))
 478                 return wq->cpu_wq.single;
 479         return NULL;
 480 }
 481
 482 static unsigned int work_color_to_flags(int color)
 483 {
 484         return color << WORK_STRUCT_COLOR_SHIFT;
 485 }
 486
 487 static int get_work_color(struct work_struct *work)
 488 {
 489         return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
 490                 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 491 }
 492
 493 static int work_next_color(int color)
 494 {
 495         return (color + 1) % WORK_NR_COLORS;
 496 }
 497
 498 /*
 499  * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
 500  * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is
 501  * cleared and the work data contains the cpu number it was last on.
 502  *
 503  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
 504  * cwq, cpu or clear work->data.  These functions should only be
 505  * called while the work is owned - ie. while the PENDING bit is set.
 506  *
 507  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
 508  * corresponding to a work.  gcwq is available once the work has been
 509  * queued anywhere after initialization.  cwq is available only from
 510  * queueing until execution starts.
 511  */
 512 static inline void set_work_data(struct work_struct *work, unsigned long data,
 513                                  unsigned long flags)
 514 {
 515         BUG_ON(!work_pending(work));
 516         atomic_long_set(&work->data, data | flags | work_static(work));
 517 }
 518
 519 static void set_work_cwq(struct work_struct *work,
 520                          struct cpu_workqueue_struct *cwq,
 521                          unsigned long extra_flags)
 522 {
 523         set_work_data(work, (unsigned long)cwq,
 524                       WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
 525 }
 526
 527 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
 528 {
 529         set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
 530 }
 531
 532 static void clear_work_data(struct work_struct *work)
 533 {
 534         set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 535 }
 536
 537 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 538 {
 539         unsigned long data = atomic_long_read(&work->data);
 540
 541         if (data & WORK_STRUCT_CWQ)
 542                 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
 543         else
 544                 return NULL;
 545 }
 546
 547 static struct global_cwq *get_work_gcwq(struct work_struct *work)
 548 {
 549         unsigned long data = atomic_long_read(&work->data);
 550         unsigned int cpu;
 551
 552         if (data & WORK_STRUCT_CWQ)
 553                 return ((struct cpu_workqueue_struct *)
 554                         (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
 555
 556         cpu = data >> WORK_STRUCT_FLAG_BITS;
 557         if (cpu == WORK_CPU_NONE)
 558                 return NULL;
 559
 560         BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
 561         return get_gcwq(cpu);
 562 }
 563
 564 /*
 565  * Policy functions.  These define the policies on how the global
 566  * worker pool is managed.  Unless noted otherwise, these functions
 567  * assume that they're being called with gcwq->lock held.
 568  */
 569
 570 static bool __need_more_worker(struct global_cwq *gcwq)
 571 {
 572         return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
 573                 gcwq->flags & GCWQ_HIGHPRI_PENDING;
 574 }
 575
 576 /*
 577  * Need to wake up a worker?  Called from anything but currently
 578  * running workers.
 579  */
 580 static bool need_more_worker(struct global_cwq *gcwq)
 581 {
 582         return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
 583 }
 584
 585 /* Can I start working?  Called from busy but !running workers. */
 586 static bool may_start_working(struct global_cwq *gcwq)
 587 {
 588         return gcwq->nr_idle;
 589 }
 590
 591 /* Do I need to keep working?  Called from currently running workers. */
 592 static bool keep_working(struct global_cwq *gcwq)
 593 {
 594         atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 595
 596         return !list_empty(&gcwq->worklist) &&
 597                 (atomic_read(nr_running) <= 1 ||
 598                  gcwq->flags & GCWQ_HIGHPRI_PENDING);
 599 }
 600
 601 /* Do we need a new worker?  Called from manager. */
 602 static bool need_to_create_worker(struct global_cwq *gcwq)
 603 {
 604         return need_more_worker(gcwq) && !may_start_working(gcwq);
 605 }
 606
 607 /* Do I need to be the manager? */
 608 static bool need_to_manage_workers(struct global_cwq *gcwq)
 609 {
 610         return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
 611 }
 612
 613 /* Do we have too many workers and should some go away? */
 614 static bool too_many_workers(struct global_cwq *gcwq)
 615 {
 616         bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
 617         int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
 618         int nr_busy = gcwq->nr_workers - nr_idle;
 619
 620         return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 621 }
 622
 623 /*
 624  * Wake up functions.
 625  */
 626
 627 /* Return the first worker.  Safe with preemption disabled */
 628 static struct worker *first_worker(struct global_cwq *gcwq)
 629 {
 630         if (unlikely(list_empty(&gcwq->idle_list)))
 631                 return NULL;
 632
 633         return list_first_entry(&gcwq->idle_list, struct worker, entry);
 634 }
 635
 636 /**
 637  * wake_up_worker - wake up an idle worker
 638  * @gcwq: gcwq to wake worker for
 639  *
 640  * Wake up the first idle worker of @gcwq.
 641  *
 642  * CONTEXT:
 643  * spin_lock_irq(gcwq->lock).
 644  */
 645 static void wake_up_worker(struct global_cwq *gcwq)
 646 {
 647         struct worker *worker = first_worker(gcwq);
 648
 649         if (likely(worker))
 650                 wake_up_process(worker->task);
 651 }
 652
 653 /**
 654  * wq_worker_waking_up - a worker is waking up
 655  * @task: task waking up
 656  * @cpu: CPU @task is waking up to
 657  *
 658  * This function is called during try_to_wake_up() when a worker is
 659  * being awoken.
 660  *
 661  * CONTEXT:
 662  * spin_lock_irq(rq->lock)
 663  */
 664 void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 665 {
 666         struct worker *worker = kthread_data(task);
 667
 668         if (!(worker->flags & WORKER_NOT_RUNNING))
 669                 atomic_inc(get_gcwq_nr_running(cpu));
 670 }
 671
 672 /**
 673  * wq_worker_sleeping - a worker is going to sleep
 674  * @task: task going to sleep
 675  * @cpu: CPU in question, must be the current CPU number
 676  *
 677  * This function is called during schedule() when a busy worker is
 678  * going to sleep.  Worker on the same cpu can be woken up by
 679  * returning pointer to its task.
 680  *
 681  * CONTEXT:
 682  * spin_lock_irq(rq->lock)
 683  *
 684  * RETURNS:
 685  * Worker task on @cpu to wake up, %NULL if none.
 686  */
 687 struct task_struct *wq_worker_sleeping(struct task_struct *task,
 688                                        unsigned int cpu)
 689 {
 690         struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 691         struct global_cwq *gcwq = get_gcwq(cpu);
 692         atomic_t *nr_running = get_gcwq_nr_running(cpu);
 693
 694         if (worker->flags & WORKER_NOT_RUNNING)
 695                 return NULL;
 696
 697         /* this can only happen on the local cpu */
 698         BUG_ON(cpu != raw_smp_processor_id());
 699
 700         /*
 701          * The counterpart of the following dec_and_test, implied mb,
 702          * worklist not empty test sequence is in insert_work().
 703          * Please read comment there.
 704          *
 705          * NOT_RUNNING is clear.  This means that trustee is not in
 706          * charge and we're running on the local cpu w/ rq lock held
 707          * and preemption disabled, which in turn means that none else
 708          * could be manipulating idle_list, so dereferencing idle_list
 709          * without gcwq lock is safe.
 710          */
 711         if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
 712                 to_wakeup = first_worker(gcwq);
 713         return to_wakeup ? to_wakeup->task : NULL;
 714 }
 715
 716 /**
 717  * worker_set_flags - set worker flags and adjust nr_running accordingly
 718  * @worker: self
 719  * @flags: flags to set
 720  * @wakeup: wakeup an idle worker if necessary
 721  *
 722  * Set @flags in @worker->flags and adjust nr_running accordingly.  If
 723  * nr_running becomes zero and @wakeup is %true, an idle worker is
 724  * woken up.
 725  *
 726  * CONTEXT:
 727  * spin_lock_irq(gcwq->lock)
 728  */
 729 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 730                                     bool wakeup)
 731 {
 732         struct global_cwq *gcwq = worker->gcwq;
 733
 734         WARN_ON_ONCE(worker->task != current);
 735
 736         /*
 737          * If transitioning into NOT_RUNNING, adjust nr_running and
 738          * wake up an idle worker as necessary if requested by
 739          * @wakeup.
 740          */
 741         if ((flags & WORKER_NOT_RUNNING) &&
 742             !(worker->flags & WORKER_NOT_RUNNING)) {
 743                 atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
 744
 745                 if (wakeup) {
 746                         if (atomic_dec_and_test(nr_running) &&
 747                             !list_empty(&gcwq->worklist))
 748                                 wake_up_worker(gcwq);
 749                 } else
 750                         atomic_dec(nr_running);
 751         }
 752
 753         worker->flags |= flags;
 754 }
 755
 756 /**
 757  * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 758  * @worker: self
 759  * @flags: flags to clear
 760  *
 761  * Clear @flags in @worker->flags and adjust nr_running accordingly.
 762  *
 763  * CONTEXT:
 764  * spin_lock_irq(gcwq->lock)
 765  */
 766 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 767 {
 768         struct global_cwq *gcwq = worker->gcwq;
 769         unsigned int oflags = worker->flags;
 770
 771         WARN_ON_ONCE(worker->task != current);
 772
 773         worker->flags &= ~flags;
 774
 775         /*
 776          * If transitioning out of NOT_RUNNING, increment nr_running.  Note
 777          * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
 778          * of multiple flags, not a single flag.
 779          */
 780         if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 781                 if (!(worker->flags & WORKER_NOT_RUNNING))
 782                         atomic_inc(get_gcwq_nr_running(gcwq->cpu));
 783 }
 784
 785 /**
 786  * busy_worker_head - return the busy hash head for a work
 787  * @gcwq: gcwq of interest
 788  * @work: work to be hashed
 789  *
 790  * Return hash head of @gcwq for @work.
 791  *
 792  * CONTEXT:
 793  * spin_lock_irq(gcwq->lock).
 794  *
 795  * RETURNS:
 796  * Pointer to the hash head.
 797  */
 798 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
 799                                            struct work_struct *work)
 800 {
 801         const int base_shift = ilog2(sizeof(struct work_struct));
 802         unsigned long v = (unsigned long)work;
 803
 804         /* simple shift and fold hash, do we need something better? */
 805         v >>= base_shift;
 806         v += v >> BUSY_WORKER_HASH_ORDER;
 807         v &= BUSY_WORKER_HASH_MASK;
 808
 809         return &gcwq->busy_hash[v];
 810 }
 811
 812 /**
 813  * __find_worker_executing_work - find worker which is executing a work
 814  * @gcwq: gcwq of interest
 815  * @bwh: hash head as returned by busy_worker_head()
 816  * @work: work to find worker for
 817  *
 818  * Find a worker which is executing @work on @gcwq.  @bwh should be
 819  * the hash head obtained by calling busy_worker_head() with the same
 820  * work.
 821  *
 822  * CONTEXT:
 823  * spin_lock_irq(gcwq->lock).
 824  *
 825  * RETURNS:
 826  * Pointer to worker which is executing @work if found, NULL
 827  * otherwise.
 828  */
 829 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
 830                                                    struct hlist_head *bwh,
 831                                                    struct work_struct *work)
 832 {
 833         struct worker *worker;
 834         struct hlist_node *tmp;
 835
 836         hlist_for_each_entry(worker, tmp, bwh, hentry)
 837                 if (worker->current_work == work)
 838                         return worker;
 839         return NULL;
 840 }
 841
 842 /**
 843  * find_worker_executing_work - find worker which is executing a work
 844  * @gcwq: gcwq of interest
 845  * @work: work to find worker for
 846  *
 847  * Find a worker which is executing @work on @gcwq.  This function is
 848  * identical to __find_worker_executing_work() except that this
 849  * function calculates @bwh itself.
 850  *
 851  * CONTEXT:
 852  * spin_lock_irq(gcwq->lock).
 853  *
 854  * RETURNS:
 855  * Pointer to worker which is executing @work if found, NULL
 856  * otherwise.
 857  */
 858 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 859                                                  struct work_struct *work)
 860 {
 861         return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
 862                                             work);
 863 }
 864
 865 /**
 866  * gcwq_determine_ins_pos - find insertion position
 867  * @gcwq: gcwq of interest
 868  * @cwq: cwq a work is being queued for
 869  *
 870  * A work for @cwq is about to be queued on @gcwq, determine insertion
 871  * position for the work.  If @cwq is for HIGHPRI wq, the work is
 872  * queued at the head of the queue but in FIFO order with respect to
 873  * other HIGHPRI works; otherwise, at the end of the queue.  This
 874  * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
 875  * there are HIGHPRI works pending.
 876  *
 877  * CONTEXT:
 878  * spin_lock_irq(gcwq->lock).
 879  *
 880  * RETURNS:
 881  * Pointer to inserstion position.
 882  */
 883 static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
 884                                                struct cpu_workqueue_struct *cwq)
 885 {
 886         struct work_struct *twork;
 887
 888         if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
 889                 return &gcwq->worklist;
 890
 891         list_for_each_entry(twork, &gcwq->worklist, entry) {
 892                 struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
 893
 894                 if (!(tcwq->wq->flags & WQ_HIGHPRI))
 895                         break;
 896         }
 897
 898         gcwq->flags |= GCWQ_HIGHPRI_PENDING;
 899         return &twork->entry;
 900 }
 901
 902 /**
 903  * insert_work - insert a work into gcwq
 904  * @cwq: cwq @work belongs to
 905  * @work: work to insert
 906  * @head: insertion point
 907  * @extra_flags: extra WORK_STRUCT_* flags to set
 908  *
 909  * Insert @work which belongs to @cwq into @gcwq after @head.
 910  * @extra_flags is or'd to work_struct flags.
 911  *
 912  * CONTEXT:
 913  * spin_lock_irq(gcwq->lock).
 914  */
 915 static void insert_work(struct cpu_workqueue_struct *cwq,
 916                         struct work_struct *work, struct list_head *head,
 917                         unsigned int extra_flags)
 918 {
 919         struct global_cwq *gcwq = cwq->gcwq;
 920
 921         /* we own @work, set data and link */
 922         set_work_cwq(work, cwq, extra_flags);
 923
 924         /*
 925          * Ensure that we get the right work->data if we see the
 926          * result of list_add() below, see try_to_grab_pending().
 927          */
 928         smp_wmb();
 929
 930         list_add_tail(&work->entry, head);
 931
 932         /*
 933          * Ensure either worker_sched_deactivated() sees the above
 934          * list_add_tail() or we see zero nr_running to avoid workers
 935          * lying around lazily while there are works to be processed.
 936          */
 937         smp_mb();
 938
 939         if (__need_more_worker(gcwq))
 940                 wake_up_worker(gcwq);
 941 }
 942
 943 /*
 944  * Test whether @work is being queued from another work executing on the
 945  * same workqueue.  This is rather expensive and should only be used from
 946  * cold paths.
 947  */
 948 static bool is_chained_work(struct workqueue_struct *wq)
 949 {
 950         unsigned long flags;
 951         unsigned int cpu;
 952
 953         for_each_gcwq_cpu(cpu) {
 954                 struct global_cwq *gcwq = get_gcwq(cpu);
 955                 struct worker *worker;
 956                 struct hlist_node *pos;
 957                 int i;
 958
 959                 spin_lock_irqsave(&gcwq->lock, flags);
 960                 for_each_busy_worker(worker, i, pos, gcwq) {
 961                         if (worker->task != current)
 962                                 continue;
 963                         spin_unlock_irqrestore(&gcwq->lock, flags);
 964                         /*
 965                          * I'm @worker, no locking necessary.  See if @work
 966                          * is headed to the same workqueue.
 967                          */
 968                         return worker->current_cwq->wq == wq;
 969                 }
 970                 spin_unlock_irqrestore(&gcwq->lock, flags);
 971         }
 972         return false;
 973 }
 974
 975 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 976                          struct work_struct *work)
 977 {
 978         struct global_cwq *gcwq;
 979         struct cpu_workqueue_struct *cwq;
 980         struct list_head *worklist;
 981         unsigned int work_flags;
 982         unsigned long flags;
 983
 984         debug_work_activate(work);
 985
 986         /* if dying, only works from the same workqueue are allowed */
 987         if (unlikely(wq->flags & WQ_DYING) &&
 988             WARN_ON_ONCE(!is_chained_work(wq)))
 989                 return;
 990
 991         /* determine gcwq to use */
 992         if (!(wq->flags & WQ_UNBOUND)) {
 993                 struct global_cwq *last_gcwq;
 994
 995                 if (unlikely(cpu == WORK_CPU_UNBOUND))
 996                         cpu = raw_smp_processor_id();
 997
 998                 /*
 999                  * It's multi cpu.  If @wq is non-reentrant and @work
1000                  * was previously on a different cpu, it might still
1001                  * be running there, in which case the work needs to
1002                  * be queued on that cpu to guarantee non-reentrance.
1003                  */
1004                 gcwq = get_gcwq(cpu);
1005                 if (wq->flags & WQ_NON_REENTRANT &&
1006                     (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
1007                         struct worker *worker;
1008
1009                         spin_lock_irqsave(&last_gcwq->lock, flags);
1010
1011                         worker = find_worker_executing_work(last_gcwq, work);
1012
1013                         if (worker && worker->current_cwq->wq == wq)
1014                                 gcwq = last_gcwq;
1015                         else {
1016                                 /* meh... not running there, queue here */
1017                                 spin_unlock_irqrestore(&last_gcwq->lock, flags);
1018                                 spin_lock_irqsave(&gcwq->lock, flags);
1019                         }
1020                 } else
1021                         spin_lock_irqsave(&gcwq->lock, flags);
1022         } else {
1023                 gcwq = get_gcwq(WORK_CPU_UNBOUND);
1024                 spin_lock_irqsave(&gcwq->lock, flags);
1025         }
1026
1027         /* gcwq determined, get cwq and queue */
1028         cwq = get_cwq(gcwq->cpu, wq);
1029         trace_workqueue_queue_work(cpu, cwq, work);
1030
1031         BUG_ON(!list_empty(&work->entry));
1032
1033         cwq->nr_in_flight[cwq->work_color]++;
1034         work_flags = work_color_to_flags(cwq->work_color);
1035
1036         if (likely(cwq->nr_active < cwq->max_active)) {
1037                 trace_workqueue_activate_work(work);
1038                 cwq->nr_active++;
1039                 worklist = gcwq_determine_ins_pos(gcwq, cwq);
1040         } else {
1041                 work_flags |= WORK_STRUCT_DELAYED;
1042                 worklist = &cwq->delayed_works;
1043         }
1044
1045         insert_work(cwq, work, worklist, work_flags);
1046
1047         spin_unlock_irqrestore(&gcwq->lock, flags);
1048 }
1049
1050 /**
1051  * queue_work - queue work on a workqueue
1052  * @wq: workqueue to use
1053  * @work: work to queue
1054  *
1055  * Returns 0 if @work was already on a queue, non-zero otherwise.
1056  *
1057  * We queue the work to the CPU on which it was submitted, but if the CPU dies
1058  * it can be processed by another CPU.
1059  */
1060 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1061 {
1062         int ret;
1063
1064         ret = queue_work_on(get_cpu(), wq, work);
1065         put_cpu();
1066
1067         return ret;
1068 }
1069 EXPORT_SYMBOL_GPL(queue_work);
1070
1071 /**
1072  * queue_work_on - queue work on specific cpu
1073  * @cpu: CPU number to execute work on
1074  * @wq: workqueue to use
1075  * @work: work to queue
1076  *
1077  * Returns 0 if @work was already on a queue, non-zero otherwise.
1078  *
1079  * We queue the work to a specific CPU, the caller must ensure it
1080  * can't go away.
1081  */
1082 int
1083 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
1084 {
1085         int ret = 0;
1086
1087         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1088                 __queue_work(cpu, wq, work);
1089                 ret = 1;
1090         }
1091         return ret;
1092 }
1093 EXPORT_SYMBOL_GPL(queue_work_on);
1094
1095 static void delayed_work_timer_fn(unsigned long __data)
1096 {
1097         struct delayed_work *dwork = (struct delayed_work *)__data;
1098         struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
1099
1100         __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1101 }
1102
1103 /**
1104  * queue_delayed_work - queue work on a workqueue after delay
1105  * @wq: workqueue to use
1106  * @dwork: delayable work to queue
1107  * @delay: number of jiffies to wait before queueing
1108  *
1109  * Returns 0 if @work was already on a queue, non-zero otherwise.
1110  */
1111 int queue_delayed_work(struct workqueue_struct *wq,
1112                         struct delayed_work *dwork, unsigned long delay)
1113 {
1114         if (delay == 0)
1115                 return queue_work(wq, &dwork->work);
1116
1117         return queue_delayed_work_on(-1, wq, dwork, delay);
1118 }
1119 EXPORT_SYMBOL_GPL(queue_delayed_work);
1120
1121 /**
1122  * queue_delayed_work_on - queue work on specific CPU after delay
1123  * @cpu: CPU number to execute work on
1124  * @wq: workqueue to use
1125  * @dwork: work to queue
1126  * @delay: number of jiffies to wait before queueing
1127  *
1128  * Returns 0 if @work was already on a queue, non-zero otherwise.
1129  */
1130 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1131                         struct delayed_work *dwork, unsigned long delay)
1132 {
1133         int ret = 0;
1134         struct timer_list *timer = &dwork->timer;
1135         struct work_struct *work = &dwork->work;
1136
1137         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1138                 unsigned int lcpu;
1139
1140                 BUG_ON(timer_pending(timer));
1141                 BUG_ON(!list_empty(&work->entry));
1142
1143                 timer_stats_timer_set_start_info(&dwork->timer);
1144
1145                 /*
1146                  * This stores cwq for the moment, for the timer_fn.
1147                  * Note that the work's gcwq is preserved to allow
1148                  * reentrance detection for delayed works.
1149                  */
1150                 if (!(wq->flags & WQ_UNBOUND)) {
1151                         struct global_cwq *gcwq = get_work_gcwq(work);
1152
1153                         if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
1154                                 lcpu = gcwq->cpu;
1155                         else
1156                                 lcpu = raw_smp_processor_id();
1157                 } else
1158                         lcpu = WORK_CPU_UNBOUND;
1159
1160                 set_work_cwq(work, get_cwq(lcpu, wq), 0);
1161
1162                 timer->expires = jiffies + delay;
1163                 timer->data = (unsigned long)dwork;
1164                 timer->function = delayed_work_timer_fn;
1165
1166                 if (unlikely(cpu >= 0))
1167                         add_timer_on(timer, cpu);
1168                 else
1169                         add_timer(timer);
1170                 ret = 1;
1171         }
1172         return ret;
1173 }
1174 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1175
1176 /**
1177  * worker_enter_idle - enter idle state
1178  * @worker: worker which is entering idle state
1179  *
1180  * @worker is entering idle state.  Update stats and idle timer if
1181  * necessary.
1182  *
1183  * LOCKING:
1184  * spin_lock_irq(gcwq->lock).
1185  */
1186 static void worker_enter_idle(struct worker *worker)
1187 {
1188         struct global_cwq *gcwq = worker->gcwq;
1189
1190         BUG_ON(worker->flags & WORKER_IDLE);
1191         BUG_ON(!list_empty(&worker->entry) &&
1192                (worker->hentry.next || worker->hentry.pprev));
1193
1194         /* can't use worker_set_flags(), also called from start_worker() */
1195         worker->flags |= WORKER_IDLE;
1196         gcwq->nr_idle++;
1197         worker->last_active = jiffies;
1198
1199         /* idle_list is LIFO */
1200         list_add(&worker->entry, &gcwq->idle_list);
1201
1202         if (likely(!(worker->flags & WORKER_ROGUE))) {
1203                 if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
1204                         mod_timer(&gcwq->idle_timer,
1205                                   jiffies + IDLE_WORKER_TIMEOUT);
1206         } else
1207                 wake_up_all(&gcwq->trustee_wait);
1208
1209         /* sanity check nr_running */
1210         WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
1211                      atomic_read(get_gcwq_nr_running(gcwq->cpu)));
1212 }
1213
1214 /**
1215  * worker_leave_idle - leave idle state
1216  * @worker: worker which is leaving idle state
1217  *
1218  * @worker is leaving idle state.  Update stats.
1219  *
1220  * LOCKING:
1221  * spin_lock_irq(gcwq->lock).
1222  */
1223 static void worker_leave_idle(struct worker *worker)
1224 {
1225         struct global_cwq *gcwq = worker->gcwq;
1226
1227         BUG_ON(!(worker->flags & WORKER_IDLE));
1228         worker_clr_flags(worker, WORKER_IDLE);
1229         gcwq->nr_idle--;
1230         list_del_init(&worker->entry);
1231 }
1232
1233 /**
1234  * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
1235  * @worker: self
1236  *
1237  * Works which are scheduled while the cpu is online must at least be
1238  * scheduled to a worker which is bound to the cpu so that if they are
1239  * flushed from cpu callbacks while cpu is going down, they are
1240  * guaranteed to execute on the cpu.
1241  *
1242  * This function is to be used by rogue workers and rescuers to bind
1243  * themselves to the target cpu and may race with cpu going down or
1244  * coming online.  kthread_bind() can't be used because it may put the
1245  * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
1246  * verbatim as it's best effort and blocking and gcwq may be
1247  * [dis]associated in the meantime.
1248  *
1249  * This function tries set_cpus_allowed() and locks gcwq and verifies
1250  * the binding against GCWQ_DISASSOCIATED which is set during
1251  * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
1252  * idle state or fetches works without dropping lock, it can guarantee
1253  * the scheduling requirement described in the first paragraph.
1254  *
1255  * CONTEXT:
1256  * Might sleep.  Called without any lock but returns with gcwq->lock
1257  * held.
1258  *
1259  * RETURNS:
1260  * %true if the associated gcwq is online (@worker is successfully
1261  * bound), %false if offline.
1262  */
1263 static bool worker_maybe_bind_and_lock(struct worker *worker)
1264 __acquires(&gcwq->lock)
1265 {
1266         struct global_cwq *gcwq = worker->gcwq;
1267         struct task_struct *task = worker->task;
1268
1269         while (true) {
1270                 /*
1271                  * The following call may fail, succeed or succeed
1272                  * without actually migrating the task to the cpu if
1273                  * it races with cpu hotunplug operation.  Verify
1274                  * against GCWQ_DISASSOCIATED.
1275                  */
1276                 if (!(gcwq->flags & GCWQ_DISASSOCIATED))
1277                         set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
1278
1279                 spin_lock_irq(&gcwq->lock);
1280                 if (gcwq->flags & GCWQ_DISASSOCIATED)
1281                         return false;
1282                 if (task_cpu(task) == gcwq->cpu &&
1283                     cpumask_equal(&current->cpus_allowed,
1284                                   get_cpu_mask(gcwq->cpu)))
1285                         return true;
1286                 spin_unlock_irq(&gcwq->lock);
1287
1288                 /* CPU has come up inbetween, retry migration */
1289                 cpu_relax();
1290         }
1291 }
1292
1293 /*
1294  * Function for worker->rebind_work used to rebind rogue busy workers
1295  * to the associated cpu which is coming back online.  This is
1296  * scheduled by cpu up but can race with other cpu hotplug operations
1297  * and may be executed twice without intervening cpu down.
1298  */
1299 static void worker_rebind_fn(struct work_struct *work)
1300 {
1301         struct worker *worker = container_of(work, struct worker, rebind_work);
1302         struct global_cwq *gcwq = worker->gcwq;
1303
1304         if (worker_maybe_bind_and_lock(worker))
1305                 worker_clr_flags(worker, WORKER_REBIND);
1306
1307         spin_unlock_irq(&gcwq->lock);
1308 }
1309
1310 static struct worker *alloc_worker(void)
1311 {
1312         struct worker *worker;
1313
1314         worker = kzalloc(sizeof(*worker), GFP_KERNEL);
1315         if (worker) {
1316                 INIT_LIST_HEAD(&worker->entry);
1317                 INIT_LIST_HEAD(&worker->scheduled);
1318                 INIT_WORK(&worker->rebind_work, worker_rebind_fn);
1319                 /* on creation a worker is in !idle && prep state */
1320                 worker->flags = WORKER_PREP;
1321         }
1322         return worker;
1323 }
1324
1325 /**
1326  * create_worker - create a new workqueue worker
1327  * @gcwq: gcwq the new worker will belong to
1328  * @bind: whether to set affinity to @cpu or not
1329  *
1330  * Create a new worker which is bound to @gcwq.  The returned worker
1331  * can be started by calling start_worker() or destroyed using
1332  * destroy_worker().
1333  *
1334  * CONTEXT:
1335  * Might sleep.  Does GFP_KERNEL allocations.
1336  *
1337  * RETURNS:
1338  * Pointer to the newly created worker.
1339  */
1340 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
1341 {
1342         bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
1343         struct worker *worker = NULL;
1344         int id = -1;
1345
1346         spin_lock_irq(&gcwq->lock);
1347         while (ida_get_new(&gcwq->worker_ida, &id)) {
1348                 spin_unlock_irq(&gcwq->lock);
1349                 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
1350                         goto fail;
1351                 spin_lock_irq(&gcwq->lock);
1352         }
1353         spin_unlock_irq(&gcwq->lock);
1354
1355         worker = alloc_worker();
1356         if (!worker)
1357                 goto fail;
1358
1359         worker->gcwq = gcwq;
1360         worker->id = id;
1361
1362         if (!on_unbound_cpu)
1363                 worker->task = kthread_create(worker_thread, worker,
1364                                               "kworker/%u:%d", gcwq->cpu, id);
1365         else
1366                 worker->task = kthread_create(worker_thread, worker,
1367                                               "kworker/u:%d", id);
1368         if (IS_ERR(worker->task))
1369                 goto fail;
1370
1371         /*
1372          * A rogue worker will become a regular one if CPU comes
1373          * online later on.  Make sure every worker has
1374          * PF_THREAD_BOUND set.
1375          */
1376         if (bind && !on_unbound_cpu)
1377                 kthread_bind(worker->task, gcwq->cpu);
1378         else {
1379                 worker->task->flags |= PF_THREAD_BOUND;
1380                 if (on_unbound_cpu)
1381                         worker->flags |= WORKER_UNBOUND;
1382         }
1383
1384         return worker;
1385 fail:
1386         if (id >= 0) {
1387                 spin_lock_irq(&gcwq->lock);
1388                 ida_remove(&gcwq->worker_ida, id);
1389                 spin_unlock_irq(&gcwq->lock);
1390         }
1391         kfree(worker);
1392         return NULL;
1393 }
1394
1395 /**
1396  * start_worker - start a newly created worker
1397  * @worker: worker to start
1398  *
1399  * Make the gcwq aware of @worker and start it.
1400  *
1401  * CONTEXT:
1402  * spin_lock_irq(gcwq->lock).
1403  */
1404 static void start_worker(struct worker *worker)
1405 {
1406         worker->flags |= WORKER_STARTED;
1407         worker->gcwq->nr_workers++;
1408         worker_enter_idle(worker);
1409         wake_up_process(worker->task);
1410 }
1411
1412 /**
1413  * destroy_worker - destroy a workqueue worker
1414  * @worker: worker to be destroyed
1415  *
1416  * Destroy @worker and adjust @gcwq stats accordingly.
1417  *
1418  * CONTEXT:
1419  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1420  */
1421 static void destroy_worker(struct worker *worker)
1422 {
1423         struct global_cwq *gcwq = worker->gcwq;
1424         int id = worker->id;
1425
1426         /* sanity check frenzy */
1427         BUG_ON(worker->current_work);
1428         BUG_ON(!list_empty(&worker->scheduled));
1429
1430         if (worker->flags & WORKER_STARTED)
1431                 gcwq->nr_workers--;
1432         if (worker->flags & WORKER_IDLE)
1433                 gcwq->nr_idle--;
1434
1435         list_del_init(&worker->entry);
1436         worker->flags |= WORKER_DIE;
1437
1438         spin_unlock_irq(&gcwq->lock);
1439
1440         kthread_stop(worker->task);
1441         kfree(worker);
1442
1443         spin_lock_irq(&gcwq->lock);
1444         ida_remove(&gcwq->worker_ida, id);
1445 }
1446
1447 static void idle_worker_timeout(unsigned long __gcwq)
1448 {
1449         struct global_cwq *gcwq = (void *)__gcwq;
1450
1451         spin_lock_irq(&gcwq->lock);
1452
1453         if (too_many_workers(gcwq)) {
1454                 struct worker *worker;
1455                 unsigned long expires;
1456
1457                 /* idle_list is kept in LIFO order, check the last one */
1458                 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1459                 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1460
1461                 if (time_before(jiffies, expires))
1462                         mod_timer(&gcwq->idle_timer, expires);
1463                 else {
1464                         /* it's been idle for too long, wake up manager */
1465                         gcwq->flags |= GCWQ_MANAGE_WORKERS;
1466                         wake_up_worker(gcwq);
1467                 }
1468         }
1469
1470         spin_unlock_irq(&gcwq->lock);
1471 }
1472
1473 static bool send_mayday(struct work_struct *work)
1474 {
1475         struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1476         struct workqueue_struct *wq = cwq->wq;
1477         unsigned int cpu;
1478
1479         if (!(wq->flags & WQ_RESCUER))
1480                 return false;
1481
1482         /* mayday mayday mayday */
1483         cpu = cwq->gcwq->cpu;
1484         /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
1485         if (cpu == WORK_CPU_UNBOUND)
1486                 cpu = 0;
1487         if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1488                 wake_up_process(wq->rescuer->task);
1489         return true;
1490 }
1491
1492 static void gcwq_mayday_timeout(unsigned long __gcwq)
1493 {
1494         struct global_cwq *gcwq = (void *)__gcwq;
1495         struct work_struct *work;
1496
1497         spin_lock_irq(&gcwq->lock);
1498
1499         if (need_to_create_worker(gcwq)) {
1500                 /*
1501                  * We've been trying to create a new worker but
1502                  * haven't been successful.  We might be hitting an
1503                  * allocation deadlock.  Send distress signals to
1504                  * rescuers.
1505                  */
1506                 list_for_each_entry(work, &gcwq->worklist, entry)
1507                         send_mayday(work);
1508         }
1509
1510         spin_unlock_irq(&gcwq->lock);
1511
1512         mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
1513 }
1514
1515 /**
1516  * maybe_create_worker - create a new worker if necessary
1517  * @gcwq: gcwq to create a new worker for
1518  *
1519  * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
1520  * have at least one idle worker on return from this function.  If
1521  * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
1522  * sent to all rescuers with works scheduled on @gcwq to resolve
1523  * possible allocation deadlock.
1524  *
1525  * On return, need_to_create_worker() is guaranteed to be false and
1526  * may_start_working() true.
1527  *
1528  * LOCKING:
1529  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1530  * multiple times.  Does GFP_KERNEL allocations.  Called only from
1531  * manager.
1532  *
1533  * RETURNS:
1534  * false if no action was taken and gcwq->lock stayed locked, true
1535  * otherwise.
1536  */
1537 static bool maybe_create_worker(struct global_cwq *gcwq)
1538 __releases(&gcwq->lock)
1539 __acquires(&gcwq->lock)
1540 {
1541         if (!need_to_create_worker(gcwq))
1542                 return false;
1543 restart:
1544         spin_unlock_irq(&gcwq->lock);
1545
1546         /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
1547         mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
1548
1549         while (true) {
1550                 struct worker *worker;
1551
1552                 worker = create_worker(gcwq, true);
1553                 if (worker) {
1554                         del_timer_sync(&gcwq->mayday_timer);
1555                         spin_lock_irq(&gcwq->lock);
1556                         start_worker(worker);
1557                         BUG_ON(need_to_create_worker(gcwq));
1558                         return true;
1559                 }
1560
1561                 if (!need_to_create_worker(gcwq))
1562                         break;
1563
1564                 __set_current_state(TASK_INTERRUPTIBLE);
1565                 schedule_timeout(CREATE_COOLDOWN);
1566
1567                 if (!need_to_create_worker(gcwq))
1568                         break;
1569         }
1570
1571         del_timer_sync(&gcwq->mayday_timer);
1572         spin_lock_irq(&gcwq->lock);
1573         if (need_to_create_worker(gcwq))
1574                 goto restart;
1575         return true;
1576 }
1577
1578 /**
1579  * maybe_destroy_worker - destroy workers which have been idle for a while
1580  * @gcwq: gcwq to destroy workers for
1581  *
1582  * Destroy @gcwq workers which have been idle for longer than
1583  * IDLE_WORKER_TIMEOUT.
1584  *
1585  * LOCKING:
1586  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1587  * multiple times.  Called only from manager.
1588  *
1589  * RETURNS:
1590  * false if no action was taken and gcwq->lock stayed locked, true
1591  * otherwise.
1592  */
1593 static bool maybe_destroy_workers(struct global_cwq *gcwq)
1594 {
1595         bool ret = false;
1596
1597         while (too_many_workers(gcwq)) {
1598                 struct worker *worker;
1599                 unsigned long expires;
1600
1601                 worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
1602                 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1603
1604                 if (time_before(jiffies, expires)) {
1605                         mod_timer(&gcwq->idle_timer, expires);
1606                         break;
1607                 }
1608
1609                 destroy_worker(worker);
1610                 ret = true;
1611         }
1612
1613         return ret;
1614 }
1615
1616 /**
1617  * manage_workers - manage worker pool
1618  * @worker: self
1619  *
1620  * Assume the manager role and manage gcwq worker pool @worker belongs
1621  * to.  At any given time, there can be only zero or one manager per
1622  * gcwq.  The exclusion is handled automatically by this function.
1623  *
1624  * The caller can safely start processing works on false return.  On
1625  * true return, it's guaranteed that need_to_create_worker() is false
1626  * and may_start_working() is true.
1627  *
1628  * CONTEXT:
1629  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1630  * multiple times.  Does GFP_KERNEL allocations.
1631  *
1632  * RETURNS:
1633  * false if no action was taken and gcwq->lock stayed locked, true if
1634  * some action was taken.
1635  */
1636 static bool manage_workers(struct worker *worker)
1637 {
1638         struct global_cwq *gcwq = worker->gcwq;
1639         bool ret = false;
1640
1641         if (gcwq->flags & GCWQ_MANAGING_WORKERS)
1642                 return ret;
1643
1644         gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
1645         gcwq->flags |= GCWQ_MANAGING_WORKERS;
1646
1647         /*
1648          * Destroy and then create so that may_start_working() is true
1649          * on return.
1650          */
1651         ret |= maybe_destroy_workers(gcwq);
1652         ret |= maybe_create_worker(gcwq);
1653
1654         gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
1655
1656         /*
1657          * The trustee might be waiting to take over the manager
1658          * position, tell it we're done.
1659          */
1660         if (unlikely(gcwq->trustee))
1661                 wake_up_all(&gcwq->trustee_wait);
1662
1663         return ret;
1664 }
1665
1666 /**
1667  * move_linked_works - move linked works to a list
1668  * @work: start of series of works to be scheduled
1669  * @head: target list to append @work to
1670  * @nextp: out paramter for nested worklist walking
1671  *
1672  * Schedule linked works starting from @work to @head.  Work series to
1673  * be scheduled starts at @work and includes any consecutive work with
1674  * WORK_STRUCT_LINKED set in its predecessor.
1675  *
1676  * If @nextp is not NULL, it's updated to point to the next work of
1677  * the last scheduled work.  This allows move_linked_works() to be
1678  * nested inside outer list_for_each_entry_safe().
1679  *
1680  * CONTEXT:
1681  * spin_lock_irq(gcwq->lock).
1682  */
1683 static void move_linked_works(struct work_struct *work, struct list_head *head,
1684                               struct work_struct **nextp)
1685 {
1686         struct work_struct *n;
1687
1688         /*
1689          * Linked worklist will always end before the end of the list,
1690          * use NULL for list head.
1691          */
1692         list_for_each_entry_safe_from(work, n, NULL, entry) {
1693                 list_move_tail(&work->entry, head);
1694                 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1695                         break;
1696         }
1697
1698         /*
1699          * If we're already inside safe list traversal and have moved
1700          * multiple works to the scheduled queue, the next position
1701          * needs to be updated.
1702          */
1703         if (nextp)
1704                 *nextp = n;
1705 }
1706
1707 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1708 {
1709         struct work_struct *work = list_first_entry(&cwq->delayed_works,
1710                                                     struct work_struct, entry);
1711         struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
1712
1713         trace_workqueue_activate_work(work);
1714         move_linked_works(work, pos, NULL);
1715         __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
1716         cwq->nr_active++;
1717 }
1718
1719 /**
1720  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1721  * @cwq: cwq of interest
1722  * @color: color of work which left the queue
1723  * @delayed: for a delayed work
1724  *
1725  * A work either has completed or is removed from pending queue,
1726  * decrement nr_in_flight of its cwq and handle workqueue flushing.
1727  *
1728  * CONTEXT:
1729  * spin_lock_irq(gcwq->lock).
1730  */
1731 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color,
1732                                  bool delayed)
1733 {
1734         /* ignore uncolored works */
1735         if (color == WORK_NO_COLOR)
1736                 return;
1737
1738         cwq->nr_in_flight[color]--;
1739
1740         if (!delayed) {
1741                 cwq->nr_active--;
1742                 if (!list_empty(&cwq->delayed_works)) {
1743                         /* one down, submit a delayed one */
1744                         if (cwq->nr_active < cwq->max_active)
1745                                 cwq_activate_first_delayed(cwq);
1746                 }
1747         }
1748
1749         /* is flush in progress and are we at the flushing tip? */
1750         if (likely(cwq->flush_color != color))
1751                 return;
1752
1753         /* are there still in-flight works? */
1754         if (cwq->nr_in_flight[color])
1755                 return;
1756
1757         /* this cwq is done, clear flush_color */
1758         cwq->flush_color = -1;
1759
1760         /*
1761          * If this was the last cwq, wake up the first flusher.  It
1762          * will handle the rest.
1763          */
1764         if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1765                 complete(&cwq->wq->first_flusher->done);
1766 }
1767
1768 /**
1769  * process_one_work - process single work
1770  * @worker: self
1771  * @work: work to process
1772  *
1773  * Process @work.  This function contains all the logics necessary to
1774  * process a single work including synchronization against and
1775  * interaction with other workers on the same cpu, queueing and
1776  * flushing.  As long as context requirement is met, any worker can
1777  * call this function to process a work.
1778  *
1779  * CONTEXT:
1780  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1781  */
1782 static void process_one_work(struct worker *worker, struct work_struct *work)
1783 __releases(&gcwq->lock)
1784 __acquires(&gcwq->lock)
1785 {
1786         struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1787         struct global_cwq *gcwq = cwq->gcwq;
1788         struct hlist_head *bwh = busy_worker_head(gcwq, work);
1789         bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
1790         work_func_t f = work->func;
1791         int work_color;
1792         struct worker *collision;
1793 #ifdef CONFIG_LOCKDEP
1794         /*
1795          * It is permissible to free the struct work_struct from
1796          * inside the function that is called from it, this we need to
1797          * take into account for lockdep too.  To avoid bogus "held
1798          * lock freed" warnings as well as problems when looking into
1799          * work->lockdep_map, make a copy and use that here.
1800          */
1801         struct lockdep_map lockdep_map = work->lockdep_map;
1802 #endif
1803         /*
1804          * A single work shouldn't be executed concurrently by
1805          * multiple workers on a single cpu.  Check whether anyone is
1806          * already processing the work.  If so, defer the work to the
1807          * currently executing one.
1808          */
1809         collision = __find_worker_executing_work(gcwq, bwh, work);
1810         if (unlikely(collision)) {
1811                 move_linked_works(work, &collision->scheduled, NULL);
1812                 return;
1813         }
1814
1815         /* claim and process */
1816         debug_work_deactivate(work);
1817         hlist_add_head(&worker->hentry, bwh);
1818         worker->current_work = work;
1819         worker->current_cwq = cwq;
1820         work_color = get_work_color(work);
1821
1822         /* record the current cpu number in the work data and dequeue */
1823         set_work_cpu(work, gcwq->cpu);
1824         list_del_init(&work->entry);
1825
1826         /*
1827          * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
1828          * wake up another worker; otherwise, clear HIGHPRI_PENDING.
1829          */
1830         if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
1831                 struct work_struct *nwork = list_first_entry(&gcwq->worklist,
1832                                                 struct work_struct, entry);
1833
1834                 if (!list_empty(&gcwq->worklist) &&
1835                     get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
1836                         wake_up_worker(gcwq);
1837                 else
1838                         gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
1839         }
1840
1841         /*
1842          * CPU intensive works don't participate in concurrency
1843          * management.  They're the scheduler's responsibility.
1844          */
1845         if (unlikely(cpu_intensive))
1846                 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
1847
1848         spin_unlock_irq(&gcwq->lock);
1849
1850         work_clear_pending(work);
1851         lock_map_acquire_read(&cwq->wq->lockdep_map);
1852         lock_map_acquire(&lockdep_map);
1853         trace_workqueue_execute_start(work);
1854         f(work);
1855         /*
1856          * While we must be careful to not use "work" after this, the trace
1857          * point will only record its address.
1858          */
1859         trace_workqueue_execute_end(work);
1860         lock_map_release(&lockdep_map);
1861         lock_map_release(&cwq->wq->lockdep_map);
1862
1863         if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1864                 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1865                        "%s/0x%08x/%d\n",
1866                        current->comm, preempt_count(), task_pid_nr(current));
1867                 printk(KERN_ERR "    last function: ");
1868                 print_symbol("%s\n", (unsigned long)f);
1869                 debug_show_held_locks(current);
1870                 dump_stack();
1871         }
1872
1873         spin_lock_irq(&gcwq->lock);
1874
1875         /* clear cpu intensive status */
1876         if (unlikely(cpu_intensive))
1877                 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
1878
1879         /* we're done with it, release */
1880         hlist_del_init(&worker->hentry);
1881         worker->current_work = NULL;
1882         worker->current_cwq = NULL;
1883         cwq_dec_nr_in_flight(cwq, work_color, false);
1884 }
1885
1886 /**
1887  * process_scheduled_works - process scheduled works
1888  * @worker: self
1889  *
1890  * Process all scheduled works.  Please note that the scheduled list
1891  * may change while processing a work, so this function repeatedly
1892  * fetches a work from the top and executes it.
1893  *
1894  * CONTEXT:
1895  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1896  * multiple times.
1897  */
1898 static void process_scheduled_works(struct worker *worker)
1899 {
1900         while (!list_empty(&worker->scheduled)) {
1901                 struct work_struct *work = list_first_entry(&worker->scheduled,
1902                                                 struct work_struct, entry);
1903                 process_one_work(worker, work);
1904         }
1905 }
1906
1907 /**
1908  * worker_thread - the worker thread function
1909  * @__worker: self
1910  *
1911  * The gcwq worker thread function.  There's a single dynamic pool of
1912  * these per each cpu.  These workers process all works regardless of
1913  * their specific target workqueue.  The only exception is works which
1914  * belong to workqueues with a rescuer which will be explained in
1915  * rescuer_thread().
1916  */
1917 static int worker_thread(void *__worker)
1918 {
1919         struct worker *worker = __worker;
1920         struct global_cwq *gcwq = worker->gcwq;
1921
1922         /* tell the scheduler that this is a workqueue worker */
1923         worker->task->flags |= PF_WQ_WORKER;
1924 woke_up:
1925         spin_lock_irq(&gcwq->lock);
1926
1927         /* DIE can be set only while we're idle, checking here is enough */
1928         if (worker->flags & WORKER_DIE) {
1929                 spin_unlock_irq(&gcwq->lock);
1930                 worker->task->flags &= ~PF_WQ_WORKER;
1931                 return 0;
1932         }
1933
1934         worker_leave_idle(worker);
1935 recheck:
1936         /* no more worker necessary? */
1937         if (!need_more_worker(gcwq))
1938                 goto sleep;
1939
1940         /* do we need to manage? */
1941         if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
1942                 goto recheck;
1943
1944         /*
1945          * ->scheduled list can only be filled while a worker is
1946          * preparing to process a work or actually processing it.
1947          * Make sure nobody diddled with it while I was sleeping.
1948          */
1949         BUG_ON(!list_empty(&worker->scheduled));
1950
1951         /*
1952          * When control reaches this point, we're guaranteed to have
1953          * at least one idle worker or that someone else has already
1954          * assumed the manager role.
1955          */
1956         worker_clr_flags(worker, WORKER_PREP);
1957
1958         do {
1959                 struct work_struct *work =
1960                         list_first_entry(&gcwq->worklist,
1961                                          struct work_struct, entry);
1962
1963                 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1964                         /* optimization path, not strictly necessary */
1965                         process_one_work(worker, work);
1966                         if (unlikely(!list_empty(&worker->scheduled)))
1967                                 process_scheduled_works(worker);
1968                 } else {
1969                         move_linked_works(work, &worker->scheduled, NULL);
1970                         process_scheduled_works(worker);
1971                 }
1972         } while (keep_working(gcwq));
1973
1974         worker_set_flags(worker, WORKER_PREP, false);
1975 sleep:
1976         if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
1977                 goto recheck;
1978
1979         /*
1980          * gcwq->lock is held and there's no work to process and no
1981          * need to manage, sleep.  Workers are woken up only while
1982          * holding gcwq->lock or from local cpu, so setting the
1983          * current state before releasing gcwq->lock is enough to
1984          * prevent losing any event.
1985          */
1986         worker_enter_idle(worker);
1987         __set_current_state(TASK_INTERRUPTIBLE);
1988         spin_unlock_irq(&gcwq->lock);
1989         schedule();
1990         goto woke_up;
1991 }
1992
1993 /**
1994  * rescuer_thread - the rescuer thread function
1995  * @__wq: the associated workqueue
1996  *
1997  * Workqueue rescuer thread function.  There's one rescuer for each
1998  * workqueue which has WQ_RESCUER set.
1999  *
2000  * Regular work processing on a gcwq may block trying to create a new
2001  * worker which uses GFP_KERNEL allocation which has slight chance of
2002  * developing into deadlock if some works currently on the same queue
2003  * need to be processed to satisfy the GFP_KERNEL allocation.  This is
2004  * the problem rescuer solves.
2005  *
2006  * When such condition is possible, the gcwq summons rescuers of all
2007  * workqueues which have works queued on the gcwq and let them process
2008  * those works so that forward progress can be guaranteed.
2009  *
2010  * This should happen rarely.
2011  */
2012 static int rescuer_thread(void *__wq)
2013 {
2014         struct workqueue_struct *wq = __wq;
2015         struct worker *rescuer = wq->rescuer;
2016         struct list_head *scheduled = &rescuer->scheduled;
2017         bool is_unbound = wq->flags & WQ_UNBOUND;
2018         unsigned int cpu;
2019
2020         set_user_nice(current, RESCUER_NICE_LEVEL);
2021 repeat:
2022         set_current_state(TASK_INTERRUPTIBLE);
2023
2024         if (kthread_should_stop())
2025                 return 0;
2026
2027         /*
2028          * See whether any cpu is asking for help.  Unbounded
2029          * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
2030          */
2031         for_each_mayday_cpu(cpu, wq->mayday_mask) {
2032                 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
2033                 struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
2034                 struct global_cwq *gcwq = cwq->gcwq;
2035                 struct work_struct *work, *n;
2036
2037                 __set_current_state(TASK_RUNNING);
2038                 mayday_clear_cpu(cpu, wq->mayday_mask);
2039
2040                 /* migrate to the target cpu if possible */
2041                 rescuer->gcwq = gcwq;
2042                 worker_maybe_bind_and_lock(rescuer);
2043
2044                 /*
2045                  * Slurp in all works issued via this workqueue and
2046                  * process'em.
2047                  */
2048                 BUG_ON(!list_empty(&rescuer->scheduled));
2049                 list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
2050                         if (get_work_cwq(work) == cwq)
2051                                 move_linked_works(work, scheduled, &n);
2052
2053                 process_scheduled_works(rescuer);
2054
2055                 /*
2056                  * Leave this gcwq.  If keep_working() is %true, notify a
2057                  * regular worker; otherwise, we end up with 0 concurrency
2058                  * and stalling the execution.
2059                  */
2060                 if (keep_working(gcwq))
2061                         wake_up_worker(gcwq);
2062
2063                 spin_unlock_irq(&gcwq->lock);
2064         }
2065
2066         schedule();
2067         goto repeat;
2068 }
2069
2070 struct wq_barrier {
2071         struct work_struct      work;
2072         struct completion       done;
2073 };
2074
2075 static void wq_barrier_func(struct work_struct *work)
2076 {
2077         struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2078         complete(&barr->done);
2079 }
2080
2081 /**
2082  * insert_wq_barrier - insert a barrier work
2083  * @cwq: cwq to insert barrier into
2084  * @barr: wq_barrier to insert
2085  * @target: target work to attach @barr to
2086  * @worker: worker currently executing @target, NULL if @target is not executing
2087  *
2088  * @barr is linked to @target such that @barr is completed only after
2089  * @target finishes execution.  Please note that the ordering
2090  * guarantee is observed only with respect to @target and on the local
2091  * cpu.
2092  *
2093  * Currently, a queued barrier can't be canceled.  This is because
2094  * try_to_grab_pending() can't determine whether the work to be
2095  * grabbed is at the head of the queue and thus can't clear LINKED
2096  * flag of the previous work while there must be a valid next work
2097  * after a work with LINKED flag set.
2098  *
2099  * Note that when @worker is non-NULL, @target may be modified
2100  * underneath us, so we can't reliably determine cwq from @target.
2101  *
2102  * CONTEXT:
2103  * spin_lock_irq(gcwq->lock).
2104  */
2105 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
2106                               struct wq_barrier *barr,
2107                               struct work_struct *target, struct worker *worker)
2108 {
2109         struct list_head *head;
2110         unsigned int linked = 0;
2111
2112         /*
2113          * debugobject calls are safe here even with gcwq->lock locked
2114          * as we know for sure that this will not trigger any of the
2115          * checks and call back into the fixup functions where we
2116          * might deadlock.
2117          */
2118         INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
2119         __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
2120         init_completion(&barr->done);
2121
2122         /*
2123          * If @target is currently being executed, schedule the
2124          * barrier to the worker; otherwise, put it after @target.
2125          */
2126         if (worker)
2127                 head = worker->scheduled.next;
2128         else {
2129                 unsigned long *bits = work_data_bits(target);
2130
2131                 head = target->entry.next;
2132                 /* there can already be other linked works, inherit and set */
2133                 linked = *bits & WORK_STRUCT_LINKED;
2134                 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2135         }
2136
2137         debug_work_activate(&barr->work);
2138         insert_work(cwq, &barr->work, head,
2139                     work_color_to_flags(WORK_NO_COLOR) | linked);
2140 }
2141
2142 /**
2143  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
2144  * @wq: workqueue being flushed
2145  * @flush_color: new flush color, < 0 for no-op
2146  * @work_color: new work color, < 0 for no-op
2147  *
2148  * Prepare cwqs for workqueue flushing.
2149  *
2150  * If @flush_color is non-negative, flush_color on all cwqs should be
2151  * -1.  If no cwq has in-flight commands at the specified color, all
2152  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
2153  * has in flight commands, its cwq->flush_color is set to
2154  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
2155  * wakeup logic is armed and %true is returned.
2156  *
2157  * The caller should have initialized @wq->first_flusher prior to
2158  * calling this function with non-negative @flush_color.  If
2159  * @flush_color is negative, no flush color update is done and %false
2160  * is returned.
2161  *
2162  * If @work_color is non-negative, all cwqs should have the same
2163  * work_color which is previous to @work_color and all will be
2164  * advanced to @work_color.
2165  *
2166  * CONTEXT:
2167  * mutex_lock(wq->flush_mutex).
2168  *
2169  * RETURNS:
2170  * %true if @flush_color >= 0 and there's something to flush.  %false
2171  * otherwise.
2172  */
2173 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
2174                                       int flush_color, int work_color)
2175 {
2176         bool wait = false;
2177         unsigned int cpu;
2178
2179         if (flush_color >= 0) {
2180                 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
2181                 atomic_set(&wq->nr_cwqs_to_flush, 1);
2182         }
2183
2184         for_each_cwq_cpu(cpu, wq) {
2185                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2186                 struct global_cwq *gcwq = cwq->gcwq;
2187
2188                 spin_lock_irq(&gcwq->lock);
2189
2190                 if (flush_color >= 0) {
2191                         BUG_ON(cwq->flush_color != -1);
2192
2193                         if (cwq->nr_in_flight[flush_color]) {
2194                                 cwq->flush_color = flush_color;
2195                                 atomic_inc(&wq->nr_cwqs_to_flush);
2196                                 wait = true;
2197                         }
2198                 }
2199
2200                 if (work_color >= 0) {
2201                         BUG_ON(work_color != work_next_color(cwq->work_color));
2202                         cwq->work_color = work_color;
2203                 }
2204
2205                 spin_unlock_irq(&gcwq->lock);
2206         }
2207
2208         if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
2209                 complete(&wq->first_flusher->done);
2210
2211         return wait;
2212 }
2213
2214 /**
2215  * flush_workqueue - ensure that any scheduled work has run to completion.
2216  * @wq: workqueue to flush
2217  *
2218  * Forces execution of the workqueue and blocks until its completion.
2219  * This is typically used in driver shutdown handlers.
2220  *
2221  * We sleep until all works which were queued on entry have been handled,
2222  * but we are not livelocked by new incoming ones.
2223  */
2224 void flush_workqueue(struct workqueue_struct *wq)
2225 {
2226         struct wq_flusher this_flusher = {
2227                 .list = LIST_HEAD_INIT(this_flusher.list),
2228                 .flush_color = -1,
2229                 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2230         };
2231         int next_color;
2232
2233         lock_map_acquire(&wq->lockdep_map);
2234         lock_map_release(&wq->lockdep_map);
2235
2236         mutex_lock(&wq->flush_mutex);
2237
2238         /*
2239          * Start-to-wait phase
2240          */
2241         next_color = work_next_color(wq->work_color);
2242
2243         if (next_color != wq->flush_color) {
2244                 /*
2245                  * Color space is not full.  The current work_color
2246                  * becomes our flush_color and work_color is advanced
2247                  * by one.
2248                  */
2249                 BUG_ON(!list_empty(&wq->flusher_overflow));
2250                 this_flusher.flush_color = wq->work_color;
2251                 wq->work_color = next_color;
2252
2253                 if (!wq->first_flusher) {
2254                         /* no flush in progress, become the first flusher */
2255                         BUG_ON(wq->flush_color != this_flusher.flush_color);
2256
2257                         wq->first_flusher = &this_flusher;
2258
2259                         if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
2260                                                        wq->work_color)) {
2261                                 /* nothing to flush, done */
2262                                 wq->flush_color = next_color;
2263                                 wq->first_flusher = NULL;
2264                                 goto out_unlock;
2265                         }
2266                 } else {
2267                         /* wait in queue */
2268                         BUG_ON(wq->flush_color == this_flusher.flush_color);
2269                         list_add_tail(&this_flusher.list, &wq->flusher_queue);
2270                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2271                 }
2272         } else {
2273                 /*
2274                  * Oops, color space is full, wait on overflow queue.
2275                  * The next flush completion will assign us
2276                  * flush_color and transfer to flusher_queue.
2277                  */
2278                 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2279         }
2280
2281         mutex_unlock(&wq->flush_mutex);
2282
2283         wait_for_completion(&this_flusher.done);
2284
2285         /*
2286          * Wake-up-and-cascade phase
2287          *
2288          * First flushers are responsible for cascading flushes and
2289          * handling overflow.  Non-first flushers can simply return.
2290          */
2291         if (wq->first_flusher != &this_flusher)
2292                 return;
2293
2294         mutex_lock(&wq->flush_mutex);
2295
2296         /* we might have raced, check again with mutex held */
2297         if (wq->first_flusher != &this_flusher)
2298                 goto out_unlock;
2299
2300         wq->first_flusher = NULL;
2301
2302         BUG_ON(!list_empty(&this_flusher.list));
2303         BUG_ON(wq->flush_color != this_flusher.flush_color);
2304
2305         while (true) {
2306                 struct wq_flusher *next, *tmp;
2307
2308                 /* complete all the flushers sharing the current flush color */
2309                 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2310                         if (next->flush_color != wq->flush_color)
2311                                 break;
2312                         list_del_init(&next->list);
2313                         complete(&next->done);
2314                 }
2315
2316                 BUG_ON(!list_empty(&wq->flusher_overflow) &&
2317                        wq->flush_color != work_next_color(wq->work_color));
2318
2319                 /* this flush_color is finished, advance by one */
2320                 wq->flush_color = work_next_color(wq->flush_color);
2321
2322                 /* one color has been freed, handle overflow queue */
2323                 if (!list_empty(&wq->flusher_overflow)) {
2324                         /*
2325                          * Assign the same color to all overflowed
2326                          * flushers, advance work_color and append to
2327                          * flusher_queue.  This is the start-to-wait
2328                          * phase for these overflowed flushers.
2329                          */
2330                         list_for_each_entry(tmp, &wq->flusher_overflow, list)
2331                                 tmp->flush_color = wq->work_color;
2332
2333                         wq->work_color = work_next_color(wq->work_color);
2334
2335                         list_splice_tail_init(&wq->flusher_overflow,
2336                                               &wq->flusher_queue);
2337                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
2338                 }
2339
2340                 if (list_empty(&wq->flusher_queue)) {
2341                         BUG_ON(wq->flush_color != wq->work_color);
2342                         break;
2343                 }
2344
2345                 /*
2346                  * Need to flush more colors.  Make the next flusher
2347                  * the new first flusher and arm cwqs.
2348                  */
2349                 BUG_ON(wq->flush_color == wq->work_color);
2350                 BUG_ON(wq->flush_color != next->flush_color);
2351
2352                 list_del_init(&next->list);
2353                 wq->first_flusher = next;
2354
2355                 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
2356                         break;
2357
2358                 /*
2359                  * Meh... this color is already done, clear first
2360                  * flusher and repeat cascading.
2361                  */
2362                 wq->first_flusher = NULL;
2363         }
2364
2365 out_unlock:
2366         mutex_unlock(&wq->flush_mutex);
2367 }
2368 EXPORT_SYMBOL_GPL(flush_workqueue);
2369
2370 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2371                              bool wait_executing)
2372 {
2373         struct worker *worker = NULL;
2374         struct global_cwq *gcwq;
2375         struct cpu_workqueue_struct *cwq;
2376
2377         might_sleep();
2378         gcwq = get_work_gcwq(work);
2379         if (!gcwq)
2380                 return false;
2381
2382         spin_lock_irq(&gcwq->lock);
2383         if (!list_empty(&work->entry)) {
2384                 /*
2385                  * See the comment near try_to_grab_pending()->smp_rmb().
2386                  * If it was re-queued to a different gcwq under us, we
2387                  * are not going to wait.
2388                  */
2389                 smp_rmb();
2390                 cwq = get_work_cwq(work);
2391                 if (unlikely(!cwq || gcwq != cwq->gcwq))
2392                         goto already_gone;
2393         } else if (wait_executing) {
2394                 worker = find_worker_executing_work(gcwq, work);
2395                 if (!worker)
2396                         goto already_gone;
2397                 cwq = worker->current_cwq;
2398         } else
2399                 goto already_gone;
2400
2401         insert_wq_barrier(cwq, barr, work, worker);
2402         spin_unlock_irq(&gcwq->lock);
2403
2404         /*
2405          * If @max_active is 1 or rescuer is in use, flushing another work
2406          * item on the same workqueue may lead to deadlock.  Make sure the
2407          * flusher is not running on the same workqueue by verifying write
2408          * access.
2409          */
2410         if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
2411                 lock_map_acquire(&cwq->wq->lockdep_map);
2412         else
2413                 lock_map_acquire_read(&cwq->wq->lockdep_map);
2414         lock_map_release(&cwq->wq->lockdep_map);
2415
2416         return true;
2417 already_gone:
2418         spin_unlock_irq(&gcwq->lock);
2419         return false;
2420 }
2421
2422 /**
2423  * flush_work - wait for a work to finish executing the last queueing instance
2424  * @work: the work to flush
2425  *
2426  * Wait until @work has finished execution.  This function considers
2427  * only the last queueing instance of @work.  If @work has been
2428  * enqueued across different CPUs on a non-reentrant workqueue or on
2429  * multiple workqueues, @work might still be executing on return on
2430  * some of the CPUs from earlier queueing.
2431  *
2432  * If @work was queued only on a non-reentrant, ordered or unbound
2433  * workqueue, @work is guaranteed to be idle on return if it hasn't
2434  * been requeued since flush started.
2435  *
2436  * RETURNS:
2437  * %true if flush_work() waited for the work to finish execution,
2438  * %false if it was already idle.
2439  */
2440 bool flush_work(struct work_struct *work)
2441 {
2442         struct wq_barrier barr;
2443
2444         if (start_flush_work(work, &barr, true)) {
2445                 wait_for_completion(&barr.done);
2446                 destroy_work_on_stack(&barr.work);
2447                 return true;
2448         } else
2449                 return false;
2450 }
2451 EXPORT_SYMBOL_GPL(flush_work);
2452
2453 static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
2454 {
2455         struct wq_barrier barr;
2456         struct worker *worker;
2457
2458         spin_lock_irq(&gcwq->lock);
2459
2460         worker = find_worker_executing_work(gcwq, work);
2461         if (unlikely(worker))
2462                 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
2463
2464         spin_unlock_irq(&gcwq->lock);
2465
2466         if (unlikely(worker)) {
2467                 wait_for_completion(&barr.done);
2468                 destroy_work_on_stack(&barr.work);
2469                 return true;
2470         } else
2471                 return false;
2472 }
2473
2474 static bool wait_on_work(struct work_struct *work)
2475 {
2476         bool ret = false;
2477         int cpu;
2478
2479         might_sleep();
2480
2481         lock_map_acquire(&work->lockdep_map);
2482         lock_map_release(&work->lockdep_map);
2483
2484         for_each_gcwq_cpu(cpu)
2485                 ret |= wait_on_cpu_work(get_gcwq(cpu), work);
2486         return ret;
2487 }
2488
2489 /**
2490  * flush_work_sync - wait until a work has finished execution
2491  * @work: the work to flush
2492  *
2493  * Wait until @work has finished execution.  On return, it's
2494  * guaranteed that all queueing instances of @work which happened
2495  * before this function is called are finished.  In other words, if
2496  * @work hasn't been requeued since this function was called, @work is
2497  * guaranteed to be idle on return.
2498  *
2499  * RETURNS:
2500  * %true if flush_work_sync() waited for the work to finish execution,
2501  * %false if it was already idle.
2502  */
2503 bool flush_work_sync(struct work_struct *work)
2504 {
2505         struct wq_barrier barr;
2506         bool pending, waited;
2507
2508         /* we'll wait for executions separately, queue barr only if pending */
2509         pending = start_flush_work(work, &barr, false);
2510
2511         /* wait for executions to finish */
2512         waited = wait_on_work(work);
2513
2514         /* wait for the pending one */
2515         if (pending) {
2516                 wait_for_completion(&barr.done);
2517                 destroy_work_on_stack(&barr.work);
2518         }
2519
2520         return pending || waited;
2521 }
2522 EXPORT_SYMBOL_GPL(flush_work_sync);
2523
2524 /*
2525  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
2526  * so this work can't be re-armed in any way.
2527  */
2528 static int try_to_grab_pending(struct work_struct *work)
2529 {
2530         struct global_cwq *gcwq;
2531         int ret = -1;
2532
2533         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
2534                 return 0;
2535
2536         /*
2537          * The queueing is in progress, or it is already queued. Try to
2538          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
2539          */
2540         gcwq = get_work_gcwq(work);
2541         if (!gcwq)
2542                 return ret;
2543
2544         spin_lock_irq(&gcwq->lock);
2545         if (!list_empty(&work->entry)) {
2546                 /*
2547                  * This work is queued, but perhaps we locked the wrong gcwq.
2548                  * In that case we must see the new value after rmb(), see
2549                  * insert_work()->wmb().
2550                  */
2551                 smp_rmb();
2552                 if (gcwq == get_work_gcwq(work)) {
2553                         debug_work_deactivate(work);
2554                         list_del_init(&work->entry);
2555                         cwq_dec_nr_in_flight(get_work_cwq(work),
2556                                 get_work_color(work),
2557                                 *work_data_bits(work) & WORK_STRUCT_DELAYED);
2558                         ret = 1;
2559                 }
2560         }
2561         spin_unlock_irq(&gcwq->lock);
2562
2563         return ret;
2564 }
2565
2566 static bool __cancel_work_timer(struct work_struct *work,
2567                                 struct timer_list* timer)
2568 {
2569         int ret;
2570
2571         do {
2572                 ret = (timer && likely(del_timer(timer)));
2573                 if (!ret)
2574                         ret = try_to_grab_pending(work);
2575                 wait_on_work(work);
2576         } while (unlikely(ret < 0));
2577
2578         clear_work_data(work);
2579         return ret;
2580 }
2581
2582 /**
2583  * cancel_work_sync - cancel a work and wait for it to finish
2584  * @work: the work to cancel
2585  *
2586  * Cancel @work and wait for its execution to finish.  This function
2587  * can be used even if the work re-queues itself or migrates to
2588  * another workqueue.  On return from this function, @work is
2589  * guaranteed to be not pending or executing on any CPU.
2590  *
2591  * cancel_work_sync(&delayed_work->work) must not be used for
2592  * delayed_work's.  Use cancel_delayed_work_sync() instead.
2593  *
2594  * The caller must ensure that the workqueue on which @work was last
2595  * queued can't be destroyed before this function returns.
2596  *
2597  * RETURNS:
2598  * %true if @work was pending, %false otherwise.
2599  */
2600 bool cancel_work_sync(struct work_struct *work)
2601 {
2602         return __cancel_work_timer(work, NULL);
2603 }
2604 EXPORT_SYMBOL_GPL(cancel_work_sync);
2605
2606 /**
2607  * flush_delayed_work - wait for a dwork to finish executing the last queueing
2608  * @dwork: the delayed work to flush
2609  *
2610  * Delayed timer is cancelled and the pending work is queued for
2611  * immediate execution.  Like flush_work(), this function only
2612  * considers the last queueing instance of @dwork.
2613  *
2614  * RETURNS:
2615  * %true if flush_work() waited for the work to finish execution,
2616  * %false if it was already idle.
2617  */
2618 bool flush_delayed_work(struct delayed_work *dwork)
2619 {
2620         if (del_timer_sync(&dwork->timer))
2621                 __queue_work(raw_smp_processor_id(),
2622                              get_work_cwq(&dwork->work)->wq, &dwork->work);
2623         return flush_work(&dwork->work);
2624 }
2625 EXPORT_SYMBOL(flush_delayed_work);
2626
2627 /**
2628  * flush_delayed_work_sync - wait for a dwork to finish
2629  * @dwork: the delayed work to flush
2630  *
2631  * Delayed timer is cancelled and the pending work is queued for
2632  * execution immediately.  Other than timer handling, its behavior
2633  * is identical to flush_work_sync().
2634  *
2635  * RETURNS:
2636  * %true if flush_work_sync() waited for the work to finish execution,
2637  * %false if it was already idle.
2638  */
2639 bool flush_delayed_work_sync(struct delayed_work *dwork)
2640 {
2641         if (del_timer_sync(&dwork->timer))
2642                 __queue_work(raw_smp_processor_id(),
2643                              get_work_cwq(&dwork->work)->wq, &dwork->work);
2644         return flush_work_sync(&dwork->work);
2645 }
2646 EXPORT_SYMBOL(flush_delayed_work_sync);
2647
2648 /**
2649  * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2650  * @dwork: the delayed work cancel
2651  *
2652  * This is cancel_work_sync() for delayed works.
2653  *
2654  * RETURNS:
2655  * %true if @dwork was pending, %false otherwise.
2656  */
2657 bool cancel_delayed_work_sync(struct delayed_work *dwork)
2658 {
2659         return __cancel_work_timer(&dwork->work, &dwork->timer);
2660 }
2661 EXPORT_SYMBOL(cancel_delayed_work_sync);
2662
2663 /**
2664  * schedule_work - put work task in global workqueue
2665  * @work: job to be done
2666  *
2667  * Returns zero if @work was already on the kernel-global workqueue and
2668  * non-zero otherwise.
2669  *
2670  * This puts a job in the kernel-global workqueue if it was not already
2671  * queued and leaves it in the same position on the kernel-global
2672  * workqueue otherwise.
2673  */
2674 int schedule_work(struct work_struct *work)
2675 {
2676         return queue_work(system_wq, work);
2677 }
2678 EXPORT_SYMBOL(schedule_work);
2679
2680 /*
2681  * schedule_work_on - put work task on a specific cpu
2682  * @cpu: cpu to put the work task on
2683  * @work: job to be done
2684  *
2685  * This puts a job on a specific cpu
2686  */
2687 int schedule_work_on(int cpu, struct work_struct *work)
2688 {
2689         return queue_work_on(cpu, system_wq, work);
2690 }
2691 EXPORT_SYMBOL(schedule_work_on);
2692
2693 /**
2694  * schedule_delayed_work - put work task in global workqueue after delay
2695  * @dwork: job to be done
2696  * @delay: number of jiffies to wait or 0 for immediate execution
2697  *
2698  * After waiting for a given time this puts a job in the kernel-global
2699  * workqueue.
2700  */
2701 int schedule_delayed_work(struct delayed_work *dwork,
2702                                         unsigned long delay)
2703 {
2704         return queue_delayed_work(system_wq, dwork, delay);
2705 }
2706 EXPORT_SYMBOL(schedule_delayed_work);
2707
2708 /**
2709  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2710  * @cpu: cpu to use
2711  * @dwork: job to be done
2712  * @delay: number of jiffies to wait
2713  *
2714  * After waiting for a given time this puts a job in the kernel-global
2715  * workqueue on the specified CPU.
2716  */
2717 int schedule_delayed_work_on(int cpu,
2718                         struct delayed_work *dwork, unsigned long delay)
2719 {
2720         return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2721 }
2722 EXPORT_SYMBOL(schedule_delayed_work_on);
2723
2724 /**
2725  * schedule_on_each_cpu - execute a function synchronously on each online CPU
2726  * @func: the function to call
2727  *
2728  * schedule_on_each_cpu() executes @func on each online CPU using the
2729  * system workqueue and blocks until all CPUs have completed.
2730  * schedule_on_each_cpu() is very slow.
2731  *
2732  * RETURNS:
2733  * 0 on success, -errno on failure.
2734  */
2735 int schedule_on_each_cpu(work_func_t func)
2736 {
2737         int cpu;
2738         struct work_struct __percpu *works;
2739
2740         works = alloc_percpu(struct work_struct);
2741         if (!works)
2742                 return -ENOMEM;
2743
2744         get_online_cpus();
2745
2746         for_each_online_cpu(cpu) {
2747                 struct work_struct *work = per_cpu_ptr(works, cpu);
2748
2749                 INIT_WORK(work, func);
2750                 schedule_work_on(cpu, work);
2751         }
2752
2753         for_each_online_cpu(cpu)
2754                 flush_work(per_cpu_ptr(works, cpu));
2755
2756         put_online_cpus();
2757         free_percpu(works);
2758         return 0;
2759 }
2760
2761 /**
2762  * flush_scheduled_work - ensure that any scheduled work has run to completion.
2763  *
2764  * Forces execution of the kernel-global workqueue and blocks until its
2765  * completion.
2766  *
2767  * Think twice before calling this function!  It's very easy to get into
2768  * trouble if you don't take great care.  Either of the following situations
2769  * will lead to deadlock:
2770  *
2771  *      One of the work items currently on the workqueue needs to acquire
2772  *      a lock held by your code or its caller.
2773  *
2774  *      Your code is running in the context of a work routine.
2775  *
2776  * They will be detected by lockdep when they occur, but the first might not
2777  * occur very often.  It depends on what work items are on the workqueue and
2778  * what locks they need, which you have no control over.
2779  *
2780  * In most situations flushing the entire workqueue is overkill; you merely
2781  * need to know that a particular work item isn't queued and isn't running.
2782  * In such cases you should use cancel_delayed_work_sync() or
2783  * cancel_work_sync() instead.
2784  */
2785 void flush_scheduled_work(void)
2786 {
2787         flush_workqueue(system_wq);
2788 }
2789 EXPORT_SYMBOL(flush_scheduled_work);
2790
2791 /**
2792  * execute_in_process_context - reliably execute the routine with user context
2793  * @fn:         the function to execute
2794  * @ew:         guaranteed storage for the execute work structure (must
2795  *              be available when the work executes)
2796  *
2797  * Executes the function immediately if process context is available,
2798  * otherwise schedules the function for delayed execution.
2799  *
2800  * Returns:     0 - function was executed
2801  *              1 - function was scheduled for execution
2802  */
2803 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
2804 {
2805         if (!in_interrupt()) {
2806                 fn(&ew->work);
2807                 return 0;
2808         }
2809
2810         INIT_WORK(&ew->work, fn);
2811         schedule_work(&ew->work);
2812
2813         return 1;
2814 }
2815 EXPORT_SYMBOL_GPL(execute_in_process_context);
2816
2817 int keventd_up(void)
2818 {
2819         return system_wq != NULL;
2820 }
2821
2822 static int alloc_cwqs(struct workqueue_struct *wq)
2823 {
2824         /*
2825          * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
2826          * Make sure that the alignment isn't lower than that of
2827          * unsigned long long.
2828          */
2829         const size_t size = sizeof(struct cpu_workqueue_struct);
2830         const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2831                                    __alignof__(unsigned long long));
2832 #ifdef CONFIG_SMP
2833         bool percpu = !(wq->flags & WQ_UNBOUND);
2834 #else
2835         bool percpu = false;
2836 #endif
2837
2838         if (percpu)
2839                 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2840         else {
2841                 void *ptr;
2842
2843                 /*
2844                  * Allocate enough room to align cwq and put an extra
2845                  * pointer at the end pointing back to the originally
2846                  * allocated pointer which will be used for free.
2847                  */
2848                 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
2849                 if (ptr) {
2850                         wq->cpu_wq.single = PTR_ALIGN(ptr, align);
2851                         *(void **)(wq->cpu_wq.single + 1) = ptr;
2852                 }
2853         }
2854
2855         /* just in case, make sure it's actually aligned
2856          * - this is affected by PERCPU() alignment in vmlinux.lds.S
2857          */
2858         BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
2859         return wq->cpu_wq.v ? 0 : -ENOMEM;
2860 }
2861
2862 static void free_cwqs(struct workqueue_struct *wq)
2863 {
2864 #ifdef CONFIG_SMP
2865         bool percpu = !(wq->flags & WQ_UNBOUND);
2866 #else
2867         bool percpu = false;
2868 #endif
2869
2870         if (percpu)
2871                 free_percpu(wq->cpu_wq.pcpu);
2872         else if (wq->cpu_wq.single) {
2873                 /* the pointer to free is stored right after the cwq */
2874                 kfree(*(void **)(wq->cpu_wq.single + 1));
2875         }
2876 }
2877
2878 static int wq_clamp_max_active(int max_active, unsigned int flags,
2879                                const char *name)
2880 {
2881         int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
2882
2883         if (max_active < 1 || max_active > lim)
2884                 printk(KERN_WARNING "workqueue: max_active %d requested for %s "
2885                        "is out of range, clamping between %d and %d\n",
2886                        max_active, name, 1, lim);
2887
2888         return clamp_val(max_active, 1, lim);
2889 }
2890
2891 struct workqueue_struct *__alloc_workqueue_key(const char *name,
2892                                                unsigned int flags,
2893                                                int max_active,
2894                                                struct lock_class_key *key,
2895                                                const char *lock_name)
2896 {
2897         struct workqueue_struct *wq;
2898         unsigned int cpu;
2899
2900         /*
2901          * Workqueues which may be used during memory reclaim should
2902          * have a rescuer to guarantee forward progress.
2903          */
2904         if (flags & WQ_MEM_RECLAIM)
2905                 flags |= WQ_RESCUER;
2906
2907         /*
2908          * Unbound workqueues aren't concurrency managed and should be
2909          * dispatched to workers immediately.
2910          */
2911         if (flags & WQ_UNBOUND)
2912                 flags |= WQ_HIGHPRI;
2913
2914         max_active = max_active ?: WQ_DFL_ACTIVE;
2915         max_active = wq_clamp_max_active(max_active, flags, name);
2916
2917         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2918         if (!wq)
2919                 goto err;
2920
2921         wq->flags = flags;
2922         wq->saved_max_active = max_active;
2923         mutex_init(&wq->flush_mutex);
2924         atomic_set(&wq->nr_cwqs_to_flush, 0);
2925         INIT_LIST_HEAD(&wq->flusher_queue);
2926         INIT_LIST_HEAD(&wq->flusher_overflow);
2927
2928         wq->name = name;
2929         lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2930         INIT_LIST_HEAD(&wq->list);
2931
2932         if (alloc_cwqs(wq) < 0)
2933                 goto err;
2934
2935         for_each_cwq_cpu(cpu, wq) {
2936                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2937                 struct global_cwq *gcwq = get_gcwq(cpu);
2938
2939                 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
2940                 cwq->gcwq = gcwq;
2941                 cwq->wq = wq;
2942                 cwq->flush_color = -1;
2943                 cwq->max_active = max_active;
2944                 INIT_LIST_HEAD(&cwq->delayed_works);
2945         }
2946
2947         if (flags & WQ_RESCUER) {
2948                 struct worker *rescuer;
2949
2950                 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
2951                         goto err;
2952
2953                 wq->rescuer = rescuer = alloc_worker();
2954                 if (!rescuer)
2955                         goto err;
2956
2957                 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
2958                 if (IS_ERR(rescuer->task))
2959                         goto err;
2960
2961                 rescuer->task->flags |= PF_THREAD_BOUND;
2962                 wake_up_process(rescuer->task);
2963         }
2964
2965         /*
2966          * workqueue_lock protects global freeze state and workqueues
2967          * list.  Grab it, set max_active accordingly and add the new
2968          * workqueue to workqueues list.
2969          */
2970         spin_lock(&workqueue_lock);
2971
2972         if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
2973                 for_each_cwq_cpu(cpu, wq)
2974                         get_cwq(cpu, wq)->max_active = 0;
2975
2976         list_add(&wq->list, &workqueues);
2977
2978         spin_unlock(&workqueue_lock);
2979
2980         return wq;
2981 err:
2982         if (wq) {
2983                 free_cwqs(wq);
2984                 free_mayday_mask(wq->mayday_mask);
2985                 kfree(wq->rescuer);
2986                 kfree(wq);
2987         }
2988         return NULL;
2989 }
2990 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
2991
2992 /**
2993  * destroy_workqueue - safely terminate a workqueue
2994  * @wq: target workqueue
2995  *
2996  * Safely destroy a workqueue. All work currently pending will be done first.
2997  */
2998 void destroy_workqueue(struct workqueue_struct *wq)
2999 {
3000         unsigned int flush_cnt = 0;
3001         unsigned int cpu;
3002
3003         /*
3004          * Mark @wq dying and drain all pending works.  Once WQ_DYING is
3005          * set, only chain queueing is allowed.  IOW, only currently
3006          * pending or running work items on @wq can queue further work
3007          * items on it.  @wq is flushed repeatedly until it becomes empty.
3008          * The number of flushing is detemined by the depth of chaining and
3009          * should be relatively short.  Whine if it takes too long.
3010          */
3011         wq->flags |= WQ_DYING;
3012 reflush:
3013         flush_workqueue(wq);
3014
3015         for_each_cwq_cpu(cpu, wq) {
3016                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3017
3018                 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3019                         continue;
3020
3021                 if (++flush_cnt == 10 ||
3022                     (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3023                         printk(KERN_WARNING "workqueue %s: flush on "
3024                                "destruction isn't complete after %u tries\n",
3025                                wq->name, flush_cnt);
3026                 goto reflush;
3027         }
3028
3029         /*
3030          * wq list is used to freeze wq, remove from list after
3031          * flushing is complete in case freeze races us.
3032          */
3033         spin_lock(&workqueue_lock);
3034         list_del(&wq->list);
3035         spin_unlock(&workqueue_lock);
3036
3037         /* sanity check */
3038         for_each_cwq_cpu(cpu, wq) {
3039                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3040                 int i;
3041
3042                 for (i = 0; i < WORK_NR_COLORS; i++)
3043                         BUG_ON(cwq->nr_in_flight[i]);
3044                 BUG_ON(cwq->nr_active);
3045                 BUG_ON(!list_empty(&cwq->delayed_works));
3046         }
3047
3048         if (wq->flags & WQ_RESCUER) {
3049                 kthread_stop(wq->rescuer->task);
3050                 free_mayday_mask(wq->mayday_mask);
3051                 kfree(wq->rescuer);
3052         }
3053
3054         free_cwqs(wq);
3055         kfree(wq);
3056 }
3057 EXPORT_SYMBOL_GPL(destroy_workqueue);
3058
3059 /**
3060  * workqueue_set_max_active - adjust max_active of a workqueue
3061  * @wq: target workqueue
3062  * @max_active: new max_active value.
3063  *
3064  * Set max_active of @wq to @max_active.
3065  *
3066  * CONTEXT:
3067  * Don't call from IRQ context.
3068  */
3069 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3070 {
3071         unsigned int cpu;
3072
3073         max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3074
3075         spin_lock(&workqueue_lock);
3076
3077         wq->saved_max_active = max_active;
3078
3079         for_each_cwq_cpu(cpu, wq) {
3080                 struct global_cwq *gcwq = get_gcwq(cpu);
3081
3082                 spin_lock_irq(&gcwq->lock);
3083
3084                 if (!(wq->flags & WQ_FREEZABLE) ||
3085                     !(gcwq->flags & GCWQ_FREEZING))
3086                         get_cwq(gcwq->cpu, wq)->max_active = max_active;
3087
3088                 spin_unlock_irq(&gcwq->lock);
3089         }
3090
3091         spin_unlock(&workqueue_lock);
3092 }
3093 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3094
3095 /**
3096  * workqueue_congested - test whether a workqueue is congested
3097  * @cpu: CPU in question
3098  * @wq: target workqueue
3099  *
3100  * Test whether @wq's cpu workqueue for @cpu is congested.  There is
3101  * no synchronization around this function and the test result is
3102  * unreliable and only useful as advisory hints or for debugging.
3103  *
3104  * RETURNS:
3105  * %true if congested, %false otherwise.
3106  */
3107 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
3108 {
3109         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3110
3111         return !list_empty(&cwq->delayed_works);
3112 }
3113 EXPORT_SYMBOL_GPL(workqueue_congested);
3114
3115 /**
3116  * work_cpu - return the last known associated cpu for @work
3117  * @work: the work of interest
3118  *
3119  * RETURNS:
3120  * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
3121  */
3122 unsigned int work_cpu(struct work_struct *work)
3123 {
3124         struct global_cwq *gcwq = get_work_gcwq(work);
3125
3126         return gcwq ? gcwq->cpu : WORK_CPU_NONE;
3127 }
3128 EXPORT_SYMBOL_GPL(work_cpu);
3129
3130 /**
3131  * work_busy - test whether a work is currently pending or running
3132  * @work: the work to be tested
3133  *
3134  * Test whether @work is currently pending or running.  There is no
3135  * synchronization around this function and the test result is
3136  * unreliable and only useful as advisory hints or for debugging.
3137  * Especially for reentrant wqs, the pending state might hide the
3138  * running state.
3139  *
3140  * RETURNS:
3141  * OR'd bitmask of WORK_BUSY_* bits.
3142  */
3143 unsigned int work_busy(struct work_struct *work)
3144 {
3145         struct global_cwq *gcwq = get_work_gcwq(work);
3146         unsigned long flags;
3147         unsigned int ret = 0;
3148
3149         if (!gcwq)
3150                 return false;
3151
3152         spin_lock_irqsave(&gcwq->lock, flags);
3153
3154         if (work_pending(work))
3155                 ret |= WORK_BUSY_PENDING;
3156         if (find_worker_executing_work(gcwq, work))
3157                 ret |= WORK_BUSY_RUNNING;
3158
3159         spin_unlock_irqrestore(&gcwq->lock, flags);
3160
3161         return ret;
3162 }
3163 EXPORT_SYMBOL_GPL(work_busy);
3164
3165 /*
3166  * CPU hotplug.
3167  *
3168  * There are two challenges in supporting CPU hotplug.  Firstly, there
3169  * are a lot of assumptions on strong associations among work, cwq and
3170  * gcwq which make migrating pending and scheduled works very
3171  * difficult to implement without impacting hot paths.  Secondly,
3172  * gcwqs serve mix of short, long and very long running works making
3173  * blocked draining impractical.
3174  *
3175  * This is solved by allowing a gcwq to be detached from CPU, running
3176  * it with unbound (rogue) workers and allowing it to be reattached
3177  * later if the cpu comes back online.  A separate thread is created
3178  * to govern a gcwq in such state and is called the trustee of the
3179  * gcwq.
3180  *
3181  * Trustee states and their descriptions.
3182  *
3183  * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
3184  *              new trustee is started with this state.
3185  *
3186  * IN_CHARGE    Once started, trustee will enter this state after
3187  *              assuming the manager role and making all existing
3188  *              workers rogue.  DOWN_PREPARE waits for trustee to
3189  *              enter this state.  After reaching IN_CHARGE, trustee
3190  *              tries to execute the pending worklist until it's empty
3191  *              and the state is set to BUTCHER, or the state is set
3192  *              to RELEASE.
3193  *
3194  * BUTCHER      Command state which is set by the cpu callback after
3195  *              the cpu has went down.  Once this state is set trustee
3196  *              knows that there will be no new works on the worklist
3197  *              and once the worklist is empty it can proceed to
3198  *              killing idle workers.
3199  *
3200  * RELEASE      Command state which is set by the cpu callback if the
3201  *              cpu down has been canceled or it has come online
3202  *              again.  After recognizing this state, trustee stops
3203  *              trying to drain or butcher and clears ROGUE, rebinds
3204  *              all remaining workers back to the cpu and releases
3205  *              manager role.
3206  *
3207  * DONE         Trustee will enter this state after BUTCHER or RELEASE
3208  *              is complete.
3209  *
3210  *          trustee                 CPU                draining
3211  *         took over                down               complete
3212  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
3213  *                        |                     |                  ^
3214  *                        | CPU is back online  v   return workers |
3215  *                         ----------------> RELEASE --------------
3216  */
3217
3218 /**
3219  * trustee_wait_event_timeout - timed event wait for trustee
3220  * @cond: condition to wait for
3221  * @timeout: timeout in jiffies
3222  *
3223  * wait_event_timeout() for trustee to use.  Handles locking and
3224  * checks for RELEASE request.
3225  *
3226  * CONTEXT:
3227  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3228  * multiple times.  To be used by trustee.
3229  *
3230  * RETURNS:
3231  * Positive indicating left time if @cond is satisfied, 0 if timed
3232  * out, -1 if canceled.
3233  */
3234 #define trustee_wait_event_timeout(cond, timeout) ({                    \
3235         long __ret = (timeout);                                         \
3236         while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
3237                __ret) {                                                 \
3238                 spin_unlock_irq(&gcwq->lock);                           \
3239                 __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
3240                         (gcwq->trustee_state == TRUSTEE_RELEASE),       \
3241                         __ret);                                         \
3242                 spin_lock_irq(&gcwq->lock);                             \
3243         }                                                               \
3244         gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
3245 })
3246
3247 /**
3248  * trustee_wait_event - event wait for trustee
3249  * @cond: condition to wait for
3250  *
3251  * wait_event() for trustee to use.  Automatically handles locking and
3252  * checks for CANCEL request.
3253  *
3254  * CONTEXT:
3255  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3256  * multiple times.  To be used by trustee.
3257  *
3258  * RETURNS:
3259  * 0 if @cond is satisfied, -1 if canceled.
3260  */
3261 #define trustee_wait_event(cond) ({                                     \
3262         long __ret1;                                                    \
3263         __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
3264         __ret1 < 0 ? -1 : 0;                                            \
3265 })
3266
3267 static int __cpuinit trustee_thread(void *__gcwq)
3268 {
3269         struct global_cwq *gcwq = __gcwq;
3270         struct worker *worker;
3271         struct work_struct *work;
3272         struct hlist_node *pos;
3273         long rc;
3274         int i;
3275
3276         BUG_ON(gcwq->cpu != smp_processor_id());
3277
3278         spin_lock_irq(&gcwq->lock);
3279         /*
3280          * Claim the manager position and make all workers rogue.
3281          * Trustee must be bound to the target cpu and can't be
3282          * cancelled.
3283          */
3284         BUG_ON(gcwq->cpu != smp_processor_id());
3285         rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
3286         BUG_ON(rc < 0);
3287
3288         gcwq->flags |= GCWQ_MANAGING_WORKERS;
3289
3290         list_for_each_entry(worker, &gcwq->idle_list, entry)
3291                 worker->flags |= WORKER_ROGUE;
3292
3293         for_each_busy_worker(worker, i, pos, gcwq)
3294                 worker->flags |= WORKER_ROGUE;
3295
3296         /*
3297          * Call schedule() so that we cross rq->lock and thus can
3298          * guarantee sched callbacks see the rogue flag.  This is
3299          * necessary as scheduler callbacks may be invoked from other
3300          * cpus.
3301          */
3302         spin_unlock_irq(&gcwq->lock);
3303         schedule();
3304         spin_lock_irq(&gcwq->lock);
3305
3306         /*
3307          * Sched callbacks are disabled now.  Zap nr_running.  After
3308          * this, nr_running stays zero and need_more_worker() and
3309          * keep_working() are always true as long as the worklist is
3310          * not empty.
3311          */
3312         atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
3313
3314         spin_unlock_irq(&gcwq->lock);
3315         del_timer_sync(&gcwq->idle_timer);
3316         spin_lock_irq(&gcwq->lock);
3317
3318         /*
3319          * We're now in charge.  Notify and proceed to drain.  We need
3320          * to keep the gcwq running during the whole CPU down
3321          * procedure as other cpu hotunplug callbacks may need to
3322          * flush currently running tasks.
3323          */
3324         gcwq->trustee_state = TRUSTEE_IN_CHARGE;
3325         wake_up_all(&gcwq->trustee_wait);
3326
3327         /*
3328          * The original cpu is in the process of dying and may go away
3329          * anytime now.  When that happens, we and all workers would
3330          * be migrated to other cpus.  Try draining any left work.  We
3331          * want to get it over with ASAP - spam rescuers, wake up as
3332          * many idlers as necessary and create new ones till the
3333          * worklist is empty.  Note that if the gcwq is frozen, there
3334          * may be frozen works in freezable cwqs.  Don't declare
3335          * completion while frozen.
3336          */
3337         while (gcwq->nr_workers != gcwq->nr_idle ||
3338                gcwq->flags & GCWQ_FREEZING ||
3339                gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
3340                 int nr_works = 0;
3341
3342                 list_for_each_entry(work, &gcwq->worklist, entry) {
3343                         send_mayday(work);
3344                         nr_works++;
3345                 }
3346
3347                 list_for_each_entry(worker, &gcwq->idle_list, entry) {
3348                         if (!nr_works--)
3349                                 break;
3350                         wake_up_process(worker->task);
3351                 }
3352
3353                 if (need_to_create_worker(gcwq)) {
3354                         spin_unlock_irq(&gcwq->lock);
3355                         worker = create_worker(gcwq, false);
3356                         spin_lock_irq(&gcwq->lock);
3357                         if (worker) {
3358                                 worker->flags |= WORKER_ROGUE;
3359                                 start_worker(worker);
3360                         }
3361                 }
3362
3363                 /* give a breather */
3364                 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
3365                         break;
3366         }
3367
3368         /*
3369          * Either all works have been scheduled and cpu is down, or
3370          * cpu down has already been canceled.  Wait for and butcher
3371          * all workers till we're canceled.
3372          */
3373         do {
3374                 rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
3375                 while (!list_empty(&gcwq->idle_list))
3376                         destroy_worker(list_first_entry(&gcwq->idle_list,
3377                                                         struct worker, entry));
3378         } while (gcwq->nr_workers && rc >= 0);
3379
3380         /*
3381          * At this point, either draining has completed and no worker
3382          * is left, or cpu down has been canceled or the cpu is being
3383          * brought back up.  There shouldn't be any idle one left.
3384          * Tell the remaining busy ones to rebind once it finishes the
3385          * currently scheduled works by scheduling the rebind_work.
3386          */
3387         WARN_ON(!list_empty(&gcwq->idle_list));
3388
3389         for_each_busy_worker(worker, i, pos, gcwq) {
3390                 struct work_struct *rebind_work = &worker->rebind_work;
3391
3392                 /*
3393                  * Rebind_work may race with future cpu hotplug
3394                  * operations.  Use a separate flag to mark that
3395                  * rebinding is scheduled.
3396                  */
3397                 worker->flags |= WORKER_REBIND;
3398                 worker->flags &= ~WORKER_ROGUE;
3399
3400                 /* queue rebind_work, wq doesn't matter, use the default one */
3401                 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
3402                                      work_data_bits(rebind_work)))
3403                         continue;
3404
3405                 debug_work_activate(rebind_work);
3406                 insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
3407                             worker->scheduled.next,
3408                             work_color_to_flags(WORK_NO_COLOR));
3409         }
3410
3411         /* relinquish manager role */
3412         gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
3413
3414         /* notify completion */
3415         gcwq->trustee = NULL;
3416         gcwq->trustee_state = TRUSTEE_DONE;
3417         wake_up_all(&gcwq->trustee_wait);
3418         spin_unlock_irq(&gcwq->lock);
3419         return 0;
3420 }
3421
3422 /**
3423  * wait_trustee_state - wait for trustee to enter the specified state
3424  * @gcwq: gcwq the trustee of interest belongs to
3425  * @state: target state to wait for
3426  *
3427  * Wait for the trustee to reach @state.  DONE is already matched.
3428  *
3429  * CONTEXT:
3430  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
3431  * multiple times.  To be used by cpu_callback.
3432  */
3433 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
3434 __releases(&gcwq->lock)
3435 __acquires(&gcwq->lock)
3436 {
3437         if (!(gcwq->trustee_state == state ||
3438               gcwq->trustee_state == TRUSTEE_DONE)) {
3439                 spin_unlock_irq(&gcwq->lock);
3440                 __wait_event(gcwq->trustee_wait,
3441                              gcwq->trustee_state == state ||
3442                              gcwq->trustee_state == TRUSTEE_DONE);
3443                 spin_lock_irq(&gcwq->lock);
3444         }
3445 }
3446
3447 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
3448                                                 unsigned long action,
3449                                                 void *hcpu)
3450 {
3451         unsigned int cpu = (unsigned long)hcpu;
3452         struct global_cwq *gcwq = get_gcwq(cpu);
3453         struct task_struct *new_trustee = NULL;
3454         struct worker *uninitialized_var(new_worker);
3455         unsigned long flags;
3456
3457         action &= ~CPU_TASKS_FROZEN;
3458
3459         switch (action) {
3460         case CPU_DOWN_PREPARE:
3461                 new_trustee = kthread_create(trustee_thread, gcwq,
3462                                              "workqueue_trustee/%d\n", cpu);
3463                 if (IS_ERR(new_trustee))
3464                         return notifier_from_errno(PTR_ERR(new_trustee));
3465                 kthread_bind(new_trustee, cpu);
3466                 /* fall through */
3467         case CPU_UP_PREPARE:
3468                 BUG_ON(gcwq->first_idle);
3469                 new_worker = create_worker(gcwq, false);
3470                 if (!new_worker) {
3471                         if (new_trustee)
3472                                 kthread_stop(new_trustee);
3473                         return NOTIFY_BAD;
3474                 }
3475         }
3476
3477         /* some are called w/ irq disabled, don't disturb irq status */
3478         spin_lock_irqsave(&gcwq->lock, flags);
3479
3480         switch (action) {
3481         case CPU_DOWN_PREPARE:
3482                 /* initialize trustee and tell it to acquire the gcwq */
3483                 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
3484                 gcwq->trustee = new_trustee;
3485                 gcwq->trustee_state = TRUSTEE_START;
3486                 wake_up_process(gcwq->trustee);
3487                 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
3488                 /* fall through */
3489         case CPU_UP_PREPARE:
3490                 BUG_ON(gcwq->first_idle);
3491                 gcwq->first_idle = new_worker;
3492                 break;
3493
3494         case CPU_DYING:
3495                 /*
3496                  * Before this, the trustee and all workers except for
3497                  * the ones which are still executing works from
3498                  * before the last CPU down must be on the cpu.  After
3499                  * this, they'll all be diasporas.
3500                  */
3501                 gcwq->flags |= GCWQ_DISASSOCIATED;
3502                 break;
3503
3504         case CPU_POST_DEAD:
3505                 gcwq->trustee_state = TRUSTEE_BUTCHER;
3506                 /* fall through */
3507         case CPU_UP_CANCELED:
3508                 destroy_worker(gcwq->first_idle);
3509                 gcwq->first_idle = NULL;
3510                 break;
3511
3512         case CPU_DOWN_FAILED:
3513         case CPU_ONLINE:
3514                 gcwq->flags &= ~GCWQ_DISASSOCIATED;
3515                 if (gcwq->trustee_state != TRUSTEE_DONE) {
3516                         gcwq->trustee_state = TRUSTEE_RELEASE;
3517                         wake_up_process(gcwq->trustee);
3518                         wait_trustee_state(gcwq, TRUSTEE_DONE);
3519                 }
3520
3521                 /*
3522                  * Trustee is done and there might be no worker left.
3523                  * Put the first_idle in and request a real manager to
3524                  * take a look.
3525                  */
3526                 spin_unlock_irq(&gcwq->lock);
3527                 kthread_bind(gcwq->first_idle->task, cpu);
3528                 spin_lock_irq(&gcwq->lock);
3529                 gcwq->flags |= GCWQ_MANAGE_WORKERS;
3530                 start_worker(gcwq->first_idle);
3531                 gcwq->first_idle = NULL;
3532                 break;
3533         }
3534
3535         spin_unlock_irqrestore(&gcwq->lock, flags);
3536
3537         return notifier_from_errno(0);
3538 }
3539
3540 #ifdef CONFIG_SMP
3541
3542 struct work_for_cpu {
3543         struct completion completion;
3544         long (*fn)(void *);
3545         void *arg;
3546         long ret;
3547 };
3548
3549 static int do_work_for_cpu(void *_wfc)
3550 {
3551         struct work_for_cpu *wfc = _wfc;
3552         wfc->ret = wfc->fn(wfc->arg);
3553         complete(&wfc->completion);
3554         return 0;
3555 }
3556
3557 /**
3558  * work_on_cpu - run a function in user context on a particular cpu
3559  * @cpu: the cpu to run on
3560  * @fn: the function to run
3561  * @arg: the function arg
3562  *
3563  * This will return the value @fn returns.
3564  * It is up to the caller to ensure that the cpu doesn't go offline.
3565  * The caller must not hold any locks which would prevent @fn from completing.
3566  */
3567 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
3568 {
3569         struct task_struct *sub_thread;
3570         struct work_for_cpu wfc = {
3571                 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
3572                 .fn = fn,
3573                 .arg = arg,
3574         };
3575
3576         sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
3577         if (IS_ERR(sub_thread))
3578                 return PTR_ERR(sub_thread);
3579         kthread_bind(sub_thread, cpu);
3580         wake_up_process(sub_thread);
3581         wait_for_completion(&wfc.completion);
3582         return wfc.ret;
3583 }
3584 EXPORT_SYMBOL_GPL(work_on_cpu);
3585 #endif /* CONFIG_SMP */
3586
3587 #ifdef CONFIG_FREEZER
3588
3589 /**
3590  * freeze_workqueues_begin - begin freezing workqueues
3591  *
3592  * Start freezing workqueues.  After this function returns, all freezable
3593  * workqueues will queue new works to their frozen_works list instead of
3594  * gcwq->worklist.
3595  *
3596  * CONTEXT:
3597  * Grabs and releases workqueue_lock and gcwq->lock's.
3598  */
3599 void freeze_workqueues_begin(void)
3600 {
3601         unsigned int cpu;
3602
3603         spin_lock(&workqueue_lock);
3604
3605         BUG_ON(workqueue_freezing);
3606         workqueue_freezing = true;
3607
3608         for_each_gcwq_cpu(cpu) {
3609                 struct global_cwq *gcwq = get_gcwq(cpu);
3610                 struct workqueue_struct *wq;
3611
3612                 spin_lock_irq(&gcwq->lock);
3613
3614                 BUG_ON(gcwq->flags & GCWQ_FREEZING);
3615                 gcwq->flags |= GCWQ_FREEZING;
3616
3617                 list_for_each_entry(wq, &workqueues, list) {
3618                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3619
3620                         if (cwq && wq->flags & WQ_FREEZABLE)
3621                                 cwq->max_active = 0;
3622                 }
3623
3624                 spin_unlock_irq(&gcwq->lock);
3625         }
3626
3627         spin_unlock(&workqueue_lock);
3628 }
3629
3630 /**
3631  * freeze_workqueues_busy - are freezable workqueues still busy?
3632  *
3633  * Check whether freezing is complete.  This function must be called
3634  * between freeze_workqueues_begin() and thaw_workqueues().
3635  *
3636  * CONTEXT:
3637  * Grabs and releases workqueue_lock.
3638  *
3639  * RETURNS:
3640  * %true if some freezable workqueues are still busy.  %false if freezing
3641  * is complete.
3642  */
3643 bool freeze_workqueues_busy(void)
3644 {
3645         unsigned int cpu;
3646         bool busy = false;
3647
3648         spin_lock(&workqueue_lock);
3649
3650         BUG_ON(!workqueue_freezing);
3651
3652         for_each_gcwq_cpu(cpu) {
3653                 struct workqueue_struct *wq;
3654                 /*
3655                  * nr_active is monotonically decreasing.  It's safe
3656                  * to peek without lock.
3657                  */
3658                 list_for_each_entry(wq, &workqueues, list) {
3659                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3660
3661                         if (!cwq || !(wq->flags & WQ_FREEZABLE))
3662                                 continue;
3663
3664                         BUG_ON(cwq->nr_active < 0);
3665                         if (cwq->nr_active) {
3666                                 busy = true;
3667                                 goto out_unlock;
3668                         }
3669                 }
3670         }
3671 out_unlock:
3672         spin_unlock(&workqueue_lock);
3673         return busy;
3674 }
3675
3676 /**
3677  * thaw_workqueues - thaw workqueues
3678  *
3679  * Thaw workqueues.  Normal queueing is restored and all collected
3680  * frozen works are transferred to their respective gcwq worklists.
3681  *
3682  * CONTEXT:
3683  * Grabs and releases workqueue_lock and gcwq->lock's.
3684  */
3685 void thaw_workqueues(void)
3686 {
3687         unsigned int cpu;
3688
3689         spin_lock(&workqueue_lock);
3690
3691         if (!workqueue_freezing)
3692                 goto out_unlock;
3693
3694         for_each_gcwq_cpu(cpu) {
3695                 struct global_cwq *gcwq = get_gcwq(cpu);
3696                 struct workqueue_struct *wq;
3697
3698                 spin_lock_irq(&gcwq->lock);
3699
3700                 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
3701                 gcwq->flags &= ~GCWQ_FREEZING;
3702
3703                 list_for_each_entry(wq, &workqueues, list) {
3704                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3705
3706                         if (!cwq || !(wq->flags & WQ_FREEZABLE))
3707                                 continue;
3708
3709                         /* restore max_active and repopulate worklist */
3710                         cwq->max_active = wq->saved_max_active;
3711
3712                         while (!list_empty(&cwq->delayed_works) &&
3713                                cwq->nr_active < cwq->max_active)
3714                                 cwq_activate_first_delayed(cwq);
3715                 }
3716
3717                 wake_up_worker(gcwq);
3718
3719                 spin_unlock_irq(&gcwq->lock);
3720         }
3721
3722         workqueue_freezing = false;
3723 out_unlock:
3724         spin_unlock(&workqueue_lock);
3725 }
3726 #endif /* CONFIG_FREEZER */
3727
3728 static int __init init_workqueues(void)
3729 {
3730         unsigned int cpu;
3731         int i;
3732
3733         cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
3734
3735         /* initialize gcwqs */
3736         for_each_gcwq_cpu(cpu) {
3737                 struct global_cwq *gcwq = get_gcwq(cpu);
3738
3739                 spin_lock_init(&gcwq->lock);
3740                 INIT_LIST_HEAD(&gcwq->worklist);
3741                 gcwq->cpu = cpu;
3742                 gcwq->flags |= GCWQ_DISASSOCIATED;
3743
3744                 INIT_LIST_HEAD(&gcwq->idle_list);
3745                 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
3746                         INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
3747
3748                 init_timer_deferrable(&gcwq->idle_timer);
3749                 gcwq->idle_timer.function = idle_worker_timeout;
3750                 gcwq->idle_timer.data = (unsigned long)gcwq;
3751
3752                 setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
3753                             (unsigned long)gcwq);
3754
3755                 ida_init(&gcwq->worker_ida);
3756
3757                 gcwq->trustee_state = TRUSTEE_DONE;
3758                 init_waitqueue_head(&gcwq->trustee_wait);
3759         }
3760
3761         /* create the initial worker */
3762         for_each_online_gcwq_cpu(cpu) {
3763                 struct global_cwq *gcwq = get_gcwq(cpu);
3764                 struct worker *worker;
3765
3766                 if (cpu != WORK_CPU_UNBOUND)
3767                         gcwq->flags &= ~GCWQ_DISASSOCIATED;
3768                 worker = create_worker(gcwq, true);
3769                 BUG_ON(!worker);
3770                 spin_lock_irq(&gcwq->lock);
3771                 start_worker(worker);
3772                 spin_unlock_irq(&gcwq->lock);
3773         }
3774
3775         system_wq = alloc_workqueue("events", 0, 0);
3776         system_long_wq = alloc_workqueue("events_long", 0, 0);
3777         system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
3778         system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3779                                             WQ_UNBOUND_MAX_ACTIVE);
3780         system_freezeable_wq = alloc_workqueue("events_freezeable",
3781                                                WQ_FREEZEABLE, 0);
3782         BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3783                !system_unbound_wq || !system_freezeable_wq);
3784         return 0;
3785 }
3786 early_initcall(init_workqueues);