make sure that kernel_thread() callbacks call do_exit() themselves

[~andy/linux] / kernel / workqueue.c
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index 692d97628a106360683dfef46797952cdf1861e1..3c5a79e2134cac6f404673093a84df5c8b371d80 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
  
         /* pool flags */
         POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+       POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
  
         /* worker flags */
         WORKER_STARTED          = 1 << 0,       /* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
  /* Do we have too many workers and should some go away? */
  static bool too_many_workers(struct worker_pool *pool)
  {
-       bool managing = mutex_is_locked(&pool->manager_mutex);
+       bool managing = pool->flags & POOL_MANAGING_WORKERS;
         int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
         int nr_busy = pool->nr_workers - nr_idle;
  
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)
  
         /* we did our part, wait for rebind_workers() to finish up */
         wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+
+       /*
+        * rebind_workers() shouldn't finish until all workers passed the
+        * above WORKER_REBIND wait.  Tell it when done.
+        */
+       spin_lock_irq(&worker->pool->gcwq->lock);
+       if (!--worker->idle_rebind->cnt)
+               complete(&worker->idle_rebind->done);
+       spin_unlock_irq(&worker->pool->gcwq->lock);
  }
  
  /*
@@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work)
         struct worker *worker = container_of(work, struct worker, rebind_work);
         struct global_cwq *gcwq = worker->pool->gcwq;
  
-       if (worker_maybe_bind_and_lock(worker))
-               worker_clr_flags(worker, WORKER_REBIND);
+       worker_maybe_bind_and_lock(worker);
+
+       /*
+        * %WORKER_REBIND must be cleared even if the above binding failed;
+        * otherwise, we may confuse the next CPU_UP cycle or oops / get
+        * stuck by calling idle_worker_rebind() prematurely.  If CPU went
+        * down again inbetween, %WORKER_UNBOUND would be set, so clearing
+        * %WORKER_REBIND is always safe.
+        */
+       worker_clr_flags(worker, WORKER_REBIND);
  
         spin_unlock_irq(&gcwq->lock);
  }
@@ -1396,12 +1414,15 @@ retry:
         /* set REBIND and kick idle ones, we'll wait for these later */
         for_each_worker_pool(pool, gcwq) {
                 list_for_each_entry(worker, &pool->idle_list, entry) {
+                       unsigned long worker_flags = worker->flags;
+
                         if (worker->flags & WORKER_REBIND)
                                 continue;
  
-                       /* morph UNBOUND to REBIND */
-                       worker->flags &= ~WORKER_UNBOUND;
-                       worker->flags |= WORKER_REBIND;
+                       /* morph UNBOUND to REBIND atomically */
+                       worker_flags &= ~WORKER_UNBOUND;
+                       worker_flags |= WORKER_REBIND;
+                       ACCESS_ONCE(worker->flags) = worker_flags;
  
                         idle_rebind.cnt++;
                         worker->idle_rebind = &idle_rebind;
@@ -1419,25 +1440,15 @@ retry:
                 goto retry;
         }
  
-       /*
-        * All idle workers are rebound and waiting for %WORKER_REBIND to
-        * be cleared inside idle_worker_rebind().  Clear and release.
-        * Clearing %WORKER_REBIND from this foreign context is safe
-        * because these workers are still guaranteed to be idle.
-        */
-       for_each_worker_pool(pool, gcwq)
-               list_for_each_entry(worker, &pool->idle_list, entry)
-                       worker->flags &= ~WORKER_REBIND;
-
-       wake_up_all(&gcwq->rebind_hold);
-
-       /* rebind busy workers */
+       /* all idle workers are rebound, rebind busy workers */
         for_each_busy_worker(worker, i, pos, gcwq) {
                 struct work_struct *rebind_work = &worker->rebind_work;
+               unsigned long worker_flags = worker->flags;
  
-               /* morph UNBOUND to REBIND */
-               worker->flags &= ~WORKER_UNBOUND;
-               worker->flags |= WORKER_REBIND;
+               /* morph UNBOUND to REBIND atomically */
+               worker_flags &= ~WORKER_UNBOUND;
+               worker_flags |= WORKER_REBIND;
+               ACCESS_ONCE(worker->flags) = worker_flags;
  
                 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
                                      work_data_bits(rebind_work)))
@@ -1449,6 +1460,34 @@ retry:
                             worker->scheduled.next,
                             work_color_to_flags(WORK_NO_COLOR));
         }
+
+       /*
+        * All idle workers are rebound and waiting for %WORKER_REBIND to
+        * be cleared inside idle_worker_rebind().  Clear and release.
+        * Clearing %WORKER_REBIND from this foreign context is safe
+        * because these workers are still guaranteed to be idle.
+        *
+        * We need to make sure all idle workers passed WORKER_REBIND wait
+        * in idle_worker_rebind() before returning; otherwise, workers can
+        * get stuck at the wait if hotplug cycle repeats.
+        */
+       idle_rebind.cnt = 1;
+       INIT_COMPLETION(idle_rebind.done);
+
+       for_each_worker_pool(pool, gcwq) {
+               list_for_each_entry(worker, &pool->idle_list, entry) {
+                       worker->flags &= ~WORKER_REBIND;
+                       idle_rebind.cnt++;
+               }
+       }
+
+       wake_up_all(&gcwq->rebind_hold);
+
+       if (--idle_rebind.cnt) {
+               spin_unlock_irq(&gcwq->lock);
+               wait_for_completion(&idle_rebind.done);
+               spin_lock_irq(&gcwq->lock);
+       }
  }
  
  static struct worker *alloc_worker(void)
@@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker)
         struct worker_pool *pool = worker->pool;
         bool ret = false;
  
-       if (!mutex_trylock(&pool->manager_mutex))
+       if (pool->flags & POOL_MANAGING_WORKERS)
                 return ret;
  
+       pool->flags |= POOL_MANAGING_WORKERS;
+
+       /*
+        * To simplify both worker management and CPU hotplug, hold off
+        * management while hotplug is in progress.  CPU hotplug path can't
+        * grab %POOL_MANAGING_WORKERS to achieve this because that can
+        * lead to idle worker depletion (all become busy thinking someone
+        * else is managing) which in turn can result in deadlock under
+        * extreme circumstances.  Use @pool->manager_mutex to synchronize
+        * manager against CPU hotplug.
+        *
+        * manager_mutex would always be free unless CPU hotplug is in
+        * progress.  trylock first without dropping @gcwq->lock.
+        */
+       if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+               spin_unlock_irq(&pool->gcwq->lock);
+               mutex_lock(&pool->manager_mutex);
+               /*
+                * CPU hotplug could have happened while we were waiting
+                * for manager_mutex.  Hotplug itself can't handle us
+                * because manager isn't either on idle or busy list, and
+                * @gcwq's state and ours could have deviated.
+                *
+                * As hotplug is now excluded via manager_mutex, we can
+                * simply try to bind.  It will succeed or fail depending
+                * on @gcwq's current state.  Try it and adjust
+                * %WORKER_UNBOUND accordingly.
+                */
+               if (worker_maybe_bind_and_lock(worker))
+                       worker->flags &= ~WORKER_UNBOUND;
+               else
+                       worker->flags |= WORKER_UNBOUND;
+
+               ret = true;
+       }
+
         pool->flags &= ~POOL_MANAGE_WORKERS;
  
         /*
@@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker)
         ret |= maybe_destroy_workers(pool);
         ret |= maybe_create_worker(pool);
  
+       pool->flags &= ~POOL_MANAGING_WORKERS;
         mutex_unlock(&pool->manager_mutex);
         return ret;
  }
@@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
  #ifdef CONFIG_SMP
  
  struct work_for_cpu {
-       struct completion completion;
+       struct work_struct work;
         long (*fn)(void *);
         void *arg;
         long ret;
  };
  
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
  {
-       struct work_for_cpu *wfc = _wfc;
+       struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
         wfc->ret = wfc->fn(wfc->arg);
-       complete(&wfc->completion);
-       return 0;
  }
  
  /**
@@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc)
   */
  long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
  {
-       struct task_struct *sub_thread;
-       struct work_for_cpu wfc = {
-               .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-               .fn = fn,
-               .arg = arg,
-       };
+       struct work_for_cpu wfc = { .fn = fn, .arg = arg };
  
-       sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
-       if (IS_ERR(sub_thread))
-               return PTR_ERR(sub_thread);
-       kthread_bind(sub_thread, cpu);
-       wake_up_process(sub_thread);
-       wait_for_completion(&wfc.completion);
+       INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+       schedule_work_on(cpu, &wfc.work);
+       flush_work(&wfc.work);
         return wfc.ret;
  }
  EXPORT_SYMBOL_GPL(work_on_cpu);