2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
42 #include <asm/uaccess.h>
43 #include <asm/pgtable.h>
44 #include <asm/system.h>
46 #include <asm/processor.h>
48 #include <asm/mmu_context.h>
50 #include <asm/prctl.h>
52 #include <asm/proto.h>
56 asmlinkage extern void ret_from_fork(void);
58 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
60 unsigned long boot_option_idle_override = 0;
61 EXPORT_SYMBOL(boot_option_idle_override);
64 * Powermanagement idle function, if any..
66 void (*pm_idle)(void);
67 EXPORT_SYMBOL(pm_idle);
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
82 static void __exit_idle(void)
84 if (test_and_clear_bit_pda(0, isidle) == 0)
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
89 /* Called from interrupts to signify idle end */
92 /* idle loop has pid 0 */
99 * We use this if we don't have any better
102 void default_idle(void)
104 current_thread_info()->status &= ~TS_POLLING;
106 * TS_POLLING-cleared state must be visible before we
111 safe_halt(); /* enables interrupts racelessly */
114 current_thread_info()->status |= TS_POLLING;
117 #ifdef CONFIG_HOTPLUG_CPU
118 DECLARE_PER_CPU(int, cpu_state);
121 /* We halt the CPU with physical CPU hotplug */
122 static inline void play_dead(void)
128 __get_cpu_var(cpu_state) = CPU_DEAD;
135 static inline void play_dead(void)
139 #endif /* CONFIG_HOTPLUG_CPU */
142 * The idle thread. There's no useful work to be
143 * done, so just try to conserve power and have a
144 * low exit latency (ie sit in a loop waiting for
145 * somebody to say that they'd like to reschedule)
149 current_thread_info()->status |= TS_POLLING;
152 * If we're the non-boot CPU, nothing set the PDA stack
153 * canary up for us - and if we are the boot CPU we have
154 * a 0 stack canary. This is a good place for updating
155 * it, as we wont ever return from this function (so the
156 * invalid canaries already on the stack wont ever
159 boot_init_stack_canary();
161 /* endless idle loop with no priority at all */
163 tick_nohz_stop_sched_tick();
164 while (!need_resched()) {
171 if (cpu_is_offline(smp_processor_id()))
174 * Idle routines should keep interrupts disabled
175 * from here on, until they go to idle.
176 * Otherwise, idle callbacks can misfire.
181 /* In many cases the interrupt that ended idle
182 has already called exit_idle. But some idle
183 loops can be woken up without interrupt. */
187 tick_nohz_restart_sched_tick();
188 preempt_enable_no_resched();
194 /* Prints also some state that isn't saved in the pt_regs */
195 void __show_regs(struct pt_regs * regs)
197 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
198 unsigned long d0, d1, d2, d3, d6, d7;
199 unsigned int fsindex, gsindex;
200 unsigned int ds, cs, es;
204 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
205 current->pid, current->comm, print_tainted(),
206 init_utsname()->release,
207 (int)strcspn(init_utsname()->version, " "),
208 init_utsname()->version);
209 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
210 printk_address(regs->ip, 1);
211 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
213 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
214 regs->ax, regs->bx, regs->cx);
215 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
216 regs->dx, regs->si, regs->di);
217 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
218 regs->bp, regs->r8, regs->r9);
219 printk("R10: %016lx R11: %016lx R12: %016lx\n",
220 regs->r10, regs->r11, regs->r12);
221 printk("R13: %016lx R14: %016lx R15: %016lx\n",
222 regs->r13, regs->r14, regs->r15);
224 asm("movl %%ds,%0" : "=r" (ds));
225 asm("movl %%cs,%0" : "=r" (cs));
226 asm("movl %%es,%0" : "=r" (es));
227 asm("movl %%fs,%0" : "=r" (fsindex));
228 asm("movl %%gs,%0" : "=r" (gsindex));
230 rdmsrl(MSR_FS_BASE, fs);
231 rdmsrl(MSR_GS_BASE, gs);
232 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
239 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
240 fs,fsindex,gs,gsindex,shadowgs);
241 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
242 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
247 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
251 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
254 void show_regs(struct pt_regs *regs)
256 printk("CPU %d:", smp_processor_id());
258 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
262 * Free current thread data structures etc..
264 void exit_thread(void)
266 struct task_struct *me = current;
267 struct thread_struct *t = &me->thread;
269 if (me->thread.io_bitmap_ptr) {
270 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
272 kfree(t->io_bitmap_ptr);
273 t->io_bitmap_ptr = NULL;
274 clear_thread_flag(TIF_IO_BITMAP);
276 * Careful, clear this in the TSS too:
278 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
279 t->io_bitmap_max = 0;
284 void flush_thread(void)
286 struct task_struct *tsk = current;
288 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
289 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
290 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
291 clear_tsk_thread_flag(tsk, TIF_IA32);
293 set_tsk_thread_flag(tsk, TIF_IA32);
294 current_thread_info()->status |= TS_COMPAT;
297 clear_tsk_thread_flag(tsk, TIF_DEBUG);
299 tsk->thread.debugreg0 = 0;
300 tsk->thread.debugreg1 = 0;
301 tsk->thread.debugreg2 = 0;
302 tsk->thread.debugreg3 = 0;
303 tsk->thread.debugreg6 = 0;
304 tsk->thread.debugreg7 = 0;
305 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
307 * Forget coprocessor state..
309 tsk->fpu_counter = 0;
314 void release_thread(struct task_struct *dead_task)
317 if (dead_task->mm->context.size) {
318 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
320 dead_task->mm->context.ldt,
321 dead_task->mm->context.size);
327 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
329 struct user_desc ud = {
336 struct desc_struct *desc = t->thread.tls_array;
341 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
343 return get_desc_base(&t->thread.tls_array[tls]);
347 * This gets called before we allocate a new thread and copy
348 * the current task into it.
350 void prepare_to_copy(struct task_struct *tsk)
355 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
356 unsigned long unused,
357 struct task_struct * p, struct pt_regs * regs)
360 struct pt_regs * childregs;
361 struct task_struct *me = current;
363 childregs = ((struct pt_regs *)
364 (THREAD_SIZE + task_stack_page(p))) - 1;
370 childregs->sp = (unsigned long)childregs;
372 p->thread.sp = (unsigned long) childregs;
373 p->thread.sp0 = (unsigned long) (childregs+1);
374 p->thread.usersp = me->thread.usersp;
376 set_tsk_thread_flag(p, TIF_FORK);
378 p->thread.fs = me->thread.fs;
379 p->thread.gs = me->thread.gs;
381 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
382 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
383 asm("mov %%es,%0" : "=m" (p->thread.es));
384 asm("mov %%ds,%0" : "=m" (p->thread.ds));
386 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
387 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
388 if (!p->thread.io_bitmap_ptr) {
389 p->thread.io_bitmap_max = 0;
392 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
394 set_tsk_thread_flag(p, TIF_IO_BITMAP);
398 * Set a new TLS for the child thread?
400 if (clone_flags & CLONE_SETTLS) {
401 #ifdef CONFIG_IA32_EMULATION
402 if (test_thread_flag(TIF_IA32))
403 err = do_set_thread_area(p, -1,
404 (struct user_desc __user *)childregs->si, 0);
407 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
413 if (err && p->thread.io_bitmap_ptr) {
414 kfree(p->thread.io_bitmap_ptr);
415 p->thread.io_bitmap_max = 0;
421 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
423 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
427 write_pda(oldrsp, new_sp);
428 regs->cs = __USER_CS;
429 regs->ss = __USER_DS;
433 * Free the old FP and other extended state
435 free_thread_xstate(current);
437 EXPORT_SYMBOL_GPL(start_thread);
439 static void hard_disable_TSC(void)
441 write_cr4(read_cr4() | X86_CR4_TSD);
444 void disable_TSC(void)
447 if (!test_and_set_thread_flag(TIF_NOTSC))
449 * Must flip the CPU state synchronously with
450 * TIF_NOTSC in the current running context.
456 static void hard_enable_TSC(void)
458 write_cr4(read_cr4() & ~X86_CR4_TSD);
461 static void enable_TSC(void)
464 if (test_and_clear_thread_flag(TIF_NOTSC))
466 * Must flip the CPU state synchronously with
467 * TIF_NOTSC in the current running context.
473 int get_tsc_mode(unsigned long adr)
477 if (test_thread_flag(TIF_NOTSC))
478 val = PR_TSC_SIGSEGV;
482 return put_user(val, (unsigned int __user *)adr);
485 int set_tsc_mode(unsigned int val)
487 if (val == PR_TSC_SIGSEGV)
489 else if (val == PR_TSC_ENABLE)
498 * This special macro can be used to load a debugging register
500 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
502 static inline void __switch_to_xtra(struct task_struct *prev_p,
503 struct task_struct *next_p,
504 struct tss_struct *tss)
506 struct thread_struct *prev, *next;
507 unsigned long debugctl;
509 prev = &prev_p->thread,
510 next = &next_p->thread;
512 debugctl = prev->debugctlmsr;
513 if (next->ds_area_msr != prev->ds_area_msr) {
514 /* we clear debugctl to make sure DS
515 * is not in use when we change it */
517 update_debugctlmsr(0);
518 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
521 if (next->debugctlmsr != debugctl)
522 update_debugctlmsr(next->debugctlmsr);
524 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
534 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
535 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
536 /* prev and next are different */
537 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
543 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
545 * Copy the relevant range of the IO bitmap.
546 * Normally this is 128 bytes or less:
548 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
549 max(prev->io_bitmap_max, next->io_bitmap_max));
550 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
552 * Clear any possible leftover bits:
554 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
558 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
559 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
561 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
562 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
567 * switch_to(x,y) should switch tasks from x to y.
569 * This could still be optimized:
570 * - fold all the options into a flag word and test it with a single test.
571 * - could test fs/gs bitsliced
573 * Kprobes not supported here. Set the probe on schedule instead.
576 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
578 struct thread_struct *prev = &prev_p->thread,
579 *next = &next_p->thread;
580 int cpu = smp_processor_id();
581 struct tss_struct *tss = &per_cpu(init_tss, cpu);
583 /* we're going to use this soon, after a few expensive things */
584 if (next_p->fpu_counter>5)
585 prefetch(next->xstate);
588 * Reload esp0, LDT and the page table pointer:
594 * This won't pick up thread selector changes, but I guess that is ok.
596 asm volatile("mov %%es,%0" : "=m" (prev->es));
597 if (unlikely(next->es | prev->es))
598 loadsegment(es, next->es);
600 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
601 if (unlikely(next->ds | prev->ds))
602 loadsegment(ds, next->ds);
611 asm volatile("movl %%fs,%0" : "=r" (fsindex));
612 /* segment register != 0 always requires a reload.
613 also reload when it has changed.
614 when prev process used 64bit base always reload
615 to avoid an information leak. */
616 if (unlikely(fsindex | next->fsindex | prev->fs)) {
617 loadsegment(fs, next->fsindex);
618 /* check if the user used a selector != 0
619 * if yes clear 64bit base, since overloaded base
620 * is always mapped to the Null selector
625 /* when next process has a 64bit base use it */
627 wrmsrl(MSR_FS_BASE, next->fs);
628 prev->fsindex = fsindex;
632 asm volatile("movl %%gs,%0" : "=r" (gsindex));
633 if (unlikely(gsindex | next->gsindex | prev->gs)) {
634 load_gs_index(next->gsindex);
639 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
640 prev->gsindex = gsindex;
643 /* Must be after DS reload */
647 * Switch the PDA and FPU contexts.
649 prev->usersp = read_pda(oldrsp);
650 write_pda(oldrsp, next->usersp);
651 write_pda(pcurrent, next_p);
653 write_pda(kernelstack,
654 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
655 #ifdef CONFIG_CC_STACKPROTECTOR
657 * Build time only check to make sure the stack_canary is at
658 * offset 40 in the pda; this is a gcc ABI requirement
660 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
664 * Now maybe reload the debug registers and handle I/O bitmaps
666 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
667 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
668 __switch_to_xtra(prev_p, next_p, tss);
670 /* If the task has used fpu the last 5 timeslices, just do a full
671 * restore of the math state immediately to avoid the trap; the
672 * chances of needing FPU soon are obviously high now
674 * tsk_used_math() checks prevent calling math_state_restore(),
675 * which can sleep in the case of !tsk_used_math()
677 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
678 math_state_restore();
683 * sys_execve() executes a new program.
686 long sys_execve(char __user *name, char __user * __user *argv,
687 char __user * __user *envp, struct pt_regs *regs)
692 filename = getname(name);
693 error = PTR_ERR(filename);
694 if (IS_ERR(filename))
696 error = do_execve(filename, argv, envp, regs);
701 void set_personality_64bit(void)
703 /* inherit personality from parent */
705 /* Make sure to be in 64bit mode */
706 clear_thread_flag(TIF_IA32);
708 /* TBD: overwrites user setup. Should have two bits.
709 But 64bit processes have always behaved this way,
710 so it's not too bad. The main problem is just that
711 32bit childs are affected again. */
712 current->personality &= ~READ_IMPLIES_EXEC;
715 asmlinkage long sys_fork(struct pt_regs *regs)
717 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
721 sys_clone(unsigned long clone_flags, unsigned long newsp,
722 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
726 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
730 * This is trivial, and on the face of it looks like it
731 * could equally well be done in user mode.
733 * Not so, for quite unobvious reasons - register pressure.
734 * In user mode vfork() cannot have a stack frame, and if
735 * done by calling the "clone()" system call directly, you
736 * do not have enough call-clobbered registers to hold all
737 * the information you need.
739 asmlinkage long sys_vfork(struct pt_regs *regs)
741 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
745 unsigned long get_wchan(struct task_struct *p)
751 if (!p || p == current || p->state==TASK_RUNNING)
753 stack = (unsigned long)task_stack_page(p);
754 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
756 fp = *(u64 *)(p->thread.sp);
758 if (fp < (unsigned long)stack ||
759 fp > (unsigned long)stack+THREAD_SIZE)
762 if (!in_sched_functions(ip))
765 } while (count++ < 16);
769 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
772 int doit = task == current;
777 if (addr >= TASK_SIZE_OF(task))
780 /* handle small bases via the GDT because that's faster to
782 if (addr <= 0xffffffff) {
783 set_32bit_tls(task, GS_TLS, addr);
785 load_TLS(&task->thread, cpu);
786 load_gs_index(GS_TLS_SEL);
788 task->thread.gsindex = GS_TLS_SEL;
791 task->thread.gsindex = 0;
792 task->thread.gs = addr;
795 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
801 /* Not strictly needed for fs, but do it for symmetry
803 if (addr >= TASK_SIZE_OF(task))
806 /* handle small bases via the GDT because that's faster to
808 if (addr <= 0xffffffff) {
809 set_32bit_tls(task, FS_TLS, addr);
811 load_TLS(&task->thread, cpu);
812 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
814 task->thread.fsindex = FS_TLS_SEL;
817 task->thread.fsindex = 0;
818 task->thread.fs = addr;
820 /* set the selector to 0 to not confuse
822 asm volatile("movl %0,%%fs" :: "r" (0));
823 ret = checking_wrmsrl(MSR_FS_BASE, addr);
830 if (task->thread.fsindex == FS_TLS_SEL)
831 base = read_32bit_tls(task, FS_TLS);
833 rdmsrl(MSR_FS_BASE, base);
835 base = task->thread.fs;
836 ret = put_user(base, (unsigned long __user *)addr);
842 if (task->thread.gsindex == GS_TLS_SEL)
843 base = read_32bit_tls(task, GS_TLS);
845 asm("movl %%gs,%0" : "=r" (gsindex));
847 rdmsrl(MSR_KERNEL_GS_BASE, base);
849 base = task->thread.gs;
852 base = task->thread.gs;
853 ret = put_user(base, (unsigned long __user *)addr);
865 long sys_arch_prctl(int code, unsigned long addr)
867 return do_arch_prctl(current, code, addr);
870 unsigned long arch_align_stack(unsigned long sp)
872 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
873 sp -= get_random_int() % 8192;
877 unsigned long arch_randomize_brk(struct mm_struct *mm)
879 unsigned long range_end = mm->brk + 0x02000000;
880 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;