2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage extern void ret_from_fork(void);
61 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
63 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
65 void idle_notifier_register(struct notifier_block *n)
67 atomic_notifier_chain_register(&idle_notifier, n);
69 EXPORT_SYMBOL_GPL(idle_notifier_register);
71 void idle_notifier_unregister(struct notifier_block *n)
73 atomic_notifier_chain_unregister(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
80 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
83 static void __exit_idle(void)
85 if (test_and_clear_bit_pda(0, isidle) == 0)
87 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
90 /* Called from interrupts to signify idle end */
93 /* idle loop has pid 0 */
100 static inline void play_dead(void)
107 * The idle thread. There's no useful work to be
108 * done, so just try to conserve power and have a
109 * low exit latency (ie sit in a loop waiting for
110 * somebody to say that they'd like to reschedule)
114 current_thread_info()->status |= TS_POLLING;
117 * If we're the non-boot CPU, nothing set the PDA stack
118 * canary up for us - and if we are the boot CPU we have
119 * a 0 stack canary. This is a good place for updating
120 * it, as we wont ever return from this function (so the
121 * invalid canaries already on the stack wont ever
124 boot_init_stack_canary();
126 /* endless idle loop with no priority at all */
128 tick_nohz_stop_sched_tick(1);
129 while (!need_resched()) {
133 if (cpu_is_offline(smp_processor_id()))
136 * Idle routines should keep interrupts disabled
137 * from here on, until they go to idle.
138 * Otherwise, idle callbacks can misfire.
142 /* Don't trace irqs off for idle */
143 stop_critical_timings();
145 start_critical_timings();
146 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */
152 tick_nohz_restart_sched_tick();
153 preempt_enable_no_resched();
159 /* Prints also some state that isn't saved in the pt_regs */
160 void __show_regs(struct pt_regs *regs, int all)
162 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
163 unsigned long d0, d1, d2, d3, d6, d7;
164 unsigned int fsindex, gsindex;
165 unsigned int ds, cs, es;
169 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
170 current->pid, current->comm, print_tainted(),
171 init_utsname()->release,
172 (int)strcspn(init_utsname()->version, " "),
173 init_utsname()->version);
174 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
175 printk_address(regs->ip, 1);
176 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
177 regs->sp, regs->flags);
178 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
179 regs->ax, regs->bx, regs->cx);
180 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
181 regs->dx, regs->si, regs->di);
182 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
183 regs->bp, regs->r8, regs->r9);
184 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
185 regs->r10, regs->r11, regs->r12);
186 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
187 regs->r13, regs->r14, regs->r15);
189 asm("movl %%ds,%0" : "=r" (ds));
190 asm("movl %%cs,%0" : "=r" (cs));
191 asm("movl %%es,%0" : "=r" (es));
192 asm("movl %%fs,%0" : "=r" (fsindex));
193 asm("movl %%gs,%0" : "=r" (gsindex));
195 rdmsrl(MSR_FS_BASE, fs);
196 rdmsrl(MSR_GS_BASE, gs);
197 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
207 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
208 fs, fsindex, gs, gsindex, shadowgs);
209 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
211 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
217 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
224 void show_regs(struct pt_regs *regs)
226 printk(KERN_INFO "CPU %d:", smp_processor_id());
227 __show_regs(regs, 1);
228 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232 * Free current thread data structures etc..
234 void exit_thread(void)
236 struct task_struct *me = current;
237 struct thread_struct *t = &me->thread;
239 if (me->thread.io_bitmap_ptr) {
240 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
242 kfree(t->io_bitmap_ptr);
243 t->io_bitmap_ptr = NULL;
244 clear_thread_flag(TIF_IO_BITMAP);
246 * Careful, clear this in the TSS too:
248 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
249 t->io_bitmap_max = 0;
253 ds_exit_thread(current);
256 void flush_thread(void)
258 struct task_struct *tsk = current;
260 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
261 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
262 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
263 clear_tsk_thread_flag(tsk, TIF_IA32);
265 set_tsk_thread_flag(tsk, TIF_IA32);
266 current_thread_info()->status |= TS_COMPAT;
269 clear_tsk_thread_flag(tsk, TIF_DEBUG);
271 tsk->thread.debugreg0 = 0;
272 tsk->thread.debugreg1 = 0;
273 tsk->thread.debugreg2 = 0;
274 tsk->thread.debugreg3 = 0;
275 tsk->thread.debugreg6 = 0;
276 tsk->thread.debugreg7 = 0;
277 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
279 * Forget coprocessor state..
281 tsk->fpu_counter = 0;
286 void release_thread(struct task_struct *dead_task)
289 if (dead_task->mm->context.size) {
290 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
292 dead_task->mm->context.ldt,
293 dead_task->mm->context.size);
299 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
301 struct user_desc ud = {
308 struct desc_struct *desc = t->thread.tls_array;
313 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
315 return get_desc_base(&t->thread.tls_array[tls]);
319 * This gets called before we allocate a new thread and copy
320 * the current task into it.
322 void prepare_to_copy(struct task_struct *tsk)
327 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
328 unsigned long unused,
329 struct task_struct *p, struct pt_regs *regs)
332 struct pt_regs *childregs;
333 struct task_struct *me = current;
335 childregs = ((struct pt_regs *)
336 (THREAD_SIZE + task_stack_page(p))) - 1;
342 childregs->sp = (unsigned long)childregs;
344 p->thread.sp = (unsigned long) childregs;
345 p->thread.sp0 = (unsigned long) (childregs+1);
346 p->thread.usersp = me->thread.usersp;
348 set_tsk_thread_flag(p, TIF_FORK);
350 p->thread.fs = me->thread.fs;
351 p->thread.gs = me->thread.gs;
353 savesegment(gs, p->thread.gsindex);
354 savesegment(fs, p->thread.fsindex);
355 savesegment(es, p->thread.es);
356 savesegment(ds, p->thread.ds);
358 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
359 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
360 if (!p->thread.io_bitmap_ptr) {
361 p->thread.io_bitmap_max = 0;
364 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
366 set_tsk_thread_flag(p, TIF_IO_BITMAP);
370 * Set a new TLS for the child thread?
372 if (clone_flags & CLONE_SETTLS) {
373 #ifdef CONFIG_IA32_EMULATION
374 if (test_thread_flag(TIF_IA32))
375 err = do_set_thread_area(p, -1,
376 (struct user_desc __user *)childregs->si, 0);
379 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
384 ds_copy_thread(p, me);
386 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
387 p->thread.debugctlmsr = 0;
391 if (err && p->thread.io_bitmap_ptr) {
392 kfree(p->thread.io_bitmap_ptr);
393 p->thread.io_bitmap_max = 0;
399 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
407 write_pda(oldrsp, new_sp);
408 regs->cs = __USER_CS;
409 regs->ss = __USER_DS;
413 * Free the old FP and other extended state
415 free_thread_xstate(current);
417 EXPORT_SYMBOL_GPL(start_thread);
419 static void hard_disable_TSC(void)
421 write_cr4(read_cr4() | X86_CR4_TSD);
424 void disable_TSC(void)
427 if (!test_and_set_thread_flag(TIF_NOTSC))
429 * Must flip the CPU state synchronously with
430 * TIF_NOTSC in the current running context.
436 static void hard_enable_TSC(void)
438 write_cr4(read_cr4() & ~X86_CR4_TSD);
441 static void enable_TSC(void)
444 if (test_and_clear_thread_flag(TIF_NOTSC))
446 * Must flip the CPU state synchronously with
447 * TIF_NOTSC in the current running context.
453 int get_tsc_mode(unsigned long adr)
457 if (test_thread_flag(TIF_NOTSC))
458 val = PR_TSC_SIGSEGV;
462 return put_user(val, (unsigned int __user *)adr);
465 int set_tsc_mode(unsigned int val)
467 if (val == PR_TSC_SIGSEGV)
469 else if (val == PR_TSC_ENABLE)
478 * This special macro can be used to load a debugging register
480 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
482 static inline void __switch_to_xtra(struct task_struct *prev_p,
483 struct task_struct *next_p,
484 struct tss_struct *tss)
486 struct thread_struct *prev, *next;
488 prev = &prev_p->thread,
489 next = &next_p->thread;
491 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
492 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
493 ds_switch_to(prev_p, next_p);
494 else if (next->debugctlmsr != prev->debugctlmsr)
495 update_debugctlmsr(next->debugctlmsr);
497 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
507 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
508 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
509 /* prev and next are different */
510 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
516 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
518 * Copy the relevant range of the IO bitmap.
519 * Normally this is 128 bytes or less:
521 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
522 max(prev->io_bitmap_max, next->io_bitmap_max));
523 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
525 * Clear any possible leftover bits:
527 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
532 * switch_to(x,y) should switch tasks from x to y.
534 * This could still be optimized:
535 * - fold all the options into a flag word and test it with a single test.
536 * - could test fs/gs bitsliced
538 * Kprobes not supported here. Set the probe on schedule instead.
539 * Function graph tracer not supported too.
541 __notrace_funcgraph struct task_struct *
542 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
544 struct thread_struct *prev = &prev_p->thread;
545 struct thread_struct *next = &next_p->thread;
546 int cpu = smp_processor_id();
547 struct tss_struct *tss = &per_cpu(init_tss, cpu);
548 unsigned fsindex, gsindex;
550 /* we're going to use this soon, after a few expensive things */
551 if (next_p->fpu_counter > 5)
552 prefetch(next->xstate);
555 * Reload esp0, LDT and the page table pointer:
561 * This won't pick up thread selector changes, but I guess that is ok.
563 savesegment(es, prev->es);
564 if (unlikely(next->es | prev->es))
565 loadsegment(es, next->es);
567 savesegment(ds, prev->ds);
568 if (unlikely(next->ds | prev->ds))
569 loadsegment(ds, next->ds);
572 /* We must save %fs and %gs before load_TLS() because
573 * %fs and %gs may be cleared by load_TLS().
575 * (e.g. xen_load_tls())
577 savesegment(fs, fsindex);
578 savesegment(gs, gsindex);
583 * Leave lazy mode, flushing any hypercalls made here.
584 * This must be done before restoring TLS segments so
585 * the GDT and LDT are properly updated, and must be
586 * done before math_state_restore, so the TS bit is up
589 arch_leave_lazy_cpu_mode();
594 * Segment register != 0 always requires a reload. Also
595 * reload when it has changed. When prev process used 64bit
596 * base always reload to avoid an information leak.
598 if (unlikely(fsindex | next->fsindex | prev->fs)) {
599 loadsegment(fs, next->fsindex);
601 * Check if the user used a selector != 0; if yes
602 * clear 64bit base, since overloaded base is always
603 * mapped to the Null selector
608 /* when next process has a 64bit base use it */
610 wrmsrl(MSR_FS_BASE, next->fs);
611 prev->fsindex = fsindex;
613 if (unlikely(gsindex | next->gsindex | prev->gs)) {
614 load_gs_index(next->gsindex);
619 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
620 prev->gsindex = gsindex;
622 /* Must be after DS reload */
626 * Switch the PDA and FPU contexts.
628 prev->usersp = read_pda(oldrsp);
629 write_pda(oldrsp, next->usersp);
630 write_pda(pcurrent, next_p);
632 write_pda(kernelstack,
633 (unsigned long)task_stack_page(next_p) +
634 THREAD_SIZE - PDA_STACKOFFSET);
635 #ifdef CONFIG_CC_STACKPROTECTOR
637 * Build time only check to make sure the stack_canary is at
638 * offset 40 in the pda; this is a gcc ABI requirement
640 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
644 * Now maybe reload the debug registers and handle I/O bitmaps
646 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
647 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
648 __switch_to_xtra(prev_p, next_p, tss);
650 /* If the task has used fpu the last 5 timeslices, just do a full
651 * restore of the math state immediately to avoid the trap; the
652 * chances of needing FPU soon are obviously high now
654 * tsk_used_math() checks prevent calling math_state_restore(),
655 * which can sleep in the case of !tsk_used_math()
657 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
658 math_state_restore();
663 * sys_execve() executes a new program.
666 long sys_execve(char __user *name, char __user * __user *argv,
667 char __user * __user *envp, struct pt_regs *regs)
672 filename = getname(name);
673 error = PTR_ERR(filename);
674 if (IS_ERR(filename))
676 error = do_execve(filename, argv, envp, regs);
681 void set_personality_64bit(void)
683 /* inherit personality from parent */
685 /* Make sure to be in 64bit mode */
686 clear_thread_flag(TIF_IA32);
688 /* TBD: overwrites user setup. Should have two bits.
689 But 64bit processes have always behaved this way,
690 so it's not too bad. The main problem is just that
691 32bit childs are affected again. */
692 current->personality &= ~READ_IMPLIES_EXEC;
695 asmlinkage long sys_fork(struct pt_regs *regs)
697 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
701 sys_clone(unsigned long clone_flags, unsigned long newsp,
702 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
706 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
710 * This is trivial, and on the face of it looks like it
711 * could equally well be done in user mode.
713 * Not so, for quite unobvious reasons - register pressure.
714 * In user mode vfork() cannot have a stack frame, and if
715 * done by calling the "clone()" system call directly, you
716 * do not have enough call-clobbered registers to hold all
717 * the information you need.
719 asmlinkage long sys_vfork(struct pt_regs *regs)
721 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
725 unsigned long get_wchan(struct task_struct *p)
731 if (!p || p == current || p->state == TASK_RUNNING)
733 stack = (unsigned long)task_stack_page(p);
734 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
736 fp = *(u64 *)(p->thread.sp);
738 if (fp < (unsigned long)stack ||
739 fp >= (unsigned long)stack+THREAD_SIZE)
742 if (!in_sched_functions(ip))
745 } while (count++ < 16);
749 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
752 int doit = task == current;
757 if (addr >= TASK_SIZE_OF(task))
760 /* handle small bases via the GDT because that's faster to
762 if (addr <= 0xffffffff) {
763 set_32bit_tls(task, GS_TLS, addr);
765 load_TLS(&task->thread, cpu);
766 load_gs_index(GS_TLS_SEL);
768 task->thread.gsindex = GS_TLS_SEL;
771 task->thread.gsindex = 0;
772 task->thread.gs = addr;
775 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
781 /* Not strictly needed for fs, but do it for symmetry
783 if (addr >= TASK_SIZE_OF(task))
786 /* handle small bases via the GDT because that's faster to
788 if (addr <= 0xffffffff) {
789 set_32bit_tls(task, FS_TLS, addr);
791 load_TLS(&task->thread, cpu);
792 loadsegment(fs, FS_TLS_SEL);
794 task->thread.fsindex = FS_TLS_SEL;
797 task->thread.fsindex = 0;
798 task->thread.fs = addr;
800 /* set the selector to 0 to not confuse
803 ret = checking_wrmsrl(MSR_FS_BASE, addr);
810 if (task->thread.fsindex == FS_TLS_SEL)
811 base = read_32bit_tls(task, FS_TLS);
813 rdmsrl(MSR_FS_BASE, base);
815 base = task->thread.fs;
816 ret = put_user(base, (unsigned long __user *)addr);
822 if (task->thread.gsindex == GS_TLS_SEL)
823 base = read_32bit_tls(task, GS_TLS);
825 savesegment(gs, gsindex);
827 rdmsrl(MSR_KERNEL_GS_BASE, base);
829 base = task->thread.gs;
831 base = task->thread.gs;
832 ret = put_user(base, (unsigned long __user *)addr);
844 long sys_arch_prctl(int code, unsigned long addr)
846 return do_arch_prctl(current, code, addr);
849 unsigned long arch_align_stack(unsigned long sp)
851 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
852 sp -= get_random_int() % 8192;
856 unsigned long arch_randomize_brk(struct mm_struct *mm)
858 unsigned long range_end = mm->brk + 0x02000000;
859 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;