Kernel/fork.c

23435 /* 23436 * linux/kernel/fork.c 23437 * 23438 * Copyright (C) 1991, 1992 Linus Torvalds 23439 */ 23440 23441 /* 'fork.c' contains the help-routines for the 'fork' 23442 * system call (see also system_call.s). Fork is rather 23443 * simple, once you get the hang of it, but the memory 23444 * management can be a bitch. See 'mm/mm.c': 23445 * 'copy_page_tables()' */ 23446 23447 #include <linux/malloc.h> 23448 #include <linux/init.h> 23449 #include <linux/unistd.h> 23450 #include <linux/smp_lock.h> 23451 #include <linux/module.h> 23452 #include <linux/vmalloc.h> 23453 23454 #include <asm/pgtable.h> 23455 #include <asm/mmu_context.h> 23456 #include <asm/uaccess.h> 23457 23458 /* The idle tasks do not count.. */ 23459 int nr_tasks=0; 23460 int nr_running=0; 23461 23462 /* Handle normal Linux uptimes. */ 23463 unsigned long int total_forks=0; 23464 int last_pid=0; 23465 23466 /* SLAB cache for mm_struct's. */ 23467 kmem_cache_t *mm_cachep; 23468 23469 /* SLAB cache for files structs */ 23470 kmem_cache_t *files_cachep; 23471 23472 struct task_struct *pidhash[PIDHASH_SZ]; 23473 23474 struct task_struct **tarray_freelist = NULL; 23475 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED; 23476 23477 /* UID task count cache, to prevent walking entire 23478 * process list every single fork() operation. */ 23479 #define UIDHASH_SZ (PIDHASH_SZ >> 2) 23480 23481 static struct user_struct { 23482 atomic_t count; 23483 struct user_struct *next, **pprev; 23484 unsigned int uid; 23485 } *uidhash[UIDHASH_SZ]; 23486 23487 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED; 23488 23489 kmem_cache_t *uid_cachep; 23490 23491 #define uidhashfn(uid) \ 23492 (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1)) 23493 23494 /* These routines must be called with the uidhash 23495 * spinlock held! */ 23496 static inline void uid_hash_insert( 23497 struct user_struct *up, unsigned int hashent) 23498 { 23499 if ((up->next = uidhash[hashent]) != NULL) 23500 uidhash[hashent]->pprev = &up->next; 23501 up->pprev = &uidhash[hashent]; 23502 uidhash[hashent] = up; 23503 } 23504 23505 static inline void uid_hash_remove( 23506 struct user_struct *up) 23507 { 23508 if (up->next) 23509 up->next->pprev = up->pprev; 23510 *up->pprev = up->next; 23511 } 23512 23513 static inline struct user_struct *uid_hash_find( 23514 unsigned short uid, unsigned int hashent) 23515 { 23516 struct user_struct *up, *next; 23517 23518 next = uidhash[hashent]; 23519 for (;;) { 23520 up = next; 23521 if (next) { 23522 next = up->next; 23523 if (up->uid != uid) 23524 continue; 23525 atomic_inc(&up->count); 23526 } 23527 break; 23528 } 23529 return up; 23530 } 23531 23532 void free_uid(struct task_struct *p) 23533 { 23534 struct user_struct *up = p->user; 23535 23536 if (up) { 23537 p->user = NULL; 23538 if (atomic_dec_and_test(&up->count)) { 23539 spin_lock(&uidhash_lock); 23540 uid_hash_remove(up); 23541 spin_unlock(&uidhash_lock); 23542 kmem_cache_free(uid_cachep, up); 23543 } 23544 } 23545 } 23546 23547 int alloc_uid(struct task_struct *p) 23548 { 23549 unsigned int hashent = uidhashfn(p->uid); 23550 struct user_struct *up; 23551 23552 spin_lock(&uidhash_lock); 23553 up = uid_hash_find(p->uid, hashent); 23554 spin_unlock(&uidhash_lock); 23555 23556 if (!up) { 23557 struct user_struct *new; 23558 23559 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); 23560 if (!new) 23561 return -EAGAIN; 23562 new->uid = p->uid; 23563 atomic_set(&new->count, 1); 23564 23565 /* Before adding this, check whether we raced on 23566 * adding the same user already.. */ 23567 spin_lock(&uidhash_lock); 23568 up = uid_hash_find(p->uid, hashent); 23569 if (up) { 23570 kmem_cache_free(uid_cachep, new); 23571 } else { 23572 uid_hash_insert(new, hashent); 23573 up = new; 23574 } 23575 spin_unlock(&uidhash_lock); 23576 23577 } 23578 p->user = up; 23579 return 0; 23580 } 23581 23582 void __init uidcache_init(void) 23583 { 23584 int i; 23585 23586 uid_cachep = 23587 kmem_cache_create("uid_cache", 23588 sizeof(struct user_struct), 23589 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 23590 if (!uid_cachep) 23591 panic("Cannot create uid taskcount SLAB cache\n"); 23592 23593 for (i = 0; i < UIDHASH_SZ; i++) 23594 uidhash[i] = 0; 23595 } 23596 23597 static inline struct task_struct ** 23598 find_empty_process(void) 23599 { 23600 struct task_struct **tslot = NULL; 23601 23602 if ((nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) 23603 !current->uid) 23604 tslot = get_free_taskslot(); 23605 return tslot; 23606 } 23607 23608 /* Protects next_safe and last_pid. */ 23609 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED; 23610 23611 static int get_pid(unsigned long flags) 23612 {

23613 static int next_safe = PID_MAX; 23614 struct task_struct *p; 23615 23616 if (flags & CLONE_PID) 23617 return current->pid; 23618 23619 spin_lock(&lastpid_lock); 23620 if((++last_pid) & 0xffff8000) { 23621 last_pid = 300; /* Skip daemons etc. */ 23622 goto inside; 23623 } 23624 if(last_pid >= next_safe) { 23625 inside: 23626 next_safe = PID_MAX; 23627 read_lock(&tasklist_lock); 23628 repeat: 23629 for_each_task(p) { 23630 if(p->pid == last_pid 23631 p->pgrp == last_pid 23632 p->session == last_pid) { 23633 if(++last_pid >= next_safe) { 23634 if(last_pid & 0xffff8000) 23635 last_pid = 300; 23636 next_safe = PID_MAX; 23637 } 23638 goto repeat; 23639 } 23640 if(p->pid > last_pid && next_safe > p->pid) 23641 next_safe = p->pid; 23642 if(p->pgrp > last_pid && next_safe > p->pgrp) 23643 next_safe = p->pgrp; 23644 if(p->session > last_pid && next_safe > p->session) 23645 next_safe = p->session; 23646 } 23647 read_unlock(&tasklist_lock); 23648 } 23649 spin_unlock(&lastpid_lock); 23650 23651 return last_pid; 23652 } 23653 23654 static inline int dup_mmap(struct mm_struct * mm) 23655 { 23656 struct vm_area_struct * mpnt, *tmp, **pprev; 23657 int retval; 23658 23659 flush_cache_mm(current->mm); 23660 pprev = &mm->mmap; 23661 for (mpnt = current->mm->mmap; mpnt; 23662 mpnt = mpnt->vm_next) { 23663 struct file *file; 23664 23665 retval = -ENOMEM; 23666 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 23667 if (!tmp) 23668 goto fail_nomem; 23669 *tmp = *mpnt; 23670 tmp->vm_flags &= ~VM_LOCKED; 23671 tmp->vm_mm = mm; 23672 mm->map_count++; 23673 tmp->vm_next = NULL; 23674 file = tmp->vm_file; 23675 if (file) { 23676 file->f_count++; 23677 if (tmp->vm_flags & VM_DENYWRITE) 23678 file->f_dentry->d_inode->i_writecount--; 23679 23680 /*insert tmp into the share list, just after mpnt*/ 23681 if ((tmp->vm_next_share = mpnt->vm_next_share) != 23682 NULL) 23683 mpnt->vm_next_share->vm_pprev_share = 23684 &tmp->vm_next_share; 23685 mpnt->vm_next_share = tmp; 23686 tmp->vm_pprev_share = &mpnt->vm_next_share; 23687 } 23688 23689 /* Copy the pages, but defer checking for errors */ 23690 retval = copy_page_range(mm, current->mm, tmp); 23691 if (!retval && tmp->vm_ops && tmp->vm_ops->open) 23692 tmp->vm_ops->open(tmp); 23693 23694 /* Link in the new vma even if an error occurred, so 23695 * that exit_mmap() can clean up the mess. */ 23696 tmp->vm_next = *pprev; 23697 *pprev = tmp; 23698 23699 pprev = &tmp->vm_next; 23700 if (retval) 23701 goto fail_nomem; 23702 } 23703 retval = 0; 23704 if (mm->map_count >= AVL_MIN_MAP_COUNT) 23705 build_mmap_avl(mm); 23706 23707 fail_nomem: 23708 flush_tlb_mm(current->mm); 23709 return retval; 23710 } 23711 23712 /* Allocate and initialize an mm_struct. 23713 * 23714 * NOTE! The mm mutex will be locked until the caller 23715 * decides that all systems are go.. */ 23716 struct mm_struct * mm_alloc(void) 23717 { 23718 struct mm_struct * mm; 23719 23720 mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL); 23721 if (mm) { 23722 *mm = *current->mm; 23723 init_new_context(mm); 23724 atomic_set(&mm->count, 1); 23725 mm->map_count = 0; 23726 mm->def_flags = 0; 23727 mm->mmap_sem = MUTEX_LOCKED; 23728 /* Leave mm->pgd set to the parent's pgd so that 23729 * pgd_offset() is always valid. */ 23730 mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; 23731 23732 /* It has not run yet, so cannot be present in 23733 * anyone's cache or tlb. */ 23734 mm->cpu_vm_mask = 0; 23735 } 23736 return mm; 23737 } 23738 23739 /* Please note the differences between mmput and 23740 * mm_release. mmput is called whenever we stop holding 23741 * onto a mm_struct, error success whatever. 23742 * 23743 * mm_release is called after a mm_struct has been 23744 * removed from the current process. 23745 * 23746 * This difference is important for error handling, when 23747 * we only half set up a mm_struct for a new process and 23748 * need to restore the old one. Because we mmput the new 23749 * mm_struct before restoring the old one. . . Eric 23750 * Biederman 10 January 1998 */ 23751 void mm_release(void) 23752 { 23753 struct task_struct *tsk = current; 23754 forget_segments(); 23755 /* notify parent sleeping on vfork() */ 23756 if (tsk->flags & PF_VFORK) { 23757 tsk->flags &= ~PF_VFORK; 23758 up(tsk->p_opptr->vfork_sem); 23759 } 23760 } 23761 23762 /* Decrement the use count and release all resources for 23763 * an mm. */ 23764 void mmput(struct mm_struct *mm) 23765 { 23766 if (atomic_dec_and_test(&mm->count)) { 23767 release_segments(mm); 23768 exit_mmap(mm); 23769 free_page_tables(mm); 23770 kmem_cache_free(mm_cachep, mm); 23771 } 23772 } 23773 23774 static inline int copy_mm(int nr, 23775 unsigned long clone_flags, struct task_struct * tsk) 23776 { 23777 struct mm_struct * mm; 23778 int retval; 23779 23780 if (clone_flags & CLONE_VM) { 23781 mmget(current->mm); 23782 /* Set up the LDT descriptor for the clone task. */ 23783 copy_segments(nr, tsk, NULL); 23784 SET_PAGE_DIR(tsk, current->mm->pgd); 23785 return 0; 23786 } 23787 23788 retval = -ENOMEM; 23789 mm = mm_alloc(); 23790 if (!mm) 23791 goto fail_nomem; 23792 23793 tsk->mm = mm; 23794 tsk->min_flt = tsk->maj_flt = 0; 23795 tsk->cmin_flt = tsk->cmaj_flt = 0; 23796 tsk->nswap = tsk->cnswap = 0; 23797 copy_segments(nr, tsk, mm); 23798 retval = new_page_tables(tsk); 23799 if (retval) 23800 goto free_mm; 23801 retval = dup_mmap(mm); 23802 if (retval) 23803 goto free_pt; 23804 up(&mm->mmap_sem); 23805 return 0; 23806 23807 free_mm: 23808 mm->pgd = NULL; 23809 free_pt: 23810 tsk->mm = NULL; 23811 mmput(mm); 23812 fail_nomem: 23813 return retval; 23814 } 23815 23816 static inline int copy_fs(unsigned long clone_flags, 23817 struct task_struct * tsk) 23818 { 23819 if (clone_flags & CLONE_FS) { 23820 atomic_inc(&current->fs->count); 23821 return 0; 23822 } 23823 tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL); 23824 if (!tsk->fs) 23825 return -1; 23826 atomic_set(&tsk->fs->count, 1); 23827 tsk->fs->umask = current->fs->umask; 23828 tsk->fs->root = dget(current->fs->root); 23829 tsk->fs->pwd = dget(current->fs->pwd); 23830 return 0; 23831 } 23832 23833 /* Copy a fd_set and compute the maximum fd it contains. 23834 */ 23835 static inline int __copy_fdset(unsigned long *d, 23836 unsigned long *src) 23837 { 23838 int i; 23839 unsigned long *p = src; 23840 unsigned long *max = src; 23841 23842 for (i = __FDSET_LONGS; i; --i) { 23843 if ((*d++ = *p++) != 0) 23844 max = p; 23845 } 23846 return (max - src)*sizeof(long)*8; 23847 } 23848 23849 static inline int copy_fdset(fd_set *dst, fd_set *src) 23850 { 23851 return __copy_fdset(dst->fds_bits, src->fds_bits); 23852 } 23853 23854 static int copy_files(unsigned long clone_flags, 23855 struct task_struct * tsk) 23856 { 23857 struct files_struct *oldf, *newf; 23858 struct file **old_fds, **new_fds; 23859 int size, i, error = 0; 23860 23861 /* A background process may not have any files ... */ 23862 oldf = current->files; 23863 if (!oldf) 23864 goto out; 23865 23866 if (clone_flags & CLONE_FILES) { 23867 atomic_inc(&oldf->count); 23868 goto out; 23869 } 23870 23871 tsk->files = NULL; 23872 error = -ENOMEM; 23873 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 23874 if (!newf) 23875 goto out; 23876 23877 /* Allocate the fd array, using get_free_page() if 23878 * possible. Eventually we want to make the array size 23879 * variable ... */ 23880 size = NR_OPEN * sizeof(struct file *); 23881 if (size == PAGE_SIZE) 23882 new_fds = 23883 (struct file **) __get_free_page(GFP_KERNEL); 23884 else 23885 new_fds = (struct file **) kmalloc(size, GFP_KERNEL); 23886 if (!new_fds) 23887 goto out_release; 23888 23889 atomic_set(&newf->count, 1); 23890 newf->max_fds = NR_OPEN; 23891 newf->fd = new_fds; 23892 newf->close_on_exec = oldf->close_on_exec; 23893 i = copy_fdset(&newf->open_fds, &oldf->open_fds); 23894 23895 old_fds = oldf->fd; 23896 for (; i != 0; i--) { 23897 struct file *f = *old_fds++; 23898 *new_fds = f; 23899 if (f) 23900 f->f_count++; 23901 new_fds++; 23902 } 23903 /* This is long word aligned thus could use a optimized 23904 * version */ 23905 memset(new_fds, 0, 23906 (char *)newf->fd + size - (char *)new_fds); 23907 23908 tsk->files = newf; 23909 error = 0; 23910 out: 23911 return error; 23912 23913 out_release: 23914 kmem_cache_free(files_cachep, newf); 23915 goto out; 23916 } 23917 23918 static inline int copy_sighand(unsigned long clone_flags, 23919 struct task_struct * tsk) 23920 { 23921 if (clone_flags & CLONE_SIGHAND) { 23922 atomic_inc(&current->sig->count); 23923 return 0; 23924 } 23925 tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL); 23926 if (!tsk->sig) 23927 return -1; 23928 spin_lock_init(&tsk->sig->siglock); 23929 atomic_set(&tsk->sig->count, 1); 23930 memcpy(tsk->sig->action, current->sig->action, 23931 sizeof(tsk->sig->action)); 23932 return 0; 23933 } 23934 23935 static inline void copy_flags(unsigned long clone_flags, 23936 struct task_struct *p) 23937 { 23938 unsigned long new_flags = p->flags; 23939 23940 new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK); 23941 new_flags |= PF_FORKNOEXEC; 23942 if (!(clone_flags & CLONE_PTRACE)) 23943 new_flags &= ~(PF_PTRACED|PF_TRACESYS); 23944 if (clone_flags & CLONE_VFORK) 23945 new_flags |= PF_VFORK; 23946 p->flags = new_flags; 23947 } 23948 23949 /* Ok, this is the main fork-routine. It copies the 23950 * system process information (task[nr]) and sets up the 23951 * necessary registers. It also copies the data segment 23952 * in its entirety. */

23953 int do_fork( unsigned long clone_flags, unsigned long usp, 23954 struct pt_regs *regs) 23955 { 23956 int nr; 23957 int retval = -ENOMEM; 23958 struct task_struct *p; 23959 struct semaphore sem = MUTEX_LOCKED; 23960 23961 current->vfork_sem = &sem; 23962 23963 p = alloc_task_struct(); 23964 if (!p) 23965 goto fork_out; 23966 23967 *p = *current; 23968 23969 down(&current->mm->mmap_sem); 23970 lock_kernel(); 23971 23972 retval = -EAGAIN; 23973 if (p->user) { 23974 if (atomic_read(&p->user->count) >= 23975 p->rlim[RLIMIT_NPROC].rlim_cur) 23976 goto bad_fork_free; 23977 } 23978 23979 { 23980 struct task_struct **tslot; 23981 tslot = find_empty_process(); 23982 if (!tslot) 23983 goto bad_fork_free; 23984 p->tarray_ptr = tslot; 23985 *tslot = p; 23986 nr = tslot - &task[0]; 23987 } 23988 23989 if (p->exec_domain && p->exec_domain->module) 23990 __MOD_INC_USE_COUNT(p->exec_domain->module); 23991 if (p->binfmt && p->binfmt->module) 23992 __MOD_INC_USE_COUNT(p->binfmt->module); 23993 23994 p->did_exec = 0; 23995 p->swappable = 0; 23996 p->state = TASK_UNINTERRUPTIBLE; 23997 23998 copy_flags(clone_flags, p); 23999 p->pid = get_pid(clone_flags); 24000 24001 /* This is a "shadow run" state. The process is marked 24002 * runnable, but isn't actually on any run queue 24003 * yet.. (that happens at the very end). */ 24004 p->state = TASK_RUNNING; 24005 p->next_run = p; 24006 p->prev_run = p; 24007 24008 p->p_pptr = p->p_opptr = current; 24009 p->p_cptr = NULL; 24010 init_waitqueue(&p->wait_chldexit); 24011 p->vfork_sem = NULL; 24012 24013 p->sigpending = 0; 24014 sigemptyset(&p->signal); 24015 p->sigqueue = NULL; 24016 p->sigqueue_tail = &p->sigqueue; 24017 24018 p->it_real_value = p->it_virt_value = p->it_prof_value 24019 = 0; 24020 p->it_real_incr = p->it_virt_incr = p->it_prof_incr 24021 = 0; 24022 init_timer(&p->real_timer); 24023 p->real_timer.data = (unsigned long) p; 24024 24025 p->leader = 0; /* session leadership doesn't inherit */ 24026 p->tty_old_pgrp = 0; 24027 p->times.tms_utime = p->times.tms_stime = 0; 24028 p->times.tms_cutime = p->times.tms_cstime = 0; 24029 #ifdef __SMP__ 24030 { 24031 int i; 24032 p->has_cpu = 0; 24033 p->processor = NO_PROC_ID; 24034 /* ?? should we just memset this ?? */ 24035 for(i = 0; i < smp_num_cpus; i++) 24036 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; 24037 spin_lock_init(&p->sigmask_lock); 24038 } 24039 #endif 24040 p->lock_depth = -1; /* -1 = no lock */ 24041 p->start_time = jiffies; 24042 24043 retval = -ENOMEM; 24044 /* copy all the process information */ 24045 if (copy_files(clone_flags, p)) 24046 goto bad_fork_cleanup; 24047 if (copy_fs(clone_flags, p)) 24048 goto bad_fork_cleanup_files; 24049 if (copy_sighand(clone_flags, p)) 24050 goto bad_fork_cleanup_fs; 24051 if (copy_mm(nr, clone_flags, p)) 24052 goto bad_fork_cleanup_sighand; 24053 retval = copy_thread(nr, clone_flags, usp, p, regs); 24054 if (retval) 24055 goto bad_fork_cleanup_sighand; 24056 p->semundo = NULL; 24057 24058 /* ok, now we should be set up.. */ 24059 p->swappable = 1; 24060 p->exit_signal = clone_flags & CSIGNAL; 24061 p->pdeath_signal = 0; 24062 24063 /* "share" dynamic priority between parent and child, 24064 * thus the total amount of dynamic priorities in the 24065 * system doesnt change, more scheduling fairness. This 24066 * is only important in the first timeslice, on the 24067 * long run the scheduling behaviour is unchanged. */ 24068 current->counter >>= 1; 24069 p->counter = current->counter; 24070 24071 /* OK, add it to the run-queues and make it visible to 24072 * the rest of the system. 24073 * 24074 * Let it rip! */ 24075 retval = p->pid; 24076 if (retval) { 24077 write_lock_irq(&tasklist_lock);

24078 SET_LINKS(p); 24079 hash_pid(p); 24080 write_unlock_irq(&tasklist_lock); 24081 24082 nr_tasks++; 24083 if (p->user) 24084 atomic_inc(&p->user->count); 24085 24086 p->next_run = NULL; 24087 p->prev_run = NULL; 24088 wake_up_process(p); /* do this last */ 24089 } 24090 ++total_forks; 24091 bad_fork: 24092 unlock_kernel(); 24093 up(&current->mm->mmap_sem); 24094 fork_out: 24095 if ((clone_flags & CLONE_VFORK) && (retval > 0)) 24096 down(&sem); 24097 return retval; 24098 24099 bad_fork_cleanup_sighand: 24100 exit_sighand(p); 24101 bad_fork_cleanup_fs: 24102 exit_fs(p); /* blocking */ 24103 bad_fork_cleanup_files: 24104 exit_files(p); /* blocking */ 24105 bad_fork_cleanup: 24106 if (p->exec_domain && p->exec_domain->module) 24107 __MOD_DEC_USE_COUNT(p->exec_domain->module); 24108 if (p->binfmt && p->binfmt->module) 24109 __MOD_DEC_USE_COUNT(p->binfmt->module); 24110 24111 add_free_taskslot(p->tarray_ptr); 24112 bad_fork_free: 24113 free_task_struct(p); 24114 goto bad_fork; 24115 } 24116 24117 void __init filescache_init(void) 24118 { 24119 files_cachep = kmem_cache_create("files_cache", 24120 sizeof(struct files_struct), 24121 0, 24122 SLAB_HWCACHE_ALIGN, 24123 NULL, NULL); 24124 if (!files_cachep) 24125 panic("Cannot create files cache"); 24126 }

Содержание раздела