Ядро Linux в комментариях

       

Mm/vmscan.c


38830 /* 38831 * linux/mm/vmscan.c 38832 * 38833 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 38834 * 38835 * Swap reorganised 29.12.95, Stephen Tweedie. 38836 * kswapd added: 7.1.96 sct 38837 * Removed kswapd_ctl limits, and swap out as many pages 38838 * as needed to bring the system back to freepages.high: 38839 * 2.4.97, Rik van Riel. 38840 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct 38841 * Exp $ */ 38842 38843 #include <linux/slab.h> 38844 #include <linux/kernel_stat.h> 38845 #include <linux/swap.h> 38846 #include <linux/swapctl.h> 38847 #include <linux/smp_lock.h> 38848 #include <linux/pagemap.h> 38849 #include <linux/init.h> 38850 38851 #include <asm/pgtable.h> 38852 38853 /* The swap-out functions return 1 if they successfully 38854 * threw something out, and we got a free page. It 38855 * returns zero if it couldn't do anything, and any other 38856 * value indicates it decreased rss, but the page was 38857 * shared. 38858 * 38859 * NOTE! If it sleeps, it *must* return 1 to make sure we 38860 * don't continue with the swap-out. Otherwise we may be 38861 * using a process that no longer actually exists (it 38862 * might have died while we slept). */

38863 static int try_to_swap_out(struct task_struct * tsk, 38864 struct vm_area_struct* vma, unsigned long address, 38865 pte_t * page_table, int gfp_mask) 38866 { 38867 pte_t pte; 38868 unsigned long entry; 38869 unsigned long page; 38870 struct page * page_map; 38871 38872 pte = *page_table; 38873 if (!pte_present(pte)) 38874 return 0; 38875 page = pte_page(pte); 38876 if (MAP_NR(page) >= max_mapnr) 38877 return 0; 38878 38879 page_map = mem_map + MAP_NR(page); 38880 if (PageReserved(page_map) 38881 PageLocked(page_map) 38882 ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))) 38883 return 0; 38884 38885 if (pte_young(pte)) { 38886 /* Transfer the "accessed" bit from the page tables 38887 * to the global page map. */ 38888 set_pte(page_table, pte_mkold(pte)); 38889 set_bit(PG_referenced, &page_map->flags); 38890 return 0; 38891 } 38892 38893 /* Is the page already in the swap cache? If so, then 38894 * we can just drop our reference to it without doing 38895 * any IO - it's already up-to-date on disk. 38896 * 38897 * Return 0, as we didn't actually free any real 38898 * memory, and we should just continue our scan. */ 38899 if (PageSwapCache(page_map)) { 38900 entry = page_map->offset; 38901 swap_duplicate(entry); 38902 set_pte(page_table, __pte(entry)); 38903 drop_pte: 38904 vma->vm_mm->rss--; 38905 flush_tlb_page(vma, address); 38906 __free_page(page_map); 38907 return 0; 38908 } 38909 38910 /* Is it a clean page? Then it must be recoverable by 38911 * just paging it in again, and we can just drop it.. 38912 * 38913 * However, this won't actually free any real memory, 38914 * as the page will just be in the page cache 38915 * somewhere, and as such we should just continue our 38916 * scan. 38917 * 38918 * Basically, this just makes it possible for us to do 38919 * some real work in the future in "shrink_mmap()". */ 38920 if (!pte_dirty(pte)) { 38921 pte_clear(page_table); 38922 goto drop_pte; 38923 } 38924 38925 /* Don't go down into the swap-out stuff if we cannot 38926 * do I/O! Avoid recursing on FS locks etc. */ 38927 if (!(gfp_mask & __GFP_IO)) 38928 return 0; 38929 38930 /* Ok, it's really dirty. That means that we should 38931 * either create a new swap cache entry for it, or we 38932 * should write it back to its own backing store. 38933 * 38934 * Note that in neither case do we actually know that 38935 * we make a page available, but as we potentially 38936 * sleep we can no longer continue scanning, so we 38937 * migth as well assume we free'd something. 38938 * 38939 * NOTE NOTE NOTE! This should just set a dirty bit in 38940 * page_map, and just drop the pte. All the hard work 38941 * would be done by shrink_mmap(). 38942 * 38943 * That would get rid of a lot of problems. */ 38944 flush_cache_page(vma, address); 38945 if (vma->vm_ops && vma->vm_ops->swapout) { 38946 pid_t pid = tsk->pid; 38947 pte_clear(page_table); 38948 flush_tlb_page(vma, address); 38949 vma->vm_mm->rss--; 38950 38951 if (vma->vm_ops->swapout(vma, page_map)) 38952 kill_proc(pid, SIGBUS, 1); 38953 __free_page(page_map); 38954 return 1; 38955 } 38956 38957 /* This is a dirty, swappable page. First of all, get 38958 * a suitable swap entry for it, and make sure we have 38959 * the swap cache set up to associate the page with 38960 * that swap entry. */ 38961 entry = get_swap_page(); 38962 if (!entry) 38963 return 0; /* No swap space left */ 38964 38965 vma->vm_mm->rss--; 38966 tsk->nswap++; 38967 set_pte(page_table, __pte(entry)); 38968 flush_tlb_page(vma, address); 38969 /* One for the process, one for the swap cache */ 38970 swap_duplicate(entry); 38971 add_to_swap_cache(page_map, entry); 38972 /* We checked we were unlocked way up above, and we 38973 * have been careful not to stall until here */ 38974 set_bit(PG_locked, &page_map->flags); 38975 38976 /* OK, do a physical asynchronous write to swap. */ 38977 rw_swap_page(WRITE, entry, (char *) page, 0); 38978 38979 __free_page(page_map); 38980 return 1; 38981 } 38982 38983 /* A new implementation of swap_out(). We do not swap 38984 * complete processes, but only a small number of blocks, 38985 * before we continue with the next process. The number 38986 * of blocks actually swapped is determined on the number 38987 * of page faults, that this process actually had in the 38988 * last time, so we won't swap heavily used processes all 38989 * the time ... 38990 * 38991 * Note: the priority argument is a hint on much CPU to 38992 * waste with the swap block search, not a hint, of how 38993 * much blocks to swap with each process. 38994 * 38995 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ 38996 static inline int swap_out_pmd(struct task_struct * tsk, 38997 struct vm_area_struct * vma, pmd_t * dir, 38998 unsigned long address, unsigned long end, int gfp_mask) 38999 { 39000 pte_t * pte; 39001 unsigned long pmd_end; 39002 39003 if (pmd_none(*dir)) 39004 return 0; 39005 if (pmd_bad(*dir)) { 39006 printk("swap_out_pmd: bad pmd (%08lx)\n", 39007 pmd_val(*dir)); 39008 pmd_clear(dir); 39009 return 0; 39010 } 39011 39012 pte = pte_offset(dir, address); 39013 39014 pmd_end = (address + PMD_SIZE) & PMD_MASK; 39015 if (end > pmd_end) 39016 end = pmd_end; 39017 39018 do { 39019 int result; 39020 tsk->mm->swap_address = address + PAGE_SIZE; 39021 result = try_to_swap_out(tsk, vma, address, pte, 39022 gfp_mask); 39023 if (result) 39024 return result; 39025 address += PAGE_SIZE; 39026 pte++; 39027 } while (address < end); 39028 return 0; 39029 } 39030 39031 static inline int swap_out_pgd(struct task_struct * tsk, 39032 struct vm_area_struct * vma, pgd_t * dir, 39033 unsigned long address, unsigned long end, int gfp_mask) 39034 { 39035 pmd_t * pmd; 39036 unsigned long pgd_end; 39037 39038 if (pgd_none(*dir)) 39039 return 0; 39040 if (pgd_bad(*dir)) { 39041 printk("swap_out_pgd: bad pgd (%08lx)\n", 39042 pgd_val(*dir)); 39043 pgd_clear(dir); 39044 return 0; 39045 } 39046 39047 pmd = pmd_offset(dir, address); 39048 39049 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; 39050 if (end > pgd_end) 39051 end = pgd_end; 39052 39053 do { 39054 int result = swap_out_pmd(tsk, vma, pmd, address, 39055 end, gfp_mask); 39056 if (result) 39057 return result; 39058 address = (address + PMD_SIZE) & PMD_MASK; 39059 pmd++; 39060 } while (address < end); 39061 return 0; 39062 } 39063 39064 static int swap_out_vma(struct task_struct * tsk, 39065 struct vm_area_struct * vma, unsigned long address, 39066 int gfp_mask) 39067 { 39068 pgd_t *pgdir; 39069 unsigned long end; 39070 39071 /* Don't swap out areas like shared memory which have 39072 * their own separate swapping mechanism or areas which 39073 * are locked down */ 39074 if (vma->vm_flags & (VM_SHM | VM_LOCKED)) 39075 return 0; 39076 39077 pgdir = pgd_offset(tsk->mm, address); 39078 39079 end = vma->vm_end; 39080 while (address < end) { 39081 int result = swap_out_pgd(tsk, vma, pgdir, address, 39082 end, gfp_mask); 39083 if (result) 39084 return result; 39085 address = (address + PGDIR_SIZE) & PGDIR_MASK; 39086 pgdir++; 39087 } 39088 return 0; 39089 } 39090 39091 static int swap_out_process(struct task_struct * p, 39092 int gfp_mask) 39093 { 39094 unsigned long address; 39095 struct vm_area_struct* vma; 39096 39097 /* Go through process' page directory. */ 39098 address = p->mm->swap_address; 39099 39100 /* Find the proper vm-area */ 39101 vma = find_vma(p->mm, address); 39102 if (vma) { 39103 if (address < vma->vm_start) 39104 address = vma->vm_start; 39105 39106 for (;;) { 39107 int result = 39108 swap_out_vma(p, vma, address, gfp_mask); 39109 if (result) 39110 return result; 39111 vma = vma->vm_next; 39112 if (!vma) 39113 break; 39114 address = vma->vm_start; 39115 } 39116 } 39117 39118 /* We didn't find anything for the process */ 39119 p->mm->swap_cnt = 0; 39120 p->mm->swap_address = 0; 39121 return 0; 39122 } 39123 39124 /* Select the task with maximal swap_cnt and try to swap 39125 * out a page. N.B. This function returns only 0 or 1. 39126 * Return values != 1 from the lower-level routines 39127 * result in continued processing. */ 39128 static int swap_out(unsigned int priority, int gfp_mask) 39129 { 39130 struct task_struct * p, * pbest; 39131 int counter, assign, max_cnt; 39132 39133 /* We make one or two passes through the task list, 39134 * indexed by assign = {0, 1}: 39135 * Pass 1: select the swappable task with maximal 39136 * RSS that HAS not yet been swapped out. 39137 * Pass 2: re-assign rss swap_cnt values, then 39138 * select as above. 39139 * 39140 * With this approach, there's no need to remember the 39141 * last task swapped out. If the swap-out fails, we 39142 * clear swap_cnt so the task won't be selected again 39143 * until all others have been tried. 39144 * 39145 * Think of swap_cnt as a "shadow rss" - it tells us 39146 * which process we want to page out (always try 39147 * largest first). */ 39148 counter = nr_tasks / (priority+1); 39149 if (counter < 1) 39150 counter = 1; 39151 if (counter > nr_tasks) 39152 counter = nr_tasks; 39153 39154 for (; counter >= 0; counter--) { 39155 assign = 0; 39156 max_cnt = 0; 39157 pbest = NULL; 39158 select: 39159 read_lock(&tasklist_lock); 39160 p = init_task.next_task; 39161 for (; p != &init_task; p = p->next_task) { 39162 if (!p->swappable) 39163 continue; 39164 if (p->mm->rss <= 0) 39165 continue; 39166 /* Refresh swap_cnt? */ 39167 if (assign) 39168 p->mm->swap_cnt = p->mm->rss; 39169 if (p->mm->swap_cnt > max_cnt) { 39170 max_cnt = p->mm->swap_cnt; 39171 pbest = p; 39172 } 39173 } 39174 read_unlock(&tasklist_lock); 39175 if (!pbest) { 39176 if (!assign) { 39177 assign = 1; 39178 goto select; 39179 } 39180 goto out; 39181 } 39182 39183 if (swap_out_process(pbest, gfp_mask)) 39184 return 1; 39185 } 39186 out: 39187 return 0; 39188 } 39189 39190 /* We need to make the locks finer granularity, but right 39191 * now we need this so that we can do page allocations 39192 * without holding the kernel lock etc. 39193 * 39194 * We want to try to free "count" pages, and we need to 39195 * cluster them so that we get good swap-out 39196 * behaviour. See the "free_memory()" macro for details. 39197 */ 39198 static int do_try_to_free_pages(unsigned int gfp_mask) 39199 { 39200 int priority; 39201 int count = SWAP_CLUSTER_MAX; 39202 39203 lock_kernel(); 39204 39205 /* Always trim SLAB caches when memory gets low. */ 39206 kmem_cache_reap(gfp_mask); 39207 39208 priority = 6; 39209 do { 39210 while (shrink_mmap(priority, gfp_mask)) { 39211 if (!--count) 39212 goto done; 39213 } 39214 39215 /* Try to get rid of some shared memory pages.. */ 39216 if (gfp_mask & __GFP_IO) { 39217 while (shm_swap(priority, gfp_mask)) { 39218 if (!--count) 39219 goto done; 39220 } 39221 } 39222 39223 /* Then, try to page stuff out.. */ 39224 while (swap_out(priority, gfp_mask)) { 39225 if (!--count) 39226 goto done; 39227 } 39228 39229 shrink_dcache_memory(priority, gfp_mask); 39230 } while (--priority >= 0); 39231 done: 39232 unlock_kernel(); 39233 39234 return priority >= 0; 39235 } 39236 39237 /* Before we start the kernel thread, print out the 39238 * kswapd initialization message (otherwise the init 39239 * message may be printed in the middle of another 39240 * driver's init message). It looks very bad when that 39241 * happens. */ 39242 void __init kswapd_setup(void) 39243 { 39244 int i; 39245 char *revision="$Revision: 1.5 $", *s, *e; 39246 39247 swap_setup(); 39248 39249 if ((s = strchr(revision, ':')) && 39250 (e = strchr(s, '$'))) 39251 s++, i = e - s; 39252 else 39253 s = revision, i = -1; 39254 printk ("Starting kswapd v%.*s\n", i, s); 39255 } 39256 39257 static struct task_struct *kswapd_process; 39258 39259 /* The background pageout daemon, started as a kernel 39260 * thread from the init process. 39261 * 39262 * This basically executes once a second, trickling out 39263 * pages so that we have _some_ free memory available 39264 * even if there is no other activity that frees anything 39265 * up. This is needed for things like routing etc, where 39266 * we otherwise might have all activity going on in 39267 * asynchronous contexts that cannot page things out. 39268 * 39269 * If there are applications that are active 39270 * memory-allocators (most normal use), this basically 39271 * shouldn't matter. */ 39272 int kswapd(void *unused) 39273 { 39274 struct task_struct *tsk = current; 39275 39276 kswapd_process = tsk; 39277 tsk->session = 1; 39278 tsk->pgrp = 1; 39279 strcpy(tsk->comm, "kswapd"); 39280 sigfillset(&tsk->blocked); 39281 39282 /* Tell the memory management that we're a "memory 39283 * allocator", and that if we need more memory we 39284 * should get access to it regardless (see 39285 * "__get_free_pages()"). "kswapd" should never get 39286 * caught in the normal page freeing logic. 39287 * 39288 * (Kswapd normally doesn't need memory anyway, but 39289 * sometimes you need a small amount of memory in order 39290 * to be able to page out something else, and this flag 39291 * essentially protects us from recursively trying to 39292 * free more memory as we're trying to free the first 39293 * piece of memory in the first place). */ 39294 tsk->flags |= PF_MEMALLOC; 39295 39296 while (1) { 39297 /* Wake up once a second to see if we need to make 39298 * more memory available. 39299 * 39300 * If we actually get into a low-memory situation, 39301 * the processes needing more memory will wake us up 39302 * on a more timely basis. */ 39303 do { 39304 if (nr_free_pages >= freepages.high) 39305 break; 39306 39307 if (!do_try_to_free_pages(GFP_KSWAPD)) 39308 break; 39309 } while (!tsk->need_resched); 39310 run_task_queue(&tq_disk); 39311 tsk->state = TASK_INTERRUPTIBLE; 39312 schedule_timeout(HZ); 39313 } 39314 } 39315 39316 /* Called by non-kswapd processes when they want more 39317 * memory. 39318 * 39319 * In a perfect world, this should just wake up kswapd 39320 * and return. We don't actually want to swap stuff out 39321 * from user processes, because the locking issues are 39322 * nasty to the extreme (file write locks, and MM 39323 * locking) 39324 * 39325 * One option might be to let kswapd do all the page-out 39326 * and VM page table scanning that needs locking, and 39327 * this process thread could do just the mmap shrink 39328 * stage that can be done by just dropping cached pages 39329 * without having any deadlock issues. */ 39330 int try_to_free_pages(unsigned int gfp_mask) 39331 { 39332 int retval = 1; 39333 39334 wake_up_process(kswapd_process); 39335 if (gfp_mask & __GFP_WAIT) 39336 retval = do_try_to_free_pages(gfp_mask); 39337 return retval; 39338 } 39339



Содержание раздела