Ядро Linux в комментариях

       

Mm/memory.c


31766 /* 31767 * linux/mm/memory.c 31768 * 31769 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 31770 */ 31771 31772 /* demand-loading started 01.12.91 - seems it is high on 31773 * the list of things wanted, and it should be easy to 31774 * implement. - Linus */ 31775 31776 /* Ok, demand-loading was easy, shared pages a little bit 31777 * tricker. Shared pages started 02.12.91, seems to 31778 * work. - Linus. 31779 * 31780 * Tested sharing by executing about 30 /bin/sh: under 31781 * the old kernel it would have taken more than the 6M I 31782 * have free, but it worked well as far as I could see. 31783 * 31784 * Also corrected some "invalidate()"s - I wasn't doing 31785 * enough of them. */ 31786 31787 /* Real VM (paging to/from disk) started 18.12.91. Much 31788 * more work and thought has to go into this. Oh, well.. 31789 * 19.12.91 - works, somewhat. Sometimes I get faults, 31790 * don't know why. Fund it. Everything seems to work 31791 * now. 31792 * 20.12.91 - Ok, making the swap-device changeable like 31793 * the root. */ 31794 /* 05.04.94 - Multi-pg memory management added for v1.1. 31795 * Idea by Alex Bligh (alex@cconcepts.co.uk) 31796 */ 31797 31798 #include <linux/mm.h> 31799 #include <linux/mman.h> 31800 #include <linux/swap.h> 31801 #include <linux/smp_lock.h> 31802 31803 #include <asm/uaccess.h> 31804 #include <asm/pgtable.h> 31805 31806 unsigned long max_mapnr = 0; 31807 unsigned long num_physpages = 0; 31808 void * high_memory = NULL; 31809 31810 /* We special-case the C-O-W ZERO_PAGE, because it's such 31811 * a common occurrence (no need to read the page to know 31812 * that it's zero - better for the cache and memory 31813 * subsystem). */ 31814 static inline void copy_cow_page(unsigned long from, 31815 unsigned long to) 31816 { 31817 if (from == ZERO_PAGE) { 31818 clear_page(to); 31819 return; 31820 } 31821 copy_page(to, from); 31822 } 31823 31824 mem_map_t * mem_map = NULL; 31825 31826 /* oom() prints a message (so that the user knows why the 31827 * process died), and gives the process an untrappable 31828 * SIGKILL. */ 31829 void oom(struct task_struct * task) 31830 { 31831 printk("\nOut of memory for %s.\n", task->comm); 31832 force_sig(SIGKILL, task); 31833 } 31834 31835 /* Note: this doesn't free the actual pages 31836 * themselves. That has been handled earlier when 31837 * unmapping all the memory regions. */ 31838 static inline void free_one_pmd(pmd_t * dir) 31839 { 31840 pte_t * pte; 31841 31842 if (pmd_none(*dir)) 31843 return; 31844 if (pmd_bad(*dir)) { 31845 printk("free_one_pmd: bad directory entry %08lx\n", 31846 pmd_val(*dir)); 31847 pmd_clear(dir); 31848 return; 31849 } 31850 pte = pte_offset(dir, 0); 31851 pmd_clear(dir); 31852 pte_free(pte); 31853 } 31854 31855 static inline void free_one_pgd(pgd_t * dir) 31856 { 31857 int j; 31858 pmd_t * pmd; 31859 31860 if (pgd_none(*dir)) 31861 return; 31862 if (pgd_bad(*dir)) { 31863 printk("free_one_pgd: bad directory entry %08lx\n", 31864 pgd_val(*dir)); 31865 pgd_clear(dir); 31866 return; 31867 } 31868 pmd = pmd_offset(dir, 0); 31869 pgd_clear(dir); 31870 for (j = 0; j < PTRS_PER_PMD ; j++) 31871 free_one_pmd(pmd+j); 31872 pmd_free(pmd); 31873 } 31874 31875 /* Low and high watermarks for page table cache. The 31876 * system should try to have pgt_water[0] <= cache 31877 * elements <= pgt_water[1] */ 31878 int pgt_cache_water[2] = { 25, 50 }; 31879 31880 /* Returns the number of pages freed */ 31881 int check_pgt_cache(void) 31882 { 31883 return do_check_pgt_cache(pgt_cache_water[0], 31884 pgt_cache_water[1]); 31885 } 31886 31887 31888 /* This function clears all user-level page tables of a 31889 * process - this is needed by execve(), so that old 31890 * pages aren't in the way. */ 31891 void clear_page_tables(struct mm_struct *mm, 31892 unsigned long first, int nr) 31893 { 31894 pgd_t * page_dir = mm->pgd; 31895 31896 if (page_dir && page_dir != swapper_pg_dir) { 31897 page_dir += first; 31898 do { 31899 free_one_pgd(page_dir); 31900 page_dir++; 31901 } while (--nr); 31902 31903 /* keep the page table cache within bounds */ 31904 check_pgt_cache(); 31905 } 31906 } 31907 31908 /* This function just free's the page directory - the 31909 * page tables themselves have been freed earlier by 31910 * clear_page_tables(). */ 31911 void free_page_tables(struct mm_struct * mm) 31912 { 31913 pgd_t * page_dir = mm->pgd; 31914 31915 if (page_dir) { 31916 if (page_dir == swapper_pg_dir) 31917 goto out_bad; 31918 pgd_free(page_dir); 31919 } 31920 return; 31921 31922 out_bad: 31923 printk(KERN_ERR 31924 "free_page_tables: Trying to free kernel pgd\n"); 31925 return; 31926 } 31927 31928 int new_page_tables(struct task_struct * tsk) 31929 { 31930 pgd_t * new_pg; 31931 31932 if (!(new_pg = pgd_alloc())) 31933 return -ENOMEM; 31934 SET_PAGE_DIR(tsk, new_pg); 31935 tsk->mm->pgd = new_pg; 31936 return 0; 31937 } 31938 31939 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) 31940 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) 31941 31942 /* copy one vm_area from one task to the other. Assumes 31943 * the page tables already present in the new task to be 31944 * cleared in the whole range covered by this vma. 31945 * 31946 * 08Jan98 Merged into one routine from several inline 31947 * routines to reduce variable count and make things 31948 * faster. -jj */ 31949 int copy_page_range(struct mm_struct *dst, 31950 struct mm_struct *src, 31951 struct vm_area_struct *vma) 31952 { 31953 pgd_t * src_pgd, * dst_pgd; 31954 unsigned long address = vma->vm_start; 31955 unsigned long end = vma->vm_end; 31956 unsigned long cow = 31957 (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) 31958 == VM_MAYWRITE; 31959 31960 src_pgd = pgd_offset(src, address)-1; 31961 dst_pgd = pgd_offset(dst, address)-1; 31962 31963 for (;;) { 31964 pmd_t * src_pmd, * dst_pmd; 31965 31966 src_pgd++; dst_pgd++; 31967 31968 /* copy_pmd_range */ 31969 31970 if (pgd_none(*src_pgd)) 31971 goto skip_copy_pmd_range; 31972 if (pgd_bad(*src_pgd)) { 31973 printk("copy_pmd_range: bad pgd (%08lx)\n", 31974 pgd_val(*src_pgd)); 31975 pgd_clear(src_pgd); 31976 skip_copy_pmd_range: 31977 address = (address + PGDIR_SIZE) & PGDIR_MASK; 31978 if (address >= end) 31979 goto out; 31980 continue; 31981 } 31982 if (pgd_none(*dst_pgd)) { 31983 if (!pmd_alloc(dst_pgd, 0)) 31984 goto nomem; 31985 } 31986 31987 src_pmd = pmd_offset(src_pgd, address); 31988 dst_pmd = pmd_offset(dst_pgd, address); 31989 31990 do { 31991 pte_t * src_pte, * dst_pte; 31992 31993 /* copy_pte_range */ 31994 31995 if (pmd_none(*src_pmd)) 31996 goto skip_copy_pte_range; 31997 if (pmd_bad(*src_pmd)) { 31998 printk("copy_pte_range: bad pmd (%08lx)\n", 31999 pmd_val(*src_pmd)); 32000 pmd_clear(src_pmd); 32001 skip_copy_pte_range: 32002 address = (address + PMD_SIZE) & PMD_MASK; 32003 if (address >= end) 32004 goto out; 32005 goto cont_copy_pmd_range; 32006 } 32007 if (pmd_none(*dst_pmd)) { 32008 if (!pte_alloc(dst_pmd, 0)) 32009 goto nomem; 32010 } 32011 32012 src_pte = pte_offset(src_pmd, address); 32013 dst_pte = pte_offset(dst_pmd, address); 32014 32015 do { 32016 pte_t pte = *src_pte; 32017 unsigned long page_nr; 32018 32019 /* copy_one_pte */ 32020 32021 if (pte_none(pte)) 32022 goto cont_copy_pte_range; 32023 if (!pte_present(pte)) { 32024 swap_duplicate(pte_val(pte)); 32025 set_pte(dst_pte, pte); 32026 goto cont_copy_pte_range; 32027 } 32028 page_nr = MAP_NR(pte_page(pte)); 32029 if (page_nr >= max_mapnr 32030 PageReserved(mem_map+page_nr)) { 32031 set_pte(dst_pte, pte); 32032 goto cont_copy_pte_range; 32033 } 32034 /* If it's a COW mapping, write protect it both 32035 * in the parent and the child */ 32036 if (cow) { 32037 pte = pte_wrprotect(pte); 32038 set_pte(src_pte, pte); 32039 } 32040 /* If it's a shared mapping, mark it clean in the 32041 * child */ 32042 if (vma->vm_flags & VM_SHARED) 32043 pte = pte_mkclean(pte); 32044 set_pte(dst_pte, pte_mkold(pte)); 32045 atomic_inc(&mem_map[page_nr].count); 32046 32047 cont_copy_pte_range: 32048 address += PAGE_SIZE; 32049 if (address >= end) 32050 goto out; 32051 src_pte++; 32052 dst_pte++; 32053 } while ((unsigned long)src_pte & PTE_TABLE_MASK); 32054 32055 cont_copy_pmd_range: 32056 src_pmd++; 32057 dst_pmd++; 32058 } while ((unsigned long)src_pmd & PMD_TABLE_MASK); 32059 } 32060 out: 32061 return 0; 32062 32063 nomem: 32064 return -ENOMEM; 32065 } 32066 32067 /* Return indicates whether a page was freed so caller 32068 * can adjust rss */ 32069 static inline int free_pte(pte_t page) 32070 { 32071 if (pte_present(page)) { 32072 unsigned long addr = pte_page(page); 32073 if (MAP_NR(addr) >= max_mapnr 32074 PageReserved(mem_map+MAP_NR(addr))) 32075 return 0; 32076 /* free_page() used to be able to clear swap cache 32077 * entries. We may now have to do it manually. */ 32078 free_page_and_swap_cache(addr); 32079 return 1; 32080 } 32081 swap_free(pte_val(page)); 32082 return 0; 32083 } 32084 32085 static inline void forget_pte(pte_t page) 32086 { 32087 if (!pte_none(page)) { 32088 printk("forget_pte: old mapping existed!\n"); 32089 free_pte(page); 32090 } 32091 } 32092 32093 static inline int zap_pte_range(pmd_t * pmd, 32094 unsigned long address, unsigned long size) 32095 { 32096 pte_t * pte; 32097 int freed; 32098 32099 if (pmd_none(*pmd)) 32100 return 0; 32101 if (pmd_bad(*pmd)) { 32102 printk("zap_pte_range: bad pmd (%08lx)\n", 32103 pmd_val(*pmd)); 32104 pmd_clear(pmd); 32105 return 0; 32106 } 32107 pte = pte_offset(pmd, address); 32108 address &= ~PMD_MASK; 32109 if (address + size > PMD_SIZE) 32110 size = PMD_SIZE - address; 32111 size >>= PAGE_SHIFT; 32112 freed = 0; 32113 for (;;) { 32114 pte_t page; 32115 if (!size) 32116 break; 32117 page = *pte; 32118 pte++; 32119 size--; 32120 if (pte_none(page)) 32121 continue; 32122 pte_clear(pte-1); 32123 freed += free_pte(page); 32124 } 32125 return freed; 32126 } 32127 32128 static inline int zap_pmd_range(pgd_t * dir, 32129 unsigned long address, unsigned long size) 32130 { 32131 pmd_t * pmd; 32132 unsigned long end; 32133 int freed; 32134 32135 if (pgd_none(*dir)) 32136 return 0; 32137 if (pgd_bad(*dir)) { 32138 printk("zap_pmd_range: bad pgd (%08lx)\n", 32139 pgd_val(*dir)); 32140 pgd_clear(dir); 32141 return 0; 32142 } 32143 pmd = pmd_offset(dir, address); 32144 address &= ~PGDIR_MASK; 32145 end = address + size; 32146 if (end > PGDIR_SIZE) 32147 end = PGDIR_SIZE; 32148 freed = 0; 32149 do { 32150 freed += zap_pte_range(pmd, address, end - address); 32151 address = (address + PMD_SIZE) & PMD_MASK; 32152 pmd++; 32153 } while (address < end); 32154 return freed; 32155 } 32156 32157 /* remove user pages in a given range. */ 32158 void zap_page_range(struct mm_struct *mm, 32159 unsigned long address, unsigned long size) 32160 { 32161 pgd_t * dir; 32162 unsigned long end = address + size; 32163 int freed = 0; 32164 32165 dir = pgd_offset(mm, address); 32166 while (address < end) { 32167 freed += zap_pmd_range(dir, address, end - address); 32168 address = (address + PGDIR_SIZE) & PGDIR_MASK; 32169 dir++; 32170 } 32171 /* Update rss for the mm_struct (not necessarily 32172 * current->mm) */ 32173 if (mm->rss > 0) { 32174 mm->rss -= freed; 32175 if (mm->rss < 0) 32176 mm->rss = 0; 32177 } 32178 } 32179 32180 static inline void zeromap_pte_range(pte_t * pte, 32181 unsigned long address, unsigned long size, 32182 pte_t zero_pte) 32183 { 32184 unsigned long end; 32185 32186 address &= ~PMD_MASK; 32187 end = address + size; 32188 if (end > PMD_SIZE) 32189 end = PMD_SIZE; 32190 do { 32191 pte_t oldpage = *pte; 32192 set_pte(pte, zero_pte); 32193 forget_pte(oldpage); 32194 address += PAGE_SIZE; 32195 pte++; 32196 } while (address < end); 32197 } 32198 32199 static inline int zeromap_pmd_range(pmd_t * pmd, 32200 unsigned long address, unsigned long size, 32201 pte_t zero_pte) 32202 { 32203 unsigned long end; 32204 32205 address &= ~PGDIR_MASK; 32206 end = address + size; 32207 if (end > PGDIR_SIZE) 32208 end = PGDIR_SIZE; 32209 do { 32210 pte_t * pte = pte_alloc(pmd, address); 32211 if (!pte) 32212 return -ENOMEM; 32213 zeromap_pte_range(pte, address, end - address, 32214 zero_pte); 32215 address = (address + PMD_SIZE) & PMD_MASK; 32216 pmd++; 32217 } while (address < end); 32218 return 0; 32219 } 32220 32221 int zeromap_page_range(unsigned long address, 32222 unsigned long size, pgprot_t prot) 32223 { 32224 int error = 0; 32225 pgd_t * dir; 32226 unsigned long beg = address; 32227 unsigned long end = address + size; 32228 pte_t zero_pte; 32229 32230 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot)); 32231 dir = pgd_offset(current->mm, address); 32232 flush_cache_range(current->mm, beg, end); 32233 while (address < end) { 32234 pmd_t *pmd = pmd_alloc(dir, address); 32235 error = -ENOMEM; 32236 if (!pmd) 32237 break; 32238 error = zeromap_pmd_range(pmd, address, 32239 end - address, zero_pte); 32240 if (error) 32241 break; 32242 address = (address + PGDIR_SIZE) & PGDIR_MASK; 32243 dir++; 32244 } 32245 flush_tlb_range(current->mm, beg, end); 32246 return error; 32247 } 32248 32249 /* maps a range of physical memory into the requested 32250 * pages. the old mappings are removed. any references to 32251 * nonexistent pages results in null mappings (currently 32252 * treated as "copy-on-access") */ 32253 static inline void remap_pte_range(pte_t * pte, 32254 unsigned long address, unsigned long size, 32255 unsigned long phys_addr, pgprot_t prot) 32256 { 32257 unsigned long end; 32258 32259 address &= ~PMD_MASK; 32260 end = address + size; 32261 if (end > PMD_SIZE) 32262 end = PMD_SIZE; 32263 do { 32264 unsigned long mapnr; 32265 pte_t oldpage = *pte; 32266 pte_clear(pte); 32267 32268 mapnr = MAP_NR(__va(phys_addr)); 32269 if (mapnr >= max_mapnr 32270 PageReserved(mem_map+mapnr)) 32271 set_pte(pte, mk_pte_phys(phys_addr, prot)); 32272 forget_pte(oldpage); 32273 address += PAGE_SIZE; 32274 phys_addr += PAGE_SIZE; 32275 pte++; 32276 } while (address < end); 32277 } 32278 32279 static inline int remap_pmd_range(pmd_t * pmd, 32280 unsigned long address, unsigned long size, 32281 unsigned long phys_addr, pgprot_t prot) 32282 { 32283 unsigned long end; 32284 32285 address &= ~PGDIR_MASK; 32286 end = address + size; 32287 if (end > PGDIR_SIZE) 32288 end = PGDIR_SIZE; 32289 phys_addr -= address; 32290 do { 32291 pte_t * pte = pte_alloc(pmd, address); 32292 if (!pte) 32293 return -ENOMEM; 32294 remap_pte_range(pte, address, end - address, 32295 address + phys_addr, prot); 32296 address = (address + PMD_SIZE) & PMD_MASK; 32297 pmd++; 32298 } while (address < end); 32299 return 0; 32300 } 32301 32302 int remap_page_range(unsigned long from, 32303 unsigned long phys_addr, unsigned long size, 32304 pgprot_t prot) 32305 { 32306 int error = 0; 32307 pgd_t * dir; 32308 unsigned long beg = from; 32309 unsigned long end = from + size; 32310 32311 phys_addr -= from; 32312 dir = pgd_offset(current->mm, from); 32313 flush_cache_range(current->mm, beg, end); 32314 while (from < end) { 32315 pmd_t *pmd = pmd_alloc(dir, from); 32316 error = -ENOMEM; 32317 if (!pmd) 32318 break; 32319 error = remap_pmd_range(pmd, from, end - from, 32320 phys_addr + from, prot); 32321 if (error) 32322 break; 32323 from = (from + PGDIR_SIZE) & PGDIR_MASK; 32324 dir++; 32325 } 32326 flush_tlb_range(current->mm, beg, end); 32327 return error; 32328 } 32329 32330 /* sanity-check function.. */ 32331 static void put_page(pte_t * page_table, pte_t pte) 32332 { 32333 if (!pte_none(*page_table)) { 32334 free_page_and_swap_cache(pte_page(pte)); 32335 return; 32336 } 32337 /* no need for flush_tlb */ 32338 set_pte(page_table, pte); 32339 } 32340 32341 /* This routine is used to map in a page into an address 32342 * space: needed by execve() for the initial stack and 32343 * environment pages. */ 32344 unsigned long put_dirty_page(struct task_struct * tsk, 32345 unsigned long page, unsigned long address) 32346 { 32347 pgd_t * pgd; 32348 pmd_t * pmd; 32349 pte_t * pte; 32350 32351 if (MAP_NR(page) >= max_mapnr) 32352 printk("put_dirty_page: trying to put page %08lx at " 32353 "%08lx\n",page,address); 32354 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1) 32355 printk("mem_map disagrees with %08lx at %08lx\n", 32356 page,address); 32357 pgd = pgd_offset(tsk->mm,address); 32358 pmd = pmd_alloc(pgd, address); 32359 if (!pmd) { 32360 free_page(page); 32361 oom(tsk); 32362 return 0; 32363 } 32364 pte = pte_alloc(pmd, address); 32365 if (!pte) { 32366 free_page(page); 32367 oom(tsk); 32368 return 0; 32369 } 32370 if (!pte_none(*pte)) { 32371 printk("put_dirty_page: page already exists\n"); 32372 free_page(page); 32373 return 0; 32374 } 32375 flush_page_to_ram(page); 32376 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page, 32377 PAGE_COPY)))); 32378 /* no need for flush_tlb */ 32379 return page; 32380 } 32381 32382 /* This routine handles present pages, when users try to 32383 * write to a shared page. It is done by copying the page 32384 * to a new address and decrementing the shared-page 32385 * counter for the old page. 32386 * 32387 * Goto-purists beware: the only reason for goto's here 32388 * is that it results in better assembly code.. The 32389 * "default" path will see no jumps at all. 32390 * 32391 * Note that this routine assumes that the protection 32392 * checks have been done by the caller (the low-level 32393 * page fault routine in most cases). Thus we can safely 32394 * just mark it writable once we've done any necessary 32395 * COW. 32396 * 32397 * We also mark the page dirty at this point even though 32398 * the page will change only once the write actually 32399 * happens. This avoids a few races, and potentially 32400 * makes it more efficient. */


32401 static int do_wp_page(struct task_struct * tsk, 32402 struct vm_area_struct * vma, unsigned long address, 32403 pte_t *page_table) 32404 { 32405 pte_t pte; 32406 unsigned long old_page, new_page; 32407 struct page * page_map; 32408 32409 pte = *page_table; 32410 new_page = __get_free_page(GFP_USER); 32411 /* Did someone else copy this page for us while we 32412 * slept? */ 32413 if (pte_val(*page_table) != pte_val(pte)) 32414 goto end_wp_page; 32415 if (!pte_present(pte)) 32416 goto end_wp_page; 32417 if (pte_write(pte)) 32418 goto end_wp_page; 32419 old_page = pte_page(pte); 32420 if (MAP_NR(old_page) >= max_mapnr) 32421 goto bad_wp_page; 32422 tsk->min_flt++; 32423 page_map = mem_map + MAP_NR(old_page); 32424 32425 /* We can avoid the copy if: 32426 * - we're the only user (count == 1) 32427 * - the only other user is the swap cache, 32428 * and the only swap cache user is itself, 32429 * in which case we can remove the page 32430 * from the swap cache. 32431 */ 32432 switch (atomic_read(&page_map->count)) { 32433 case 2: 32434 if (!PageSwapCache(page_map)) 32435 break; 32436 if (swap_count(page_map->offset) != 1) 32437 break; 32438 delete_from_swap_cache(page_map); 32439 /* FallThrough */ 32440 case 1: 32441 /* We can release the kernel lock now.. */ 32442 unlock_kernel(); 32443 32444 flush_cache_page(vma, address); 32445 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte))); 32446 flush_tlb_page(vma, address); 32447 end_wp_page: 32448 if (new_page) 32449 free_page(new_page); 32450 return 1; 32451 } 32452 32453 unlock_kernel(); 32454 if (!new_page) 32455 return 0; 32456 32457 if (PageReserved(mem_map + MAP_NR(old_page))) 32458 ++vma->vm_mm->rss; 32459 copy_cow_page(old_page,new_page); 32460 flush_page_to_ram(old_page); 32461 flush_page_to_ram(new_page); 32462 flush_cache_page(vma, address); 32463 set_pte(page_table, 32464 pte_mkwrite(pte_mkdirty(mk_pte(new_page, 32465 vma->vm_page_prot)))); 32466 free_page(old_page); 32467 flush_tlb_page(vma, address); 32468 return 1; 32469 32470 bad_wp_page: 32471 printk("do_wp_page: bogus page at address " 32472 "%08lx (%08lx)\n", address, old_page); 32473 send_sig(SIGKILL, tsk, 1); 32474 if (new_page) 32475 free_page(new_page); 32476 return 0; 32477 } 32478 32479 /* This function zeroes out partial mmap'ed pages at 32480 truncation time.. */ 32481 static void partial_clear(struct vm_area_struct *vma, 32482 unsigned long address) 32483 { 32484 pgd_t *page_dir; 32485 pmd_t *page_middle; 32486 pte_t *page_table, pte; 32487 32488 page_dir = pgd_offset(vma->vm_mm, address); 32489 if (pgd_none(*page_dir)) 32490 return; 32491 if (pgd_bad(*page_dir)) { 32492 printk("bad page table directory entry %p:[%lx]\n", 32493 page_dir, pgd_val(*page_dir)); 32494 pgd_clear(page_dir); 32495 return; 32496 } 32497 page_middle = pmd_offset(page_dir, address); 32498 if (pmd_none(*page_middle)) 32499 return; 32500 if (pmd_bad(*page_middle)) { 32501 printk("bad page table directory entry %p:[%lx]\n", 32502 page_dir, pgd_val(*page_dir)); 32503 pmd_clear(page_middle); 32504 return; 32505 } 32506 page_table = pte_offset(page_middle, address); 32507 pte = *page_table; 32508 if (!pte_present(pte)) 32509 return; 32510 flush_cache_page(vma, address); 32511 address &= ~PAGE_MASK; 32512 address += pte_page(pte); 32513 if (MAP_NR(address) >= max_mapnr) 32514 return; 32515 memset((void *) address, 0, 32516 PAGE_SIZE - (address & ~PAGE_MASK)); 32517 flush_page_to_ram(pte_page(pte)); 32518 } 32519 32520 /* Handle all mappings that got truncated by a 32521 * "truncate()" system call. 32522 * 32523 * NOTE! We have to be ready to update the memory sharing 32524 * between the file and the memory map for a potential 32525 * last incomplete page. Ugly, but necessary. */ 32526 void vmtruncate(struct inode * inode, 32527 unsigned long offset) 32528 { 32529 struct vm_area_struct * mpnt; 32530 32531 truncate_inode_pages(inode, offset); 32532 if (!inode->i_mmap) 32533 return; 32534 mpnt = inode->i_mmap; 32535 do { 32536 struct mm_struct *mm = mpnt->vm_mm; 32537 unsigned long start = mpnt->vm_start; 32538 unsigned long end = mpnt->vm_end; 32539 unsigned long len = end - start; 32540 unsigned long diff; 32541 32542 /* mapping wholly truncated? */ 32543 if (mpnt->vm_offset >= offset) { 32544 flush_cache_range(mm, start, end); 32545 zap_page_range(mm, start, len); 32546 flush_tlb_range(mm, start, end); 32547 continue; 32548 } 32549 /* mapping wholly unaffected? */ 32550 diff = offset - mpnt->vm_offset; 32551 if (diff >= len) 32552 continue; 32553 /* Ok, partially affected.. */ 32554 start += diff; 32555 len = (len - diff) & PAGE_MASK; 32556 if (start & ~PAGE_MASK) { 32557 partial_clear(mpnt, start); 32558 start = (start + ~PAGE_MASK) & PAGE_MASK; 32559 } 32560 flush_cache_range(mm, start, end); 32561 zap_page_range(mm, start, len); 32562 flush_tlb_range(mm, start, end); 32563 } while ((mpnt = mpnt->vm_next_share) != NULL); 32564 } 32565 32566 32567 /* This is called with the kernel lock held, we need to 32568 * return without it. */ 32569 static int do_swap_page(struct task_struct * tsk, 32570 struct vm_area_struct * vma, unsigned long address, 32571 pte_t * page_table, pte_t entry, int write_access) 32572 { 32573 if (!vma->vm_ops !vma->vm_ops->swapin) { 32574 swap_in(tsk, vma, page_table, pte_val(entry), 32575 write_access); 32576 flush_page_to_ram(pte_page(*page_table)); 32577 } else { 32578 pte_t page = 32579 vma->vm_ops->swapin(vma, 32580 address - vma->vm_start + vma->vm_offset, 32581 pte_val(entry)); 32582 if (pte_val(*page_table) != pte_val(entry)) { 32583 free_page(pte_page(page)); 32584 } else { 32585 if (atomic_read(&mem_map[MAP_NR(pte_page(page))]. 32586 count) > 1 && 32587 !(vma->vm_flags & VM_SHARED)) 32588 page = pte_wrprotect(page); 32589 ++vma->vm_mm->rss; 32590 ++tsk->maj_flt; 32591 flush_page_to_ram(pte_page(page)); 32592 set_pte(page_table, page); 32593 } 32594 } 32595 unlock_kernel(); 32596 return 1; 32597 } 32598 32599 /* This only needs the MM semaphore */ 32600 static int do_anonymous_page(struct task_struct * tsk, 32601 struct vm_area_struct * vma, pte_t *page_table, 32602 int write_access) 32603 { 32604 pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE, 32605 vma->vm_page_prot)); 32606 if (write_access) { 32607 unsigned long page = __get_free_page(GFP_USER); 32608 if (!page) 32609 return 0; 32610 clear_page(page); 32611 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, 32612 vma->vm_page_prot))); 32613 vma->vm_mm->rss++; 32614 tsk->min_flt++; 32615 flush_page_to_ram(page); 32616 } 32617 put_page(page_table, entry); 32618 return 1; 32619 } 32620 32621 /* do_no_page() tries to create a new page mapping. It 32622 * aggressively tries to share with existing pages, but 32623 * makes a separate copy if the "write_access" parameter 32624 * is true in order to avoid the next page fault. 32625 * 32626 * As this is called only for pages that do not currently 32627 * exist, we do not need to flush old virtual caches or 32628 * the TLB. 32629 * 32630 * This is called with the MM semaphore and the kernel 32631 * lock held. We need to release the kernel lock as soon 32632 * as possible.. */ 32633 static int do_no_page(struct task_struct * tsk, 32634 struct vm_area_struct * vma, unsigned long address, 32635 int write_access, pte_t *page_table) 32636 { 32637 unsigned long page; 32638 pte_t entry; 32639 32640 if (!vma->vm_ops !vma->vm_ops->nopage) { 32641 unlock_kernel(); 32642 return do_anonymous_page(tsk, vma, page_table, 32643 write_access); 32644 } 32645 32646 /* The third argument is "no_share", which tells the 32647 * low-level code to copy, not share the page even if 32648 * sharing is possible. It's essentially an early COW 32649 * detection. */ 32650 page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 32651 (vma->vm_flags & VM_SHARED)?0:write_access); 32652 32653 unlock_kernel(); 32654 if (!page) 32655 return 0; 32656 32657 ++tsk->maj_flt; 32658 ++vma->vm_mm->rss; 32659 /* This silly early PAGE_DIRTY setting removes a race 32660 * due to the bad i386 page protection. But it's valid 32661 * for other architectures too. 32662 * 32663 * Note that if write_access is true, we either now 32664 * have an exclusive copy of the page, or this is a 32665 * shared mapping, so we can make it writable and dirty 32666 * to avoid having to handle that later. */ 32667 flush_page_to_ram(page); 32668 entry = mk_pte(page, vma->vm_page_prot); 32669 if (write_access) { 32670 entry = pte_mkwrite(pte_mkdirty(entry)); 32671 } else if (atomic_read(&mem_map[MAP_NR(page)]. 32672 count) > 1 && 32673 !(vma->vm_flags & VM_SHARED)) 32674 entry = pte_wrprotect(entry); 32675 put_page(page_table, entry); 32676 /* no need to invalidate: a not-present page shouldn't 32677 * be cached */ 32678 return 1; 32679 } 32680 32681 /* These routines also need to handle stuff like marking 32682 * pages dirty and/or accessed for architectures that 32683 * don't do it in hardware (most RISC architectures). 32684 * The early dirtying is also good on the i386. 32685 * 32686 * There is also a hook called "update_mmu_cache()" that 32687 * architectures with external mmu caches can use to 32688 * update those (ie the Sparc or PowerPC hashed page 32689 * tables that act as extended TLBs). */



32690 static inline int handle_pte_fault( 32691 struct task_struct *tsk, 32692 struct vm_area_struct * vma, unsigned long address, 32693 int write_access, pte_t * pte) 32694 { 32695 pte_t entry; 32696 32697 lock_kernel(); 32698 entry = *pte; 32699 32700 if (!pte_present(entry)) { 32701 if (pte_none(entry)) 32702 return do_no_page(tsk, vma, address, write_access, 32703 pte); 32704 return do_swap_page(tsk, vma, address, pte, entry, 32705 write_access); 32706 } 32707 32708 entry = pte_mkyoung(entry); 32709 set_pte(pte, entry); 32710 flush_tlb_page(vma, address); 32711 if (write_access) { 32712 if (!pte_write(entry)) 32713 return do_wp_page(tsk, vma, address, pte); 32714 32715 entry = pte_mkdirty(entry); 32716 set_pte(pte, entry); 32717 flush_tlb_page(vma, address); 32718 } 32719 unlock_kernel(); 32720 return 1; 32721 } 32722 32723 /* By the time we get here, we already hold the mm 32724 * semaphore */

32725 int handle_mm_fault(struct task_struct *tsk, 32726 struct vm_area_struct * vma, unsigned long address, 32727 int write_access) 32728 { 32729 pgd_t *pgd; 32730 pmd_t *pmd; 32731 32732 pgd = pgd_offset(vma->vm_mm, address); 32733 pmd = pmd_alloc(pgd, address); 32734 if (pmd) { 32735 pte_t * pte = pte_alloc(pmd, address); 32736 if (pte) { 32737 if (handle_pte_fault(tsk, vma, address, 32738 write_access, pte)) { 32739 update_mmu_cache(vma, address, *pte); 32740 return 1; 32741 } 32742 } 32743 } 32744 return 0; 32745 } 32746 32747 /* Simplistic page force-in.. */ 32748 void make_pages_present(unsigned long addr, 32749 unsigned long end) 32750 { 32751 int write; 32752 struct vm_area_struct * vma; 32753 32754 vma = find_vma(current->mm, addr); 32755 write = (vma->vm_flags & VM_WRITE) != 0; 32756 while (addr < end) { 32757 handle_mm_fault(current, vma, addr, write); 32758 addr += PAGE_SIZE; 32759 } 32760 }


Содержание раздела