Ядро Linux в комментариях

       

Mm/page_alloc.c


34498 /* 34499 * linux/mm/page_alloc.c 34500 * 34501 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 34502 * Swap reorganised 29.12.95, Stephen Tweedie 34503 */ 34504 34505 #include <linux/config.h> 34506 #include <linux/mm.h> 34507 #include <linux/kernel_stat.h> 34508 #include <linux/swap.h> 34509 #include <linux/swapctl.h> 34510 #include <linux/interrupt.h> 34511 #include <linux/init.h> 34512 #include <linux/pagemap.h> 34513 34514 #include <asm/dma.h> 34515 #include <asm/uaccess.h> /* for copy_to/from_user */ 34516 #include <asm/pgtable.h> 34517 34518 int nr_swap_pages = 0; 34519 int nr_free_pages = 0; 34520 34521 /* Free area management 34522 * 34523 * The free_area_list arrays point to the queue heads of 34524 * the free areas of different sizes */ 34525 34526 #if CONFIG_AP1000 34527 /* the AP+ needs to allocate 8MB contiguous, aligned 34528 * chunks of ram for the ring buffers */ 34529 #define NR_MEM_LISTS 12 34530 #else 34531 #define NR_MEM_LISTS 6 34532 #endif 34533 34534 /* The start of this MUST match the start of "struct 34535 * page" */ 34536 struct free_area_struct { 34537 struct page *next; 34538 struct page *prev; 34539 unsigned int * map; 34540 }; 34541 34542 #define memory_head(x) ((struct page *)(x)) 34543 34544 static struct free_area_struct free_area[NR_MEM_LISTS]; 34545 34546 static inline void init_mem_queue( 34547 struct free_area_struct * head) 34548 { 34549 head->next = memory_head(head); 34550 head->prev = memory_head(head); 34551 } 34552 34553 static inline void add_mem_queue( 34554 struct free_area_struct * head, struct page * entry) 34555 { 34556 struct page * next = head->next; 34557 34558 entry->prev = memory_head(head); 34559 entry->next = next; 34560 next->prev = entry; 34561 head->next = entry; 34562 } 34563 34564 static inline void remove_mem_queue(struct page * entry) 34565 { 34566 struct page * next = entry->next; 34567 struct page * prev = entry->prev; 34568 next->prev = prev; 34569 prev->next = next; 34570 } 34571 34572 /* Free_page() adds the page to the free lists. This is 34573 * optimized for fast normal cases (no error jumps taken 34574 * normally). 34575 * 34576 * The way to optimize jumps for gcc-2.2.2 is to: 34577 * - select the "normal" case and put it inside the 34578 * if () { XXX } 34579 * - no else-statements if you can avoid them 34580 * 34581 * With the above two rules, you get a straight-line 34582 * execution path for the normal case, giving better 34583 * asm-code. */ 34584 34585 /* Buddy system. Hairy. You really aren't expected to 34586 * understand this 34587 * 34588 * Hint: -mask = 1+~mask */ 34589 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; 34590 34591 static inline void free_pages_ok(unsigned long map_nr, 34592 unsigned long order) 34593 { 34594 struct free_area_struct *area = free_area + order; 34595 unsigned long index = map_nr >> (1 + order); 34596 unsigned long mask = (~0UL) << order; 34597 unsigned long flags; 34598 34599 spin_lock_irqsave(&page_alloc_lock, flags); 34600 34601 #define list(x) (mem_map+(x)) 34602 34603 map_nr &= mask; 34604 nr_free_pages -= mask; 34605 while (mask + (1 << (NR_MEM_LISTS-1))) { 34606 if (!test_and_change_bit(index, area->map)) 34607 break; 34608 remove_mem_queue(list(map_nr ^ -mask)); 34609 mask <<= 1; 34610 area++; 34611 index >>= 1; 34612 map_nr &= mask; 34613 } 34614 add_mem_queue(area, list(map_nr)); 34615 34616 #undef list 34617 34618 spin_unlock_irqrestore(&page_alloc_lock, flags); 34619 } 34620 34621 void __free_page(struct page *page) 34622 { 34623 if (!PageReserved(page) && 34624 atomic_dec_and_test(&page->count)) { 34625 if (PageSwapCache(page)) 34626 panic ("Freeing swap cache page"); 34627 page->flags &= ~(1 << PG_referenced); 34628 free_pages_ok(page - mem_map, 0); 34629 return; 34630 } 34631 } 34632 34633 void free_pages(unsigned long addr, unsigned long order) 34634 { 34635 unsigned long map_nr = MAP_NR(addr); 34636 34637 if (map_nr < max_mapnr) { 34638 mem_map_t * map = mem_map + map_nr; 34639 if (PageReserved(map)) 34640 return; 34641 if (atomic_dec_and_test(&map->count)) { 34642 if (PageSwapCache(map)) 34643 panic ("Freeing swap cache pages"); 34644 map->flags &= ~(1 << PG_referenced); 34645 free_pages_ok(map_nr, order); 34646 return; 34647 } 34648 } 34649 } 34650 34651 /* Some ugly macros to speed up __get_free_pages().. */ 34652 #define MARK_USED(index, order, area) \ 34653 change_bit((index) >> (1+(order)), (area)->map) 34654 #define CAN_DMA(x) (PageDMA(x)) 34655 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT)) 34656 #define RMQUEUE(order, gfp_mask) \ 34657 do { \ 34658 struct free_area_struct * area = free_area+order; \ 34659 unsigned long new_order = order; \ 34660 do { \ 34661 struct page *prev = memory_head(area), \ 34662 *ret = prev->next; \ 34663 while (memory_head(area) != ret) { \ 34664 if (!(gfp_mask & __GFP_DMA) CAN_DMA(ret)) { \ 34665 unsigned long map_nr; \ 34666 (prev->next = ret->next)->prev = prev; \ 34667 map_nr = ret - mem_map; \ 34668 MARK_USED(map_nr, new_order, area); \ 34669 nr_free_pages -= 1 << order; \ 34670 EXPAND(ret, map_nr, order, new_order, area); \ 34671 spin_unlock_irqrestore(&page_alloc_lock, flags);\ 34672 return ADDRESS(map_nr); \ 34673 } \ 34674 prev = ret; \ 34675 ret = ret->next; \ 34676 } \ 34677 new_order++; area++; \ 34678 } while (new_order < NR_MEM_LISTS); \ 34679 } while (0) 34680 34681 #define EXPAND(map,index,low,high,area) \ 34682 do { \ 34683 unsigned long size = 1 << high; \ 34684 while (high > low) { \ 34685 area--; high--; size >>= 1; \ 34686 add_mem_queue(area, map); \ 34687 MARK_USED(index, high, area); \ 34688 index += size; \ 34689 map += size; \ 34690 } \ 34691 atomic_set(&map->count, 1); \ 34692 } while (0) 34693 34694 int low_on_memory = 0; 34695 34696 unsigned long __get_free_pages(int gfp_mask, 34697 unsigned long order) 34698 { 34699 unsigned long flags; 34700 34701 if (order >= NR_MEM_LISTS) 34702 goto nopage; 34703 34704 #ifdef ATOMIC_MEMORY_DEBUGGING 34705 if ((gfp_mask & __GFP_WAIT) && in_interrupt()) { 34706 static int count = 0; 34707 if (++count < 5) { 34708 printk("gfp called nonatomically from interrupt " 34709 "%p\n", __builtin_return_address(0)); 34710 } 34711 goto nopage; 34712 } 34713 #endif 34714 34715 /* If this is a recursive call, we'd better do our best 34716 * to just allocate things without further thought. */ 34717 if (!(current->flags & PF_MEMALLOC)) { 34718 int freed; 34719 34720 if (nr_free_pages > freepages.min) { 34721 if (!low_on_memory) 34722 goto ok_to_allocate; 34723 if (nr_free_pages >= freepages.high) { 34724 low_on_memory = 0; 34725 goto ok_to_allocate; 34726 } 34727 } 34728 34729 low_on_memory = 1; 34730 current->flags |= PF_MEMALLOC; 34731 freed = try_to_free_pages(gfp_mask); 34732 current->flags &= ~PF_MEMALLOC; 34733 34734 if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) 34735 goto nopage; 34736 } 34737 ok_to_allocate: 34738 spin_lock_irqsave(&page_alloc_lock, flags); 34739 RMQUEUE(order, gfp_mask); 34740 spin_unlock_irqrestore(&page_alloc_lock, flags); 34741 34742 /* If we can schedule, do so, and make sure to yield. 34743 * We may be a real-time process, and if kswapd is 34744 * waiting for us we need to allow it to run a bit. */ 34745 if (gfp_mask & __GFP_WAIT) { 34746 current->policy |= SCHED_YIELD; 34747 schedule(); 34748 } 34749 34750 nopage: 34751 return 0; 34752 } 34753 34754 /* Show free area list (used inside shift_scroll-lock 34755 * stuff) We also calculate the percentage 34756 * fragmentation. We do this by counting the memory on 34757 * each free list with the exception of the first item on 34758 * the list. */ 34759 void show_free_areas(void) 34760 { 34761 unsigned long order, flags; 34762 unsigned long total = 0; 34763 34764 printk("Free pages: %6dkB\n ( ", 34765 nr_free_pages<<(PAGE_SHIFT-10)); 34766 printk("Free: %d (%d %d %d)\n", 34767 nr_free_pages, 34768 freepages.min, 34769 freepages.low, 34770 freepages.high); 34771 spin_lock_irqsave(&page_alloc_lock, flags); 34772 for (order=0 ; order < NR_MEM_LISTS; order++) { 34773 struct page * tmp; 34774 unsigned long nr = 0; 34775 for (tmp = free_area[order].next ; 34776 tmp != memory_head(free_area+order) ; 34777 tmp = tmp->next) { 34778 nr ++; 34779 } 34780 total += nr * ((PAGE_SIZE>>10) << order); 34781 printk("%lu*%lukB ", nr, 34782 (unsigned long)((PAGE_SIZE>>10) << order)); 34783 } 34784 spin_unlock_irqrestore(&page_alloc_lock, flags); 34785 printk("= %lukB)\n", total); 34786 #ifdef SWAP_CACHE_INFO 34787 show_swap_cache_info(); 34788 #endif 34789 } 34790 34791 #define LONG_ALIGN(x) \ 34792 (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 34793 34794 /* set up the free-area data structures: 34795 * - mark all pages reserved 34796 * - mark all memory queues empty 34797 * - clear the memory bitmaps */ 34798 unsigned long __init free_area_init( 34799 unsigned long start_mem, unsigned long end_mem) 34800 { 34801 mem_map_t * p; 34802 unsigned long mask = PAGE_MASK; 34803 unsigned long i; 34804 34805 /* Select nr of pages we try to keep free for important 34806 * stuff with a minimum of 10 pages and a maximum of 34807 * 256 pages, so that we don't waste too much memory on 34808 * large systems. This is fairly arbitrary, but based 34809 * on some behaviour analysis. */ 34810 i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7); 34811 if (i < 10) 34812 i = 10; 34813 if (i > 256) 34814 i = 256; 34815 freepages.min = i; 34816 freepages.low = i * 2; 34817 freepages.high = i * 3; 34818 mem_map = (mem_map_t *) LONG_ALIGN(start_mem); 34819 p = mem_map + MAP_NR(end_mem); 34820 start_mem = LONG_ALIGN((unsigned long) p); 34821 memset(mem_map, 0, 34822 start_mem - (unsigned long) mem_map); 34823 do { 34824 --p; 34825 atomic_set(&p->count, 0); 34826 p->flags = (1 << PG_DMA) | (1 << PG_reserved); 34827 } while (p > mem_map); 34828 34829 for (i = 0 ; i < NR_MEM_LISTS ; i++) { 34830 unsigned long bitmap_size; 34831 init_mem_queue(free_area+i); 34832 mask += mask; 34833 end_mem = (end_mem + ~mask) & mask; 34834 bitmap_size = 34835 (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i); 34836 bitmap_size = (bitmap_size + 7) >> 3; 34837 bitmap_size = LONG_ALIGN(bitmap_size); 34838 free_area[i].map = (unsigned int *) start_mem; 34839 memset((void *) start_mem, 0, bitmap_size); 34840 start_mem += bitmap_size; 34841 } 34842 return start_mem; 34843 } 34844 34845 /* Primitive swap readahead code. We simply read an 34846 * aligned block of (1 << page_cluster) entries in the 34847 * swap area. This method is chosen because it doesn't 34848 * cost us any seek time. We also make sure to queue the 34849 * 'original' request together with the readahead ones... 34850 */ 34851 void swapin_readahead(unsigned long entry) 34852 { 34853 int i; 34854 struct page *new_page; 34855 unsigned long offset = SWP_OFFSET(entry); 34856 struct swap_info_struct *swapdev = 34857 SWP_TYPE(entry) + swap_info; 34858 34859 offset = (offset >> page_cluster) << page_cluster; 34860 34861 i = 1 << page_cluster; 34862 do { 34863 /* Don't read-ahead past the end of the swap area */ 34864 if (offset >= swapdev->max) 34865 break; 34866 /* Don't block on I/O for read-ahead */ 34867 if (atomic_read(&nr_async_pages) >= 34868 pager_daemon.swap_cluster) 34869 break; 34870 /* Don't read in bad or busy pages */ 34871 if (!swapdev->swap_map[offset]) 34872 break; 34873 if (swapdev->swap_map[offset] == SWAP_MAP_BAD) 34874 break; 34875 if (test_bit(offset, swapdev->swap_lockmap)) 34876 break; 34877 34878 /* Ok, do the async read-ahead now */ 34879 new_page = 34880 read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), 34881 offset), 0); 34882 if (new_page != NULL) 34883 __free_page(new_page); 34884 offset++; 34885 } while (--i); 34886 return; 34887 } 34888 34889 /* The tests may look silly, but it essentially makes 34890 * sure that no other process did a swap-in on us just as 34891 * we were waiting. 34892 * 34893 * Also, don't bother to add to the swap cache if this 34894 * page-in was due to a write access. */ 34895 void swap_in(struct task_struct * tsk, 34896 struct vm_area_struct * vma, pte_t * page_table, 34897 unsigned long entry, int write_access) 34898 { 34899 unsigned long page; 34900 struct page *page_map = lookup_swap_cache(entry); 34901 34902 if (!page_map) { 34903 swapin_readahead(entry); 34904 page_map = read_swap_cache(entry); 34905 } 34906 if (pte_val(*page_table) != entry) { 34907 if (page_map) 34908 free_page_and_swap_cache(page_address(page_map)); 34909 return; 34910 } 34911 if (!page_map) { 34912 set_pte(page_table, BAD_PAGE); 34913 swap_free(entry); 34914 oom(tsk); 34915 return; 34916 } 34917 34918 page = page_address(page_map); 34919 vma->vm_mm->rss++; 34920 tsk->min_flt++; 34921 swap_free(entry); 34922 34923 if (!write_access is_page_shared(page_map)) { 34924 set_pte(page_table, mk_pte(page, vma->vm_page_prot)); 34925 return; 34926 } 34927 34928 /* The page is unshared, and we want write access. In 34929 * this case, it is safe to tear down the swap cache 34930 * and give the page over entirely to this process. */ 34931 if (PageSwapCache(page_map)) 34932 delete_from_swap_cache(page_map); 34933 set_pte(page_table, 34934 pte_mkwrite(pte_mkdirty(mk_pte(page, 34935 vma->vm_page_prot)))); 34936 return; 34937 }



Содержание раздела