From c0a499c2c42992cff097b38be29d2ba60d2fd99a Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 6 Dec 2006 20:31:39 -0800 Subject: [PATCH] __unmap_hugepage_range(): add comment Signed-off-by: Ken Chen Cc: David Gibson Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a088f593a807..f7355bf2f285 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -365,6 +365,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, pte_t pte; struct page *page; struct page *tmp; + /* + * A page gathering list, protected by per file i_mmap_lock. The + * lock is used to avoid list corruption from multiple unmapping + * of the same page since we are using page->lru. + */ LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); -- cgit v1.2.3 From 39dde65c9940c97fcd178a3d2b1c57ed8b7b68aa Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 6 Dec 2006 20:32:03 -0800 Subject: [PATCH] shared page table for hugetlb page Following up with the work on shared page table done by Dave McCracken. This set of patch target shared page table for hugetlb memory only. The shared page table is particular useful in the situation of large number of independent processes sharing large shared memory segments. In the normal page case, the amount of memory saved from process' page table is quite significant. For hugetlb, the saving on page table memory is not the primary objective (as hugetlb itself already cuts down page table overhead significantly), instead, the purpose of using shared page table on hugetlb is to allow faster TLB refill and smaller cache pollution upon TLB miss. With PT sharing, pte entries are shared among hundreds of processes, the cache consumption used by all the page table is smaller and in return, application gets much higher cache hit ratio. One other effect is that cache hit ratio with hardware page walker hitting on pte in cache will be higher and this helps to reduce tlb miss latency. These two effects contribute to higher application performance. Signed-off-by: Ken Chen Acked-by: Hugh Dickins Cc: Dave McCracken Cc: William Lee Irwin III Cc: "Luck, Tony" Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Adam Litke Cc: Paul Mundt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f7355bf2f285..9244971b6791 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -386,6 +386,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (!ptep) continue; + if (huge_pmd_unshare(mm, &address, ptep)) + continue; + pte = huge_ptep_get_and_clear(mm, address, ptep); if (pte_none(pte)) continue; @@ -658,11 +661,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + if (huge_pmd_unshare(mm, &address, ptep)) + continue; if (!pte_none(*ptep)) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); @@ -671,6 +677,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); flush_tlb_range(vma, start, end); } -- cgit v1.2.3 From cace673d376d97b0c66ffa0a49b8d588a696d5d2 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 6 Dec 2006 20:32:07 -0800 Subject: [PATCH] htlb forget rss with pt sharing Imprecise RSS accounting is an irritating ill effect with pt sharing. After consulted with several VM experts, I have tried various methods to solve that problem: (1) iterate through all mm_structs that share the PT and increment count; (2) keep RSS count in page table structure and then sum them up at reporting time. None of the above methods yield any satisfactory implementation. Since process RSS accounting is pure information only, I propose we don't count them at all for hugetlb page. rlimit has such field, though there is absolutely no enforcement on limiting that resource. One other method is to account all RSS at hugetlb mmap time regardless they are faulted or not. I opt for the simplicity of no accounting at all. Hugetlb page are special, they are reserved up front in global reservation pool and is not reclaimable. From physical memory resource point of view, it is already consumed regardless whether there are users using them. If the concern is that RSS can be used to control resource allocation, we already can specify hugetlb fs size limit and sysadmin can enforce that at mount time. Combined with the two points mentioned above, I fail to see if there is anything got affected because of this patch. Signed-off-by: Ken Chen Acked-by: Hugh Dickins Cc: Dave McCracken Cc: William Lee Irwin III Cc: "Luck, Tony" Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Adam Litke Cc: Paul Mundt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9244971b6791..2911a364481e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); - add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -377,10 +376,6 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(end & ~HPAGE_MASK); spin_lock(&mm->page_table_lock); - - /* Update high watermark before we lower rss */ - update_hiwater_rss(mm); - for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) @@ -395,9 +390,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, page = pte_page(pte); list_add(&page->lru, &page_list); - add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { @@ -523,7 +516,6 @@ retry: if (!pte_none(*ptep)) goto backout; - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); -- cgit v1.2.3 From 33f2ef89f8e181486b63fdbdc97c6afa6ca9f34b Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 6 Dec 2006 20:33:32 -0800 Subject: [PATCH] mm: make compound page destructor handling explicit Currently we we use the lru head link of the second page of a compound page to hold its destructor. This was ok when it was purely an internal implmentation detail. However, hugetlbfs overrides this destructor violating the layering. Abstract this out as explicit calls, also introduce a type for the callback function allowing them to be type checked. For each callback we pre-declare the function, causing a type error on definition rather than on use elsewhere. [akpm@osdl.org: cleanups] Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2911a364481e..0ccc7f230252 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -109,7 +109,7 @@ static int alloc_fresh_huge_page(void) if (nid == MAX_NUMNODES) nid = first_node(node_online_map); if (page) { - page[1].lru.next = (void *)free_huge_page; /* dtor */ + set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; -- cgit v1.2.3