summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/bounce.c4
-rw-r--r--mm/filemap_xip.c6
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/memory.c17
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/mincore.c183
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/oom_kill.c19
-rw-r--r--mm/page-writeback.c118
-rw-r--r--mm/page_alloc.c35
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slob.c11
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/truncate.c49
-rw-r--r--mm/vmscan.c39
19 files changed, 311 insertions, 248 deletions
diff --git a/mm/bounce.c b/mm/bounce.c
index e4b62d2a4024..643efbe82402 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -237,6 +237,8 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
if (!bio)
return;
+ blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
/*
* at least one page was bounced, fill in possible non-highmem
* pages
@@ -291,8 +293,6 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
pool = isa_page_pool;
}
- blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
/*
* slow path
*/
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8d667617f558..9dd9fbb75139 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,13 +183,13 @@ __xip_unmap (struct address_space * mapping,
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- page = ZERO_PAGE(address);
+ page = ZERO_PAGE(0);
pte = page_check_address(page, mm, address, &ptl);
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
@@ -246,7 +246,7 @@ xip_file_nopage(struct vm_area_struct * area,
__xip_unmap(mapping, pgoff);
} else {
/* not shared and writable, use ZERO_PAGE() */
- page = ZERO_PAGE(address);
+ page = ZERO_PAGE(0);
}
out:
diff --git a/mm/fremap.c b/mm/fremap.c
index b77a002c3352..4e3f53dd5fd4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
}
} else {
diff --git a/mm/memory.c b/mm/memory.c
index c00bac66ce9f..ef09f0acb1d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
mark_page_accessed(page);
file_rss--;
}
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
tlb_remove_page(tlb, page);
continue;
}
@@ -1091,7 +1091,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages) {
pages[i] = page;
- flush_anon_page(page, start);
+ flush_anon_page(vma, page, start);
flush_dcache_page(page);
}
if (vmas)
@@ -1586,7 +1586,7 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, vma);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
@@ -2606,8 +2606,15 @@ static int __init gate_vma_init(void)
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
- gate_vma.vm_page_prot = PAGE_READONLY;
- gate_vma.vm_flags = 0;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+ /*
+ * Make sure the vDSO gets into every core dump.
+ * Dumping its contents makes post-mortem fully interpretable later
+ * without matching up the same kernel and hardware config to see
+ * what PC values meant.
+ */
+ gate_vma.vm_flags |= VM_ALWAYSDUMP;
return 0;
}
__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c055a090f4d..84279127fcd3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,11 +67,13 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
zone_type = zone - pgdat->node_zones;
if (!populated_zone(zone)) {
int ret = 0;
- ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+ ret = init_currently_empty_zone(zone, phys_start_pfn,
+ nr_pages, MEMMAP_HOTPLUG);
if (ret < 0)
return ret;
}
- memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+ memmap_init_zone(nr_pages, nid, zone_type,
+ phys_start_pfn, MEMMAP_HOTPLUG);
return 0;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index da9463946556..c2aec0e1090d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -884,6 +884,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
+#ifdef CONFIG_CPUSETS
+ /* Restrict the nodes to the allowed nodes in the cpuset */
+ nodes_and(nodes, nodes, current->mems_allowed);
+#endif
return do_mbind(start, len, mode, &nodes, flags);
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c9..8aca6f7167bb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,7 +1,7 @@
/*
* linux/mm/mincore.c
*
- * Copyright (C) 1994-1999 Linus Torvalds
+ * Copyright (C) 1994-2006 Linus Torvalds
*/
/*
@@ -38,46 +38,51 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
return present;
}
-static long mincore_vma(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, unsigned char __user * vec)
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
{
- long error, i, remaining;
- unsigned char * tmp;
-
- error = -ENOMEM;
- if (!vma->vm_file)
- return error;
-
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ unsigned long i, nr, pgoff;
+ struct vm_area_struct *vma = find_vma(current->mm, addr);
- error = -EAGAIN;
- tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
- if (!tmp)
- return error;
+ /*
+ * find_vma() didn't find anything above us, or we're
+ * in an unmapped hole in the address space: ENOMEM.
+ */
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
- /* (end - start) is # of pages, and also # of bytes in "vec */
- remaining = (end - start),
+ /*
+ * Ok, got it. But check whether it's a segment we support
+ * mincore() on. Right now, we don't do any anonymous mappings.
+ *
+ * FIXME: This is just stupid. And returning ENOMEM is
+ * stupid too. We should just look at the page tables. But
+ * this is what we've traditionally done, so we'll just
+ * continue doing it.
+ */
+ if (!vma->vm_file)
+ return -ENOMEM;
- error = 0;
- for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
- int j = 0;
- long thispiece = (remaining < PAGE_SIZE) ?
- remaining : PAGE_SIZE;
+ /*
+ * Calculate how many pages there are left in the vma, and
+ * what the pgoff is for our address.
+ */
+ nr = (vma->vm_end - addr) >> PAGE_SHIFT;
+ if (nr > pages)
+ nr = pages;
- while (j < thispiece)
- tmp[j++] = mincore_page(vma, start++);
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
- if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
- error = -EFAULT;
- break;
- }
- }
+ /* And then we just fill the sucker in.. */
+ for (i = 0 ; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma, pgoff);
- free_page((unsigned long) tmp);
- return error;
+ return nr;
}
/*
@@ -107,82 +112,50 @@ static long mincore_vma(struct vm_area_struct * vma,
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec)
{
- int index = 0;
- unsigned long end, limit;
- struct vm_area_struct * vma;
- size_t max;
- int unmapped_error = 0;
- long error;
-
- /* check the arguments */
- if (start & ~PAGE_CACHE_MASK)
- goto einval;
-
- limit = TASK_SIZE;
- if (start >= limit)
- goto enomem;
-
- if (!len)
- return 0;
-
- max = limit - start;
- len = PAGE_CACHE_ALIGN(len);
- if (len > max || !len)
- goto enomem;
+ long retval;
+ unsigned long pages;
+ unsigned char *tmp;
- end = start + len;
+ /* Check the start address: needs to be page-aligned.. */
+ if (start & ~PAGE_CACHE_MASK)
+ return -EINVAL;
- /* check the output buffer whilst holding the lock */
- error = -EFAULT;
- down_read(&current->mm->mmap_sem);
+ /* ..and we need to be passed a valid user-space range */
+ if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ return -ENOMEM;
- if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
- goto out;
+ /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ pages = len >> PAGE_SHIFT;
+ pages += (len & ~PAGE_MASK) != 0;
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- */
- error = 0;
-
- vma = find_vma(current->mm, start);
- while (vma) {
- /* Here start < vma->vm_end. */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- }
+ if (!access_ok(VERIFY_WRITE, vec, pages))
+ return -EFAULT;
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = mincore_vma(vma, start, end,
- &vec[index]);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
+ tmp = (void *) __get_free_page(GFP_USER);
+ if (!tmp)
+ return -EAGAIN;
+
+ retval = 0;
+ while (pages) {
+ /*
+ * Do at most PAGE_SIZE entries per iteration, due to
+ * the temporary buffer size.
+ */
+ down_read(&current->mm->mmap_sem);
+ retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+ up_read(&current->mm->mmap_sem);
+
+ if (retval <= 0)
+ break;
+ if (copy_to_user(vec, tmp, retval)) {
+ retval = -EFAULT;
+ break;
}
-
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
- if (error)
- goto out;
- index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
- start = vma->vm_end;
- vma = vma->vm_next;
+ pages -= retval;
+ vec += retval;
+ start += retval << PAGE_SHIFT;
+ retval = 0;
}
-
- /* we found a hole in the area queried if we arrive here */
- error = -ENOMEM;
-
-out:
- up_read(&current->mm->mmap_sem);
- return error;
-
-einval:
- return -EINVAL;
-enomem:
- return -ENOMEM;
+ free_page((unsigned long) tmp);
+ return retval;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 9717337293c3..cc3a20819457 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1477,6 +1477,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
@@ -1496,6 +1497,12 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
return -ENOMEM;
}
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
diff --git a/mm/mremap.c b/mm/mremap.c
index 9c769fa29f32..5d4bd4f95b8e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -105,7 +105,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (pte_none(*old_pte))
continue;
pte = ptep_clear_flush(vma, old_addr, old_pte);
- /* ZERO_PAGE can be dependant on virtual addr */
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
set_pte_at(mm, new_addr, new_pte, pte);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 64cf3c214634..b278b8d60eee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,12 +61,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_SWAPOFF)
- return ULONG_MAX;
-
- /*
* The memory size of the process is the basis for the badness.
*/
points = mm->total_vm;
@@ -77,6 +71,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
task_unlock(p);
/*
+ * swapoff can easily use up all memory, so kill those first.
+ */
+ if (p->flags & PF_SWAPOFF)
+ return ULONG_MAX;
+
+ /*
* Processes which fork a lot of child processes are likely
* a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
@@ -174,7 +174,12 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
struct zone **z;
- nodemask_t nodes = node_online_map;
+ nodemask_t nodes;
+ int node;
+ /* node has memory ? */
+ for_each_online_node(node)
+ if (NODE_DATA(node)->node_present_pages)
+ node_set(node, nodes);
for (z = zonelist->zones; *z; z++)
if (cpuset_zone_allowed_softwall(*z, gfp_mask))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 237107c1b084..be0efbde4994 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -133,11 +133,9 @@ get_dirty_limits(long *pbackground, long *pdirty,
#ifdef CONFIG_HIGHMEM
/*
- * If this mapping can only allocate from low memory,
- * we exclude high memory from our count.
+ * We always exclude high memory from our count.
*/
- if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
- available_memory -= totalhigh_pages;
+ available_memory -= totalhigh_pages;
#endif
@@ -526,28 +524,25 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
};
/*
- * If the machine has a large highmem:lowmem ratio then scale back the default
- * dirty memory thresholds: allowing too much dirty highmem pins an excessive
- * number of buffer_heads.
+ * Called early on to tune the page writeback dirty limits.
+ *
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers (by
+ * comparing nr_free_buffer_pages() to vm_total_pages.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory (by subtracting
+ * totalhigh_pages from vm_total_pages), and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
*/
void __init page_writeback_init(void)
{
- long buffer_pages = nr_free_buffer_pages();
- long correction;
-
- correction = (100 * 4 * buffer_pages) / vm_total_pages;
-
- if (correction < 100) {
- dirty_background_ratio *= correction;
- dirty_background_ratio /= 100;
- vm_dirty_ratio *= correction;
- vm_dirty_ratio /= 100;
-
- if (dirty_background_ratio <= 0)
- dirty_background_ratio = 1;
- if (vm_dirty_ratio <= 0)
- vm_dirty_ratio = 1;
- }
mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
@@ -845,38 +840,6 @@ int set_page_dirty_lock(struct page *page)
EXPORT_SYMBOL(set_page_dirty_lock);
/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
- unsigned long flags;
-
- if (!mapping)
- return TestClearPageDirty(page);
-
- write_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- /*
- * We can continue to use `mapping' here because the
- * page is locked, which pins the address_space
- */
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
- return 1;
- }
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-
-/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
@@ -894,17 +857,46 @@ int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (!mapping)
- return TestClearPageDirty(page);
-
- if (TestClearPageDirty(page)) {
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ /*
+ * Yes, Virginia, this is indeed insane.
+ *
+ * We use this sequence to make sure that
+ * (a) we account for dirty stats properly
+ * (b) we tell the low-level filesystem to
+ * mark the whole page dirty if it was
+ * dirty in a pagetable. Only to then
+ * (c) clean the page again and return 1 to
+ * cause the writeback.
+ *
+ * This way we avoid all nasty races with the
+ * dirty bit in multiple places and clearing
+ * them concurrently from different threads.
+ *
+ * Note! Normally the "set_page_dirty(page)"
+ * has no effect on the actual dirty bit - since
+ * that will already usually be set. But we
+ * need the side effects, and it can help us
+ * avoid races.
+ *
+ * We basically use the page "master dirty bit"
+ * as a serialization point for all the different
+ * threads doing their things.
+ *
+ * FIXME! We still have a race here: if somebody
+ * adds the page back to the page tables in
+ * between the "page_mkclean()" and the "TestClearPageDirty()",
+ * we might have it mapped without the dirty bit set.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
+ return 1;
}
- return 1;
+ return 0;
}
- return 0;
+ return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8c1a116875bc..2c606cc922a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -711,6 +711,9 @@ static void __drain_pages(unsigned int cpu)
for_each_zone(zone) {
struct per_cpu_pageset *pset;
+ if (!populated_zone(zone))
+ continue;
+
pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -986,8 +989,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
/* free_pages my go negative - that's OK */
- unsigned long min = mark;
- long free_pages = z->free_pages - (1 << order) + 1;
+ long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;
if (alloc_flags & ALLOC_HIGH)
@@ -1953,17 +1955,24 @@ static inline unsigned long wait_table_bits(unsigned long size)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
+ unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
+ /*
+ * There can be holes in boot-time mem_map[]s
+ * handed to this function. They do not
+ * exist on hotplugged memory.
+ */
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ }
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
@@ -1990,7 +1999,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn))
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
static int __cpuinit zone_batchsize(struct zone *zone)
@@ -2236,7 +2245,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
__meminit int init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size)
+ unsigned long size,
+ enum memmap_context context)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
@@ -2680,7 +2690,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
if (!size)
continue;
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn,
+ size, MEMMAP_EARLY);
BUG_ON(ret);
zone_start_pfn += size;
}
@@ -3321,6 +3332,10 @@ void *__init alloc_large_system_hash(const char *tablename,
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
diff --git a/mm/rmap.c b/mm/rmap.c
index d8a842a586db..669acb22b572 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/module.h>
+#include <linux/kallsyms.h>
#include <asm/tlbflush.h>
@@ -432,7 +433,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
- pte_t *pte, entry;
+ pte_t *pte;
spinlock_t *ptl;
int ret = 0;
@@ -444,17 +445,18 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
if (!pte)
goto out;
- if (!pte_dirty(*pte) && !pte_write(*pte))
- goto unlock;
+ if (pte_dirty(*pte) || pte_write(*pte)) {
+ pte_t entry;
- entry = ptep_get_and_clear(mm, address, pte);
- entry = pte_mkclean(entry);
- entry = pte_wrprotect(entry);
- ptep_establish(vma, address, pte, entry);
- lazy_mmu_prot_update(entry);
- ret = 1;
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(mm, address, pte, entry);
+ lazy_mmu_prot_update(entry);
+ ret = 1;
+ }
-unlock:
pte_unmap_unlock(pte, ptl);
out:
return ret;
@@ -489,6 +491,8 @@ int page_mkclean(struct page *page)
if (mapping)
ret = page_mkclean_file(mapping, page);
}
+ if (page_test_and_clear_dirty(page))
+ ret = 1;
return ret;
}
@@ -567,14 +571,20 @@ void page_add_file_rmap(struct page *page)
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
if (unlikely(page_mapcount(page) < 0)) {
printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+ printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
printk (KERN_EMERG " page->flags = %lx\n", page->flags);
printk (KERN_EMERG " page->count = %x\n", page_count(page));
printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
+ print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
+ if (vma->vm_ops)
+ print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+ if (vma->vm_file && vma->vm_file->f_op)
+ print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
BUG();
}
@@ -679,7 +689,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
dec_mm_counter(mm, file_rss);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
out_unmap:
@@ -769,7 +779,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (pte_dirty(pteval))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, vma);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4bb28d218eb5..70da7a0981bf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -515,7 +515,12 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
size = SHMEM_NR_DIRECT;
nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
}
- if (!topdir)
+
+ /*
+ * If there are no indirect blocks or we are punching a hole
+ * below indirect blocks, nothing to be done.
+ */
+ if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
goto done2;
BUG_ON(limit <= SHMEM_NR_DIRECT);
diff --git a/mm/slab.c b/mm/slab.c
index 909975f6e090..c6100628a6ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3281,7 +3281,7 @@ retry:
flags | GFP_THISNODE, nid);
}
- if (!obj) {
+ if (!obj && !(flags & __GFP_NO_GROW)) {
/*
* This allocation will be performed within the constraints
* of the current cpuset / memory policy requirements.
@@ -3310,7 +3310,7 @@ retry:
*/
goto retry;
} else {
- kmem_freepages(cache, obj);
+ /* cache_grow already freed obj */
obj = NULL;
}
}
@@ -3553,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
*
* Currently only used for dentry validation.
*/
-int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
+int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
{
unsigned long addr = (unsigned long)ptr;
unsigned long min_addr = PAGE_OFFSET;
@@ -3587,6 +3587,7 @@ out:
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
* @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
*
* Identical to kmem_cache_alloc but it will allocate memory on the given
* node, which can improve the performance for cpu bound structures.
diff --git a/mm/slob.c b/mm/slob.c
index 2e9236e10ed1..5adc29cb58dd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -60,6 +60,8 @@ static DEFINE_SPINLOCK(slob_lock);
static DEFINE_SPINLOCK(block_lock);
static void slob_free(void *b, int size);
+static void slob_timer_cbk(void);
+
static void *slob_alloc(size_t size, gfp_t gfp, int align)
{
@@ -326,7 +328,7 @@ const char *kmem_cache_name(struct kmem_cache *c)
EXPORT_SYMBOL(kmem_cache_name);
static struct timer_list slob_timer = TIMER_INITIALIZER(
- (void (*)(unsigned long))kmem_cache_init, 0, 0);
+ (void (*)(unsigned long))slob_timer_cbk, 0, 0);
int kmem_cache_shrink(struct kmem_cache *d)
{
@@ -339,7 +341,12 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b)
return 0;
}
-void kmem_cache_init(void)
+void __init kmem_cache_init(void)
+{
+ slob_timer_cbk();
+}
+
+static void slob_timer_cbk(void)
{
void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b9fc0e5de6d5..a2d9bb4e80df 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -434,7 +434,7 @@ void free_swap_and_cache(swp_entry_t entry)
*
* This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device, sector_t offset)
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
{
struct block_device *bdev = NULL;
int i;
@@ -450,6 +450,9 @@ int swap_type_of(dev_t device, sector_t offset)
continue;
if (!bdev) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
spin_unlock(&swap_lock);
return i;
}
@@ -459,6 +462,9 @@ int swap_type_of(dev_t device, sector_t offset)
se = list_entry(sis->extent_list.next,
struct swap_extent, list);
if (se->start_block == offset) {
+ if (bdev_p)
+ *bdev_p = sis->bdev;
+
spin_unlock(&swap_lock);
bdput(bdev);
return i;
diff --git a/mm/truncate.c b/mm/truncate.c
index 9bfb8e853860..5df947de7654 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,6 +52,33 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
}
/*
+ * This cancels just the dirty bit on the kernel page itself, it
+ * does NOT actually remove dirty bits on any mmap's that may be
+ * around. It also leaves the page tagged dirty, so any sync
+ * activity will still find it on the dirty lists, and in particular,
+ * clear_page_dirty_for_io() will still look at the dirty bits in
+ * the VM.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
+ */
+void cancel_dirty_page(struct page *page, unsigned int account_size)
+{
+ if (TestClearPageDirty(page)) {
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ if (account_size)
+ task_io_account_cancelled_write(account_size);
+ }
+ }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
+/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_nopage().
@@ -67,11 +94,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return;
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
+
if (PagePrivate(page))
do_invalidatepage(page, 0);
- if (test_clear_page_dirty(page))
- task_io_account_cancelled_write(PAGE_CACHE_SIZE);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
@@ -321,6 +348,15 @@ failed:
return 0;
}
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+ if (!PageDirty(page))
+ return 0;
+ if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ return 0;
+ return mapping->a_ops->launder_page(page);
+}
+
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space
@@ -350,7 +386,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index;
- int was_dirty;
lock_page(page);
if (page->mapping != mapping) {
@@ -386,18 +421,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
PAGE_CACHE_SIZE, 0);
}
}
- was_dirty = test_clear_page_dirty(page);
- if (!invalidate_complete_page2(mapping, page)) {
- if (was_dirty)
- set_page_dirty(page);
+ ret = do_launder_page(mapping, page);
+ if (ret == 0 && !invalidate_complete_page2(mapping, page))
ret = -EIO;
- }
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
- WARN_ON_ONCE(ret);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e9813b06c7a3..7430df68cb64 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -692,7 +692,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
__count_vm_events(KSWAPD_STEAL, nr_freed);
} else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
- __count_vm_events(PGACTIVATE, nr_freed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
goto done;
@@ -1369,8 +1369,8 @@ void wakeup_kswapd(struct zone *zone, int order)
*
* For pass > 3 we also try to shrink the LRU lists that contain a few pages
*/
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
- int prio, struct scan_control *sc)
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+ int pass, struct scan_control *sc)
{
struct zone *zone;
unsigned long nr_to_scan, ret = 0;
@@ -1406,6 +1406,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
return ret;
}
+static unsigned long count_lru_pages(void)
+{
+ struct zone *zone;
+ unsigned long ret = 0;
+
+ for_each_zone(zone)
+ ret += zone->nr_active + zone->nr_inactive;
+ return ret;
+}
+
/*
* Try to free `nr_pages' of memory, system-wide, and return the number of
* freed pages.
@@ -1420,7 +1430,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
unsigned long ret = 0;
int pass;
struct reclaim_state reclaim_state;
- struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_swap = 0,
@@ -1431,10 +1440,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
current->reclaim_state = &reclaim_state;
- lru_pages = 0;
- for_each_zone(zone)
- lru_pages += zone->nr_active + zone->nr_inactive;
-
+ lru_pages = count_lru_pages();
nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
@@ -1461,13 +1467,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
for (pass = 0; pass < 5; pass++) {
int prio;
- /* Needed for shrinking slab caches later on */
- if (!lru_pages)
- for_each_zone(zone) {
- lru_pages += zone->nr_active;
- lru_pages += zone->nr_inactive;
- }
-
/* Force reclaiming mapped pages in the passes #3 and #4 */
if (pass > 2) {
sc.may_swap = 1;
@@ -1483,7 +1482,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
goto out;
reclaim_state.reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+ shrink_slab(sc.nr_scanned, sc.gfp_mask,
+ count_lru_pages());
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -1491,20 +1491,19 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
congestion_wait(WRITE, HZ / 10);
}
-
- lru_pages = 0;
}
/*
* If ret = 0, we could not shrink LRUs, but there may be something
* in slab caches
*/
- if (!ret)
+ if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ }
out:
current->reclaim_state = NULL;