summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig91
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c23
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c99
-rw-r--r--mm/filemap.h94
-rw-r--r--mm/filemap_xip.c447
-rw-r--r--mm/hugetlb.c177
-rw-r--r--mm/madvise.c109
-rw-r--r--mm/memory.c63
-rw-r--r--mm/mempolicy.c110
-rw-r--r--mm/mempool.c20
-rw-r--r--mm/mmap.c57
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page_alloc.c481
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/rmap.c25
-rw-r--r--mm/shmem.c143
-rw-r--r--mm/slab.c31
-rw-r--r--mm/sparse.c137
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/vmscan.c107
25 files changed, 1832 insertions, 458 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
new file mode 100644
index 000000000000..cd379936cac6
--- /dev/null
+++ b/mm/Kconfig
@@ -0,0 +1,91 @@
+config SELECT_MEMORY_MODEL
+ def_bool y
+ depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+
+choice
+ prompt "Memory model"
+ depends on SELECT_MEMORY_MODEL
+ default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+ default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
+ default FLATMEM_MANUAL
+
+config FLATMEM_MANUAL
+ bool "Flat Memory"
+ depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here: FLATMEM. This is normal
+ and a correct option.
+
+ Some users of more advanced features like NUMA and
+ memory hotplug may have different options here.
+ DISCONTIGMEM is an more mature, better tested system,
+ but is incompatible with memory hotplug and may suffer
+ decreased performance over SPARSEMEM. If unsure between
+ "Sparse Memory" and "Discontiguous Memory", choose
+ "Discontiguous Memory".
+
+ If unsure, choose this option (Flat Memory) over any other.
+
+config DISCONTIGMEM_MANUAL
+ bool "Discontigious Memory"
+ depends on ARCH_DISCONTIGMEM_ENABLE
+ help
+ This option provides enhanced support for discontiguous
+ memory systems, over FLATMEM. These systems have holes
+ in their physical address spaces, and this option provides
+ more efficient handling of these holes. However, the vast
+ majority of hardware has quite flat address spaces, and
+ can have degraded performance from extra overhead that
+ this option imposes.
+
+ Many NUMA configurations will have this as the only option.
+
+ If unsure, choose "Flat Memory" over this option.
+
+config SPARSEMEM_MANUAL
+ bool "Sparse Memory"
+ depends on ARCH_SPARSEMEM_ENABLE
+ help
+ This will be the only option for some systems, including
+ memory hotplug systems. This is normal.
+
+ For many other systems, this will be an alternative to
+ "Discontigious Memory". This option provides some potential
+ performance benefits, along with decreased code complexity,
+ but it is newer, and more experimental.
+
+ If unsure, choose "Discontiguous Memory" or "Flat Memory"
+ over this option.
+
+endchoice
+
+config DISCONTIGMEM
+ def_bool y
+ depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
+
+config SPARSEMEM
+ def_bool y
+ depends on SPARSEMEM_MANUAL
+
+config FLATMEM
+ def_bool y
+ depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+ def_bool y
+ depends on !SPARSEMEM
+
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory. This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+ def_bool y
+ depends on DISCONTIGMEM || NUMA
+
+config HAVE_MEMORY_PRESENT
+ def_bool y
+ depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so
* dma_get_required_mask(), which uses
* it, can be an inline function */
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
+
/* return the number of _pages_ that will be allocated for the boot bitmap */
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
{
@@ -57,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
pgdat->pgdat_next = pgdat_list;
pgdat_list = pgdat;
- mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+ mapsize = ALIGN(mapsize, sizeof(long));
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
bdata->node_boot_start = (start << PAGE_SHIFT);
bdata->node_low_pfn = end;
@@ -178,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
} else
preferred = 0;
- preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+ preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
preferred += offset;
areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
incr = align >> PAGE_SHIFT ? : 1;
@@ -219,7 +227,7 @@ found:
*/
if (align < PAGE_SIZE &&
bdata->last_offset && bdata->last_pos+1 == start) {
- offset = (bdata->last_offset+align-1) & ~(align-1);
+ offset = ALIGN(bdata->last_offset, align);
BUG_ON(offset > PAGE_SIZE);
remaining_size = PAGE_SIZE-offset;
if (size < remaining_size) {
@@ -256,6 +264,7 @@ found:
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
{
struct page *page;
+ unsigned long pfn;
bootmem_data_t *bdata = pgdat->bdata;
unsigned long i, count, total = 0;
unsigned long idx;
@@ -266,7 +275,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
count = 0;
/* first extant page of the node */
- page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+ pfn = bdata->node_boot_start >> PAGE_SHIFT;
idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
map = bdata->node_bootmem_map;
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +284,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
gofast = 1;
for (i = 0; i < idx; ) {
unsigned long v = ~map[i / BITS_PER_LONG];
+
if (gofast && v == ~0UL) {
int j, order;
+ page = pfn_to_page(pfn);
count += BITS_PER_LONG;
__ClearPageReserved(page);
order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +303,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
page += BITS_PER_LONG;
} else if (v) {
unsigned long m;
+
+ page = pfn_to_page(pfn);
for (m = 1; m && i < idx; m<<=1, page++, i++) {
if (v & m) {
count++;
@@ -302,8 +315,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
}
} else {
i+=BITS_PER_LONG;
- page += BITS_PER_LONG;
}
+ pfn += BITS_PER_LONG;
}
total += count;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 57264d74b8bf..5f19e87bc5af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -43,6 +43,10 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
goto out;
}
+ if (mapping->a_ops->get_xip_page)
+ /* no bad return value, but ignore advice */
+ goto out;
+
/* Careful about overflows. Len == 0 means "as much as possible" */
endbyte = offset + len;
if (!len || endbyte < len)
diff --git a/mm/filemap.c b/mm/filemap.c
index 1d33fec7bac6..c11418dd94e8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include "filemap.h"
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
}
EXPORT_SYMBOL(remove_suid);
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied. If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
- const char __user *buf, unsigned bytes)
-{
- char *kaddr;
- int left;
-
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
-
- if (left != 0) {
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap(page);
- }
- return bytes - left;
-}
-
-static size_t
+size_t
__filemap_copy_from_user_iovec(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -1767,52 +1743,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
}
/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page, KM_USER0);
- copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
- base, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- if (copied != bytes) {
- kaddr = kmap(page);
- copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
- base, bytes);
- kunmap(page);
- }
- return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t base = *basep;
-
- while (bytes) {
- int copy = min(bytes, iov->iov_len - base);
-
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- base = 0;
- }
- }
- *iovp = iov;
- *basep = base;
-}
-
-/*
* Performs necessary checks before doing a write
*
* Can adjust writing position aor amount of bytes to write.
@@ -1827,12 +1757,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos < 0))
return -EINVAL;
- if (unlikely(file->f_error)) {
- int err = file->f_error;
- file->f_error = 0;
- return err;
- }
-
if (!isblk) {
/* FIXME: this is for backwards compatibility with 2.4 */
if (file->f_flags & O_APPEND)
@@ -1927,8 +1851,11 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
* i_sem is held, which protects generic_osync_inode() from
* livelocking.
*/
- if (written >= 0 && file->f_flags & O_SYNC)
- generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
+ if (err < 0)
+ written = err;
+ }
if (written == count && !is_sync_kiocb(iocb))
written = -EIOCBQUEUED;
return written;
@@ -1968,6 +1895,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
do {
unsigned long index;
unsigned long offset;
+ unsigned long maxlen;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -1982,7 +1910,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ maxlen = cur_iov->iov_len - iov_base;
+ if (maxlen > bytes)
+ maxlen = bytes;
+ fault_in_pages_readable(buf, maxlen);
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
@@ -2023,7 +1954,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
if (unlikely(nr_segs > 1)) {
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
- buf = cur_iov->iov_base + iov_base;
+ if (count)
+ buf = cur_iov->iov_base +
+ iov_base;
+ } else {
+ iov_base += status;
}
}
}
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..13793ba0ce17
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
+/*
+ * linux/mm/filemap.h
+ *
+ * Copyright (C) 1994-1999 Linus Torvalds
+ */
+
+#ifndef __FILEMAP_H
+#define __FILEMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include <linux/config.h>
+#include <asm/uaccess.h>
+
+size_t
+__filemap_copy_from_user_iovec(char *vaddr,
+ const struct iovec *iov,
+ size_t base,
+ size_t bytes);
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then clear the page
+ * out to (offset+bytes) and return the number of bytes which were copied.
+ */
+static inline size_t
+filemap_copy_from_user(struct page *page, unsigned long offset,
+ const char __user *buf, unsigned bytes)
+{
+ char *kaddr;
+ int left;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ if (left != 0) {
+ /* Do it the slow way */
+ kaddr = kmap(page);
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ kunmap(page);
+ }
+ return bytes - left;
+}
+
+/*
+ * This has the same sideeffects and return value as filemap_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+ base, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (copied != bytes) {
+ kaddr = kmap(page);
+ copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+ base, bytes);
+ kunmap(page);
+ }
+ return copied;
+}
+
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+ const struct iovec *iov = *iovp;
+ size_t base = *basep;
+
+ while (bytes) {
+ int copy = min(bytes, iov->iov_len - base);
+
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ base = 0;
+ }
+ }
+ *iovp = iov;
+ *basep = base;
+}
+#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..3b6e384b98a6
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,447 @@
+/*
+ * linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <asm/tlbflush.h>
+#include "filemap.h"
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all. It may be NULL.
+ */
+static void
+do_xip_mapping_read(struct address_space *mapping,
+ struct file_ra_state *_ra,
+ struct file *filp,
+ loff_t *ppos,
+ read_descriptor_t *desc,
+ read_actor_t actor)
+{
+ struct inode *inode = mapping->host;
+ unsigned long index, end_index, offset;
+ loff_t isize;
+
+ BUG_ON(!mapping->a_ops->get_xip_page);
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+
+ isize = i_size_read(inode);
+ if (!isize)
+ goto out;
+
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ for (;;) {
+ struct page *page;
+ unsigned long nr, ret;
+
+ /* nr is the maximum number of bytes to copy from this page */
+ nr = PAGE_CACHE_SIZE;
+ if (index >= end_index) {
+ if (index > end_index)
+ goto out;
+ nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (nr <= offset) {
+ goto out;
+ }
+ }
+ nr = nr - offset;
+
+ page = mapping->a_ops->get_xip_page(mapping,
+ index*(PAGE_SIZE/512), 0);
+ if (!page)
+ goto no_xip_page;
+ if (unlikely(IS_ERR(page))) {
+ if (PTR_ERR(page) == -ENODATA) {
+ /* sparse */
+ page = virt_to_page(empty_zero_page);
+ } else {
+ desc->error = PTR_ERR(page);
+ goto out;
+ }
+ } else
+ BUG_ON(!PageUptodate(page));
+
+ /* If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ ret = actor(desc, page, offset, nr);
+ offset += ret;
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+
+ if (ret == nr && desc->count)
+ continue;
+ goto out;
+
+no_xip_page:
+ /* Did not get the page. Report it */
+ desc->error = -EIO;
+ goto out;
+ }
+
+out:
+ *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ if (filp)
+ file_accessed(filp);
+}
+
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+ read_descriptor_t desc;
+
+ if (!access_ok(VERIFY_WRITE, buf, len))
+ return -EFAULT;
+
+ desc.written = 0;
+ desc.arg.buf = buf;
+ desc.count = len;
+ desc.error = 0;
+
+ do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+ ppos, &desc, file_read_actor);
+
+ if (desc.written)
+ return desc.written;
+ else
+ return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+
+ssize_t
+xip_file_sendfile(struct file *in_file, loff_t *ppos,
+ size_t count, read_actor_t actor, void *target)
+{
+ read_descriptor_t desc;
+
+ if (!count)
+ return 0;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.arg.data = target;
+ desc.error = 0;
+
+ do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
+ ppos, &desc, actor);
+ if (desc.written)
+ return desc.written;
+ return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_sendfile);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * empty_zero_page when found at pgoff. Should it go in rmap.c?
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+ unsigned long pgoff)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct prio_tree_iter iter;
+ unsigned long address;
+ pte_t *pte;
+ pte_t pteval;
+
+ spin_lock(&mapping->i_mmap_lock);
+ vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ mm = vma->vm_mm;
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /*
+ * We need the page_table_lock to protect us from page faults,
+ * munmap, fork, etc...
+ */
+ pte = page_check_address(virt_to_page(empty_zero_page), mm,
+ address);
+ if (!IS_ERR(pte)) {
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(pte));
+ pteval = ptep_clear_flush(vma, address, pte);
+ BUG_ON(pte_dirty(pteval));
+ pte_unmap(pte);
+ spin_unlock(&mm->page_table_lock);
+ }
+ }
+ spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * xip_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_nopage, but used for execute in place
+ */
+static struct page *
+xip_file_nopage(struct vm_area_struct * area,
+ unsigned long address,
+ int *type)
+{
+ struct file *file = area->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct page *page;
+ unsigned long size, pgoff, endoff;
+
+ pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+ + area->vm_pgoff;
+ endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+ + area->vm_pgoff;
+
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (pgoff >= size) {
+ return NULL;
+ }
+
+ page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+ if (!IS_ERR(page)) {
+ BUG_ON(!PageUptodate(page));
+ return page;
+ }
+ if (PTR_ERR(page) != -ENODATA)
+ return NULL;
+
+ /* sparse block */
+ if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+ (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+ (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+ /* maybe shared writable, allocate new block */
+ page = mapping->a_ops->get_xip_page (mapping,
+ pgoff*(PAGE_SIZE/512), 1);
+ if (IS_ERR(page))
+ return NULL;
+ BUG_ON(!PageUptodate(page));
+ /* unmap page at pgoff from all other vmas */
+ __xip_unmap(mapping, pgoff);
+ } else {
+ /* not shared and writable, use empty_zero_page */
+ page = virt_to_page(empty_zero_page);
+ }
+
+ return page;
+}
+
+static struct vm_operations_struct xip_file_vm_ops = {
+ .nopage = xip_file_nopage,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+ BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+
+ file_accessed(file);
+ vma->vm_ops = &xip_file_vm_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t pos, loff_t *ppos)
+{
+ struct address_space * mapping = filp->f_mapping;
+ struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status = 0;
+ struct page *page;
+ size_t bytes;
+ ssize_t written = 0;
+
+ BUG_ON(!mapping->a_ops->get_xip_page);
+
+ do {
+ unsigned long index;
+ unsigned long offset;
+ size_t copied;
+
+ offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ index = pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > count)
+ bytes = count;
+
+ /*
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the
+ * same page as we're writing to, without it being marked
+ * up-to-date.
+ */
+ fault_in_pages_readable(buf, bytes);
+
+ page = a_ops->get_xip_page(mapping,
+ index*(PAGE_SIZE/512), 0);
+ if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+ /* we allocate a new page unmap it */
+ page = a_ops->get_xip_page(mapping,
+ index*(PAGE_SIZE/512), 1);
+ if (!IS_ERR(page))
+ /* unmap page at pgoff from all other vmas */
+ __xip_unmap(mapping, index);
+ }
+
+ if (IS_ERR(page)) {
+ status = PTR_ERR(page);
+ break;
+ }
+
+ BUG_ON(!PageUptodate(page));
+
+ copied = filemap_copy_from_user(page, offset, buf, bytes);
+ flush_dcache_page(page);
+ if (likely(copied > 0)) {
+ status = copied;
+
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ }
+ }
+ if (unlikely(copied != bytes))
+ if (status >= 0)
+ status = -EFAULT;
+ if (status < 0)
+ break;
+ } while (count);
+ *ppos = pos;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_sem.
+ */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+
+ return written ? written : status;
+}
+
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count;
+ loff_t pos;
+ ssize_t ret;
+
+ down(&inode->i_sem);
+
+ if (!access_ok(VERIFY_READ, buf, len)) {
+ ret=-EFAULT;
+ goto out_up;
+ }
+
+ pos = *ppos;
+ count = len;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_backing;
+ if (count == 0)
+ goto out_backing;
+
+ ret = remove_suid(filp->f_dentry);
+ if (ret)
+ goto out_backing;
+
+ inode_update_time(inode, 1);
+
+ ret = __xip_file_write (filp, buf, count, pos, ppos);
+
+ out_backing:
+ current->backing_dev_info = NULL;
+ out_up:
+ up(&inode->i_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_page
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize;
+ unsigned length;
+ struct page *page;
+ void *kaddr;
+
+ BUG_ON(!mapping->a_ops->get_xip_page);
+
+ blocksize = 1 << mapping->host->i_blkbits;
+ length = offset & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+
+ length = blocksize - length;
+
+ page = mapping->a_ops->get_xip_page(mapping,
+ index*(PAGE_SIZE/512), 0);
+ if (!page)
+ return -ENOMEM;
+ if (unlikely(IS_ERR(page))) {
+ if (PTR_ERR(page) == -ENODATA)
+ /* Hole? No need to truncate */
+ return 0;
+ else
+ return PTR_ERR(page);
+ } else
+ BUG_ON(!PageUptodate(page));
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ flush_dcache_page(page);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/hugetlb.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <linux/hugetlb.h>
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
.nopage = hugetlb_nopage,
};
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+ pte_t entry;
+
+ if (vma->vm_flags & VM_WRITE) {
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ } else {
+ entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ }
+ entry = pte_mkyoung(entry);
+ entry = pte_mkhuge(entry);
+
+ return entry;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *vma)
+{
+ pte_t *src_pte, *dst_pte, entry;
+ struct page *ptepage;
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+
+ while (addr < end) {
+ dst_pte = huge_pte_alloc(dst, addr);
+ if (!dst_pte)
+ goto nomem;
+ src_pte = huge_pte_offset(src, addr);
+ BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+ entry = *src_pte;
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+ add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ addr += HPAGE_SIZE;
+ }
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t pte;
+ struct page *page;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(start & ~HPAGE_MASK);
+ BUG_ON(end & ~HPAGE_MASK);
+
+ for (address = start; address < end; address += HPAGE_SIZE) {
+ pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+ if (pte_none(pte))
+ continue;
+ page = pte_page(pte);
+ put_page(page);
+ }
+ add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+ flush_tlb_range(vma, start, end);
+}
+
void zap_hugepage_range(struct vm_area_struct *vma,
unsigned long start, unsigned long length)
{
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
unmap_hugepage_range(vma, start, start + length);
spin_unlock(&mm->page_table_lock);
}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret = 0;
+
+ WARN_ON(!is_vm_hugetlb_page(vma));
+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+ hugetlb_prefault_arch_hook(mm);
+
+ spin_lock(&mm->page_table_lock);
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ unsigned long idx;
+ pte_t *pte = huge_pte_alloc(mm, addr);
+ struct page *page;
+
+ if (!pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (! pte_none(*pte))
+ hugetlb_clean_stale_pgtable(pte);
+
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (! ret) {
+ unlock_page(page);
+ } else {
+ hugetlb_put_quota(mapping);
+ free_huge_page(page);
+ goto out;
+ }
+ }
+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ }
+out:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i)
+{
+ unsigned long vpfn, vaddr = *position;
+ int remainder = *length;
+
+ BUG_ON(!is_vm_hugetlb_page(vma));
+
+ vpfn = vaddr/PAGE_SIZE;
+ while (vaddr < vma->vm_end && remainder) {
+
+ if (pages) {
+ pte_t *pte;
+ struct page *page;
+
+ /* Some archs (sparc64, sh*) have multiple
+ * pte_ts to each hugepage. We have to make
+ * sure we get the first, for the page
+ * indexing below to work. */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageCompound(page));
+
+ get_page(page);
+ pages[i] = page;
+ }
+
+ if (vmas)
+ vmas[i] = vma;
+
+ vaddr += PAGE_SIZE;
+ ++vpfn;
+ --remainder;
+ ++i;
+ }
+
+ *length = remainder;
+ *position = vaddr;
+
+ return i;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 944b5e52d812..73180a22877e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -8,17 +8,47 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
+#include <linux/mempolicy.h>
#include <linux/hugetlb.h>
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
-static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_behavior(struct vm_area_struct * vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
{
struct mm_struct * mm = vma->vm_mm;
int error = 0;
+ pgoff_t pgoff;
+ int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+
+ switch (behavior) {
+ case MADV_SEQUENTIAL:
+ new_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+
+ if (new_flags == vma->vm_flags) {
+ *prev = vma;
+ goto success;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
@@ -35,22 +65,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
- VM_ClearReadHint(vma);
-
- switch (behavior) {
- case MADV_SEQUENTIAL:
- vma->vm_flags |= VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- vma->vm_flags |= VM_RAND_READ;
- break;
- default:
- break;
- }
+ vma->vm_flags = new_flags;
out:
if (error == -ENOMEM)
error = -EAGAIN;
+success:
return error;
}
@@ -58,6 +78,7 @@ out:
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
@@ -65,6 +86,12 @@ static long madvise_willneed(struct vm_area_struct * vma,
if (!file)
return -EBADF;
+ if (file->f_mapping->a_ops->get_xip_page) {
+ /* no bad return value, but ignore advice */
+ return 0;
+ }
+
+ *prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
@@ -95,8 +122,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
* dirty pages is already available as msync(MS_INVALIDATE).
*/
static long madvise_dontneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
+ *prev = vma;
if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
return -EINVAL;
@@ -111,8 +140,8 @@ static long madvise_dontneed(struct vm_area_struct * vma,
return 0;
}
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
+static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, int behavior)
{
long error = -EBADF;
@@ -120,15 +149,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
- error = madvise_behavior(vma, start, end, behavior);
+ error = madvise_behavior(vma, prev, start, end, behavior);
break;
case MADV_WILLNEED:
- error = madvise_willneed(vma, start, end);
+ error = madvise_willneed(vma, prev, start, end);
break;
case MADV_DONTNEED:
- error = madvise_dontneed(vma, start, end);
+ error = madvise_dontneed(vma, prev, start, end);
break;
default:
@@ -175,8 +204,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
*/
asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
- struct vm_area_struct * vma;
+ unsigned long end, tmp;
+ struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
size_t len;
@@ -202,40 +231,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
*/
- vma = find_vma(current->mm, start);
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (!vma && prev)
+ vma = prev->vm_next;
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
- /* Here start < vma->vm_end. */
+ /* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
+ if (start >= end)
+ goto out;
}
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = madvise_vma(vma, start, end,
- behavior);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
- }
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = madvise_vma(vma, start, vma->vm_end, behavior);
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
- start = vma->vm_end;
- vma = vma->vm_next;
+ start = tmp;
+ if (start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ goto out;
+ vma = prev->vm_next;
}
-
out:
up_write(&current->mm->mmap_sem);
return error;
diff --git a/mm/memory.c b/mm/memory.c
index d209f745db7f..beabdefa6254 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address)
{
return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
}
-
EXPORT_SYMBOL(check_user_page_readable);
-/*
- * Given a physical address, is there a useful struct page pointing to
- * it? This may become more complex in the future if we start dealing
- * with IO-aperture pages for direct-IO.
- */
-
-static inline struct page *get_page_map(struct page *page)
-{
- if (!pfn_valid(page_to_pfn(page)))
- return NULL;
- return page;
-}
-
-
static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
unsigned long address)
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
return 0;
}
-
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
spin_lock(&mm->page_table_lock);
do {
- struct page *map;
+ struct page *page;
int lookup_write = write;
cond_resched_lock(&mm->page_table_lock);
- while (!(map = follow_page(mm, start, lookup_write))) {
+ while (!(page = follow_page(mm, start, lookup_write))) {
/*
* Shortcut for anonymous pages. We don't want
* to force the creation of pages tables for
- * insanly big anonymously mapped areas that
+ * insanely big anonymously mapped areas that
* nobody touched so far. This is important
* for doing a core dump for these mappings.
*/
if (!lookup_write &&
untouched_anonymous_page(mm,vma,start)) {
- map = ZERO_PAGE(start);
+ page = ZERO_PAGE(start);
break;
}
spin_unlock(&mm->page_table_lock);
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
spin_lock(&mm->page_table_lock);
}
if (pages) {
- pages[i] = get_page_map(map);
- if (!pages[i]) {
- spin_unlock(&mm->page_table_lock);
- while (i--)
- page_cache_release(pages[i]);
- i = -EFAULT;
- goto out;
- }
- flush_dcache_page(pages[i]);
- if (!PageReserved(pages[i]))
- page_cache_get(pages[i]);
+ pages[i] = page;
+ flush_dcache_page(page);
+ if (!PageReserved(page))
+ page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
i++;
start += PAGE_SIZE;
len--;
- } while(len && start < vma->vm_end);
+ } while (len && start < vma->vm_end);
spin_unlock(&mm->page_table_lock);
- } while(len);
-out:
+ } while (len);
return i;
}
-
EXPORT_SYMBOL(get_user_pages);
static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
@@ -1164,7 +1139,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
}
old_page = pfn_to_page(pfn);
- if (!TestSetPageLocked(old_page)) {
+ if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
if (reuse) {
@@ -1483,7 +1458,7 @@ restart:
* unmap_mapping_range - unmap the portion of all mmaps
* in the specified address_space corresponding to the specified
* page range in the underlying file.
- * @address_space: the address space containing mmaps to be unmapped.
+ * @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from vmtruncate(), which
@@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
}
/* The page isn't present yet, go ahead with the fault. */
-
- swap_free(entry);
- if (vm_swap_full())
- remove_exclusive_swap_page(page);
inc_mm_counter(mm, rss);
pte = mk_pte(page, vma->vm_page_prot);
@@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
write_access = 0;
}
- unlock_page(page);
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
+ swap_free(entry);
+ if (vm_swap_full())
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+
if (write_access) {
if (do_wp_page(mm, vma, address,
page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..cb41c31e7c87 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
}
/* Ensure all existing pages follow the policy. */
-static int
-verify_pages(struct mm_struct *mm,
- unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
{
- while (addr < end) {
- struct page *p;
- pte_t *pte;
- pmd_t *pmd;
- pud_t *pud;
- pgd_t *pgd;
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd)) {
- unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
- if (next > addr)
- break;
- addr = next;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+ spin_lock(&mm->page_table_lock);
+ orig_pte = pte = pte_offset_map(pmd, addr);
+ do {
+ unsigned long pfn;
+ unsigned int nid;
+
+ if (!pte_present(*pte))
continue;
- }
- pud = pud_offset(pgd, addr);
- if (pud_none(*pud)) {
- addr = (addr + PUD_SIZE) & PUD_MASK;
+ pfn = pte_pfn(*pte);
+ if (!pfn_valid(pfn))
continue;
- }
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- addr = (addr + PMD_SIZE) & PMD_MASK;
+ nid = pfn_to_nid(pfn);
+ if (!test_bit(nid, nodes))
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(orig_pte);
+ spin_unlock(&mm->page_table_lock);
+ return addr != end;
+}
+
+static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
continue;
- }
- p = NULL;
- pte = pte_offset_map(pmd, addr);
- if (pte_present(*pte))
- p = pte_page(*pte);
- pte_unmap(pte);
- if (p) {
- unsigned nid = page_to_nid(p);
- if (!test_bit(nid, nodes))
- return -EIO;
- }
- addr += PAGE_SIZE;
- }
+ if (check_pte_range(mm, pmd, addr, next, nodes))
+ return -EIO;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ if (check_pmd_range(mm, pud, addr, next, nodes))
+ return -EIO;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+}
+
+static inline int check_pgd_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ if (check_pud_range(mm, pgd, addr, next, nodes))
+ return -EIO;
+ } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
- err = verify_pages(vma->vm_mm,
+ err = check_pgd_range(vma->vm_mm,
vma->vm_start, vma->vm_end, nodes);
if (err) {
first = ERR_PTR(err);
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
- zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+ zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
put_cpu();
}
return page;
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..9a72f7d918fa 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
* functions might sleep - as long as the mempool_alloc function is not called
* from IRQ contexts.
*/
-mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
- mempool_t *pool;
+ return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+}
+EXPORT_SYMBOL(mempool_create);
- pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+ mempool_free_t *free_fn, void *pool_data, int node_id)
+{
+ mempool_t *pool;
+ pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
if (!pool)
return NULL;
memset(pool, 0, sizeof(*pool));
- pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+ pool->elements = kmalloc_node(min_nr * sizeof(void *),
+ GFP_KERNEL, node_id);
if (!pool->elements) {
kfree(pool);
return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
}
return pool;
}
-EXPORT_SYMBOL(mempool_create);
+EXPORT_SYMBOL(mempool_create_node);
/**
* mempool_resize - resize an existing memory pool
@@ -197,7 +204,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
{
void *element;
unsigned long flags;
- DEFINE_WAIT(wait);
+ wait_queue_t wait;
int gfp_temp;
might_sleep_if(gfp_mask & __GFP_WAIT);
@@ -228,6 +235,7 @@ repeat_alloc:
/* Now start performing page reclaim */
gfp_temp = gfp_mask;
+ init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
smp_mb();
if (!pool->curr_nr)
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..da3fa90a0aae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- start_addr = addr = mm->free_area_cache;
+ if (len > mm->cached_hole_size) {
+ start_addr = addr = mm->free_area_cache;
+ } else {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
full_search:
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
* some holes.
*/
if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = addr = TASK_UNMAPPED_BASE;
+ addr = TASK_UNMAPPED_BASE;
+ start_addr = addr;
+ mm->cached_hole_size = 0;
goto full_search;
}
return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
mm->free_area_cache = addr + len;
return addr;
}
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
}
#endif
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
/*
* Is this a new hole at the lowest possible address?
*/
- if (area->vm_start >= TASK_UNMAPPED_BASE &&
- area->vm_start < area->vm_mm->free_area_cache)
- area->vm_mm->free_area_cache = area->vm_start;
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+ mm->free_area_cache = addr;
+ mm->cached_hole_size = ~0UL;
+ }
}
/*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
+ /* check if free_area_cache is useful for us */
+ if (len <= mm->cached_hole_size) {
+ mm->cached_hole_size = 0;
+ mm->free_area_cache = mm->mmap_base;
+ }
+
/* either no address requested or can't fit in requested address hole */
addr = mm->free_area_cache;
@@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return (mm->free_area_cache = addr-len);
}
+ if (mm->mmap_base < len)
+ goto bottomup;
+
addr = mm->mmap_base-len;
do {
@@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* remember the address as a hint for next time */
return (mm->free_area_cache = addr);
+ /* remember the largest hole we saw so far */
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
+
/* try just below the current vma->vm_start */
addr = vma->vm_start-len;
} while (len < vma->vm_start);
+bottomup:
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
/*
* Restore the topdown base:
*/
mm->free_area_cache = mm->mmap_base;
+ mm->cached_hole_size = ~0UL;
return addr;
}
#endif
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
{
/*
* Is this a new hole at the highest possible address?
*/
- if (area->vm_end > area->vm_mm->free_area_cache)
- area->vm_mm->free_area_cache = area->vm_end;
+ if (addr > mm->free_area_cache)
+ mm->free_area_cache = addr;
/* dont allow allocations above current base */
- if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
- area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+ if (mm->free_area_cache > mm->mmap_base)
+ mm->free_area_cache = mm->mmap_base;
}
unsigned long
@@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
if (area->vm_flags & VM_LOCKED)
area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
vm_stat_unaccount(area);
- area->vm_mm->unmap_area(area);
remove_vm_struct(area);
}
@@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
+ unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
do {
@@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} while (vma && vma->vm_start < end);
*insertion_point = vma;
tail_vma->vm_next = NULL;
+ if (mm->unmap_area == arch_unmap_area)
+ addr = prev ? prev->vm_end : mm->mmap_base;
+ else
+ addr = vma ? vma->vm_start : mm->mmap_base;
+ mm->unmap_area(mm, addr);
mm->mmap_cache = NULL; /* Kill the cache. */
}
diff --git a/mm/msync.c b/mm/msync.c
index 090f426bca7d..d0f5a1bce7cb 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!pte_present(*pte))
continue;
+ if (!pte_maybe_dirty(*pte))
+ continue;
pfn = pte_pfn(*pte);
if (!pfn_valid(pfn))
continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
{
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4bbb1cb10495..59666d905f19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask)
struct mm_struct *mm = NULL;
task_t * p;
+ printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+ /* print memory stats */
+ show_mem();
+
read_lock(&tasklist_lock);
retry:
p = select_bad_process();
@@ -268,12 +272,9 @@ retry:
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
read_unlock(&tasklist_lock);
- show_free_areas();
panic("Out of memory and no killable processes...\n");
}
- printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
- show_free_areas();
mm = oom_kill_process(p);
if (!mm)
goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..7ee675ad101e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
*/
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
- page->flags &= ~(1 << PG_private |
+ page->flags &= ~(1 << PG_lru |
+ 1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
+ 1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
*/
static void prep_new_page(struct page *page, int order)
{
- if (page->mapping || page_mapcount(page) ||
- (page->flags & (
+ if ( page_mapcount(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 0 ||
+ (page->flags & (
+ 1 << PG_lru |
1 << PG_private |
1 << PG_locked |
- 1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
+ 1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return allocated;
}
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+ struct zone *zone;
+ int i;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset;
+
+ /* Do not drain local pagesets */
+ if (zone->zone_pgdat->node_id == numa_node_id())
+ continue;
+
+ pset = zone->pageset[smp_processor_id()];
+ for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+ struct per_cpu_pages *pcp;
+
+ pcp = &pset->pcp[i];
+ if (pcp->count)
+ pcp->count -= free_pages_bulk(zone, pcp->count,
+ &pcp->list, 0);
+ }
+ }
+ local_irq_restore(flags);
+}
+#endif
+
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
for_each_zone(zone) {
struct per_cpu_pageset *pset;
- pset = &zone->pageset[cpu];
+ pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
local_irq_save(flags);
cpu = smp_processor_id();
- p = &z->pageset[cpu];
+ p = zone_pcp(z,cpu);
if (pg == orig) {
- z->pageset[cpu].numa_hit++;
+ p->numa_hit++;
} else {
p->numa_miss++;
- zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
list_add(&page->lru, &pcp->list);
pcp->count++;
+ if (pcp->count >= pcp->high)
+ pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
local_irq_restore(flags);
put_cpu();
}
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
if (order == 0) {
struct per_cpu_pages *pcp;
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return 1;
}
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+ if (!z->reclaim_pages)
+ return 0;
+ if (gfp_mask & __GFP_NORECLAIM)
+ return 0;
+ return 1;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
classzone_idx = zone_idx(zones[0]);
- restart:
+restart:
/* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; (z = zones[i]) != NULL; i++) {
-
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0))
- continue;
+ int do_reclaim = should_reclaim_zone(z, gfp_mask);
if (!cpuset_zone_allowed(z))
continue;
+ /*
+ * If the zone is to attempt early page reclaim then this loop
+ * will try to reclaim pages and check the watermark a second
+ * time before giving up and falling back to the next zone.
+ */
+zone_reclaim_retry:
+ if (!zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, 0, 0)) {
+ if (!do_reclaim)
+ continue;
+ else {
+ zone_reclaim(z, gfp_mask, order);
+ /* Only try reclaim once */
+ do_reclaim = 0;
+ goto zone_reclaim_retry;
+ }
+ }
+
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+ did_some_progress = try_to_free_pages(zones, gfp_mask);
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
dump_stack();
+ show_mem();
}
return NULL;
got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
}
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
{
unsigned long ret = 0;
int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
return ret;
}
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
{
unsigned long flags;
void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
if (!cpu_possible(cpu))
continue;
- pageset = zone->pageset + cpu;
+ pageset = zone_pcp(zone, cpu);
for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: low %d, high %d, batch %d\n",
+ printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
cpu,
temperature ? "cold" : "hot",
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch);
+ pageset->pcp[temperature].batch,
+ pageset->pcp[temperature].count);
}
}
get_page_state(&ps);
get_zone_counts(&active, &inactive, &free);
- printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+ printk("Free pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));
@@ -1587,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
- struct page *start = pfn_to_page(start_pfn);
struct page *page;
+ unsigned long end_pfn = start_pfn + size;
+ unsigned long pfn;
- for (page = start; page < (start + size); page++) {
- set_page_zone(page, NODEZONE(nid, zone));
+ for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ page = pfn_to_page(pfn);
+ set_page_links(page, zone, nid, pfn);
set_page_count(page, 0);
reset_page_mapcount(page);
SetPageReserved(page);
@@ -1615,11 +1683,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
}
}
+#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size)
+{
+ unsigned long snum = pfn_to_section_nr(pfn);
+ unsigned long end = pfn_to_section_nr(pfn + size);
+
+ if (FLAGS_HAS_NODE)
+ zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+ else
+ for (; snum <= end; snum++)
+ zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn))
#endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+ int batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of the
+ * size of the zone. But no more than 1/4 of a meg - there's
+ * no point in going beyond the size of L2 cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ /*
+ * Clamp the batch to a 2^n - 1 value. Having a power
+ * of 2 value was found to be more likely to have
+ * suboptimal cache aliasing properties in some cases.
+ *
+ * For example if 2 tasks are alternately allocating
+ * batches of pages, one task can end up with a lot
+ * of pages of one half of the possible page colors
+ * and the other with pages of the other colors.
+ */
+ batch = (1 << fls(batch + batch/2)) - 1;
+ return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ struct per_cpu_pages *pcp;
+
+ pcp = &p->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &p->pcp[1]; /* cold*/
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = max(1UL, 1 * batch);
+ INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static struct per_cpu_pageset
+ boot_pageset[NR_CPUS];
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+ struct zone *zone, *dzone;
+
+ for_each_zone(zone) {
+
+ zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!zone->pageset[cpu])
+ goto bad;
+
+ setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+ }
+
+ return 0;
+bad:
+ for_each_zone(dzone) {
+ if (dzone == zone)
+ break;
+ kfree(dzone->pageset[cpu]);
+ dzone->pageset[cpu] = NULL;
+ }
+ return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+ zone_pcp(zone, cpu) = NULL;
+ kfree(pset);
+ }
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ int ret = NOTIFY_OK;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+#endif
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block pageset_notifier =
+ { &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+ int err;
+
+ /* Initialize per_cpu_pageset for cpu 0.
+ * A cpuup callback will do this for every cpu
+ * as it comes online
+ */
+ err = process_zones(smp_processor_id());
+ BUG_ON(err);
+ register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1643,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long size, realsize;
unsigned long batch;
- zone_table[NODEZONE(nid, j)] = zone;
realsize = size = zones_size[j];
if (zholes_size)
realsize -= zholes_size[j];
@@ -1662,48 +1899,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- /*
- * Clamp the batch to a 2^n - 1 value. Having a power
- * of 2 value was found to be more likely to have
- * suboptimal cache aliasing properties in some cases.
- *
- * For example if 2 tasks are alternately allocating
- * batches of pages, one task can end up with a lot
- * of pages of one half of the possible page colors
- * and the other with pages of the other colors.
- */
- batch = (1 << fls(batch + batch/2)) - 1;
+ batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
- struct per_cpu_pages *pcp;
-
- pcp = &zone->pageset[cpu].pcp[0]; /* hot */
- pcp->count = 0;
- pcp->low = 2 * batch;
- pcp->high = 6 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &zone->pageset[cpu].pcp[1]; /* cold */
- pcp->count = 0;
- pcp->low = 0;
- pcp->high = 2 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1713,6 +1918,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
zone->nr_scan_inactive = 0;
zone->nr_active = 0;
zone->nr_inactive = 0;
+ atomic_set(&zone->reclaim_in_progress, -1);
if (!size)
continue;
@@ -1740,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
memmap_init(size, nid, j, zone_start_pfn);
+ zonetable_add(zone, nid, j, zone_start_pfn, size);
+
zone_start_pfn += size;
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1748,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
static void __init alloc_node_mem_map(struct pglist_data *pgdat)
{
- unsigned long size;
-
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
+ unsigned long size;
+ struct page *map;
+
size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
- pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+ map = alloc_remap(pgdat->node_id, size);
+ if (!map)
+ map = alloc_bootmem_node(pgdat, size);
+ pgdat->node_mem_map = map;
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0))
mem_map = NODE_DATA(0)->node_mem_map;
#endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1781,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
free_area_init_core(pgdat, zones_size, zholes_size);
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
EXPORT_SYMBOL(contig_page_data);
+#endif
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_node(0, &contig_page_data, zones_size,
+ free_area_init_node(0, NODE_DATA(0), zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#endif
#ifdef CONFIG_PROC_FS
@@ -1853,6 +2067,115 @@ struct seq_operations fragmentation_op = {
.show = frag_show,
};
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = arg;
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+ int i;
+
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ seq_printf(m,
+ "\n pages free %lu"
+ "\n min %lu"
+ "\n low %lu"
+ "\n high %lu"
+ "\n active %lu"
+ "\n inactive %lu"
+ "\n scanned %lu (a: %lu i: %lu)"
+ "\n spanned %lu"
+ "\n present %lu",
+ zone->free_pages,
+ zone->pages_min,
+ zone->pages_low,
+ zone->pages_high,
+ zone->nr_active,
+ zone->nr_inactive,
+ zone->pages_scanned,
+ zone->nr_scan_active, zone->nr_scan_inactive,
+ zone->spanned_pages,
+ zone->present_pages);
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+ struct per_cpu_pageset *pageset;
+ int j;
+
+ pageset = zone_pcp(zone, i);
+ for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+ if (pageset->pcp[j].count)
+ break;
+ }
+ if (j == ARRAY_SIZE(pageset->pcp))
+ continue;
+ for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+ seq_printf(m,
+ "\n cpu: %i pcp: %i"
+ "\n count: %i"
+ "\n low: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i, j,
+ pageset->pcp[j].count,
+ pageset->pcp[j].low,
+ pageset->pcp[j].high,
+ pageset->pcp[j].batch);
+ }
+#ifdef CONFIG_NUMA
+ seq_printf(m,
+ "\n numa_hit: %lu"
+ "\n numa_miss: %lu"
+ "\n numa_foreign: %lu"
+ "\n interleave_hit: %lu"
+ "\n local_node: %lu"
+ "\n other_node: %lu",
+ pageset->numa_hit,
+ pageset->numa_miss,
+ pageset->numa_foreign,
+ pageset->interleave_hit,
+ pageset->local_node,
+ pageset->other_node);
+#endif
+ }
+ seq_printf(m,
+ "\n all_unreclaimable: %u"
+ "\n prev_priority: %i"
+ "\n temp_priority: %i"
+ "\n start_pfn: %lu",
+ zone->all_unreclaimable,
+ zone->prev_priority,
+ zone->temp_priority,
+ zone->zone_start_pfn);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ seq_putc(m, '\n');
+ }
+ return 0;
+}
+
+struct seq_operations zoneinfo_op = {
+ .start = frag_start, /* iterate over all zones. The same as in
+ * fragmentation. */
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = zoneinfo_show,
+};
+
static char *vmstat_text[] = {
"nr_dirty",
"nr_writeback",
@@ -2058,10 +2381,10 @@ static void setup_per_zone_pages_min(void)
min_pages = 128;
zone->pages_min = min_pages;
} else {
- /* if it's a lowmem zone, reserve a number of pages
+ /* if it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = (pages_min * zone->present_pages) /
+ zone->pages_min = (pages_min * zone->present_pages) /
lowmem_pages;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 667c76df1ec2..2e605a19ce57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -127,7 +127,7 @@ out:
return ret;
}
-#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
/*
* A scruffy utility function to read or write an arbitrary swap page
* and wait on the I/O. The caller must have a ref on the page.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
spin_unlock_irq(&pdflush_lock);
schedule();
- if (try_to_freeze(PF_FREEZE)) {
+ if (try_to_freeze()) {
spin_lock_irq(&pdflush_lock);
continue;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
*
* On success returns with mapped pte and locked mm->page_table_lock.
*/
-static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address)
+pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+ unsigned long address)
{
pgd_t *pgd;
pud_t *pud;
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
goto out_unmap;
}
- /*
- * Don't pull an anonymous page out from under get_user_pages.
- * GUP carefully breaks COW and raises page count (while holding
- * page_table_lock, as we have here) to make sure that the page
- * cannot be freed. If we unmap that page here, a user write
- * access to the virtual address will bring back the page, but
- * its raised count will (ironically) be taken to mean it's not
- * an exclusive swap page, do_wp_page will replace it by a copy
- * page, and the user never get to see the data GUP was holding
- * the original page for.
- *
- * This test is also useful for when swapoff (unuse_process) has
- * to drop page lock: its reference to the page stops existing
- * ptes from being unmapped, so swapoff can make progress.
- */
- if (PageSwapCache(page) &&
- page_count(page) != page_mapcount(page) + 2) {
- ret = SWAP_FAIL;
- goto out_unmap;
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 61574b81d979..e64fa726a790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,8 +6,8 @@
* 2000-2001 Christoph Rohland
* 2000-2001 SAP AG
* 2002 Red Hat Inc.
- * Copyright (C) 2002-2004 Hugh Dickins.
- * Copyright (C) 2002-2004 VERITAS Software Corporation.
+ * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
* Copyright (C) 2004 Andi Kleen, SuSE Labs
*
* Extended attribute support for tmpfs:
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock);
static void shmem_free_blocks(struct inode *inode, long pages)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_blocks += pages;
inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
* page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data.
*/
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock);
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode)
spin_unlock(&shmem_swaplist_lock);
}
}
- if (sbinfo) {
- BUG_ON(inode->i_blocks);
+ BUG_ON(inode->i_blocks);
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1080,7 +1080,7 @@ repeat:
} else {
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks == 0 ||
shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
mpol_shared_policy_init(&info->policy);
break;
}
- } else if (sbinfo) {
+ } else if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
}
#ifdef CONFIG_TMPFS
-
-static int shmem_set_size(struct shmem_sb_info *sbinfo,
- unsigned long max_blocks, unsigned long max_inodes)
-{
- int error;
- unsigned long blocks, inodes;
-
- spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
- inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- error = -EINVAL;
- if (max_blocks < blocks)
- goto out;
- if (max_inodes < inodes)
- goto out;
- error = 0;
- sbinfo->max_blocks = max_blocks;
- sbinfo->free_blocks = max_blocks - blocks;
- sbinfo->max_inodes = max_inodes;
- sbinfo->free_inodes = max_inodes - inodes;
-out:
- spin_unlock(&sbinfo->stat_lock);
- return error;
-}
-
static struct inode_operations shmem_symlink_inode_operations;
static struct inode_operations shmem_symlink_inline_operations;
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- if (sbinfo) {
- spin_lock(&sbinfo->stat_lock);
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ }
+ if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
- spin_unlock(&sbinfo->stat_lock);
}
/* else leave those fields 0 like simple_statfs */
+ spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
*/
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo) {
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
@@ -1921,22 +1898,42 @@ bad_val:
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- unsigned long max_blocks = 0;
- unsigned long max_inodes = 0;
+ unsigned long max_blocks = sbinfo->max_blocks;
+ unsigned long max_inodes = sbinfo->max_inodes;
+ unsigned long blocks;
+ unsigned long inodes;
+ int error = -EINVAL;
+
+ if (shmem_parse_options(data, NULL, NULL, NULL,
+ &max_blocks, &max_inodes))
+ return error;
- if (sbinfo) {
- max_blocks = sbinfo->max_blocks;
- max_inodes = sbinfo->max_inodes;
- }
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
- return -EINVAL;
- /* Keep it simple: disallow limited <-> unlimited remount */
- if ((max_blocks || max_inodes) == !sbinfo)
- return -EINVAL;
- /* But allow the pointless unlimited -> unlimited remount */
- if (!sbinfo)
- return 0;
- return shmem_set_size(sbinfo, max_blocks, max_inodes);
+ spin_lock(&sbinfo->stat_lock);
+ blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+ inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if (max_blocks < blocks)
+ goto out;
+ if (max_inodes < inodes)
+ goto out;
+ /*
+ * Those tests also disallow limited->unlimited while any are in
+ * use, so i_blocks will always be zero when max_blocks is zero;
+ * but we must separately disallow unlimited->limited, because
+ * in that case we have no record of how much is already in use.
+ */
+ if (max_blocks && !sbinfo->max_blocks)
+ goto out;
+ if (max_inodes && !sbinfo->max_inodes)
+ goto out;
+
+ error = 0;
+ sbinfo->max_blocks = max_blocks;
+ sbinfo->free_blocks = max_blocks - blocks;
+ sbinfo->max_inodes = max_inodes;
+ sbinfo->free_inodes = max_inodes - inodes;
+out:
+ spin_unlock(&sbinfo->stat_lock);
+ return error;
}
#endif
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb,
uid_t uid = current->fsuid;
gid_t gid = current->fsgid;
int err = -ENOMEM;
-
-#ifdef CONFIG_TMPFS
+ struct shmem_sb_info *sbinfo;
unsigned long blocks = 0;
unsigned long inodes = 0;
+#ifdef CONFIG_TMPFS
/*
* Per default we only allow half of the physical ram per
* tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb,
inodes = totalram_pages - totalhigh_pages;
if (inodes > blocks)
inodes = blocks;
-
- if (shmem_parse_options(data, &mode,
- &uid, &gid, &blocks, &inodes))
+ if (shmem_parse_options(data, &mode, &uid, &gid,
+ &blocks, &inodes))
return -EINVAL;
}
-
- if (blocks || inodes) {
- struct shmem_sb_info *sbinfo;
- sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
- if (!sbinfo)
- return -ENOMEM;
- sb->s_fs_info = sbinfo;
- spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = blocks;
- sbinfo->free_blocks = blocks;
- sbinfo->max_inodes = inodes;
- sbinfo->free_inodes = inodes;
- }
- sb->s_xattr = shmem_xattr_handlers;
#else
sb->s_flags |= MS_NOUSER;
#endif
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ spin_lock_init(&sbinfo->stat_lock);
+ sbinfo->max_blocks = blocks;
+ sbinfo->free_blocks = blocks;
+ sbinfo->max_inodes = inodes;
+ sbinfo->free_inodes = inodes;
+
+ sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
+ sb->s_xattr = shmem_xattr_handlers;
+
inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
if (!inode)
goto failed;
diff --git a/mm/slab.c b/mm/slab.c
index 840742641152..122d031baab2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
#include <linux/sysctl.h>
#include <linux/module.h>
#include <linux/rcupdate.h>
+#include <linux/string.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2620,6 +2621,12 @@ unsigned int kmem_cache_size(kmem_cache_t *cachep)
}
EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(kmem_cache_t *cachep)
+{
+ return cachep->name;
+}
+EXPORT_SYMBOL_GPL(kmem_cache_name);
+
struct ccupdate_struct {
kmem_cache_t *cachep;
struct array_cache *new[NR_CPUS];
@@ -2845,6 +2852,7 @@ next:
}
check_irq_on();
up(&cache_chain_sem);
+ drain_remote_pages();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
}
@@ -3075,3 +3083,26 @@ unsigned int ksize(const void *objp)
return size;
}
+
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, int gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strlen(s) + 1;
+ buf = kmalloc(len, gfp);
+ if (buf)
+ memcpy(buf, s, len);
+ return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..b54e304df4a7
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,137 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section - memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+
+/* Record a memory area against a node. */
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+ start &= PAGE_SECTION_MASK;
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ unsigned long section = pfn_to_section_nr(pfn);
+ if (!mem_section[section].section_mem_map)
+ mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+ }
+}
+
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long nr_pages = 0;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ if (nid != early_pfn_to_nid(pfn))
+ continue;
+
+ if (pfn_valid(pfn))
+ nr_pages += PAGES_PER_SECTION;
+ }
+
+ return nr_pages * sizeof(struct page);
+}
+
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+ return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+}
+
+/*
+ * We need this if we ever free the mem_maps. While not implemented yet,
+ * this function is included for parity with its sibling.
+ */
+static __attribute((unused))
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+ return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+
+static int sparse_init_one_section(struct mem_section *ms,
+ unsigned long pnum, struct page *mem_map)
+{
+ if (!valid_section(ms))
+ return -EINVAL;
+
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+
+ return 1;
+}
+
+static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+ struct page *map;
+ int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+
+ map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
+ map = alloc_bootmem_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
+ printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+ mem_section[pnum].section_mem_map = 0;
+ return NULL;
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+ unsigned long pnum;
+ struct page *map;
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!valid_section_nr(pnum))
+ continue;
+
+ map = sparse_early_mem_map_alloc(pnum);
+ if (map)
+ sparse_init_one_section(&mem_section[pnum], pnum, map);
+ }
+}
+
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set. If this is <=0, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+{
+ struct mem_section *ms = __pfn_to_section(start_pfn);
+
+ if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+ return -EEXIST;
+
+ ms->section_mem_map |= SECTION_MARKED_PRESENT;
+
+ return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
}
/*
- * Check if we're the only user of a swap page,
- * when the page is locked.
+ * How many references to page are currently swapped out?
*/
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
{
- int retval = 0;
- struct swap_info_struct * p;
+ int count = 0;
+ struct swap_info_struct *p;
swp_entry_t entry;
entry.val = page->private;
p = swap_info_get(entry);
if (p) {
- /* Is the only swap cache user the cache itself? */
- if (p->swap_map[swp_offset(entry)] == 1) {
- /* Recheck the page count with the swapcache lock held.. */
- write_lock_irq(&swapper_space.tree_lock);
- if (page_count(page) == 2)
- retval = 1;
- write_unlock_irq(&swapper_space.tree_lock);
- }
+ /* Subtract the 1 for the swap cache itself */
+ count = p->swap_map[swp_offset(entry)] - 1;
swap_info_put(p);
}
- return retval;
+ return count;
}
/*
* We can use this swap cache entry directly
* if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
*/
int can_share_swap_page(struct page *page)
{
- int retval = 0;
+ int count;
- if (!PageLocked(page))
- BUG();
- switch (page_count(page)) {
- case 3:
- if (!PagePrivate(page))
- break;
- /* Fallthrough */
- case 2:
- if (!PageSwapCache(page))
- break;
- retval = exclusive_swap_page(page);
- break;
- case 1:
- if (PageReserved(page))
- break;
- retval = 1;
- }
- return retval;
+ BUG_ON(!PageLocked(page));
+ count = page_mapcount(page);
+ if (count <= 1 && PageSwapCache(page))
+ count += page_swapcount(page);
+ return count == 1;
}
/*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
if (!down_read_trylock(&mm->mmap_sem)) {
/*
- * Our reference to the page stops try_to_unmap_one from
- * unmapping its ptes, so swapoff can make progress.
+ * Activate page so shrink_cache is unlikely to unmap its
+ * ptes while lock is dropped, so swapoff can make progress.
*/
+ activate_page(page);
unlock_page(page);
down_read(&mm->mmap_sem);
lock_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 269eded9b459..1fa312a8db77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,6 +74,9 @@ struct scan_control {
int may_writepage;
+ /* Can pages be swapped as part of reclaim? */
+ int may_swap;
+
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
* In this context, it doesn't matter that we scan the
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker);
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
*/
static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
+ int ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem))
- return 0;
+ return 1; /* Assume we'll be able to shrink next time */
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
+ int nr_before;
+ nr_before = (*shrinker->shrinker)(0, gfp_mask);
shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
if (shrink_ret == -1)
break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
mod_page_state(slabs_scanned, this_scan);
total_scan -= this_scan;
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
- return 0;
+ return ret;
}
/* Called without lock on whether page is mapped, so answer is unstable */
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
if (!add_to_swap(page))
goto activate_locked;
}
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, sc);
+ atomic_dec(&zone->reclaim_in_progress);
}
}
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones,
- unsigned int gfp_mask, unsigned int order)
+int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
{
int priority;
int ret = 0;
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones,
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
+ sc.may_swap = 1;
inc_page_state(allocstall);
@@ -1020,6 +1032,7 @@ loop_again:
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
+ sc.may_swap = 1;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1079,6 +1092,7 @@ scan:
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
+ int nr_slab;
if (zone->present_pages == 0)
continue;
@@ -1098,16 +1112,19 @@ scan:
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+ atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
+ atomic_dec(&zone->reclaim_in_progress);
reclaim_state->reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+ lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (zone->pages_scanned >= (zone->nr_active +
- zone->nr_inactive) * 4)
+ if (nr_slab == 0 && zone->pages_scanned >=
+ (zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -1199,8 +1216,8 @@ static int kswapd(void *p)
order = 0;
for ( ; ; ) {
unsigned long new_order;
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+
+ try_to_freeze();
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
+
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+ struct scan_control sc;
+ int nr_pages = 1 << order;
+ int total_reclaimed = 0;
+
+ /* The reclaim may sleep, so don't do it if sleep isn't allowed */
+ if (!(gfp_mask & __GFP_WAIT))
+ return 0;
+ if (zone->all_unreclaimable)
+ return 0;
+
+ sc.gfp_mask = gfp_mask;
+ sc.may_writepage = 0;
+ sc.may_swap = 0;
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ /* scan at the highest priority */
+ sc.priority = 0;
+
+ if (nr_pages > SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ /* Don't reclaim the zone if there are other reclaimers active */
+ if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+ goto out;
+
+ shrink_zone(zone, &sc);
+ total_reclaimed = sc.nr_reclaimed;
+
+ out:
+ atomic_dec(&zone->reclaim_in_progress);
+ return total_reclaimed;
+}
+
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+ unsigned int state)
+{
+ struct zone *z;
+ int i;
+
+ if (node >= MAX_NUMNODES || !node_online(node))
+ return -EINVAL;
+
+ /* This will break if we ever add more zones */
+ if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+ return -EINVAL;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (!(zone & 1<<i))
+ continue;
+
+ z = &NODE_DATA(node)->node_zones[i];
+
+ if (state)
+ z->reclaim_pages = 1;
+ else
+ z->reclaim_pages = 0;
+ }
+
+ return 0;
+}