From e286781d5f2e9c846e012a39653a166e9d31777d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:30 -0700 Subject: mm: speculative page references If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves consistently throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). Actually, there is a period where the count behaves differently: when the page is free or if it is a constituent page of a compound page. We need an atomic_inc_not_zero operation to ensure we don't try to grab the page in either case. This patch introduces the core locking protocol to the pagecache (ie. adds page_cache_get_speculative, and tweaks some update-side code to make it work). Thanks to Hugh for pointing out an improvement to the algorithm setting page_count to zero when we have control of all references, in order to hold off speculative getters. [kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()] [hugh@veritas.com: fix add_to_page_cache] [akpm@linux-foundation.org: repair a comment] Signed-off-by: Nick Piggin Cc: Jeff Garzik Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2d3ec1ffc66e..4e182a9a14c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + write_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else + } else { + page->mapping = NULL; mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } write_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); @@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) -- cgit v1.2.3 From a60637c85893e7191faaafa6a72e197c24386727 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:31 -0700 Subject: mm: lockless pagecache Combine page_cache_get_speculative with lockless radix tree lookups to introduce lockless page cache lookups (ie. no mapping->tree_lock on the read-side). The only atomicity changes this introduces is that the gang pagecache lookup functions now behave as if they are implemented with multiple find_get_page calls, rather than operating on a snapshot of the pages. In practice, this atomicity guarantee is not used anyway, and it is to replace individual lookups, so these semantics are natural. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 179 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 134 insertions(+), 45 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 4e182a9a14c0..feb8448d8618 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -637,15 +637,35 @@ void __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -660,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -751,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -778,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -810,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); -- cgit v1.2.3 From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:32 -0700 Subject: mm: spinlock tree_lock mapping->tree_lock has no read lockers. convert the lock from an rwlock to a spinlock. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index feb8448d8618..2ed8b0389c51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -109,7 +109,7 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->mapping = mapping; page->index = offset; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (likely(!error)) { mapping->nrpages++; @@ -480,7 +480,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page_cache_release(page); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page); -- cgit v1.2.3 From 2f1936b87783a3a56c9441b27b9ba7a747f11e8e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Jun 2008 16:50:14 +0200 Subject: [patch 3/5] vfs: change remove_suid() to file_remove_suid() All calls to remove_suid() are made with a file pointer, because (similarly to file_update_time) it is called when the file is written. Clean up callers by passing in a file instead of a dentry. Signed-off-by: Miklos Szeredi --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 2ed8b0389c51..5de7633e1dbe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1758,8 +1758,9 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { + struct dentry *dentry = file->f_path.dentry; int killsuid = should_remove_suid(dentry); int killpriv = security_inode_need_killpriv(dentry); int error = 0; @@ -1773,7 +1774,7 @@ int remove_suid(struct dentry *dentry) return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -2529,7 +2530,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; -- cgit v1.2.3 From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 28 Jul 2008 15:46:36 -0700 Subject: vfs: pagecache usage optimization for pagesize!=blocksize When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include #include #include #include #include #include #include #include #include #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi Cc: Nick Piggin Cc: Christoph Hellwig Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 5de7633e1dbe..42bbc6909ba4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1023,8 +1023,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (TestSetPageLocked(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -1094,6 +1103,7 @@ page_not_up_to_date: if (lock_page_killable(page)) goto readpage_eio; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); -- cgit v1.2.3