From 34d0640e268923bb1fb9d244a047cdfcd3f77909 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Apr 2015 11:31:35 -0400 Subject: switch hugetlbfs to ->read_iter() ... and fix the case when the area we are asked to read crosses a hugepage boundary Signed-off-by: Al Viro --- fs/hugetlbfs/inode.c | 92 +++++++++++++++++++--------------------------------- 1 file changed, 34 insertions(+), 58 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..5291c14ee6b8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -179,42 +180,33 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } #endif -static int +static size_t hugetlbfs_read_actor(struct page *page, unsigned long offset, - char __user *buf, unsigned long count, - unsigned long size) + struct iov_iter *to, unsigned long size) { - char *kaddr; - unsigned long left, copied = 0; + size_t copied = 0; int i, chunksize; - if (size > count) - size = count; - /* Find which 4k chunk and offset with in that chunk */ i = offset >> PAGE_CACHE_SHIFT; offset = offset & ~PAGE_CACHE_MASK; while (size) { + size_t n; chunksize = PAGE_CACHE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) chunksize = size; - kaddr = kmap(&page[i]); - left = __copy_to_user(buf, kaddr + offset, chunksize); - kunmap(&page[i]); - if (left) { - copied += (chunksize - left); - break; - } + n = copy_page_to_iter(&page[i], offset, chunksize, to); + copied += n; + if (n != chunksize) + return copied; offset = 0; size -= chunksize; - buf += chunksize; - copied += chunksize; i++; } - return copied ? copied : -EFAULT; + return copied; } /* @@ -222,39 +214,34 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, * data. Its *very* similar to do_generic_mapping_read(), we can't use that * since it has PAGE_CACHE_SIZE assumptions. */ -static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, - size_t len, loff_t *ppos) +static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct hstate *h = hstate_file(filp); - struct address_space *mapping = filp->f_mapping; + struct file *file = iocb->ki_filp; + struct hstate *h = hstate_file(file); + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - unsigned long index = *ppos >> huge_page_shift(h); - unsigned long offset = *ppos & ~huge_page_mask(h); + unsigned long index = iocb->ki_pos >> huge_page_shift(h); + unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; - /* validate length */ - if (len == 0) - goto out; - - for (;;) { + while (iov_iter_count(to)) { struct page *page; - unsigned long nr, ret; - int ra; + size_t nr, copied; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) - goto out; + break; end_index = (isize - 1) >> huge_page_shift(h); - if (index >= end_index) { - if (index > end_index) - goto out; + if (index > end_index) + break; + if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) - goto out; + break; } nr = nr - offset; @@ -265,39 +252,27 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ - ret = len < nr ? len : nr; - if (clear_user(buf, ret)) - ra = -EFAULT; - else - ra = 0; + copied = iov_iter_zero(nr, to); } else { unlock_page(page); /* * We have the page, copy it to user space buffer. */ - ra = hugetlbfs_read_actor(page, offset, buf, len, nr); - ret = ra; + copied = hugetlbfs_read_actor(page, offset, to, nr); page_cache_release(page); } - if (ra < 0) { - if (retval == 0) - retval = ra; - goto out; + offset += copied; + retval += copied; + if (copied != nr && iov_iter_count(to)) { + if (!retval) + retval = -EFAULT; + break; } - - offset += ret; - retval += ret; - len -= ret; index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); - - /* short read or no more work */ - if ((ret != nr) || (len == 0)) - break; } -out: - *ppos = ((loff_t)index << huge_page_shift(h)) + offset; + iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } @@ -721,7 +696,8 @@ static void init_once(void *foo) } const struct file_operations hugetlbfs_file_operations = { - .read = hugetlbfs_read, + .read = new_sync_read, + .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, -- cgit v1.2.3 From 5d5d568975307877e9195f5305f4240e506a2807 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Apr 2015 15:41:18 -0400 Subject: make new_sync_{read,write}() static All places outside of core VFS that checked ->read and ->write for being NULL or called the methods directly are gone now, so NULL {read,write} with non-NULL {read,write}_iter will do the right thing in all cases. Signed-off-by: Al Viro --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5291c14ee6b8..8c2dad629e7c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -696,7 +696,6 @@ static void init_once(void *foo) } const struct file_operations hugetlbfs_file_operations = { - .read = new_sync_read, .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, -- cgit v1.2.3 From b9ea25152e56365ce149b9a39637cd7a16eec556 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 14 Apr 2015 15:45:27 -0700 Subject: page_writeback: clean up mess around cancel_dirty_page() This patch replaces cancel_dirty_page() with a helper function account_page_cleaned() which only updates counters. It's called from truncate_complete_page() and from try_to_free_buffers() (hack for ext3). Page is locked in both cases, page-lock protects against concurrent dirtiers: see commit 2d6d7f982846 ("mm: protect set_page_dirty() from ongoing truncation"). Delete_from_page_cache() shouldn't be called for dirty pages, they must be handled by caller (either written or truncated). This patch treats final dirty accounting fixup at the end of __delete_from_page_cache() as a debug check and adds WARN_ON_ONCE() around it. If something removes dirty pages without proper handling that might be a bug and unwritten data might be lost. Hugetlbfs has no dirty pages accounting, ClearPageDirty() is enough here. cancel_dirty_page() in nfs_wb_page_cancel() is redundant. This is helper for nfs_invalidate_page() and it's called only in case complete invalidation. The mess was started in v2.6.20 after commits 46d2277c796f ("Clean up and make try_to_free_buffers() not race with dirty pages") and 3e67c0987d75 ("truncate: clear page dirtiness before running try_to_free_buffers()") first was reverted right in v2.6.20 in commit ecdfc9787fe5 ("Resurrect 'try_to_free_buffers()' VM hackery"), second in v2.6.25 commit a2b345642f53 ("Fix dirty page accounting leak with ext3 data=journal"). Custom fixes were introduced between these points. NFS in v2.6.23, commit 1b3b4a1a2deb ("NFS: Fix a write request leak in nfs_invalidate_page()"). Kludge in __delete_from_page_cache() in v2.6.24, commit 3a6927906f1b ("Do dirty page accounting when removing a page from the page cache"). Since v2.6.25 all of them are redundant. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Konstantin Khlebnikov Cc: Tejun Heo Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..db76cec3ce21 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, static void truncate_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } -- cgit v1.2.3 From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:42 -0700 Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly Make 'min_size=' be an option when mounting a hugetlbfs. This option takes the same value as the 'size' option. min_size can be specified without specifying size. If both are specified, min_size must be less that or equal to size else the mount will fail. If min_size is specified, then at mount time an attempt is made to reserve min_size pages. If the reservation fails, the mount fails. At umount time, the reserved pages are released. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 90 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 19 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index db76cec3ce21..3a8f12762821 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -47,9 +47,10 @@ struct hugetlbfs_config { kuid_t uid; kgid_t gid; umode_t mode; - long nr_blocks; + long max_hpages; long nr_inodes; struct hstate *hstate; + long min_hpages; }; struct hugetlbfs_inode_info { @@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, + Opt_pagesize, Opt_min_size, Opt_err, }; @@ -78,6 +79,7 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, {Opt_err, NULL}, }; @@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = { .show_options = generic_show_options, }; +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; - unsigned long long size = 0; - enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; if (!options) return 0; @@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; - size = memparse(args[0].from, &rest); - setsize = SIZE_STD; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; if (*rest == '%') - setsize = SIZE_PERCENT; + max_val_type = SIZE_PERCENT; break; } @@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + default: pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; @@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } } - /* Do size after hstate is set up */ - if (setsize > NO_SIZE) { - struct hstate *h = pconfig->hstate; - if (setsize == SIZE_PERCENT) { - size <<= huge_page_shift(h); - size *= h->max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> huge_page_shift(h)); + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; } return 0; @@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - config.nr_blocks = -1; /* No limit on size by default */ + config.max_hpages = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current_fsuid(); config.gid = current_fsgid(); config.mode = 0755; config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sbinfo->spool = NULL; - if (config.nr_blocks != -1) { - sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); if (!sbinfo->spool) goto out_free; } -- cgit v1.2.3