From 34d0640e268923bb1fb9d244a047cdfcd3f77909 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Apr 2015 11:31:35 -0400
Subject: switch hugetlbfs to ->read_iter()

... and fix the case when the area we are asked to read crosses
a hugepage boundary

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c | 92 +++++++++++++++++++---------------------------------
 1 file changed, 34 insertions(+), 58 deletions(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..5291c14ee6b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/security.h>
 #include <linux/magic.h>
 #include <linux/migrate.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -179,42 +180,33 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 }
 #endif
 
-static int
+static size_t
 hugetlbfs_read_actor(struct page *page, unsigned long offset,
-			char __user *buf, unsigned long count,
-			unsigned long size)
+			struct iov_iter *to, unsigned long size)
 {
-	char *kaddr;
-	unsigned long left, copied = 0;
+	size_t copied = 0;
 	int i, chunksize;
 
-	if (size > count)
-		size = count;
-
 	/* Find which 4k chunk and offset with in that chunk */
 	i = offset >> PAGE_CACHE_SHIFT;
 	offset = offset & ~PAGE_CACHE_MASK;
 
 	while (size) {
+		size_t n;
 		chunksize = PAGE_CACHE_SIZE;
 		if (offset)
 			chunksize -= offset;
 		if (chunksize > size)
 			chunksize = size;
-		kaddr = kmap(&page[i]);
-		left = __copy_to_user(buf, kaddr + offset, chunksize);
-		kunmap(&page[i]);
-		if (left) {
-			copied += (chunksize - left);
-			break;
-		}
+		n = copy_page_to_iter(&page[i], offset, chunksize, to);
+		copied += n;
+		if (n != chunksize)
+			return copied;
 		offset = 0;
 		size -= chunksize;
-		buf += chunksize;
-		copied += chunksize;
 		i++;
 	}
-	return copied ? copied : -EFAULT;
+	return copied;
 }
 
 /*
@@ -222,39 +214,34 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
  * since it has PAGE_CACHE_SIZE assumptions.
  */
-static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
-			      size_t len, loff_t *ppos)
+static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-	struct hstate *h = hstate_file(filp);
-	struct address_space *mapping = filp->f_mapping;
+	struct file *file = iocb->ki_filp;
+	struct hstate *h = hstate_file(file);
+	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long index = *ppos >> huge_page_shift(h);
-	unsigned long offset = *ppos & ~huge_page_mask(h);
+	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
+	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
 
-	/* validate length */
-	if (len == 0)
-		goto out;
-
-	for (;;) {
+	while (iov_iter_count(to)) {
 		struct page *page;
-		unsigned long nr, ret;
-		int ra;
+		size_t nr, copied;
 
 		/* nr is the maximum number of bytes to copy from this page */
 		nr = huge_page_size(h);
 		isize = i_size_read(inode);
 		if (!isize)
-			goto out;
+			break;
 		end_index = (isize - 1) >> huge_page_shift(h);
-		if (index >= end_index) {
-			if (index > end_index)
-				goto out;
+		if (index > end_index)
+			break;
+		if (index == end_index) {
 			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset)
-				goto out;
+				break;
 		}
 		nr = nr - offset;
 
@@ -265,39 +252,27 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			 * We have a HOLE, zero out the user-buffer for the
 			 * length of the hole or request.
 			 */
-			ret = len < nr ? len : nr;
-			if (clear_user(buf, ret))
-				ra = -EFAULT;
-			else
-				ra = 0;
+			copied = iov_iter_zero(nr, to);
 		} else {
 			unlock_page(page);
 
 			/*
 			 * We have the page, copy it to user space buffer.
 			 */
-			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
-			ret = ra;
+			copied = hugetlbfs_read_actor(page, offset, to, nr);
 			page_cache_release(page);
 		}
-		if (ra < 0) {
-			if (retval == 0)
-				retval = ra;
-			goto out;
+		offset += copied;
+		retval += copied;
+		if (copied != nr && iov_iter_count(to)) {
+			if (!retval)
+				retval = -EFAULT;
+			break;
 		}
-
-		offset += ret;
-		retval += ret;
-		len -= ret;
 		index += offset >> huge_page_shift(h);
 		offset &= ~huge_page_mask(h);
-
-		/* short read or no more work */
-		if ((ret != nr) || (len == 0))
-			break;
 	}
-out:
-	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
+	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
 	return retval;
 }
 
@@ -721,7 +696,8 @@ static void init_once(void *foo)
 }
 
 const struct file_operations hugetlbfs_file_operations = {
-	.read			= hugetlbfs_read,
+	.read			= new_sync_read,
+	.read_iter		= hugetlbfs_read_iter,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
-- 
cgit v1.2.3


From 5d5d568975307877e9195f5305f4240e506a2807 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 3 Apr 2015 15:41:18 -0400
Subject: make new_sync_{read,write}() static

All places outside of core VFS that checked ->read and ->write for being NULL or
called the methods directly are gone now, so NULL {read,write} with non-NULL
{read,write}_iter will do the right thing in all cases.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5291c14ee6b8..8c2dad629e7c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -696,7 +696,6 @@ static void init_once(void *foo)
 }
 
 const struct file_operations hugetlbfs_file_operations = {
-	.read			= new_sync_read,
 	.read_iter		= hugetlbfs_read_iter,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
-- 
cgit v1.2.3


From b9ea25152e56365ce149b9a39637cd7a16eec556 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Tue, 14 Apr 2015 15:45:27 -0700
Subject: page_writeback: clean up mess around cancel_dirty_page()

This patch replaces cancel_dirty_page() with a helper function
account_page_cleaned() which only updates counters.  It's called from
truncate_complete_page() and from try_to_free_buffers() (hack for ext3).
Page is locked in both cases, page-lock protects against concurrent
dirtiers: see commit 2d6d7f982846 ("mm: protect set_page_dirty() from
ongoing truncation").

Delete_from_page_cache() shouldn't be called for dirty pages, they must
be handled by caller (either written or truncated).  This patch treats
final dirty accounting fixup at the end of __delete_from_page_cache() as
a debug check and adds WARN_ON_ONCE() around it.  If something removes
dirty pages without proper handling that might be a bug and unwritten
data might be lost.

Hugetlbfs has no dirty pages accounting, ClearPageDirty() is enough
here.

cancel_dirty_page() in nfs_wb_page_cancel() is redundant.  This is
helper for nfs_invalidate_page() and it's called only in case complete
invalidation.

The mess was started in v2.6.20 after commits 46d2277c796f ("Clean up
and make try_to_free_buffers() not race with dirty pages") and
3e67c0987d75 ("truncate: clear page dirtiness before running
try_to_free_buffers()") first was reverted right in v2.6.20 in commit
ecdfc9787fe5 ("Resurrect 'try_to_free_buffers()' VM hackery"), second in
v2.6.25 commit a2b345642f53 ("Fix dirty page accounting leak with ext3
data=journal").

Custom fixes were introduced between these points.  NFS in v2.6.23, commit
1b3b4a1a2deb ("NFS: Fix a write request leak in nfs_invalidate_page()").
Kludge in __delete_from_page_cache() in v2.6.24, commit 3a6927906f1b ("Do
dirty page accounting when removing a page from the page cache").  Since
v2.6.25 all of them are redundant.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..db76cec3ce21 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -319,7 +319,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
 
 static void truncate_huge_page(struct page *page)
 {
-	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+	ClearPageDirty(page);
 	ClearPageUptodate(page);
 	delete_from_page_cache(page);
 }
-- 
cgit v1.2.3


From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 15 Apr 2015 16:13:42 -0700
Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly

Make 'min_size=<value>' be an option when mounting a hugetlbfs.  This
option takes the same value as the 'size' option.  min_size can be
specified without specifying size.  If both are specified, min_size must
be less that or equal to size else the mount will fail.  If min_size is
specified, then at mount time an attempt is made to reserve min_size
pages.  If the reservation fails, the mount fails.  At umount time, the
reserved pages are released.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 90 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 71 insertions(+), 19 deletions(-)

(limited to 'fs/hugetlbfs/inode.c')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index db76cec3ce21..3a8f12762821 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -47,9 +47,10 @@ struct hugetlbfs_config {
 	kuid_t   uid;
 	kgid_t   gid;
 	umode_t mode;
-	long	nr_blocks;
+	long	max_hpages;
 	long	nr_inodes;
 	struct hstate *hstate;
+	long    min_hpages;
 };
 
 struct hugetlbfs_inode_info {
@@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group;
 enum {
 	Opt_size, Opt_nr_inodes,
 	Opt_mode, Opt_uid, Opt_gid,
-	Opt_pagesize,
+	Opt_pagesize, Opt_min_size,
 	Opt_err,
 };
 
@@ -78,6 +79,7 @@ static const match_table_t tokens = {
 	{Opt_uid,	"uid=%u"},
 	{Opt_gid,	"gid=%u"},
 	{Opt_pagesize,	"pagesize=%s"},
+	{Opt_min_size,	"min_size=%s"},
 	{Opt_err,	NULL},
 };
 
@@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = {
 	.show_options	= generic_show_options,
 };
 
+enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+/*
+ * Convert size option passed from command line to number of huge pages
+ * in the pool specified by hstate.  Size option could be in bytes
+ * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
+ */
+static long long
+hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
+								int val_type)
+{
+	if (val_type == NO_SIZE)
+		return -1;
+
+	if (val_type == SIZE_PERCENT) {
+		size_opt <<= huge_page_shift(h);
+		size_opt *= h->max_huge_pages;
+		do_div(size_opt, 100);
+	}
+
+	size_opt >>= huge_page_shift(h);
+	return size_opt;
+}
+
 static int
 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 {
 	char *p, *rest;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
-	unsigned long long size = 0;
-	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
+	unsigned long long max_size_opt = 0, min_size_opt = 0;
+	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
 
 	if (!options)
 		return 0;
@@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
-			size = memparse(args[0].from, &rest);
-			setsize = SIZE_STD;
+			max_size_opt = memparse(args[0].from, &rest);
+			max_val_type = SIZE_STD;
 			if (*rest == '%')
-				setsize = SIZE_PERCENT;
+				max_val_type = SIZE_PERCENT;
 			break;
 		}
 
@@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 		}
 
+		case Opt_min_size: {
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			min_size_opt = memparse(args[0].from, &rest);
+			min_val_type = SIZE_STD;
+			if (*rest == '%')
+				min_val_type = SIZE_PERCENT;
+			break;
+		}
+
 		default:
 			pr_err("Bad mount option: \"%s\"\n", p);
 			return -EINVAL;
@@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 		}
 	}
 
-	/* Do size after hstate is set up */
-	if (setsize > NO_SIZE) {
-		struct hstate *h = pconfig->hstate;
-		if (setsize == SIZE_PERCENT) {
-			size <<= huge_page_shift(h);
-			size *= h->max_huge_pages;
-			do_div(size, 100);
-		}
-		pconfig->nr_blocks = (size >> huge_page_shift(h));
+	/*
+	 * Use huge page pool size (in hstate) to convert the size
+	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
+	 */
+	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						max_size_opt, max_val_type);
+	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						min_size_opt, min_val_type);
+
+	/*
+	 * If max_size was specified, then min_size must be smaller
+	 */
+	if (max_val_type > NO_SIZE &&
+	    pconfig->min_hpages > pconfig->max_hpages) {
+		pr_err("minimum size can not be greater than maximum size\n");
+		return -EINVAL;
 	}
 
 	return 0;
@@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	save_mount_options(sb, data);
 
-	config.nr_blocks = -1; /* No limit on size by default */
+	config.max_hpages = -1; /* No limit on size by default */
 	config.nr_inodes = -1; /* No limit on number of inodes by default */
 	config.uid = current_fsuid();
 	config.gid = current_fsgid();
 	config.mode = 0755;
 	config.hstate = &default_hstate;
+	config.min_hpages = -1; /* No default minimum size */
 	ret = hugetlbfs_parse_options(data, &config);
 	if (ret)
 		return ret;
@@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
 	sbinfo->spool = NULL;
-	if (config.nr_blocks != -1) {
-		sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+	/*
+	 * Allocate and initialize subpool if maximum or minimum size is
+	 * specified.  Any needed reservations (for minimim size) are taken
+	 * taken when the subpool is created.
+	 */
+	if (config.max_hpages != -1 || config.min_hpages != -1) {
+		sbinfo->spool = hugepage_new_subpool(config.hstate,
+							config.max_hpages,
+							config.min_hpages);
 		if (!sbinfo->spool)
 			goto out_free;
 	}
-- 
cgit v1.2.3