From 5c7d0f49cf1492866fa619af4538f56938abe07d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2016 15:16:07 -0700 Subject: devpts: clean up interface to pty drivers commit 67245ff332064c01b760afa7a384ccda024bfd24 upstream. This gets rid of the horrible notion of having that struct inode *ptmx_inode be the linchpin of the interface between the pty code and devpts. By de-emphasizing the ptmx inode, a lot of things actually get cleaner, and we will have a much saner way forward. In particular, this will allow us to associate with any particular devpts instance at open-time, and not be artificially tied to one particular ptmx inode. The patch itself is actually fairly straightforward, and apart from some locking and return path cleanups it's pretty mechanical: - the interfaces that devpts exposes all take "struct pts_fs_info *" instead of "struct inode *ptmx_inode" now. NOTE! The "struct pts_fs_info" thing is a completely opaque structure as far as the pty driver is concerned: it's still declared entirely internally to devpts. So the pty code can't actually access it in any way, just pass it as a "cookie" to the devpts code. - the "look up the pts fs info" is now a single clear operation, that also does the reference count increment on the pts superblock. So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get ref" operation (devpts_get_ref(inode)), along with a "put ref" op (devpts_put_ref()). - the pty master "tty->driver_data" field now contains the pts_fs_info, not the ptmx inode. - because we don't care about the ptmx inode any more as some kind of base index, the ref counting can now drop the inode games - it just gets the ref on the superblock. - the pts_fs_info now has a back-pointer to the super_block. That's so that we can easily look up the information we actually need. Although quite often, the pts fs info was actually all we wanted, and not having to look it up based on some magical inode makes things more straightforward. In particular, now that "devpts_get_ref(inode)" operation should really be the *only* place we need to look up what devpts instance we're associated with, and we do it exactly once, at ptmx_open() time. The other side of this is that one ptmx node could now be associated with multiple different devpts instances - you could have a single /dev/ptmx node, and then have multiple mount namespaces with their own instances of devpts mounted on /dev/pts/. And that's all perfectly sane in a model where we just look up the pts instance at open time. This will eventually allow us to get rid of our odd single-vs-multiple pts instance model, but this patch in itself changes no semantics, only an internal binding model. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds Cc: Francesco Ruggeri Cc: "Herton R. Krzesinski" Signed-off-by: Greg Kroah-Hartman --- include/linux/devpts_fs.h | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index e0ee0b3000b2..358a4db72a27 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,38 +15,24 @@ #include +struct pts_fs_info; + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); -void devpts_kill_index(struct inode *ptmx_inode, int idx); -void devpts_add_ref(struct inode *ptmx_inode); -void devpts_del_ref(struct inode *ptmx_inode); +/* Look up a pts fs info and get a ref to it */ +struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); +void devpts_put_ref(struct pts_fs_info *); + +int devpts_new_index(struct pts_fs_info *); +void devpts_kill_index(struct pts_fs_info *, int); + /* mknod in devpts */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv); +struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); /* get private structure */ void *devpts_get_priv(struct inode *pts_inode); /* unlink */ void devpts_pty_kill(struct inode *inode); -#else - -/* Dummy stubs in the no-pty case */ -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } -static inline void devpts_add_ref(struct inode *ptmx_inode) { } -static inline void devpts_del_ref(struct inode *ptmx_inode) { } -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, - dev_t device, int index, void *priv) -{ - return ERR_PTR(-EINVAL); -} -static inline void *devpts_get_priv(struct inode *pts_inode) -{ - return NULL; -} -static inline void devpts_pty_kill(struct inode *inode) { } - #endif -- cgit v1.2.3 From 8627c7750a66a46d56d3564e1e881aa53764497c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jul 2016 15:44:57 -0700 Subject: mm: memcontrol: fix cgroup creation failure after many small jobs commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream. The memory controller has quite a bit of state that usually outlives the cgroup and pins its CSS until said state disappears. At the same time it imposes a 16-bit limit on the CSS ID space to economically store IDs in the wild. Consequently, when we use cgroups to contain frequent but small and short-lived jobs that leave behind some page cache, we quickly run into the 64k limitations of outstanding CSSs. Creating a new cgroup fails with -ENOSPC while there are only a few, or even no user-visible cgroups in existence. Although pinning CSSs past cgroup removal is common, there are only two instances that actually need an ID after a cgroup is deleted: cache shadow entries and swapout records. Cache shadow entries reference the ID weakly and can deal with the CSS having disappeared when it's looked up later. They pose no hurdle. Swap-out records do need to pin the css to hierarchically attribute swapins after the cgroup has been deleted; though the only pages that remain swapped out after offlining are tmpfs/shmem pages. And those references are under the user's control, so they are manageable. This patch introduces a private 16-bit memcg ID and switches swap and cache shadow entries over to using that. This ID can then be recycled after offlining when the CSS remains pinned only by objects that don't specifically need it. This script demonstrates the problem by faulting one cache page in a new cgroup and deleting it again: set -e mkdir -p pages for x in `seq 128000`; do [ $((x % 1000)) -eq 0 ] && echo $x mkdir /cgroup/foo echo $$ >/cgroup/foo/cgroup.procs echo trex >pages/$x echo $$ >/cgroup/cgroup.procs rmdir /cgroup/foo done When run on an unpatched kernel, we eventually run out of possible IDs even though there are no visible cgroups: [root@ham ~]# ./cssidstress.sh [...] 65000 mkdir: cannot create directory '/cgroup/foo': No space left on device After this patch, the IDs get released upon cgroup destruction and the cache and css objects get released once memory reclaim kicks in. [hannes@cmpxchg.org: init the IDR] Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: John Garcia Reviewed-by: Vladimir Davydov Acked-by: Tejun Heo Cc: Nikolay Borisov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- include/linux/memcontrol.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cd0e2413c358..435fd8426b8a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -174,6 +174,11 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; +struct mem_cgroup_id { + int id; + atomic_t ref; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -183,6 +188,9 @@ struct mem_cgroup_thresholds { struct mem_cgroup { struct cgroup_subsys_state css; + /* Private memcg ID. Used to ID objects that outlive the cgroup */ + struct mem_cgroup_id id; + /* Accounted resources */ struct page_counter memory; struct page_counter memsw; -- cgit v1.2.3