From 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 3 Feb 2014 19:13:49 -0800
Subject: [PATCH 01/11] namespaces: Use task_lock and not rcu to protect
 nsproxy

The synchronous syncrhonize_rcu in switch_task_namespaces makes setns
a sufficiently expensive system call that people have complained.

Upon inspect nsproxy no longer needs rcu protection for remote reads.
remote reads are rare.  So optimize for same process reads and write
by switching using rask_lock instead.

This yields a simpler to understand lock, and a faster setns system call.

In particular this fixes a performance regression observed
by Rafael David Tinoco <rafael.tinoco@canonical.com>.

This is effectively a revert of Pavel Emelyanov's commit
cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter
from 2007.  The race this originialy fixed no longer exists as
do_notify_parent uses task_active_pid_ns(parent) instead of
parent->nsproxy.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c           |  6 +++---
 fs/proc/proc_net.c       |  4 +++-
 fs/proc_namespace.c      |  8 +++-----
 include/linux/nsproxy.h  | 16 ++++++----------
 ipc/namespace.c          |  6 +++---
 kernel/nsproxy.c         | 15 ++++-----------
 kernel/utsname.c         |  6 +++---
 net/core/net_namespace.c | 10 ++++++----
 8 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41cd887..7187d01329c3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task)
 	struct mnt_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->mnt_ns;
 		get_mnt_ns(ns);
 	}
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4677bb7dc7c2..a63af3e0a612 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
 	rcu_read_lock();
 	task = pid_task(proc_pid(dir), PIDTYPE_PID);
 	if (task != NULL) {
-		ns = task_nsproxy(task);
+		task_lock(task);
+		ns = task->nsproxy;
 		if (ns != NULL)
 			net = get_net(ns->net_ns);
+		task_unlock(task);
 	}
 	rcu_read_unlock();
 
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 1a81373947f3..73ca1740d839 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	if (!task)
 		goto err;
 
-	rcu_read_lock();
-	nsp = task_nsproxy(task);
+	task_lock(task);
+	nsp = task->nsproxy;
 	if (!nsp || !nsp->mnt_ns) {
-		rcu_read_unlock();
+		task_unlock(task);
 		put_task_struct(task);
 		goto err;
 	}
 	ns = nsp->mnt_ns;
 	get_mnt_ns(ns);
-	rcu_read_unlock();
-	task_lock(task);
 	if (!task->fs) {
 		task_unlock(task);
 		put_task_struct(task);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index b4ec59d159ac..35fa08fd7739 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy;
  * the namespaces access rules are:
  *
  *  1. only current task is allowed to change tsk->nsproxy pointer or
- *     any pointer on the nsproxy itself
+ *     any pointer on the nsproxy itself.  Current must hold the task_lock
+ *     when changing tsk->nsproxy.
  *
  *  2. when accessing (i.e. reading) current task's namespaces - no
  *     precautions should be taken - just dereference the pointers
  *
  *  3. the access to other task namespaces is performed like this
- *     rcu_read_lock();
- *     nsproxy = task_nsproxy(tsk);
+ *     task_lock(task);
+ *     nsproxy = task->nsproxy;
  *     if (nsproxy != NULL) {
  *             / *
  *               * work with the namespaces here
  *               * e.g. get the reference on one of them
  *               * /
  *     } / *
- *         * NULL task_nsproxy() means that this task is
+ *         * NULL task->nsproxy means that this task is
  *         * almost dead (zombie)
  *         * /
- *     rcu_read_unlock();
+ *     task_unlock(task);
  *
  */
 
-static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
-{
-	return rcu_dereference(tsk->nsproxy);
-}
-
 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59451c1e214d..b54468e48e32 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task)
 	struct ipc_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy)
 		ns = get_ipc_ns(nsproxy->ipc_ns);
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 
 	might_sleep();
 
+	task_lock(p);
 	ns = p->nsproxy;
+	p->nsproxy = new;
+	task_unlock(p);
 
-	rcu_assign_pointer(p->nsproxy, new);
-
-	if (ns && atomic_dec_and_test(&ns->count)) {
-		/*
-		 * wait for others to get what they want from this nsproxy.
-		 *
-		 * cannot release this nsproxy via the call_rcu() since
-		 * put_mnt_ns() will want to sleep
-		 */
-		synchronize_rcu();
+	if (ns && atomic_dec_and_test(&ns->count))
 		free_nsproxy(ns);
-	}
 }
 
 void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
 	struct uts_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->uts_ns;
 		get_uts_ns(ns);
 	}
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 85b62691f4f2..7c6b51a58968 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
 	tsk = find_task_by_vpid(pid);
 	if (tsk) {
 		struct nsproxy *nsproxy;
-		nsproxy = task_nsproxy(tsk);
+		task_lock(tsk);
+		nsproxy = tsk->nsproxy;
 		if (nsproxy)
 			net = get_net(nsproxy->net_ns);
+		task_unlock(tsk);
 	}
 	rcu_read_unlock();
 	return net;
@@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
 	struct net *net = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy)
 		net = get_net(nsproxy->net_ns);
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return net;
 }

From a6138db815df5ee542d848318e5dae681590fccd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jul 2014 16:26:53 -0700
Subject: [PATCH 02/11] mnt: Only change user settable mount flags in remount

Kenton Varda <kenton@sandstorm.io> discovered that by remounting a
read-only bind mount read-only in a user namespace the
MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user
to the remount a read-only mount read-write.

Correct this by replacing the mask of mount flags to preserve
with a mask of mount flags that may be changed, and preserve
all others.   This ensures that any future bugs with this mask and
remount will fail in an easy to detect way where new mount flags
simply won't change.

Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c        | 2 +-
 include/linux/mount.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7187d01329c3..cb40449ea0df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1937,7 +1937,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		lock_mount_hash();
-		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		touch_mnt_namespace(mnt->mnt_ns);
 		unlock_mount_hash();
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 839bac270904..b637a89e1fae 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -42,7 +42,9 @@ struct mnt_namespace;
  * flag, consider how it interacts with shared mounts.
  */
 #define MNT_SHARED_MASK	(MNT_UNBINDABLE)
-#define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+#define MNT_USER_SETTABLE_MASK  (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
+				 | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
+				 | MNT_READONLY)
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
 			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)

From 07b645589dcda8b7a5249e096fece2a67556f0f4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jul 2014 17:10:56 -0700
Subject: [PATCH 03/11] mnt: Move the test for MNT_LOCK_READONLY from
 change_mount_flags into do_remount

There are no races as locked mount flags are guaranteed to never change.

Moving the test into do_remount makes it more visible, and ensures all
filesystem remounts pass the MNT_LOCK_READONLY permission check.  This
second case is not an issue today as filesystem remounts are guarded
by capable(CAP_DAC_ADMIN) and thus will always fail in less privileged
mount namespaces, but it could become an issue in the future.

Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index cb40449ea0df..1105a577a14f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1896,9 +1896,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 
-	if (mnt->mnt_flags & MNT_LOCK_READONLY)
-		return -EPERM;
-
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
@@ -1924,6 +1921,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	/* Don't allow changing of locked mnt flags.
+	 *
+	 * No locks need to be held here while testing the various
+	 * MNT_LOCK flags because those flags can never be cleared
+	 * once they are set.
+	 */
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+	    !(mnt_flags & MNT_READONLY)) {
+		return -EPERM;
+	}
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;

From 9566d6742852c527bf5af38af5cbb878dad75705 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jul 2014 17:26:07 -0700
Subject: [PATCH 04/11] mnt: Correct permission checks in do_remount

While invesgiating the issue where in "mount --bind -oremount,ro ..."
would result in later "mount --bind -oremount,rw" succeeding even if
the mount started off locked I realized that there are several
additional mount flags that should be locked and are not.

In particular MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, and the atime
flags in addition to MNT_READONLY should all be locked.  These
flags are all per superblock, can all be changed with MS_BIND,
and should not be changable if set by a more privileged user.

The following additions to the current logic are added in this patch.
- nosuid may not be clearable by a less privileged user.
- nodev  may not be clearable by a less privielged user.
- noexec may not be clearable by a less privileged user.
- atime flags may not be changeable by a less privileged user.

The logic with atime is that always setting atime on access is a
global policy and backup software and auditing software could break if
atime bits are not updated (when they are configured to be updated),
and serious performance degradation could result (DOS attack) if atime
updates happen when they have been explicitly disabled.  Therefore an
unprivileged user should not be able to mess with the atime bits set
by a more privileged user.

The additional restrictions are implemented with the addition of
MNT_LOCK_NOSUID, MNT_LOCK_NODEV, MNT_LOCK_NOEXEC, and MNT_LOCK_ATIME
mnt flags.

Taken together these changes and the fixes for MNT_LOCK_READONLY
should make it safe for an unprivileged user to create a user
namespace and to call "mount --bind -o remount,... ..." without
the danger of mount flags being changed maliciously.

Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c        | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mount.h |  5 +++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 1105a577a14f..dd9c93b5a9d5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -890,8 +890,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
 	/* Don't allow unprivileged users to change mount flags */
-	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
-		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+	if (flag & CL_UNPRIVILEGED) {
+		mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
+
+		if (mnt->mnt.mnt_flags & MNT_READONLY)
+			mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
+		if (mnt->mnt.mnt_flags & MNT_NODEV)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
+
+		if (mnt->mnt.mnt_flags & MNT_NOSUID)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
+
+		if (mnt->mnt.mnt_flags & MNT_NOEXEC)
+			mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
+	}
 
 	/* Don't allow unprivileged users to reveal what is under a mount */
 	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
@@ -1931,6 +1944,23 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	    !(mnt_flags & MNT_READONLY)) {
 		return -EPERM;
 	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+	    !(mnt_flags & MNT_NODEV)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
+	    !(mnt_flags & MNT_NOSUID)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
+	    !(mnt_flags & MNT_NOEXEC)) {
+		return -EPERM;
+	}
+	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+	    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
+		return -EPERM;
+	}
+
 	err = security_sb_remount(sb, data);
 	if (err)
 		return err;
@@ -2129,7 +2159,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 		 */
 		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
 			flags |= MS_NODEV;
-			mnt_flags |= MNT_NODEV;
+			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
 	}
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index b637a89e1fae..b0c1e6574e7f 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -45,12 +45,17 @@ struct mnt_namespace;
 #define MNT_USER_SETTABLE_MASK  (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
 				 | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
 				 | MNT_READONLY)
+#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
 			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_LOCK_ATIME		0x040000
+#define MNT_LOCK_NOEXEC		0x080000
+#define MNT_LOCK_NOSUID		0x100000
+#define MNT_LOCK_NODEV		0x200000
 #define MNT_LOCK_READONLY	0x400000
 #define MNT_LOCKED		0x800000
 #define MNT_DOOMED		0x1000000

From ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 28 Jul 2014 17:36:04 -0700
Subject: [PATCH 05/11] mnt: Change the default remount atime from relatime to
 the existing value

Since March 2009 the kernel has treated the state that if no
MS_..ATIME flags are passed then the kernel defaults to relatime.

Defaulting to relatime instead of the existing atime state during a
remount is silly, and causes problems in practice for people who don't
specify any MS_...ATIME flags and to get the default filesystem atime
setting.  Those users may encounter a permission error because the
default atime setting does not work.

A default that does not work and causes permission problems is
ridiculous, so preserve the existing value to have a default
atime setting that is always guaranteed to work.

Using the default atime setting in this way is particularly
interesting for applications built to run in restricted userspace
environments without /proc mounted, as the existing atime mount
options of a filesystem can not be read from /proc/mounts.

In practice this fixes user space that uses the default atime
setting on remount that are broken by the permission checks
keeping less privileged users from changing more privileged users
atime settings.

Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index dd9c93b5a9d5..7886176232c1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2473,6 +2473,14 @@ long do_mount(const char *dev_name, const char *dir_name,
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
+	/* The default atime for remount is preservation */
+	if ((flags & MS_REMOUNT) &&
+	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+		       MS_STRICTATIME)) == 0)) {
+		mnt_flags &= ~MNT_ATIME_MASK;
+		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
+	}
+
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME);

From db181ce011e3c033328608299cd6fac06ea50130 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 29 Jul 2014 15:50:44 -0700
Subject: [PATCH 06/11] mnt: Add tests for unprivileged remount cases that have
 found to be faulty

Kenton Varda <kenton@sandstorm.io> discovered that by remounting a
read-only bind mount read-only in a user namespace the
MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user
to the remount a read-only mount read-write.

Upon review of the code in remount it was discovered that the code allowed
nosuid, noexec, and nodev to be cleared.  It was also discovered that
the code was allowing the per mount atime flags to be changed.

The first naive patch to fix these issues contained the flaw that using
default atime settings when remounting a filesystem could be disallowed.

To avoid this problems in the future add tests to ensure unprivileged
remounts are succeeding and failing at the appropriate times.

Cc: stable@vger.kernel.org
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/mount/Makefile        |  17 ++
 .../mount/unprivileged-remount-test.c         | 242 ++++++++++++++++++
 3 files changed, 260 insertions(+)
 create mode 100644 tools/testing/selftests/mount/Makefile
 create mode 100644 tools/testing/selftests/mount/unprivileged-remount-test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index e66e710cc595..0a8a9db43d34 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += efivarfs
 TARGETS += kcmp
 TARGETS += memory-hotplug
 TARGETS += mqueue
+TARGETS += mount
 TARGETS += net
 TARGETS += ptrace
 TARGETS += timers
diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile
new file mode 100644
index 000000000000..337d853c2b72
--- /dev/null
+++ b/tools/testing/selftests/mount/Makefile
@@ -0,0 +1,17 @@
+# Makefile for mount selftests.
+
+all: unprivileged-remount-test
+
+unprivileged-remount-test: unprivileged-remount-test.c
+	gcc -Wall -O2 unprivileged-remount-test.c -o unprivileged-remount-test
+
+# Allow specific tests to be selected.
+test_unprivileged_remount: unprivileged-remount-test
+	@if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi
+
+run_tests: all test_unprivileged_remount
+
+clean:
+	rm -f unprivileged-remount-test
+
+.PHONY: all test_unprivileged_remount
diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
new file mode 100644
index 000000000000..1b3ff2fda4d0
--- /dev/null
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -0,0 +1,242 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifndef CLONE_NEWNS
+# define CLONE_NEWNS 0x00020000
+#endif
+#ifndef CLONE_NEWUTS
+# define CLONE_NEWUTS 0x04000000
+#endif
+#ifndef CLONE_NEWIPC
+# define CLONE_NEWIPC 0x08000000
+#endif
+#ifndef CLONE_NEWNET
+# define CLONE_NEWNET 0x40000000
+#endif
+#ifndef CLONE_NEWUSER
+# define CLONE_NEWUSER 0x10000000
+#endif
+#ifndef CLONE_NEWPID
+# define CLONE_NEWPID 0x20000000
+#endif
+
+#ifndef MS_RELATIME
+#define MS_RELATIME (1 << 21)
+#endif
+#ifndef MS_STRICTATIME
+#define MS_STRICTATIME (1 << 24)
+#endif
+
+static void die(char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	exit(EXIT_FAILURE);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+	char buf[4096];
+	int fd;
+	ssize_t written;
+	int buf_len;
+	va_list ap;
+
+	va_start(ap, fmt);
+	buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	if (buf_len < 0) {
+		die("vsnprintf failed: %s\n",
+		    strerror(errno));
+	}
+	if (buf_len >= sizeof(buf)) {
+		die("vsnprintf output truncated\n");
+	}
+
+	fd = open(filename, O_WRONLY);
+	if (fd < 0) {
+		die("open of %s failed: %s\n",
+		    filename, strerror(errno));
+	}
+	written = write(fd, buf, buf_len);
+	if (written != buf_len) {
+		if (written >= 0) {
+			die("short write to %s\n", filename);
+		} else {
+			die("write to %s failed: %s\n",
+				filename, strerror(errno));
+		}
+	}
+	if (close(fd) != 0) {
+		die("close of %s failed: %s\n",
+			filename, strerror(errno));
+	}
+}
+
+static void create_and_enter_userns(void)
+{
+	uid_t uid;
+	gid_t gid;
+
+	uid = getuid();
+	gid = getgid();
+
+	if (unshare(CLONE_NEWUSER) !=0) {
+		die("unshare(CLONE_NEWUSER) failed: %s\n",
+			strerror(errno));
+	}
+
+	write_file("/proc/self/uid_map", "0 %d 1", uid);
+	write_file("/proc/self/gid_map", "0 %d 1", gid);
+
+	if (setgroups(0, NULL) != 0) {
+		die("setgroups failed: %s\n",
+			strerror(errno));
+	}
+	if (setgid(0) != 0) {
+		die ("setgid(0) failed %s\n",
+			strerror(errno));
+	}
+	if (setuid(0) != 0) {
+		die("setuid(0) failed %s\n",
+			strerror(errno));
+	}
+}
+
+static
+bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
+{
+	pid_t child;
+
+	child = fork();
+	if (child == -1) {
+		die("fork failed: %s\n",
+			strerror(errno));
+	}
+	if (child != 0) { /* parent */
+		pid_t pid;
+		int status;
+		pid = waitpid(child, &status, 0);
+		if (pid == -1) {
+			die("waitpid failed: %s\n",
+				strerror(errno));
+		}
+		if (pid != child) {
+			die("waited for %d got %d\n",
+				child, pid);
+		}
+		if (!WIFEXITED(status)) {
+			die("child did not terminate cleanly\n");
+		}
+		return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+	}
+
+	create_and_enter_userns();
+	if (unshare(CLONE_NEWNS) != 0) {
+		die("unshare(CLONE_NEWNS) failed: %s\n",
+			strerror(errno));
+	}
+
+	if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) {
+		die("mount of /tmp failed: %s\n",
+			strerror(errno));
+	}
+
+	create_and_enter_userns();
+
+	if (unshare(CLONE_NEWNS) != 0) {
+		die("unshare(CLONE_NEWNS) failed: %s\n",
+			strerror(errno));
+	}
+
+	if (mount("/tmp", "/tmp", "none",
+		  MS_REMOUNT | MS_BIND | remount_flags, NULL) != 0) {
+		/* system("cat /proc/self/mounts"); */
+		die("remount of /tmp failed: %s\n",
+		    strerror(errno));
+	}
+
+	if (mount("/tmp", "/tmp", "none",
+		  MS_REMOUNT | MS_BIND | invalid_flags, NULL) == 0) {
+		/* system("cat /proc/self/mounts"); */
+		die("remount of /tmp with invalid flags "
+		    "succeeded unexpectedly\n");
+	}
+	exit(EXIT_SUCCESS);
+}
+
+static bool test_unpriv_remount_simple(int mount_flags)
+{
+	return test_unpriv_remount(mount_flags, mount_flags, 0);
+}
+
+static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags)
+{
+	return test_unpriv_remount(mount_flags, mount_flags, invalid_flags);
+}
+
+int main(int argc, char **argv)
+{
+	if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) {
+		die("MS_RDONLY malfunctions\n");
+	}
+	if (!test_unpriv_remount_simple(MS_NODEV)) {
+		die("MS_NODEV malfunctions\n");
+	}
+	if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) {
+		die("MS_NOSUID malfunctions\n");
+	}
+	if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) {
+		die("MS_NOEXEC malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV,
+				       MS_NOATIME|MS_NODEV))
+	{
+		die("MS_RELATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV,
+				       MS_NOATIME|MS_NODEV))
+	{
+		die("MS_STRICTATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV,
+				       MS_STRICTATIME|MS_NODEV))
+	{
+		die("MS_RELATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV,
+				       MS_NOATIME|MS_NODEV))
+	{
+		die("MS_RELATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV,
+				       MS_NOATIME|MS_NODEV))
+	{
+		die("MS_RELATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV,
+				       MS_STRICTATIME|MS_NODEV))
+	{
+		die("MS_RELATIME malfunctions\n");
+	}
+	if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV,
+				 MS_NOATIME|MS_NODEV))
+	{
+		die("Default atime malfunctions\n");
+	}
+	return EXIT_SUCCESS;
+}

From 65b38851a17472d31fec9019fc3a55b0802dab88 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 Jul 2014 04:35:20 -0700
Subject: [PATCH 07/11] NFS: Fix /proc/fs/nfsfs/servers and
 /proc/fs/nfsfs/volumes

The usage of pid_ns->child_reaper->nsproxy->net_ns in
nfs_server_list_open and nfs_client_list_open is not safe.

/proc for a pid namespace can remain mounted after the all of the
process in that pid namespace have exited.  There are also times
before the initial process in a pid namespace has started or after the
initial process in a pid namespace has exited where
pid_ns->child_reaper can be NULL or stale.  Making the idiom
pid_ns->child_reaper->nsproxy a double whammy of problems.

Luckily all that needs to happen is to move /proc/fs/nfsfs/servers and
/proc/fs/nfsfs/volumes under /proc/net to /proc/net/nfsfs/servers and
/proc/net/nfsfs/volumes and add a symlink from the original location,
and to use seq_open_net as it has been designed.

Cc: stable@vger.kernel.org
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Stanislav Kinsbursky <skinsbursky@parallels.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/nfs/client.c   | 95 +++++++++++++++++++++++++++--------------------
 fs/nfs/inode.c    |  3 +-
 fs/nfs/internal.h |  9 +++++
 fs/nfs/netns.h    |  3 ++
 4 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1d09289c8f0e..180d1ec9c32e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1205,7 +1205,7 @@ static const struct file_operations nfs_server_list_fops = {
 	.open		= nfs_server_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_net,
 	.owner		= THIS_MODULE,
 };
 
@@ -1226,7 +1226,7 @@ static const struct file_operations nfs_volume_list_fops = {
 	.open		= nfs_volume_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_net,
 	.owner		= THIS_MODULE,
 };
 
@@ -1236,19 +1236,8 @@ static const struct file_operations nfs_volume_list_fops = {
  */
 static int nfs_server_list_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *m;
-	int ret;
-	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
-	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
-	ret = seq_open(file, &nfs_server_list_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = net;
-
-	return 0;
+	return seq_open_net(inode, file, &nfs_server_list_ops,
+			   sizeof(struct seq_net_private));
 }
 
 /*
@@ -1256,7 +1245,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
  */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 {
-	struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
 	/* lock the list against modification */
 	spin_lock(&nn->nfs_client_lock);
@@ -1268,7 +1257,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
  */
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-	struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
 	return seq_list_next(v, &nn->nfs_client_list, pos);
 }
@@ -1278,7 +1267,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
  */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
 {
-	struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
 	spin_unlock(&nn->nfs_client_lock);
 }
@@ -1289,7 +1278,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
 static int nfs_server_list_show(struct seq_file *m, void *v)
 {
 	struct nfs_client *clp;
-	struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
 	/* display header on line 1 */
 	if (v == &nn->nfs_client_list) {
@@ -1321,19 +1310,8 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
  */
 static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *m;
-	int ret;
-	struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
-	struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-
-	ret = seq_open(file, &nfs_volume_list_ops);
-	if (ret < 0)
-		return ret;
-
-	m = file->private_data;
-	m->private = net;
-
-	return 0;
+	return seq_open_net(inode, file, &nfs_server_list_ops,
+			   sizeof(struct seq_net_private));
 }
 
 /*
@@ -1341,7 +1319,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
  */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 {
-	struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
 	/* lock the list against modification */
 	spin_lock(&nn->nfs_client_lock);
@@ -1353,7 +1331,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
  */
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-	struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
 	return seq_list_next(v, &nn->nfs_volume_list, pos);
 }
@@ -1363,7 +1341,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
  */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
 {
-	struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
 
 	spin_unlock(&nn->nfs_client_lock);
 }
@@ -1376,7 +1354,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 	struct nfs_server *server;
 	struct nfs_client *clp;
 	char dev[8], fsid[17];
-	struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+	struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
 	/* display header on line 1 */
 	if (v == &nn->nfs_volume_list) {
@@ -1407,6 +1385,45 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+int nfs_fs_proc_net_init(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+	struct proc_dir_entry *p;
+
+	nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+	if (!nn->proc_nfsfs)
+		goto error_0;
+
+	/* a file of servers with which we're dealing */
+	p = proc_create("servers", S_IFREG|S_IRUGO,
+			nn->proc_nfsfs, &nfs_server_list_fops);
+	if (!p)
+		goto error_1;
+
+	/* a file of volumes that we have mounted */
+	p = proc_create("volumes", S_IFREG|S_IRUGO,
+			nn->proc_nfsfs, &nfs_volume_list_fops);
+	if (!p)
+		goto error_2;
+	return 0;
+
+error_2:
+	remove_proc_entry("servers", nn->proc_nfsfs);
+error_1:
+	remove_proc_entry("fs/nfsfs", NULL);
+error_0:
+	return -ENOMEM;
+}
+
+void nfs_fs_proc_net_exit(struct net *net)
+{
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+	remove_proc_entry("volumes", nn->proc_nfsfs);
+	remove_proc_entry("servers", nn->proc_nfsfs);
+	remove_proc_entry("fs/nfsfs", NULL);
+}
+
 /*
  * initialise the /proc/fs/nfsfs/ directory
  */
@@ -1419,14 +1436,12 @@ int __init nfs_fs_proc_init(void)
 		goto error_0;
 
 	/* a file of servers with which we're dealing */
-	p = proc_create("servers", S_IFREG|S_IRUGO,
-			proc_fs_nfs, &nfs_server_list_fops);
+	p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
 	if (!p)
 		goto error_1;
 
 	/* a file of volumes that we have mounted */
-	p = proc_create("volumes", S_IFREG|S_IRUGO,
-			proc_fs_nfs, &nfs_volume_list_fops);
+	p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
 	if (!p)
 		goto error_2;
 	return 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9927913c97c2..a732cbbd4e80 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1840,11 +1840,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 static int nfs_net_init(struct net *net)
 {
 	nfs_clients_init(net);
-	return 0;
+	return nfs_fs_proc_net_init(net);
 }
 
 static void nfs_net_exit(struct net *net)
 {
+	nfs_fs_proc_net_exit(net);
 	nfs_cleanup_cb_ident_idr(net);
 }
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f415cbf9f6c3..2d2ebdfb0723 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
 #else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+	return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
 static inline int nfs_fs_proc_init(void)
 {
 	return 0;
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 8ee1fab83268..ef221fb8a183 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,6 +29,9 @@ struct nfs_net {
 #endif
 	spinlock_t nfs_client_lock;
 	struct timespec boot_time;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc_nfsfs;
+#endif
 };
 
 extern int nfs_net_id;

From 6ba8ed79a3cce14bd5597218fd46408b8376f8f3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 Jul 2014 16:27:08 -0700
Subject: [PATCH 08/11] proc: Have net show up under /proc/<tgid>/task/<tid>

Network namespaces are per task so it make sense for them to show up
in the task directory.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/base.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2d696b0c93bf..ed34e405c6b9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2895,6 +2895,9 @@ static const struct pid_entry tid_base_stuff[] = {
 	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
 	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
 	REG("environ",   S_IRUSR, proc_environ_operations),
 	INF("auxv",      S_IRUSR, proc_pid_auxv),
 	ONE("status",    S_IRUGO, proc_pid_status),

From 0097875bd41528922fb3bb5f348c53f17e00e2fd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 Jul 2014 03:10:50 -0700
Subject: [PATCH 09/11] proc: Implement /proc/thread-self to point at the
 directory of the current thread

/proc/thread-self is derived from /proc/self.  /proc/thread-self
points to the directory in proc containing information about the
current thread.

This funtionality has been missing for a long time, and is tricky to
implement in userspace as gettid() is not exported by glibc.  More
importantly this allows fixing defects in /proc/mounts and /proc/net
where in a threaded application today they wind up being empty files
when only the initial pthread has exited, causing problems for other
threads.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/Makefile              |  1 +
 fs/proc/base.c                | 15 ++++---
 fs/proc/inode.c               |  7 ++-
 fs/proc/internal.h            |  6 +++
 fs/proc/root.c                |  3 ++
 fs/proc/thread_self.c         | 85 +++++++++++++++++++++++++++++++++++
 include/linux/pid_namespace.h |  1 +
 7 files changed, 112 insertions(+), 6 deletions(-)
 create mode 100644 fs/proc/thread_self.c

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 239493ec718e..7151ea428041 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -23,6 +23,7 @@ proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
 proc-y	+= self.o
+proc-y	+= thread_self.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ed34e405c6b9..0131156ce7c9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2847,7 +2847,7 @@ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter ite
 	return iter;
 }
 
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
@@ -2859,14 +2859,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
 		return 0;
 
-	if (pos == TGID_OFFSET - 1) {
+	if (pos == TGID_OFFSET - 2) {
 		struct inode *inode = ns->proc_self->d_inode;
 		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
 			return 0;
-		iter.tgid = 0;
-	} else {
-		iter.tgid = pos - TGID_OFFSET;
+		ctx->pos = pos = pos + 1;
 	}
+	if (pos == TGID_OFFSET - 1) {
+		struct inode *inode = ns->proc_thread_self->d_inode;
+		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+			return 0;
+		ctx->pos = pos = pos + 1;
+	}
+	iter.tgid = pos - TGID_OFFSET;
 	iter.task = NULL;
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 0adbc02d60e3..333080d7a671 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 int proc_fill_super(struct super_block *s)
 {
 	struct inode *root_inode;
+	int ret;
 
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
@@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s)
 		return -ENOMEM;
 	}
 
-	return proc_setup_self(s);
+	ret = proc_setup_self(s);
+	if (ret) {
+		return ret;
+	}
+	return proc_setup_thread_self(s);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3ab6d14e71c5..ee04619173b2 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -233,6 +233,12 @@ static inline int proc_net_init(void) { return 0; }
  */
 extern int proc_setup_self(struct super_block *);
 
+/*
+ * proc_thread_self.c
+ */
+extern int proc_setup_thread_self(struct super_block *);
+extern void proc_thread_self_init(void);
+
 /*
  * proc_sysctl.c
  */
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5dbadecb234d..48f1c03bc7ed 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb)
 	ns = (struct pid_namespace *)sb->s_fs_info;
 	if (ns->proc_self)
 		dput(ns->proc_self);
+	if (ns->proc_thread_self)
+		dput(ns->proc_thread_self);
 	kill_anon_super(sb);
 	put_pid_ns(ns);
 }
@@ -170,6 +172,7 @@ void __init proc_root_init(void)
 		return;
 
 	proc_self_init();
+	proc_thread_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
new file mode 100644
index 000000000000..59075b509df3
--- /dev/null
+++ b/fs/proc/thread_self.c
@@ -0,0 +1,85 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+/*
+ * /proc/thread_self:
+ */
+static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	pid_t pid = task_pid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
+	if (!pid)
+		return -ENOENT;
+	sprintf(tmp, "%d/task/%d", tgid, pid);
+	return readlink_copy(buffer, buflen, tmp);
+}
+
+static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	pid_t pid = task_pid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (pid) {
+		name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d/task/%d", tgid, pid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static const struct inode_operations proc_thread_self_inode_operations = {
+	.readlink	= proc_thread_self_readlink,
+	.follow_link	= proc_thread_self_follow_link,
+	.put_link	= kfree_put_link,
+};
+
+static unsigned thread_self_inum;
+
+int proc_setup_thread_self(struct super_block *s)
+{
+	struct inode *root_inode = s->s_root->d_inode;
+	struct pid_namespace *ns = s->s_fs_info;
+	struct dentry *thread_self;
+
+	mutex_lock(&root_inode->i_mutex);
+	thread_self = d_alloc_name(s->s_root, "thread-self");
+	if (thread_self) {
+		struct inode *inode = new_inode_pseudo(s);
+		if (inode) {
+			inode->i_ino = thread_self_inum;
+			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+			inode->i_op = &proc_thread_self_inode_operations;
+			d_add(thread_self, inode);
+		} else {
+			dput(thread_self);
+			thread_self = ERR_PTR(-ENOMEM);
+		}
+	} else {
+		thread_self = ERR_PTR(-ENOMEM);
+	}
+	mutex_unlock(&root_inode->i_mutex);
+	if (IS_ERR(thread_self)) {
+		pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
+		return PTR_ERR(thread_self);
+	}
+	ns->proc_thread_self = thread_self;
+	return 0;
+}
+
+void __init proc_thread_self_init(void)
+{
+	proc_alloc_inum(&thread_self_inum);
+}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7246ef3d4455..1997ffc295a7 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -33,6 +33,7 @@ struct pid_namespace {
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
 	struct dentry *proc_self;
+	struct dentry *proc_thread_self;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;

From e81324407269b7708a0678a5c40710d2a75e9bf0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 Jul 2014 03:22:30 -0700
Subject: [PATCH 10/11] proc: Point /proc/net at /proc/thread-self/net instead
 of /proc/self/net

In oddball cases where the thread has a different network namespace
than the primary thread group leader or more likely in cases where
the thread remains and the thread group leader has exited this
ensures that /proc/net continues to work.

This should not cause any problems but if it does this patch can just
be reverted.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/proc_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a63af3e0a612..39481028ec08 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -226,7 +226,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
 
 int __init proc_net_init(void)
 {
-	proc_symlink("net", NULL, "self/net");
+	proc_symlink("net", NULL, "thread-self/net");
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }

From 344470cac42e887e68cfb5bdfa6171baf27f1eb5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 31 Jul 2014 15:41:15 -0700
Subject: [PATCH 11/11] proc: Point /proc/mounts at /proc/thread-self/mounts
 instead of /proc/self/mounts

In oddball cases where the thread has a different mount namespace than
the thread group leader or more likely in cases where the thread
remains and the thread group leader has exited this ensures that
/proc/mounts continues to work.

This should not cause any problems but if it does this patch can just
be reverted.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/root.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 48f1c03bc7ed..92c12c243ce3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -173,7 +173,7 @@ void __init proc_root_init(void)
 
 	proc_self_init();
 	proc_thread_self_init();
-	proc_symlink("mounts", NULL, "self/mounts");
+	proc_symlink("mounts", NULL, "thread-self/mounts");
 
 	proc_net_init();