[pve-devel] [PATCH container] bindmount: catch rw/ro race and add tests

Wolfgang Bumiller w.bumiller at proxmox.com
Thu Jun 2 16:23:09 CEST 2016


rw/ro race occurs when a container contains the same bind
mount twice and another container contains a bind mount
containing the first container's destination. If the double
bind mounts are both meant to be read-only then the second
container could theoretically swap them out between the
mount and the read-only remount call, then swap them back
for the test. So to verify this we use the same file
descriptor we use for the dev/inode check and perform an
faccessat() call and expect it to return EROFS and nothing
else.

Also include O_NOFOLLOW in the checks' openat() calls.
---
 src/PVE/LXC.pm             | 90 ++++++++++++++++++++++++++++++++++------------
 src/test/Makefile          |  5 ++-
 src/test/bindmount_test.pl | 89 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+), 24 deletions(-)
 create mode 100755 src/test/bindmount_test.pl

diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm
index 7c5a5e6..3a7d2a2 100644
--- a/src/PVE/LXC.pm
+++ b/src/PVE/LXC.pm
@@ -11,7 +11,7 @@ use File::Path;
 use File::Spec;
 use Cwd qw();
 use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY);
-use Errno qw(ELOOP);
+use Errno qw(ELOOP EROFS);
 
 use PVE::Exception qw(raise_perm_exc);
 use PVE::Storage;
@@ -1044,49 +1044,95 @@ sub walk_tree_nofollow($$$) {
     return $fd;
 }
 
-sub bindmount {
-    my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_;
-
-    my $srcdh = walk_tree_nofollow('/', $dir, 0);
-
-    PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]);
-    if ($ro) {
-	eval { PVE::Tools::run_command(['mount', '-o', 'bind,remount,ro', $dest]); };
-	if (my $err = $@) {
-	    warn "bindmount error\n";
-	    # don't leave writable bind-mounts behind...
-	    PVE::Tools::run_command(['umount', $dest]);
-	    die $err;
-	}
-    }
+# To guard against symlink attack races against other currently running
+# containers with shared recursive bind mount hierarchies we prepare a
+# directory handle for the directory we're mounting over to verify the
+# mountpoint afterwards.
+sub __bindmount_prepare {
+    my ($hostroot, $dir) = @_;
+    my $srcdh = walk_tree_nofollow($hostroot, $dir, 0);
+    return $srcdh;
+}
 
+# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b'
+# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory
+# we intended to bind mount.
+sub __bindmount_verify {
+    my ($srcdh, $parentfd, $last_dir, $ro) = @_;
     my $destdh;
     if ($parentfd) {
 	# Open the mount point path coming from the parent directory since the
 	# filehandle we would have gotten as first result of walk_tree_nofollow
 	# earlier is still a handle to the underlying directory instead of the
 	# mounted path.
-	$destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH)
+	$destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY);
+	die "failed to open mount point: $!\n" if !$destdh;
+	if ($ro) {
+	    my $dot = '.';
+	    # 269: faccessat()
+	    # no separate function because 99% of the time it's the wrong thing to use.
+	    if (syscall(269, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) {
+		die "failed to mark bind mount read only\n";
+	    }
+	    die "read-only check failed: $!\n" if $! != EROFS;
+	}
     } else {
 	# For the rootfs we don't have a parentfd so we open the path directly.
 	# Note that this means bindmounting any prefix of the host's
 	# /var/lib/lxc/$vmid path into another container is considered a grave
 	# security error.
 	sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY;
+	die "failed to open mount point: $!\n" if !$destdh;
     }
-    die "failed to open directory $dir: $!\n" if !$destdh;
 
     my ($srcdev, $srcinode) = stat($srcdh);
     my ($dstdev, $dstinode) = stat($destdh);
     close $srcdh;
     close $destdh;
 
-    if ($srcdev != $dstdev || $srcinode != $dstinode) {
+    return ($srcdev == $dstdev && $srcinode == $dstinode);
+}
+
+# Perform the actual bind mounting:
+sub __bindmount_do {
+    my ($dir, $dest, $ro, @extra_opts) = @_;
+    PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]);
+    if ($ro) {
+	eval { PVE::Tools::run_command(['mount', '-o', 'bind,remount,ro', $dest]); };
+	if (my $err = $@) {
+	    warn "bindmount error\n";
+	    # don't leave writable bind-mounts behind...
+	    PVE::Tools::run_command(['umount', $dest]);
+	    die $err;
+	}
+    }
+}
+
+sub bindmount {
+    my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_;
+
+    my $srcdh = __bindmount_prepare('/', $dir);
+
+    __bindmount_do($dir, $dest, $ro, @extra_opts);
+
+    if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) {
 	PVE::Tools::run_command(['umount', $dest]);
 	die "detected mount path change at: $dir\n";
     }
 }
 
+# Cleanup $rootdir a bit (double and trailing slashes), build the mount path
+# from $rootdir and $mount and walk the path from $rootdir to the final
+# directory to check for symlinks.
+sub __mount_prepare_rootdir {
+    my ($rootdir, $mount) = @_;
+    $rootdir =~ s!/+!/!g;
+    $rootdir =~ s!/+$!!;
+    my $mount_path = "$rootdir/$mount";
+    my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1);
+    return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir);
+}
+
 # use $rootdir = undef to just return the corresponding mount path
 sub mountpoint_mount {
     my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_;
@@ -1105,10 +1151,8 @@ sub mountpoint_mount {
     my ($mpfd, $parentfd, $last_dir);
     
     if (defined($rootdir)) {
-	$rootdir =~ s!/+!/!g;
-	$rootdir =~ s!/+$!!;
-	$mount_path = "$rootdir/$mount";
-	($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1);
+	($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) =
+	    __mount_prepare_rootdir($rootdir, $mount);
     }
     
     my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
diff --git a/src/test/Makefile b/src/test/Makefile
index f3e260d..497a179 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -1,7 +1,7 @@
 all:
 
 
-test: test_setup test_snapshot
+test: test_setup test_snapshot test_bindmount
 
 test_setup: run_setup_tests.pl
 	./run_setup_tests.pl
@@ -9,5 +9,8 @@ test_setup: run_setup_tests.pl
 test_snapshot: run_snapshot_tests.pl
 	./run_snapshot_tests.pl
 
+test_bindmount: bindmount_test.pl
+	./bindmount_test.pl
+
 clean:
 	rm -rf tmprootfs
diff --git a/src/test/bindmount_test.pl b/src/test/bindmount_test.pl
new file mode 100755
index 0000000..a6052db
--- /dev/null
+++ b/src/test/bindmount_test.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use Cwd;
+use File::Path;
+
+use lib qw(..);
+
+use PVE::LXC;
+
+my $pwd = getcwd();
+
+my $rootdir = "./tmproot";
+my $destdir = "/mnt";
+
+my $sharedir = "/tmpshare";
+my $a        = "/tmpshare/a";
+my $ab       = "/tmpshare/a/b";
+my $abc      = "/tmpshare/a/b/c";
+my $sym      = "/tmpshare/sym";
+
+my $secret = "/secret";
+
+END {
+    my $ignore_error;
+    File::Path::rmtree("$pwd/$rootdir", {error => \$ignore_error});
+    File::Path::rmtree("$pwd/$sharedir", {error => \$ignore_error});
+    File::Path::rmtree("$pwd/$secret", {error => \$ignore_error});
+};
+
+# Create all the test paths...
+PVE::LXC::walk_tree_nofollow('.', $rootdir, 1);
+PVE::LXC::walk_tree_nofollow($rootdir, $destdir, 1);
+PVE::LXC::walk_tree_nofollow('.', $abc, 1);
+PVE::LXC::walk_tree_nofollow('.', $secret, 1);
+# Create one evil symlink
+symlink('a/b', "$pwd/$sym") or die "failed to prepare test folders: $!\n";
+
+# Test walk_tree_nofollow:
+eval { PVE::LXC::walk_tree_nofollow('.', $rootdir, 0) };
+die "unexpected error: $@" if $@;
+eval { PVE::LXC::walk_tree_nofollow('.', "$sym/c", 0) };
+die "failed to catch symlink at $sym/c\n" if !$@;
+die "unexpected test error: '$@'\n" if $@ ne "symlink encountered at: .$sym\n";
+
+# Bindmount testing:
+sub bindmount {
+    my ($from, $rootdir, $destdir, $inject, $ro, $inject_write) = @_;
+
+    my ($mpath, $mpfd, $parentfd, $last);
+    ($rootdir, $mpath, $mpfd, $parentfd, $last) =
+        PVE::LXC::__mount_prepare_rootdir($rootdir, $destdir);
+
+    my $srcdh = PVE::LXC::__bindmount_prepare('.', $from);
+
+    if ($inject) {
+        File::Path::rmtree(".$inject");
+        symlink("$pwd/$secret", ".$inject")
+            or die "failed to create symlink\n";
+        PVE::LXC::walk_tree_nofollow('.', $from, 1);
+    }
+    PVE::LXC::__bindmount_do(".$from", $mpath, $inject_write ? 0 : $ro);
+
+    my $res = PVE::LXC::__bindmount_verify($srcdh, $parentfd, $last, $ro);
+
+    system('umount', $mpath);
+}
+
+bindmount($a, $rootdir, $destdir, undef, 0, 0);
+eval { bindmount($sym, $rootdir, $destdir, undef, 0, 0) };
+die "illegal symlink bindmount went through\n" if !$@;
+bindmount($abc, $rootdir, $destdir, undef, 0, 0);
+# Race test: Assume someone exchanged 2 equivalent bind mounts between and
+# after the bindmount_do()'s mount and remount calls.
+# First: non-ro mount, should pass
+bindmount($abc, $rootdir, $destdir, undef, 0, 1);
+# Second: regular read-only mount, should pass.
+bindmount($abc, $rootdir, $destdir, undef, 1, 0);
+# Third: read-only requested with read-write injected
+eval { bindmount($abc, $rootdir, $destdir, undef, 1, 1) };
+die "read-write mount possible\n" if !$@;
+die "unexpected test error: $@\n" if $@ ne "failed to mark bind mount read only\n";
+# Race test: Replace /tmpshare/a/b with a symlink to /secret just before
+# __bindmount_do().
+eval { bindmount($abc, $rootdir, $destdir, $ab, 0, 0) };
+die "injected symlink bindmount went through\n" if !$@;
+die "unexpected test error: $@\n" if $@ ne "symlink encountered at: .$ab\n";
-- 
2.1.4





More information about the pve-devel mailing list