[pve-devel] [RFC v1/2 manager] pvestatd: add simple container cpuset balancing

Wolfgang Bumiller w.bumiller at proxmox.com
Thu Oct 20 13:43:52 CEST 2016


---
 PVE/Service/pvestatd.pm | 171 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)

diff --git a/PVE/Service/pvestatd.pm b/PVE/Service/pvestatd.pm
index 98e5844..0d51574 100755
--- a/PVE/Service/pvestatd.pm
+++ b/PVE/Service/pvestatd.pm
@@ -15,6 +15,7 @@ use PVE::Cluster qw(cfs_read_file);
 use PVE::Storage;
 use PVE::QemuServer;
 use PVE::LXC;
+use PVE::LXC::Config;
 use PVE::RPCEnvironment;
 use PVE::API2::Subscription;
 use PVE::AutoBalloon;
@@ -253,6 +254,8 @@ sub update_lxc_status {
 	    $plugin->update_lxc_status($plugin_config, $vmid, $d, $ctime);
 	}
     }
+
+    rebalance($vmstatus);
 }
 
 sub update_storage_status {
@@ -282,6 +285,174 @@ sub update_storage_status {
     }
 }
 
+# FIXME: already in QemuServer (but for semicolon-separated sets), move to Tools
+sub parse_number_sets {
+    my ($set, $re) = @_;
+    my $res = [];
+    $re = qr/;/ if !defined($re);
+    foreach my $part (split($re, $set)) {
+	if ($part =~ /^\s*(\d+)(?:-(\d+))?\s*$/) {
+	    die "invalid range: $part ($2 < $1)\n" if defined($2) && $2 < $1;
+	    push @$res, [$1, $2];
+	} else {
+	    die "invalid range: $part\n";
+	}
+    }
+    return $res;
+}
+
+sub number_setlist_to_list {
+    my ($setlist) = @_;
+    return map { $_->[0] .. ($_->[1]//$_->[0]) } @$setlist;
+}
+
+sub get_cpusets {
+    my ($cgroup, $kind) = @_;
+    $kind = 'cpus' if !defined($kind);
+    my $set_text = PVE::Tools::file_read_firstline(
+	"/sys/fs/cgroup/cpuset/$cgroup/cpuset.$kind");
+    return parse_number_sets($set_text, qr/,/);
+}
+
+# FIXME: Candidate for PVE/LXC.pm?
+sub has_lxc_entry {
+    my ($conf, $keyname) = @_;
+    foreach my $entry (@{$conf->{lxc}}) {
+	my ($key, undef) = @$entry;
+	return 1 if $key eq $keyname;
+    }
+    return 0;
+}
+
+sub apply_cpumask {
+    my ($vmid, $mask, $curmask) = @_;
+    my $value = '';
+    my $changed = !$curmask;
+
+    for (my $id = 0; $id != @$mask; ++$id) {
+	if (!$mask->[$id]) {
+	    $changed = 1 if !$changed && ($id < @$curmask && $curmask->[$id]);
+	    next;
+	}
+	$changed = 1 if !$changed && ($id >= @$curmask || !$curmask->[$id]);
+	$value .= ',' if length($value);
+	$value .= $id;
+    }
+    if (!$changed && $curmask) {
+	for (my $id = @$mask; $id < @$curmask; ++$id) {
+	    if ($curmask->[$id]) {
+		$changed = 1;
+		last;
+	    }
+	}
+    }
+    return if !$changed;
+    open(my $fh, '>', "/sys/fs/cgroup/cpuset/lxc/$vmid/cpuset.cpus")
+	or die "failed to open cpuset for $vmid: $!\n";
+    print {$fh} "$value\n";
+    close($fh);
+}
+
+sub rebalance {
+    my ($vmstatus) = @_;
+
+    return if !-d '/sys/fs/cgroup/cpuset/lxc'; # nothing to do...
+
+    my $cpu_setlist = get_cpusets('lxc', 'effective_cpus');
+    my @allowed_cpus = number_setlist_to_list($cpu_setlist);
+    my $cpucount = scalar(@allowed_cpus);
+    my $highest_cpuid = $allowed_cpus[-1];
+
+    my @cpu_ctcount = (0) x $highest_cpuid;
+    my @balanced_cts;
+
+    foreach my $vmid (sort keys %$vmstatus) {
+	my $d = $vmstatus->{$vmid};
+	next if !$d->{pid};
+
+	my $conf = eval { PVE::LXC::Config->load_config($vmid) };
+	if ($@) {
+	    warn $@;
+	    next;
+	}
+
+	# get the current cpuset:
+	my $cpu_setlist = get_cpusets("lxc/$vmid");
+	my $cpu_list = [number_setlist_to_list($cpu_setlist)];
+	$highest_cpuid = $cpu_list->[-1] if $highest_cpuid < $cpu_list->[-1];
+
+	# container has a fixed set, count it
+	if (has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')) {
+	    foreach my $cpu (@$cpu_list) {
+		$cpu_ctcount[$cpu]++ if $cpu <= @cpu_ctcount;
+	    }
+	} else {
+	    my $cpulimit = $conf->{cpulimit};
+	    $cpulimit = $cpucount if !$cpulimit || $cpulimit > $cpucount;
+	    push @balanced_cts, [$vmid, $cpulimit, $cpu_list];
+	}
+    }
+
+    my @cpus_by_count = sort { $cpu_ctcount[$a] <=> $cpu_ctcount[$b] }
+	@allowed_cpus;
+
+    foreach my $bct (@balanced_cts) {
+	my ($vmid, $cpulimit, $cpu_list) = @$bct;
+
+	# Get the currently active cpu mask:
+	my $curmask = [(0) x $highest_cpuid];
+	$curmask->[$_] = 1 foreach @$cpu_list;
+
+	# Get the desired new cpu mask:
+	my $mask = [(0) x $highest_cpuid];
+	my $i;
+	for ($i = 0; $i < $cpulimit && $i < @cpus_by_count; ++$i) {
+	    my $cpu = $cpus_by_count[$i];
+	    $mask->[$cpu] = 1;
+	    $cpu_ctcount[$cpu]++;
+	}
+
+	apply_cpumask($vmid, $mask, $curmask);
+
+	# We need to keep cpus_by_count sorted:
+	# 1) Since cpus can only be used once the order does not need to be
+	# changed if we walked up to the last cpu in the sorted list:
+	next if $i >= @cpus_by_count;
+
+	my $lastcpu = $cpus_by_count[$i-1];
+	my $nextcpu = $cpus_by_count[$i];
+	my $count = $cpu_ctcount[$nextcpu];
+	# 2) If the next count is equal to the bumped-up count of the last cpu
+	# we assigned the container to, the order is still fine, too.
+	next if $count >= $cpu_ctcount[$lastcpu];
+
+	# 3) Find the range of cpus we need to sort forward. Under our
+	# conditions this translates to finding the next cpu with a different
+	# count (since they're sorted and adding even just 1 means we're equal
+	# to the last assigned cpu).
+	# (This should be a stable sort with respect to equally-utilized cpus)
+	my $from = $i;
+	++$i;
+	while ($i < @cpus_by_count &&
+	       $cpu_ctcount[$cpus_by_count[$i]] == $count) {
+	    ++$i;
+	}
+	my $to = $i;
+
+	# 3) find the last cpu with a count lower than or equal to the first
+	# one we want to move:
+	$i = $from-1;
+	while ($i && $cpu_ctcount[$cpus_by_count[$i]] > $count) {
+	    --$i;
+	}
+
+	# 4) Move:
+	my @range = (@cpus_by_count[$from..($to-1)]);
+	splice @cpus_by_count, $from, $to-$from;
+	unshift @cpus_by_count, @range;
+    }
+}
+
 sub update_status {
 
     # update worker list. This is not really required and
-- 
2.1.4





More information about the pve-devel mailing list