[pve-devel] [PATCH] add hugepages option v5

Wolfgang Bumiller w.bumiller at proxmox.com
Tue Jun 7 12:40:29 CEST 2016


General question:
Can you give an example in what kind of situation this boosts
performance?

Code comments inline:

On Sat, Jun 04, 2016 at 10:19:56AM +0200, Alexandre Derumier wrote:
> changelog : rebase on last git
> 
> vm configuration
> ----------------
> hugepages: (any|2|1024)
> 
> any: we'll try to allocate 1GB hugepage if possible, if not we use 2MB hugepage
> 2: we want to use 2MB hugepage
> 1024: we want to use 1GB hugepage. (memory need to be multiple of 1GB in this case)
> 
> optionnal host configuration for 1GB hugepages
> ----------------------------------------------
> 1GB hugepages can be allocated at boot if user want it.
> hugepages need to be contiguous, so sometime it's not possible to reserve them on the fly
> 
> /etc/default/grub : GRUB_CMDLINE_LINUX_DEFAULT="quiet hugepagesz=1G hugepages=x"
> 
> Signed-off-by: Alexandre Derumier <aderumier at odiso.com>
> ---
>  PVE/QemuServer.pm        |  66 +++++++++--
>  PVE/QemuServer/Memory.pm | 287 +++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 333 insertions(+), 20 deletions(-)
> 
> diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
> index 9b8110e..4ccca19 100644
> --- a/PVE/QemuServer.pm
> +++ b/PVE/QemuServer.pm
> @@ -321,6 +321,12 @@ EODESC
>  	description => "Enable/disable NUMA.",
>  	default => 0,
>      },
> +    hugepages => {
> +	optional => 1,
> +	type => 'string',
> +	description => "Enable/disable hugepages memory.",
> +	enum => [qw(any 2 1024)],
> +    },
>      vcpus => {
>  	optional => 1,
>  	type => 'integer',
> @@ -4348,19 +4354,46 @@ sub vm_start {
>  	my $cpuunits = defined($conf->{cpuunits}) ? $conf->{cpuunits}
>  	                                          : $defaults->{cpuunits};
>  
> -	eval  {
> -	    my %properties = (
> -		Slice => 'qemu.slice',
> -		KillMode => 'none',
> -		CPUShares => $cpuunits
> -	    );
> -	    if (my $cpulimit = $conf->{cpulimit}) {
> +	my %run_params = (timeout => $statefile ? undef : 30, umask => 0077);
> +
> +	my %properties = (
> +	    Slice => 'qemu.slice',
> +	    KillMode => 'none',
> +	    CPUShares => $cpuunits
> +	);
> +
> +	if (my $cpulimit = $conf->{cpulimit}) {
>  		$properties{CPUQuota} = int($cpulimit * 100);

Indentation nit:
-  		$properties{CPUQuota} = int($cpulimit * 100);
+  	    $properties{CPUQuota} = int($cpulimit * 100);

> -	    }
> -	    $properties{timeout} = 10 if $statefile; # setting up the scope shoul be quick
> -	    PVE::Tools::enter_systemd_scope($vmid, "Proxmox VE VM $vmid", %properties);
> -	    run_command($cmd, timeout => $statefile ? undef : 30, umask => 0077);
> -	};
> +	}
> +	$properties{timeout} = 10 if $statefile; # setting up the scope shoul be quick
> +
> +	if ($conf->{hugepages}) {
> +
> +	    my $code = sub {
> +		my $hugepages_topology = PVE::QemuServer::Memory::hugepages_topology($conf);
> +		my $hugepages_host_topology = PVE::QemuServer::Memory::hugepages_host_topology();
> +
> +		PVE::QemuServer::Memory::hugepages_mount();
> +		PVE::QemuServer::Memory::hugepages_allocate($hugepages_topology, $hugepages_host_topology);
> +
> +		eval  {
> +		    PVE::Tools::enter_systemd_scope($vmid, "Proxmox VE VM $vmid", %properties);
> +		    run_command($cmd, %run_params);
> +		};
> +
> +		if (my $err = $@) {
> +		    PVE::QemuServer::Memory::hugepages_reset($hugepages_host_topology);
> +		    die $err;
> +		}
> +	    };
> +	    eval { PVE::QemuServer::Memory::hugepages_update_locked($code); };
> +
> +	} else {
> +	    eval  {
> +		PVE::Tools::enter_systemd_scope($vmid, "Proxmox VE VM $vmid", %properties);
> +		run_command($cmd, %run_params);
> +	    };
> +	}
>  
>  	if (my $err = $@) {
>  	    # deactivate volumes if start fails
> @@ -4530,6 +4563,15 @@ sub vm_stop_cleanup {
>  	    unlink "/var/run/qemu-server/${vmid}.$ext";
>  	}
>  
> +	if($conf->{hugepages}) {
> +

Note that vm_sto_cleanup() literally only happens when you stop the VM
via CLI or GUI, not when the VM powers down by itself.

> +	    my $code = sub {
> +		my $hugepages_topology = PVE::QemuServer::Memory::hugepages_topology($conf);
> +		PVE::QemuServer::Memory::hugepages_deallocate($hugepages_topology);
> +	    };
> +	    eval { PVE::QemuServer::Memory::hugepages_update_locked($code); };
> +	}
> +	
>  	vmconfig_apply_pending($vmid, $conf, $storecfg) if $apply_pending_changes;
>      };
>      warn $@ if $@; # avoid errors - just warn
> diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm
> index 3c9659c..3fd7bf8 100644
> --- a/PVE/QemuServer/Memory.pm
> +++ b/PVE/QemuServer/Memory.pm
> @@ -3,6 +3,7 @@ package PVE::QemuServer::Memory;
>  use strict;
>  use warnings;
>  use PVE::QemuServer;
> +use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach);
>  
>  my $MAX_NUMA = 8;
>  my $MAX_MEM = 4194304;
> @@ -76,7 +77,29 @@ sub qemu_memory_hotplug {
>  
>  		return if $current_size <= $conf->{memory};
>  
> -		eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) };
> +		if ($conf->{hugepages}) {
> +
> +		    my $hugepages_size = hugepages_size($conf, $dimm_size);

When not specifying numaX entries we run into this code using the
default $dimm_size of 512, which won't work with 1G pages. Iow this
config without any `numa0`, `numa1`, ... entries:
  hugepages: 1024
  numa: 1
will error at this point about wrong memory sizes. We could probably
catch this in foreach_dimm()?

> +		    my $path = hugepages_mount_path($hugepages_size);
> +		    my $hugepages_topology->{$hugepages_size}->{$numanode} = hugepages_nr($dimm_size, $hugepages_size);
> +
> +		    my $code = sub {
> +			my $hugepages_host_topology = hugepages_host_topology();
> +			hugepages_allocate($hugepages_topology, $hugepages_host_topology);
> +
> +			eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", props => { 
> +					     size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true } ); };
> +			if (my $err = $@) {
> +			    hugepages_reset($hugepages_host_topology);
> +			    die $err;
> +			}
> +		    };
> +		    eval { hugepages_update_locked($code); };
> +
> +		} else {
> +		    eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) };
> +		}
> +
>  		if (my $err = $@) {
>  		    eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); };
>  		    die $err;
> @@ -157,18 +180,21 @@ sub config {
>  	push @$cmd, '-m', $static_memory;
>      }
>  
> +    die "numa need to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa};
> +
>      if ($conf->{numa}) {
>  
>  	my $numa_totalmemory = undef;
>  	for (my $i = 0; $i < $MAX_NUMA; $i++) {
>  	    next if !$conf->{"numa$i"};
> -	    my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"});
> +	    my $numa = parse_numa($conf->{"numa$i"});

But parse_numa is still in QemuServer, did you mean to include another
hunk in this patch moving this function?

>  	    next if !$numa;
>  	    # memory
>  	    die "missing NUMA node$i memory value\n" if !$numa->{memory};
>  	    my $numa_memory = $numa->{memory};
>  	    $numa_totalmemory += $numa_memory;
> -	    my $numa_object = "memory-backend-ram,id=ram-node$i,size=${numa_memory}M";
> +
> +	    my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
>  
>  	    # cpus
>  	    my $cpulists = $numa->{cpus};
> @@ -196,10 +222,10 @@ sub config {
>  		# policy
>  		my $policy = $numa->{policy};
>  		die "you need to define a policy for hostnode $hostnodes\n" if !$policy;
> -		$numa_object .= ",host-nodes=$hostnodes,policy=$policy";
> +		$mem_object .= ",host-nodes=$hostnodes,policy=$policy";
>  	    }
>  
> -	    push @$cmd, '-object', $numa_object;
> +	    push @$cmd, '-object', $mem_object;
>  	    push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
>  	}
>  
> @@ -209,16 +235,19 @@ sub config {
>  	#if no custom tology, we split memory and cores across numa nodes
>  	if(!$numa_totalmemory) {
>  
> -	    my $numa_memory = ($static_memory / $sockets) . "M";
> +	    my $numa_memory = ($static_memory / $sockets);
>  
>  	    for (my $i = 0; $i < $sockets; $i++)  {
> +		die "host NUMA node$i don't exist\n" if ! -d "/sys/devices/system/node/node$i/";
>  
>  		my $cpustart = ($cores * $i);
>  		my $cpuend = ($cpustart + $cores - 1) if $cores && $cores > 1;
>  		my $cpus = $cpustart;
>  		$cpus .= "-$cpuend" if $cpuend;
>  
> -		push @$cmd, '-object', "memory-backend-ram,size=$numa_memory,id=ram-node$i";
> +		my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
> +
> +		push @$cmd, '-object', $mem_object;
>  		push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
>  	    }
>  	}
> @@ -227,7 +256,10 @@ sub config {
>      if ($hotplug_features->{memory}) {
>  	foreach_dimm($conf, $vmid, $memory, $sockets, sub {
>  	    my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
> -	    push @$cmd, "-object" , "memory-backend-ram,id=mem-$name,size=${dimm_size}M";
> +
> +	    my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
> +
> +	    push @$cmd, "-object" , $mem_object;
>  	    push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
>  
>  	    #if dimm_memory is not aligned to dimm map
> @@ -239,6 +271,245 @@ sub config {
>      }
>  }
>  
> +sub print_mem_object {
> +    my ($conf, $id, $size) = @_;
> +
> +    if ($conf->{hugepages}) {
> +
> +	my $hugepages_size = hugepages_size($conf, $size);
> +	my $path = hugepages_mount_path($hugepages_size);
> +
> +	return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
> +    } else {
> +	return "memory-backend-ram,id=$id,size=${size}M";
> +    }
> +
> +}
> +
> +sub hugepages_mount {
> +
> +   my $mountdata = PVE::ProcFSTools::parse_proc_mounts();
> +
> +   foreach my $size (qw(2048 1048576)) {
> +	return if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB");
> +
> +	my $path = "/run/hugepages/kvm/${size}kB";
> +
> +	my $found = grep {
> +	    $_->[2] =~ /^hugetlbfs/ &&
> +	    $_->[1] eq $path
> +	} @$mountdata;
> +
> +	if (!$found) {
> +
> +	    File::Path::make_path($path) if (!-d $path);
> +	    my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', "pagesize=${size}k", 'hugetlbfs', $path];
> +	    run_command($cmd, errmsg => "hugepage mount error");
> +	}
> +   }
> +}
> +
> +sub hugepages_mount_path {
> +   my ($size) = @_;
> +
> +   $size = $size * 1024;
> +   return "/run/hugepages/kvm/${size}kB";
> +
> +}
> +
> +sub hugepages_nr {
> +  my ($size, $hugepages_size) = @_;
> +
> +  return $size / $hugepages_size;
> +}
> +
> +sub hugepages_size {
> +   my ($conf, $size) = @_;
> +
> +   die "hugepages option is not enabled" if !$conf->{hugepages};
> +
> +   if ($conf->{hugepages} eq 'any') {
> +
> +	#try to use 1GB if available && memory size is matching
> +	if (-d "/sys/kernel/mm/hugepages/hugepages-1048576kB" && ($size % 1024 == 0)) {
> +	    return 1024;
> +	} else {
> +	    return 2;
> +	}
> +
> +   } else {
> +
> +	my $hugepagesize = $conf->{hugepages} * 1024 . "kB";
> +
> +	if (! -d "/sys/kernel/mm/hugepages/hugepages-$hugepagesize") {
> +		die "your system don't support hugepages of $hugepagesize";
> +	}
> +	die "the $size memory is not a multiple of $hugepagesize hugepages size" if ($size % $conf->{hugepages}) != 0;	
> +	return $conf->{hugepages};
> +   }
> +
> +}
> +
> +sub hugepages_topology {
> +    my ($conf) = @_;
> +
> +    my $hugepages_topology = {};
> +
> +    return if !$conf->{numa};
> +
> +    my $defaults = PVE::QemuServer::load_defaults();
> +    my $memory = $conf->{memory} || $defaults->{memory};
> +    my $static_memory = 0;
> +    my $sockets = 1;
> +    $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused
> +    $sockets = $conf->{sockets} if $conf->{sockets};
> +    my $numa_custom_topology = undef;
> +    my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1');
> +
> +    if ($hotplug_features->{memory}) {
> +        $static_memory = $STATICMEM;
> +    } else {
> +        $static_memory = $memory;
> +    }
> +
> +    #custom numa topology
> +    for (my $i = 0; $i < $MAX_NUMA; $i++) {
> +	next if !$conf->{"numa$i"};
> +	my $numa = parse_numa($conf->{"numa$i"});
> +	next if !$numa;
> +
> +	$numa_custom_topology = 1;
> +	my $numa_memory = $numa->{memory};
>  
> +        my $hugepages_size = hugepages_size($conf, $numa_memory);
> +        $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size);
> +
> +    }
> +
> +    #if no custom numa tology, we split memory and cores across numa nodes
> +    if(!$numa_custom_topology) {
> +
> +	my $numa_memory = ($static_memory / $sockets);
> +
> +	for (my $i = 0; $i < $sockets; $i++)  {
> +
> +	    my $hugepages_size = hugepages_size($conf, $numa_memory);
> +	    $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size);
> +	}
> +    }
> +
> +    if ($hotplug_features->{memory}) {
> +	foreach_dimm($conf, undef, $memory, $sockets, sub {
> +	    my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
> +
> +	    my $hugepages_size = hugepages_size($conf, $dimm_size);
> +	    $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size);
> +	});
> +    }
> +
> +    return $hugepages_topology;
> +}
> +
> +sub hugepages_host_topology {
> +
> +    #read host hugepages
> +    my $hugepages_host_topology = {};
> +
> +    dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub {
> +	my ($nodepath, $numanode) = @_;
> +
> +	dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 'hugepages\-(\d+)kB', sub {
> +	    my ($hugepages_path, $hugepages_size) = @_;
> +
> +	    $hugepages_size = $hugepages_size / 1024;
> +	    my $hugepages_nr = PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages");
> +	    $hugepages_host_topology->{$hugepages_size}->{$numanode} = $hugepages_nr;
> +        });
> +    });
> +
> +    return $hugepages_host_topology;
> +}
> +
> +sub hugepages_allocate {
> +    my ($hugepages_topology, $hugepages_host_topology) = @_;
> +
> +    #allocate new hupages if needed
> +    foreach my $size (sort keys %$hugepages_topology) {
> +
> +	my $nodes = $hugepages_topology->{$size};
> +
> +	foreach my $numanode (keys %$nodes) {
> +
> +	    my $hugepages_size = $size * 1024;
> +	    my $hugepages_requested = $hugepages_topology->{$size}->{$numanode};
> +	    my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
> +	    my $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages");
> +	    my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages");

When adding more numaX entries to the VM's config than the host has this
now produces an 'Use of uninitialized value' error.
Better check for whether /sys/devices/system/node/node$numanode exists
and throw a useful error.
But should this even be fixed to host nodes? Without hugepages I was
able to provide more smaller numa nodes to the guest (iow. split one big
host numa node into multiple smaller virtual ones), should this not work
with hugepages, too?

> +
> +	    if ($hugepages_requested > $hugepages_free) {
> +		my $hugepages_needed = $hugepages_requested - $hugepages_free;
> +		PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr + $hugepages_needed);
> +		#verify that is correctly allocated
> +		$hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages");
> +		if ($hugepages_free < $hugepages_requested) {
> +		    #rollback to initial host config
> +		    hugepages_reset($hugepages_host_topology);
> +		    die "hugepage allocation fail";
> +		}
> +	    }
> +
> +	}
> +    }
> +
> +}
> +
> +sub hugepages_deallocate {
> +    my ($hugepages_topology) = @_;

I wonder how important this part really is: As far as I understand it,
increasing nr_hugepages shouldn't *reserve* them, but simply allow
allocating them, so once qemu exits they're free for normal use, too.
This number AFAIK is just an upper limit for how many pages can be
allocated in total. At least I can write 100 into the 1G nr_hugepages,
which is then if I have only 8G free ram somewhere around 8, and I can
still use the rest of the memory with other processes.
The main issues with cleanups are that we a) don't do this when qemu
powers down by itself and b) might be racing against other applications
(which can also happen at sartup, but seems unavoidable there...).

We really need a stop hook... but the question there is *how*...
(Even if we added a hook to kvm, it would be nice to have something for
the case where it gets SIGKILLed).
One possibility would be adding an inotify handler onto the qemu.slice
to pvestatd... I've also looked quickly over the systemd dbus API but
there was no obvious stop signal one could listen to...

> +
> +    foreach my $size (sort keys %$hugepages_topology) {
> +
> +	my $nodes = $hugepages_topology->{$size};
> +
> +	foreach my $numanode (keys %$nodes) {
> +
> +	    my $hugepages_size = $size * 1024;
> +	    my $hugepages_used = $hugepages_topology->{$size}->{$numanode};
> +	    my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
> +	    my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages");
> +
> +	    PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", ($hugepages_nr - $hugepages_used));
> +	}
> +    }
> +}
> +
> +sub hugepages_reset {
> +    my ($hugepages_topology) = @_;
> +
> +    foreach my $size (sort keys %$hugepages_topology) {
> +
> +	my $nodes = $hugepages_topology->{$size};
> +	foreach my $numanode (keys %$nodes) {
> +
> +	    my $hugepages_nr = $hugepages_topology->{$size}->{$numanode};
> +	    my $hugepages_size = $size * 1024;
> +	    my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
> +
> +	    PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr);
> +	}
> +    }
> +}
> +
> +sub hugepages_update_locked {
> +    my ($code, @param) = @_;
> +
> +    my $timeout = 60; #could be long if a lot of hugepages need to be alocated
> +
> +    my $lock_filename = "/var/lock/hugepages.lck";
> +
> +    my $res = lock_file($lock_filename, $timeout, $code, @param);
> +    die $@ if $@;
> +
> +    return $res;
> +}
>  1;
>  
> -- 
> 2.1.4




More information about the pve-devel mailing list