[pve-devel] [PATCH ha-manager 1/5] Fence: cleanup class

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Apr 8 17:07:23 CEST 2016



On 04/08/2016 04:51 PM, Thomas Lamprecht wrote:
> A rather big cleanup I already had in a separate branch.
>
> Changes:
> * make process fencing private, it will be called by run_fence_jobs
>   (was start_fencing) if the node has already a fence job deployed.
> * fix bugs in process_fencing where if multiple parallel device
>   succeeded at once only one was counted, the same for failed devices
> * restrict waitpid to fence_job worker pids (instead of -1) else
>   we could get some pretty nasty side effects
> * remove 'resed_hard', 'reset' and 'bailout', they are replaced by
>   kill_and_cleanup_jobs
> * some comment and formatting changes
>
> Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
> ---
>  src/PVE/HA/Fence.pm | 303 +++++++++++++++++++++++++++-------------------------
>  1 file changed, 158 insertions(+), 145 deletions(-)
>
> diff --git a/src/PVE/HA/Fence.pm b/src/PVE/HA/Fence.pm
> index 9a6c1fb..3376cb1 100644
> --- a/src/PVE/HA/Fence.pm
> +++ b/src/PVE/HA/Fence.pm
> @@ -6,83 +6,13 @@ use POSIX qw( WNOHANG );
>  use PVE::HA::FenceConfig;
>  use Data::Dumper;
>  
> -
> - # pid's and additional info of fence processes
> +# pid's and additional info of fence processes
>  my $fence_jobs = {};
>  
>  # fence state of a node
>  my $fenced_nodes = {};
>  
> -sub has_fencing_job { # update for parallel fencing
> -    my ($node) = @_;
> -
> -    foreach my $job (values %$fence_jobs) {
> -	return 1 if ($job->{node} eq $node);
> -    }
> -    return undef;
> -}
> -
> -my $virtual_pid = 0; # hack for test framework
> -
> -sub start_fencing {
> -    my ($haenv, $node, $try) = @_;
> -
> -    $try = 0 if !defined($try) || $try<0;
> -
> -    my $fence_cfg = $haenv->read_fence_config();
> -    my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
> -
> -    if (!$commands) {
> -	$haenv->log('err', "no commands for node '$node'");
> -	$fenced_nodes->{$node}->{failure} = 1;
> -	return 0;
> -    }
> -
> -    $haenv->log('notice', "Start fencing of node '$node'");
> -
> -    my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
> -
> -    $fenced_nodes->{$node}->{needed} = scalar @$commands;
> -    $fenced_nodes->{$node}->{triggered} = 0;
> -
> -    for my $cmd (@$commands)
> -    {
> -	my $cmd_str = "$cmd->{agent} " .
> -	    PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
> -	$haenv->log('notice', "[fence '$node'] execute fence command: $cmd_str");
> -
> -	if ($can_fork) {
> -	    my $pid = fork();
> -	    if (!defined($pid)) {
> -		$haenv->log('err', "forking fence job failed");
> -		return 0;
> -	    } elsif ($pid==0) { # child
> -		$haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> -		exit(-1);
> -	    } else {
> -		$fence_jobs->{$pid} = {cmd=>$cmd_str, node=>$node, try=>$try};
> -	    }
> -	} else {
> -	    my $res = -1;
> -	    eval {
> -		$res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> -		$res = $res << 8 if $res > 0;
> -	    };
> -	    if (my $err = $@) {
> -		$haenv->log('err', $err);
> -	    }
> -
> -	    $virtual_pid++;
> -	    $fence_jobs->{$virtual_pid} = {cmd => $cmd_str, node => $node,
> -					   try => $try, ec => $res};
> -	}
> -    }
> -
> -    return 1;
> -}
> -
> -
> -# check childs and process exit status
> +# picks up/checks children and processes exit status
>  my $check_jobs = sub {
>      my ($haenv) = @_;
>  
> @@ -91,65 +21,53 @@ my $check_jobs = sub {
>  
>      my @finished = ();
>  
> -    # pick up all finsihed childs if we can fork
>      if ($haenv->get_max_workers() > 0) {
> -	while((my $res = waitpid(-1, WNOHANG))>0) {
> -	    $fence_jobs->{$res}->{ec} = $? if $fence_jobs->{$res};
> -	    push @finished, $res;
> +	# pick up all finished children if we can fork
> +	foreach my $pid (keys %$fence_jobs) {
> +
> +	    my $waitpid = waitpid($pid, WNOHANG);
> +	    if (defined($waitpid) && ($waitpid == $pid)) {
> +		$fence_jobs->{$waitpid}->{ec} = $? if $fence_jobs->{$waitpid};
> +		push @finished, $waitpid;
> +	    }
> +
>  	}
>      } else {
> -	@finished = keys %{$fence_jobs};
> +	# all jobs are already finished when not forking
> +	@finished = keys %$fence_jobs;
>      }
>  
> -    #    while((my $res = waitpid(-1, WNOHANG))>0) {
>      foreach my $res (@finished) {
> -	if (my $job = $fence_jobs->{$res}) {
> -	    my $ec = $job->{ec};
> -
> -	    my $status = {
> -		exit_code => $ec,
> -		cmd => $job->{cmd},
> -		try => $job->{try}
> -	    };
> -
> -	    if ($ec == 0) {
> -		$succeeded->{$job->{node}} = $status;
> +	my $job = $fence_jobs->{$res};
> +	my $node = $job->{node};
> +
> +	my $status = {
> +	    exit_code => $job->{ec},
> +	    cmd => $job->{cmd},
> +	};
> +
> +	if ($job->{ec} == 0) {
> +	    # succeeded jobs doesn't need the status for now
> +	    $succeeded->{$node} = $succeeded->{$node} || 0;
> +	    $succeeded->{$node} ++;
> +	} else {
> +	    # failed jobs per node have the same try value, store only one
> +	    if (defined($failed->{$node})) {
> +		push @{$failed->{$node}->{jobs}}, $status;
>  	    } else {
> -		$failed->{$job->{node}} = $status;
> +		$failed->{$node}->{try} = $job->{try};
> +		$failed->{$node}->{jobs} = [ $status ];
>  	    }
> -
> -	    delete $fence_jobs->{$res};
> -
> -	} else {
> -	    warn "exit from unknown child (PID=$res)";
>  	}
>  
> +	delete $fence_jobs->{$res};
>      }
>  
>      return ($succeeded, $failed);
>  };
>  
> -
> -my $reset_hard = sub {
> -    my ($haenv, $node) = @_;
> -
> -    while (my ($pid, $job) = each %$fence_jobs) {
> -	next if $job->{node} ne $node;
> -
> -	if ($haenv->max_workers() > 0) {
> -	    kill KILL => $pid;
> -	    # fixme maybe use an timeout even if kill should not hang?
> -	    waitpid($pid, 0); # pick it up directly
> -	}
> -	delete $fence_jobs->{$pid};
> -    }
> -
> -    delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
> -};
> -
> -
>  # pick up jobs and process them
> -sub process_fencing {
> +my $process_fencing = sub {
>      my ($haenv) = @_;
>  
>      my $fence_cfg = $haenv->read_fence_config();
> @@ -158,62 +76,157 @@ sub process_fencing {
>  
>      foreach my $node (keys %$succeeded) {
>  	# count how many fence devices succeeded
> -	# this is needed for parallel devices
> -	$fenced_nodes->{$node}->{triggered}++;
> +	$fenced_nodes->{$node}->{triggered} += $succeeded->{$node};
>      }
>  
>      # try next device for failed jobs
> -    while(my ($node, $job) = each %$failed) {
> -	$haenv->log('err', "fence job failed: '$job->{cmd}' returned '$job->{exit_code}'");
> +    foreach my $node (keys %$failed) {
> +	my @failed_jobs = @{$failed->{$node}->{jobs}};
> +	my $try = $failed->{$node}->{try};
>  
> -	while($job->{try} < PVE::HA::FenceConfig::count_devices($node, $fence_cfg) )
> -	{
> -	    &$reset_hard($haenv, $node);
> -	    $job->{try}++;
> +	foreach my $job (@failed_jobs) {
> +	    $haenv->log('err', "fence job failed: '$job->{cmd}' returned " .
> +			"'$job->{exit_code}'");
> +	}
>  
> -	    return if start_fencing($node, $job->{try});
> +	# check if any devices are left to try
> +	while ($try < PVE::HA::FenceConfig::count_devices($node, $fence_cfg)) {
> +	    # clean up the other parallel jobs, if any, as at least one failed
> +	    kill_and_cleanup_jobs($haenv, $node);
>  
> -	    $haenv->log('warn', "Couldn't start fence try '$job->{try}'");
> +	    $try++; # try next available device
> +	    return if start_fencing($node, $try);
> +
> +	    $haenv->log('warn', "couldn't start fence try '$try'");
>  	}
>  
> -	    $haenv->log('err', "Tried all fence devices\n");
> -	    # fixme: returnproper exit code so CRM waits for the agent lock
> +	$fenced_nodes->{$node}->{failure} = 1;
> +	$haenv->log('err', "tried all fence devices for node '$node'");
> +    }
> +};
> +
> +sub has_fencing_job {
> +    my ($node) = @_;
> +
> +    foreach my $job (values %$fence_jobs) {
> +	return 1 if ($job->{node} eq $node);
>      }
> +    return undef;
>  }
>  
> +my $virtual_pid = 0; # hack for test framework
>  
> -sub is_node_fenced {
> -    my ($node) = @_;
> +sub run_fence_jobs {
> +    my ($haenv, $node, $try) = @_;
>  
> -    my $state = $fenced_nodes->{$node};
> -    return 0 if !$state;
> +    if (defined($node) && !has_fencing_job($node)) {

Ugh, the defined part is wrong and not needed, I swear I corrected that
on the rebase.
I can resend it or a cleanup afterwards, sorry for any inconvenience...

> +	# start new fencing job(s)
> +	$try = 0 if !defined($try) || ($try < 0);
>  
> -    return -1 if $state->{failure} && $state->{failure} == 1;
> +	my $fence_cfg = $haenv->read_fence_config();
> +	my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
>  
> -    return ($state->{needed} && $state->{triggered} &&
> -	   $state->{triggered} >= $state->{needed}) ? 1 : 0;
> -}
> +	if (!$commands) {
> +	    $haenv->log('err', "no fence commands for node '$node'");
> +	    $fenced_nodes->{$node}->{failure} = 1;
> +	    return 0;
> +	}
>  
> +	$haenv->log('notice', "Start fencing node '$node'");
> +
> +	my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
> +
> +	# when parallel devices are configured all must succeed
> +	$fenced_nodes->{$node}->{needed} = scalar(@$commands);
> +	$fenced_nodes->{$node}->{triggered} = 0;
> +
> +	for my $cmd (@$commands) {
> +	    my $cmd_str = "$cmd->{agent} " .
> +		PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
> +	    $haenv->log('notice', "[fence '$node'] execute cmd: $cmd_str");
> +
> +	    if ($can_fork) {
> +		my $pid = fork();
> +		if (!defined($pid)) {
> +		    $haenv->log('err', "forking fence job failed");
> +		    return 0;
> +		} elsif ($pid == 0) {
> +		    $haenv->after_fork(); # cleanup child
> +
> +		    $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> +		    exit(-1);
> +		} else {
> +
> +		    $fence_jobs->{$pid} = {
> +			cmd => $cmd_str,
> +			node => $node,
> +			try => $try
> +		    };
> +
> +		}
> +	    } else { # for test framework
> +		my $res = -1;
> +		eval {
> +		    $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> +		    $res = $res << 8 if $res > 0;
> +		};
> +		if (my $err = $@) {
> +		    $haenv->log('err', $err);
> +		}
> +
> +		$virtual_pid++;
> +		$fence_jobs->{$virtual_pid} = {
> +		    cmd => $cmd_str,
> +		    node => $node,
> +		    try => $try,
> +		    ec => $res,
> +		};
> +	    }
> +	}
>  
> -sub reset {
> -    my ($node, $noerr) = @_;
> +	return 1;
>  
> -    delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
> +    } else {
> +	# node has already fence jobs deployed, collect finished jobs
> +	# and check their result
> +	&$process_fencing($haenv);
> +
> +    }
>  }
>  
> +# if $node is undef we kill and cleanup *all* jobs from all nodes
> +sub kill_and_cleanup_jobs {
> +    my ($haenv, $node) = @_;
>  
> -sub bail_out {
> -    my ($haenv) = @_;
> +    while (my ($pid, $job) = each %$fence_jobs) {
> +	next if defined($node) && $job->{node} ne $node;
>  
> -    if ($haenv->max_workers() > 0) {
> -	foreach my $pid (keys %$fence_jobs) {
> +	if ($haenv->max_workers() > 0) {
>  	    kill KILL => $pid;
> -	    waitpid($pid, 0); # has to come back directly
> +	    # fixme maybe use an timeout even if kill should not hang?
> +	    waitpid($pid, 0);
>  	}
> +	delete $fence_jobs->{$pid};
>      }
>  
> -    $fenced_nodes = {};
> -    $fence_jobs = {};
> +    if (defined($node) && $fenced_nodes->{$node}) {
> +	delete $fenced_nodes->{$node};
> +    } else {
> +	$fenced_nodes = {};
> +	$fence_jobs = {};
> +    }
> +};
> +
> +sub is_node_fenced {
> +    my ($node) = @_;
> +
> +    my $state = $fenced_nodes->{$node};
> +    return 0 if !$state;
> +
> +    return -1 if $state->{failure} && $state->{failure} == 1;
> +
> +    return ($state->{needed} && $state->{triggered} &&
> +	    $state->{triggered} >= $state->{needed}) ? 1 : 0;
>  }
>  
>  1;





More information about the pve-devel mailing list