[pve-devel] [PATCH ha-manager 1/5] Fence: cleanup class

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Apr 8 16:51:17 CEST 2016


A rather big cleanup I already had in a separate branch.

Changes:
* make process fencing private, it will be called by run_fence_jobs
  (was start_fencing) if the node has already a fence job deployed.
* fix bugs in process_fencing where if multiple parallel device
  succeeded at once only one was counted, the same for failed devices
* restrict waitpid to fence_job worker pids (instead of -1) else
  we could get some pretty nasty side effects
* remove 'resed_hard', 'reset' and 'bailout', they are replaced by
  kill_and_cleanup_jobs
* some comment and formatting changes

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Fence.pm | 303 +++++++++++++++++++++++++++-------------------------
 1 file changed, 158 insertions(+), 145 deletions(-)

diff --git a/src/PVE/HA/Fence.pm b/src/PVE/HA/Fence.pm
index 9a6c1fb..3376cb1 100644
--- a/src/PVE/HA/Fence.pm
+++ b/src/PVE/HA/Fence.pm
@@ -6,83 +6,13 @@ use POSIX qw( WNOHANG );
 use PVE::HA::FenceConfig;
 use Data::Dumper;
 
-
- # pid's and additional info of fence processes
+# pid's and additional info of fence processes
 my $fence_jobs = {};
 
 # fence state of a node
 my $fenced_nodes = {};
 
-sub has_fencing_job { # update for parallel fencing
-    my ($node) = @_;
-
-    foreach my $job (values %$fence_jobs) {
-	return 1 if ($job->{node} eq $node);
-    }
-    return undef;
-}
-
-my $virtual_pid = 0; # hack for test framework
-
-sub start_fencing {
-    my ($haenv, $node, $try) = @_;
-
-    $try = 0 if !defined($try) || $try<0;
-
-    my $fence_cfg = $haenv->read_fence_config();
-    my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
-
-    if (!$commands) {
-	$haenv->log('err', "no commands for node '$node'");
-	$fenced_nodes->{$node}->{failure} = 1;
-	return 0;
-    }
-
-    $haenv->log('notice', "Start fencing of node '$node'");
-
-    my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
-
-    $fenced_nodes->{$node}->{needed} = scalar @$commands;
-    $fenced_nodes->{$node}->{triggered} = 0;
-
-    for my $cmd (@$commands)
-    {
-	my $cmd_str = "$cmd->{agent} " .
-	    PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
-	$haenv->log('notice', "[fence '$node'] execute fence command: $cmd_str");
-
-	if ($can_fork) {
-	    my $pid = fork();
-	    if (!defined($pid)) {
-		$haenv->log('err', "forking fence job failed");
-		return 0;
-	    } elsif ($pid==0) { # child
-		$haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
-		exit(-1);
-	    } else {
-		$fence_jobs->{$pid} = {cmd=>$cmd_str, node=>$node, try=>$try};
-	    }
-	} else {
-	    my $res = -1;
-	    eval {
-		$res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
-		$res = $res << 8 if $res > 0;
-	    };
-	    if (my $err = $@) {
-		$haenv->log('err', $err);
-	    }
-
-	    $virtual_pid++;
-	    $fence_jobs->{$virtual_pid} = {cmd => $cmd_str, node => $node,
-					   try => $try, ec => $res};
-	}
-    }
-
-    return 1;
-}
-
-
-# check childs and process exit status
+# picks up/checks children and processes exit status
 my $check_jobs = sub {
     my ($haenv) = @_;
 
@@ -91,65 +21,53 @@ my $check_jobs = sub {
 
     my @finished = ();
 
-    # pick up all finsihed childs if we can fork
     if ($haenv->get_max_workers() > 0) {
-	while((my $res = waitpid(-1, WNOHANG))>0) {
-	    $fence_jobs->{$res}->{ec} = $? if $fence_jobs->{$res};
-	    push @finished, $res;
+	# pick up all finished children if we can fork
+	foreach my $pid (keys %$fence_jobs) {
+
+	    my $waitpid = waitpid($pid, WNOHANG);
+	    if (defined($waitpid) && ($waitpid == $pid)) {
+		$fence_jobs->{$waitpid}->{ec} = $? if $fence_jobs->{$waitpid};
+		push @finished, $waitpid;
+	    }
+
 	}
     } else {
-	@finished = keys %{$fence_jobs};
+	# all jobs are already finished when not forking
+	@finished = keys %$fence_jobs;
     }
 
-    #    while((my $res = waitpid(-1, WNOHANG))>0) {
     foreach my $res (@finished) {
-	if (my $job = $fence_jobs->{$res}) {
-	    my $ec = $job->{ec};
-
-	    my $status = {
-		exit_code => $ec,
-		cmd => $job->{cmd},
-		try => $job->{try}
-	    };
-
-	    if ($ec == 0) {
-		$succeeded->{$job->{node}} = $status;
+	my $job = $fence_jobs->{$res};
+	my $node = $job->{node};
+
+	my $status = {
+	    exit_code => $job->{ec},
+	    cmd => $job->{cmd},
+	};
+
+	if ($job->{ec} == 0) {
+	    # succeeded jobs doesn't need the status for now
+	    $succeeded->{$node} = $succeeded->{$node} || 0;
+	    $succeeded->{$node} ++;
+	} else {
+	    # failed jobs per node have the same try value, store only one
+	    if (defined($failed->{$node})) {
+		push @{$failed->{$node}->{jobs}}, $status;
 	    } else {
-		$failed->{$job->{node}} = $status;
+		$failed->{$node}->{try} = $job->{try};
+		$failed->{$node}->{jobs} = [ $status ];
 	    }
-
-	    delete $fence_jobs->{$res};
-
-	} else {
-	    warn "exit from unknown child (PID=$res)";
 	}
 
+	delete $fence_jobs->{$res};
     }
 
     return ($succeeded, $failed);
 };
 
-
-my $reset_hard = sub {
-    my ($haenv, $node) = @_;
-
-    while (my ($pid, $job) = each %$fence_jobs) {
-	next if $job->{node} ne $node;
-
-	if ($haenv->max_workers() > 0) {
-	    kill KILL => $pid;
-	    # fixme maybe use an timeout even if kill should not hang?
-	    waitpid($pid, 0); # pick it up directly
-	}
-	delete $fence_jobs->{$pid};
-    }
-
-    delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
-};
-
-
 # pick up jobs and process them
-sub process_fencing {
+my $process_fencing = sub {
     my ($haenv) = @_;
 
     my $fence_cfg = $haenv->read_fence_config();
@@ -158,62 +76,157 @@ sub process_fencing {
 
     foreach my $node (keys %$succeeded) {
 	# count how many fence devices succeeded
-	# this is needed for parallel devices
-	$fenced_nodes->{$node}->{triggered}++;
+	$fenced_nodes->{$node}->{triggered} += $succeeded->{$node};
     }
 
     # try next device for failed jobs
-    while(my ($node, $job) = each %$failed) {
-	$haenv->log('err', "fence job failed: '$job->{cmd}' returned '$job->{exit_code}'");
+    foreach my $node (keys %$failed) {
+	my @failed_jobs = @{$failed->{$node}->{jobs}};
+	my $try = $failed->{$node}->{try};
 
-	while($job->{try} < PVE::HA::FenceConfig::count_devices($node, $fence_cfg) )
-	{
-	    &$reset_hard($haenv, $node);
-	    $job->{try}++;
+	foreach my $job (@failed_jobs) {
+	    $haenv->log('err', "fence job failed: '$job->{cmd}' returned " .
+			"'$job->{exit_code}'");
+	}
 
-	    return if start_fencing($node, $job->{try});
+	# check if any devices are left to try
+	while ($try < PVE::HA::FenceConfig::count_devices($node, $fence_cfg)) {
+	    # clean up the other parallel jobs, if any, as at least one failed
+	    kill_and_cleanup_jobs($haenv, $node);
 
-	    $haenv->log('warn', "Couldn't start fence try '$job->{try}'");
+	    $try++; # try next available device
+	    return if start_fencing($node, $try);
+
+	    $haenv->log('warn', "couldn't start fence try '$try'");
 	}
 
-	    $haenv->log('err', "Tried all fence devices\n");
-	    # fixme: returnproper exit code so CRM waits for the agent lock
+	$fenced_nodes->{$node}->{failure} = 1;
+	$haenv->log('err', "tried all fence devices for node '$node'");
+    }
+};
+
+sub has_fencing_job {
+    my ($node) = @_;
+
+    foreach my $job (values %$fence_jobs) {
+	return 1 if ($job->{node} eq $node);
     }
+    return undef;
 }
 
+my $virtual_pid = 0; # hack for test framework
 
-sub is_node_fenced {
-    my ($node) = @_;
+sub run_fence_jobs {
+    my ($haenv, $node, $try) = @_;
 
-    my $state = $fenced_nodes->{$node};
-    return 0 if !$state;
+    if (defined($node) && !has_fencing_job($node)) {
+	# start new fencing job(s)
+	$try = 0 if !defined($try) || ($try < 0);
 
-    return -1 if $state->{failure} && $state->{failure} == 1;
+	my $fence_cfg = $haenv->read_fence_config();
+	my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
 
-    return ($state->{needed} && $state->{triggered} &&
-	   $state->{triggered} >= $state->{needed}) ? 1 : 0;
-}
+	if (!$commands) {
+	    $haenv->log('err', "no fence commands for node '$node'");
+	    $fenced_nodes->{$node}->{failure} = 1;
+	    return 0;
+	}
 
+	$haenv->log('notice', "Start fencing node '$node'");
+
+	my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
+
+	# when parallel devices are configured all must succeed
+	$fenced_nodes->{$node}->{needed} = scalar(@$commands);
+	$fenced_nodes->{$node}->{triggered} = 0;
+
+	for my $cmd (@$commands) {
+	    my $cmd_str = "$cmd->{agent} " .
+		PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
+	    $haenv->log('notice', "[fence '$node'] execute cmd: $cmd_str");
+
+	    if ($can_fork) {
+		my $pid = fork();
+		if (!defined($pid)) {
+		    $haenv->log('err', "forking fence job failed");
+		    return 0;
+		} elsif ($pid == 0) {
+		    $haenv->after_fork(); # cleanup child
+
+		    $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
+		    exit(-1);
+		} else {
+
+		    $fence_jobs->{$pid} = {
+			cmd => $cmd_str,
+			node => $node,
+			try => $try
+		    };
+
+		}
+	    } else { # for test framework
+		my $res = -1;
+		eval {
+		    $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
+		    $res = $res << 8 if $res > 0;
+		};
+		if (my $err = $@) {
+		    $haenv->log('err', $err);
+		}
+
+		$virtual_pid++;
+		$fence_jobs->{$virtual_pid} = {
+		    cmd => $cmd_str,
+		    node => $node,
+		    try => $try,
+		    ec => $res,
+		};
+	    }
+	}
 
-sub reset {
-    my ($node, $noerr) = @_;
+	return 1;
 
-    delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
+    } else {
+	# node has already fence jobs deployed, collect finished jobs
+	# and check their result
+	&$process_fencing($haenv);
+
+    }
 }
 
+# if $node is undef we kill and cleanup *all* jobs from all nodes
+sub kill_and_cleanup_jobs {
+    my ($haenv, $node) = @_;
 
-sub bail_out {
-    my ($haenv) = @_;
+    while (my ($pid, $job) = each %$fence_jobs) {
+	next if defined($node) && $job->{node} ne $node;
 
-    if ($haenv->max_workers() > 0) {
-	foreach my $pid (keys %$fence_jobs) {
+	if ($haenv->max_workers() > 0) {
 	    kill KILL => $pid;
-	    waitpid($pid, 0); # has to come back directly
+	    # fixme maybe use an timeout even if kill should not hang?
+	    waitpid($pid, 0);
 	}
+	delete $fence_jobs->{$pid};
     }
 
-    $fenced_nodes = {};
-    $fence_jobs = {};
+    if (defined($node) && $fenced_nodes->{$node}) {
+	delete $fenced_nodes->{$node};
+    } else {
+	$fenced_nodes = {};
+	$fence_jobs = {};
+    }
+};
+
+sub is_node_fenced {
+    my ($node) = @_;
+
+    my $state = $fenced_nodes->{$node};
+    return 0 if !$state;
+
+    return -1 if $state->{failure} && $state->{failure} == 1;
+
+    return ($state->{needed} && $state->{triggered} &&
+	    $state->{triggered} >= $state->{needed}) ? 1 : 0;
 }
 
 1;
-- 
2.1.4





More information about the pve-devel mailing list