[pve-devel] [PATCH ha-manager] TestHardware: correct shutdown/reboot behaviour of CRM and LRM

Thomas Lamprecht t.lamprecht at proxmox.com
Mon Jan 18 10:26:45 CET 2016


Instead of shutting down the LRM and then killing the CRM we now
also make a shutdown request to the CRM, that mirrors the real world
behaviour much better and let's us also test the lock release from
the CRM.

To accomplish this we add new sim_hardware commands for stopping and
starting the CRM.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Sim/TestHardware.pm          | 41 ++++++++++++++++++++++++-----
 src/test/test-reboot1/log.expect        | 13 +++++-----
 src/test/test-shutdown1/log.expect      | 29 +++++++++++----------
 src/test/test-shutdown2/log.expect      | 29 +++++++++++----------
 src/test/test-shutdown3/log.expect      | 29 +++++++++++----------
 src/test/test-shutdown4/README          |  5 ++++
 src/test/test-shutdown4/cmdlist         |  4 +++
 src/test/test-shutdown4/hardware_status |  5 ++++
 src/test/test-shutdown4/log.expect      | 46 +++++++++++++++++++++++++++++++++
 src/test/test-shutdown4/manager_status  |  1 +
 src/test/test-shutdown4/service_config  |  3 +++
 11 files changed, 151 insertions(+), 54 deletions(-)
 create mode 100644 src/test/test-shutdown4/README
 create mode 100644 src/test/test-shutdown4/cmdlist
 create mode 100644 src/test/test-shutdown4/hardware_status
 create mode 100644 src/test/test-shutdown4/log.expect
 create mode 100644 src/test/test-shutdown4/manager_status
 create mode 100644 src/test/test-shutdown4/service_config

diff --git a/src/PVE/HA/Sim/TestHardware.pm b/src/PVE/HA/Sim/TestHardware.pm
index d7f4efb..cfd48e7 100644
--- a/src/PVE/HA/Sim/TestHardware.pm
+++ b/src/PVE/HA/Sim/TestHardware.pm
@@ -160,6 +160,19 @@ sub sim_hardware_cmd {
 		$d->{lrm_restart} = 1;
 		$d->{lrm}->shutdown_request();
 	    }
+	} elsif ($cmd eq 'crm') {
+
+	    if ($action eq 'stop') {
+		if ($d->{crm}) {
+		    $d->{crm_stop} = 1;
+		    $d->{crm}->shutdown_request();
+		}
+	    } elsif ($action eq 'start') {
+		$d->{crm} = PVE::HA::CRM->new($d->{crm_env}) if !$d->{crm};
+	    } else {
+		die "sim_hardware_cmd: unknown action '$action'";
+	    }
+
 	} elsif ($cmd eq 'service') {
 	    if ($action eq 'enabled' || $action eq 'disabled') {
 
@@ -221,12 +234,30 @@ sub run {
 
 		$d->{crm_env}->loop_start_hook($self->get_time());
 
-		die "implement me (CRM exit)" if !$crm->do_one_iteration();
+		my $exit_crm = !$crm->do_one_iteration();
 
 		$d->{crm_env}->loop_end_hook();
 
 		my $nodetime = $d->{crm_env}->get_time();
 		$self->{cur_time} = $nodetime if $nodetime > $self->{cur_time};
+
+		if ($exit_crm) {
+		    $d->{crm_env}->log('info', "exit (loop end)");
+		    $d->{crm} = undef;
+
+		    my $cstatus = $self->read_hardware_status_nolock();
+		    my $nstatus = $cstatus->{$node} || die "no node status for node '$node'";
+		    my $shutdown = $nstatus->{shutdown} || '';
+		    if ($shutdown eq 'reboot') {
+			$self->sim_hardware_cmd("power $node off", 'reboot');
+			$self->sim_hardware_cmd("power $node on", 'reboot');
+		    } elsif ($shutdown eq 'shutdown') {
+			$self->sim_hardware_cmd("power $node off", 'shutdown');
+		    } elsif (!$d->{crm_stop}) {
+			die "unexpected CRM exit - not implemented"
+		    }
+		    $d->{crm_stop} = undef;
+		}
 	    }
 
 	    if (my $lrm = $d->{lrm}) {
@@ -250,11 +281,9 @@ sub run {
 			die "lrm restart during shutdown - not implemented" if $shutdown;
 			$d->{lrm_restart} = undef;
 			$d->{lrm} = PVE::HA::LRM->new($d->{lrm_env});
-		    } elsif ($shutdown eq 'reboot') {
-			$self->sim_hardware_cmd("power $node off", 'reboot');
-			$self->sim_hardware_cmd("power $node on", 'reboot');
-		    } elsif ($shutdown eq 'shutdown') {
-			$self->sim_hardware_cmd("power $node off", 'shutdown');
+		    } elsif ($shutdown eq 'reboot' || $shutdown eq 'shutdown') {
+			# exit the LRM before the CRM to reflect real world behaviour
+			$self->sim_hardware_cmd("crm $node stop", $shutdown);
 		    } else {
 			die "unexpected LRM exit - not implemented"
 		    }
diff --git a/src/test/test-reboot1/log.expect b/src/test/test-reboot1/log.expect
index 12c3fe5..840f56d 100644
--- a/src/test/test-reboot1/log.expect
+++ b/src/test/test-reboot1/log.expect
@@ -25,14 +25,15 @@ info    120    node3/lrm: shutdown LRM, stop all services
 info    125    node3/lrm: stopping service vm:103
 info    125    node3/lrm: service status vm:103 stopped
 info    126    node3/lrm: exit (loop end)
-info    126       reboot: execute power node3 off
-info    125    node3/crm: killed by poweroff
-info    126       reboot: execute power node3 on
-info    125    node3/crm: status change startup => wait_for_quorum
-info    126    node3/lrm: status change startup => wait_for_agent_lock
-info    144    node3/crm: status change wait_for_quorum => slave
+info    126       reboot: execute crm node3 stop
+info    145    node3/crm: exit (loop end)
+info    145       reboot: execute power node3 off
+info    145       reboot: execute power node3 on
+info    145    node3/crm: status change startup => wait_for_quorum
+info    140    node3/lrm: status change startup => wait_for_agent_lock
 info    145    node3/lrm: got lock 'ha_agent_node3_lock'
 info    145    node3/lrm: status change wait_for_agent_lock => active
 info    145    node3/lrm: starting service vm:103
 info    145    node3/lrm: service status vm:103 started
+info    164    node3/crm: status change wait_for_quorum => slave
 info    720     hardware: exit simulation - done
diff --git a/src/test/test-shutdown1/log.expect b/src/test/test-shutdown1/log.expect
index 5c063ab..76f5133 100644
--- a/src/test/test-shutdown1/log.expect
+++ b/src/test/test-shutdown1/log.expect
@@ -25,18 +25,19 @@ info    120    node3/lrm: shutdown LRM, stop all services
 info    125    node3/lrm: stopping service vm:103
 info    125    node3/lrm: service status vm:103 stopped
 info    126    node3/lrm: exit (loop end)
-info    126     shutdown: execute power node3 off
-info    125    node3/crm: killed by poweroff
-info    140    node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info    180    node1/crm: service 'vm:103': state changed from 'started' to 'fence' 
-info    180    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info    180    node1/crm: got lock 'ha_agent_node3_lock'
-info    180    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info    180    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info    180    node1/crm: service 'vm:103': state changed from 'fence' to 'stopped' 
-info    180    node1/crm: service 'vm:103': state changed from 'stopped' to 'started'  (node = node1)
-info    181    node1/lrm: got lock 'ha_agent_node1_lock'
-info    181    node1/lrm: status change wait_for_agent_lock => active
-info    181    node1/lrm: starting service vm:103
-info    181    node1/lrm: service status vm:103 started
+info    126     shutdown: execute crm node3 stop
+info    145    node3/crm: exit (loop end)
+info    145     shutdown: execute power node3 off
+info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    200    node1/crm: service 'vm:103': state changed from 'started' to 'fence' 
+info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    200    node1/crm: got lock 'ha_agent_node3_lock'
+info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    200    node1/crm: service 'vm:103': state changed from 'fence' to 'stopped' 
+info    200    node1/crm: service 'vm:103': state changed from 'stopped' to 'started'  (node = node1)
+info    201    node1/lrm: got lock 'ha_agent_node1_lock'
+info    201    node1/lrm: status change wait_for_agent_lock => active
+info    201    node1/lrm: starting service vm:103
+info    201    node1/lrm: service status vm:103 started
 info    720     hardware: exit simulation - done
diff --git a/src/test/test-shutdown2/log.expect b/src/test/test-shutdown2/log.expect
index b367b64..4b90294 100644
--- a/src/test/test-shutdown2/log.expect
+++ b/src/test/test-shutdown2/log.expect
@@ -25,20 +25,21 @@ info    120    node3/lrm: shutdown LRM, stop all services
 info    125    node3/lrm: stopping service vm:103
 info    125    node3/lrm: service status vm:103 stopped
 info    126    node3/lrm: exit (loop end)
-info    126     shutdown: execute power node3 off
-info    125    node3/crm: killed by poweroff
-info    140    node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info    180    node1/crm: service 'vm:103': state changed from 'started' to 'fence' 
-info    180    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info    180    node1/crm: got lock 'ha_agent_node3_lock'
-info    180    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info    180    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info    180    node1/crm: service 'vm:103': state changed from 'fence' to 'stopped' 
-info    180    node1/crm: service 'vm:103': state changed from 'stopped' to 'started'  (node = node1)
-info    181    node1/lrm: got lock 'ha_agent_node1_lock'
-info    181    node1/lrm: status change wait_for_agent_lock => active
-info    181    node1/lrm: starting service vm:103
-info    181    node1/lrm: service status vm:103 started
+info    126     shutdown: execute crm node3 stop
+info    145    node3/crm: exit (loop end)
+info    145     shutdown: execute power node3 off
+info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    200    node1/crm: service 'vm:103': state changed from 'started' to 'fence' 
+info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    200    node1/crm: got lock 'ha_agent_node3_lock'
+info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    200    node1/crm: service 'vm:103': state changed from 'fence' to 'stopped' 
+info    200    node1/crm: service 'vm:103': state changed from 'stopped' to 'started'  (node = node1)
+info    201    node1/lrm: got lock 'ha_agent_node1_lock'
+info    201    node1/lrm: status change wait_for_agent_lock => active
+info    201    node1/lrm: starting service vm:103
+info    201    node1/lrm: service status vm:103 started
 info    500      cmdlist: execute power node3 on
 info    500    node3/crm: status change startup => wait_for_quorum
 info    500    node3/lrm: status change startup => wait_for_agent_lock
diff --git a/src/test/test-shutdown3/log.expect b/src/test/test-shutdown3/log.expect
index 559cb4f..8ceb042 100644
--- a/src/test/test-shutdown3/log.expect
+++ b/src/test/test-shutdown3/log.expect
@@ -25,20 +25,21 @@ info    120    node3/lrm: shutdown LRM, stop all services
 info    125    node3/lrm: stopping service ct:103
 info    125    node3/lrm: service status ct:103 stopped
 info    126    node3/lrm: exit (loop end)
-info    126     shutdown: execute power node3 off
-info    125    node3/crm: killed by poweroff
-info    140    node1/crm: node 'node3': state changed from 'online' => 'unknown'
-info    180    node1/crm: service 'ct:103': state changed from 'started' to 'fence' 
-info    180    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
-info    180    node1/crm: got lock 'ha_agent_node3_lock'
-info    180    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
-info    180    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info    180    node1/crm: service 'ct:103': state changed from 'fence' to 'stopped' 
-info    180    node1/crm: service 'ct:103': state changed from 'stopped' to 'started'  (node = node1)
-info    181    node1/lrm: got lock 'ha_agent_node1_lock'
-info    181    node1/lrm: status change wait_for_agent_lock => active
-info    181    node1/lrm: starting service ct:103
-info    181    node1/lrm: service status ct:103 started
+info    126     shutdown: execute crm node3 stop
+info    145    node3/crm: exit (loop end)
+info    145     shutdown: execute power node3 off
+info    160    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    200    node1/crm: service 'ct:103': state changed from 'started' to 'fence' 
+info    200    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    200    node1/crm: got lock 'ha_agent_node3_lock'
+info    200    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info    200    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    200    node1/crm: service 'ct:103': state changed from 'fence' to 'stopped' 
+info    200    node1/crm: service 'ct:103': state changed from 'stopped' to 'started'  (node = node1)
+info    201    node1/lrm: got lock 'ha_agent_node1_lock'
+info    201    node1/lrm: status change wait_for_agent_lock => active
+info    201    node1/lrm: starting service ct:103
+info    201    node1/lrm: service status ct:103 started
 info    500      cmdlist: execute power node3 on
 info    500    node3/crm: status change startup => wait_for_quorum
 info    500    node3/lrm: status change startup => wait_for_agent_lock
diff --git a/src/test/test-shutdown4/README b/src/test/test-shutdown4/README
new file mode 100644
index 0000000..0c5fe02
--- /dev/null
+++ b/src/test/test-shutdown4/README
@@ -0,0 +1,5 @@
+This tests if the manager lock gets released AND the services from the node with
+the manager lock get cleanly shutdown without changing the state of the service
+in the cluster.
+That means that the powered off node gets fenced by the new master and the
+service will be relocated and started again.
diff --git a/src/test/test-shutdown4/cmdlist b/src/test/test-shutdown4/cmdlist
new file mode 100644
index 0000000..e84297f
--- /dev/null
+++ b/src/test/test-shutdown4/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "shutdown node1" ]
+]
diff --git a/src/test/test-shutdown4/hardware_status b/src/test/test-shutdown4/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-shutdown4/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-shutdown4/log.expect b/src/test/test-shutdown4/log.expect
new file mode 100644
index 0000000..c5564cc
--- /dev/null
+++ b/src/test/test-shutdown4/log.expect
@@ -0,0 +1,46 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:100' on node 'node1'
+info     21    node1/lrm: got lock 'ha_agent_node1_lock'
+info     21    node1/lrm: status change wait_for_agent_lock => active
+info     21    node1/lrm: starting service vm:100
+info     21    node1/lrm: service status vm:100 started
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info    120      cmdlist: execute shutdown node1
+info    120    node1/lrm: shutdown LRM, stop all services
+info    121    node1/lrm: stopping service vm:100
+info    121    node1/lrm: service status vm:100 stopped
+info    122    node1/lrm: exit (loop end)
+info    122     shutdown: execute crm node1 stop
+info    140    node1/crm: voluntary release CRM lock
+info    141    node1/crm: exit (loop end)
+info    141     shutdown: execute power node1 off
+info    141    node2/crm: got lock 'ha_manager_lock'
+info    141    node2/crm: status change slave => master
+info    141    node2/crm: node 'node1': state changed from 'online' => 'unknown'
+info    220    node2/crm: service 'vm:100': state changed from 'started' to 'fence' 
+info    220    node2/crm: node 'node1': state changed from 'unknown' => 'fence'
+info    220    node2/crm: got lock 'ha_agent_node1_lock'
+info    220    node2/crm: fencing: acknowleged - got agent lock for node 'node1'
+info    220    node2/crm: node 'node1': state changed from 'fence' => 'unknown'
+info    220    node2/crm: service 'vm:100': state changed from 'fence' to 'stopped' 
+info    220    node2/crm: service 'vm:100': state changed from 'stopped' to 'started'  (node = node2)
+info    221    node2/lrm: got lock 'ha_agent_node2_lock'
+info    221    node2/lrm: status change wait_for_agent_lock => active
+info    221    node2/lrm: starting service vm:100
+info    221    node2/lrm: service status vm:100 started
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-shutdown4/manager_status b/src/test/test-shutdown4/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-shutdown4/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-shutdown4/service_config b/src/test/test-shutdown4/service_config
new file mode 100644
index 0000000..01d6242
--- /dev/null
+++ b/src/test/test-shutdown4/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:100": { "node": "node1", "state": "enabled" }
+}
-- 
2.1.4





More information about the pve-devel mailing list