[pve-devel] [PATCH ha-manager 1/3] add ingore state for resources

Thomas Lamprecht t.lamprecht at proxmox.com
Thu Nov 24 16:58:46 CET 2016


in this state the resource will not get touched by us, all commands
(like start/stop/migrate) go directly to the VM/CT itself and not
through the HA stack.
The resource will not get recovered if its node fails.

Achieve that by simply removing the respective service from the
manager_status service status hash if it is in ignored state.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Config.pm                          |  5 ++-
 src/PVE/HA/Manager.pm                         | 14 ++++++--
 src/PVE/HA/Resources.pm                       | 10 +++++-
 src/PVE/HA/Sim/TestHardware.pm                |  5 +--
 src/test/test-service-ignore1/README          |  2 ++
 src/test/test-service-ignore1/cmdlist         |  4 +++
 src/test/test-service-ignore1/hardware_status |  5 +++
 src/test/test-service-ignore1/log.expect      | 25 +++++++++++++++
 src/test/test-service-ignore1/manager_status  |  1 +
 src/test/test-service-ignore1/service_config  |  3 ++
 src/test/test-service-ignore2/README          |  4 +++
 src/test/test-service-ignore2/cmdlist         |  6 ++++
 src/test/test-service-ignore2/hardware_status |  5 +++
 src/test/test-service-ignore2/log.expect      | 46 +++++++++++++++++++++++++++
 src/test/test-service-ignore2/manager_status  |  1 +
 src/test/test-service-ignore2/service_config  |  3 ++
 16 files changed, 133 insertions(+), 6 deletions(-)
 create mode 100644 src/test/test-service-ignore1/README
 create mode 100644 src/test/test-service-ignore1/cmdlist
 create mode 100644 src/test/test-service-ignore1/hardware_status
 create mode 100644 src/test/test-service-ignore1/log.expect
 create mode 100644 src/test/test-service-ignore1/manager_status
 create mode 100644 src/test/test-service-ignore1/service_config
 create mode 100644 src/test/test-service-ignore2/README
 create mode 100644 src/test/test-service-ignore2/cmdlist
 create mode 100644 src/test/test-service-ignore2/hardware_status
 create mode 100644 src/test/test-service-ignore2/log.expect
 create mode 100644 src/test/test-service-ignore2/manager_status
 create mode 100644 src/test/test-service-ignore2/service_config

diff --git a/src/PVE/HA/Config.pm b/src/PVE/HA/Config.pm
index a7a7e30..8999495 100644
--- a/src/PVE/HA/Config.pm
+++ b/src/PVE/HA/Config.pm
@@ -201,7 +201,10 @@ my $service_check_ha_state = sub {
     my ($conf, $sid, $has_state) = @_;
 
     if (my $d = $conf->{ids}->{$sid}) {
-	return 1 if !defined($has_state);
+	if (!defined($has_state)) {
+	    return 0 if defined($d->{state}) && $d->{state} eq 'ignored';
+	    return 1;
+	}
 
 	# backward compatibility
 	$has_state = 'started' if $has_state eq 'enabled';
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 7b11be4..eeda5d6 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -381,6 +381,8 @@ sub manage {
     foreach my $sid (sort keys %$sc) {
 	next if $ss->{$sid}; # already there
 	my $cd = $sc->{$sid};
+	next if defined($cd->{state}) && $cd->{state} eq 'ignored';
+
 	$haenv->log('info', "adding new service '$sid' on node '$cd->{node}'");
 	# assume we are running to avoid relocate running service at add
 	my $state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
@@ -390,8 +392,16 @@ sub manage {
 
     # remove stale service from manager state
     foreach my $sid (keys %$ss) {
-	next if $sc->{$sid};
-	$haenv->log('info', "removing stale service '$sid' (no config)");
+	my $removal_reason = 'no config';
+	if (my $cd = $sc->{$sid}) {
+	    if (defined($cd->{state}) && $cd->{state} eq 'ignored') {
+		$removal_reason = 'in ignored state';
+	    } else {
+		next; # still ha managed
+	    }
+	}
+
+	$haenv->log('info', "removing stale service '$sid' ($removal_reason)");
 	# remove all service related state information
 	delete $ss->{$sid};
     }
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 3a3746e..3334210 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -17,7 +17,7 @@ my $defaultData = {
 				   { completion => \&PVE::HA::Tools::complete_sid }),
 	state => {
 	    type => 'string',
-	    enum => ['started', 'stopped', 'enabled', 'disabled'],
+	    enum => ['started', 'stopped', 'enabled', 'disabled', 'ignored'],
 	    optional => 1,
 	    default => 'started',
 	    description => "Requested resource state.",
@@ -44,6 +44,14 @@ to relocate the resources on node failures. The main purpose of this
 state is error recovery, because it is the only way to move a resource out
 of the `error` state.
 
+`ignored`;;
+
+The resource gets removed from the manager status and so the CRM and the LRM
+do not touch the resource anymore. All {pve} API calls affecting this
+resource will be executed directly on it. CRM comands will be thrown away
+while the resource is in this state. The resource will not get relocated on
+node failures.
+
 EODESC
 	},
 	group => get_standard_option('pve-ha-group-id',
diff --git a/src/PVE/HA/Sim/TestHardware.pm b/src/PVE/HA/Sim/TestHardware.pm
index 50aef0c..11eca17 100644
--- a/src/PVE/HA/Sim/TestHardware.pm
+++ b/src/PVE/HA/Sim/TestHardware.pm
@@ -87,7 +87,7 @@ sub log {
 # reboot <node>
 # shutdown <node>
 # restart-lrm <node>
-# service <sid> <started|disabled|stopped>
+# service <sid> <started|disabled|stopped|ignored>
 # service <sid> <migrate|relocate> <target>
 # service <sid> lock/unlock [lockname]
 
@@ -175,7 +175,8 @@ sub sim_hardware_cmd {
 	    }
 
 	} elsif ($cmd eq 'service') {
-	    if ($action eq 'started' || $action eq 'disabled' || $action eq 'stopped') {
+	    if ($action eq 'started' || $action eq 'disabled' ||
+		$action eq 'stopped' || $action eq 'ignored') {
 
 		$self->set_service_state($sid, $action);
 
diff --git a/src/test/test-service-ignore1/README b/src/test/test-service-ignore1/README
new file mode 100644
index 0000000..2a01fce
--- /dev/null
+++ b/src/test/test-service-ignore1/README
@@ -0,0 +1,2 @@
+Test an user triggered service ignore, the service should get removed from the
+manager status with a meaningful log message
diff --git a/src/test/test-service-ignore1/cmdlist b/src/test/test-service-ignore1/cmdlist
new file mode 100644
index 0000000..597e469
--- /dev/null
+++ b/src/test/test-service-ignore1/cmdlist
@@ -0,0 +1,4 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service vm:103 ignored" ]
+]
diff --git a/src/test/test-service-ignore1/hardware_status b/src/test/test-service-ignore1/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-service-ignore1/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-service-ignore1/log.expect b/src/test/test-service-ignore1/log.expect
new file mode 100644
index 0000000..da05ee1
--- /dev/null
+++ b/src/test/test-service-ignore1/log.expect
@@ -0,0 +1,25 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 ignored
+info    120    node1/crm: removing stale service 'vm:103' (in ignored state)
+info    720     hardware: exit simulation - done
diff --git a/src/test/test-service-ignore1/manager_status b/src/test/test-service-ignore1/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-service-ignore1/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-service-ignore1/service_config b/src/test/test-service-ignore1/service_config
new file mode 100644
index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-service-ignore1/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
diff --git a/src/test/test-service-ignore2/README b/src/test/test-service-ignore2/README
new file mode 100644
index 0000000..c605145
--- /dev/null
+++ b/src/test/test-service-ignore2/README
@@ -0,0 +1,4 @@
+Set the request state of a service to ignored. Then simulate a node failure
+through network outage. The HA stack should not touch the 'ignored' service.
+
+Set the service to 'started' again, now the service should be fenced.
diff --git a/src/test/test-service-ignore2/cmdlist b/src/test/test-service-ignore2/cmdlist
new file mode 100644
index 0000000..4cf29f4
--- /dev/null
+++ b/src/test/test-service-ignore2/cmdlist
@@ -0,0 +1,6 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service vm:103 ignored" ],
+    [ "network node3 off" ],
+    [ "service vm:103 started" ]
+]
diff --git a/src/test/test-service-ignore2/hardware_status b/src/test/test-service-ignore2/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-service-ignore2/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-service-ignore2/log.expect b/src/test/test-service-ignore2/log.expect
new file mode 100644
index 0000000..0bebba5
--- /dev/null
+++ b/src/test/test-service-ignore2/log.expect
@@ -0,0 +1,46 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 ignored
+info    120    node1/crm: removing stale service 'vm:103' (in ignored state)
+info    220      cmdlist: execute network node3 off
+info    220    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    224    node3/crm: status change slave => wait_for_quorum
+info    225    node3/lrm: status change active => lost_agent_lock
+info    266     watchdog: execute power node3 off
+info    265    node3/crm: killed by poweroff
+info    266    node3/lrm: killed by poweroff
+info    266     hardware: server 'node3' stopped by poweroff (watchdog)
+info    320      cmdlist: execute service vm:103 started
+info    320    node1/crm: adding new service 'vm:103' on node 'node3'
+info    320    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info    320    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    340    node1/crm: got lock 'ha_agent_node3_lock'
+info    340    node1/crm: fencing: acknowledged - got agent lock for node 'node3'
+info    340    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    340    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+info    340    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
+info    341    node1/lrm: got lock 'ha_agent_node1_lock'
+info    341    node1/lrm: status change wait_for_agent_lock => active
+info    341    node1/lrm: starting service vm:103
+info    341    node1/lrm: service status vm:103 started
+info    920     hardware: exit simulation - done
diff --git a/src/test/test-service-ignore2/manager_status b/src/test/test-service-ignore2/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-service-ignore2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-service-ignore2/service_config b/src/test/test-service-ignore2/service_config
new file mode 100644
index 0000000..c6860e7
--- /dev/null
+++ b/src/test/test-service-ignore2/service_config
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
-- 
2.1.4





More information about the pve-devel mailing list