[pve-devel] [PATCH ha-manager 6/7] move cfs update to common code

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Nov 22 11:53:11 CET 2017


We updated the CRM and LRM view of the cluster state only in the PVE2
environment, outside of all regression testing and simulation scope.

Further, we ignored if this update failed and happily worked with an
empty state, resulting in strange actions, e.g., the removal of all
(not so) "stale" services or changing the all but the masters node
state to unknown.

This patch tries to improve this by moving out the update in a own
environment method, cluster_update_state, calling this in the LRM and
CRM and saving its result.
As with our introduced functionallity to simulate cfs rw or update
errors we can also simulate failures of this state update with the RT
system.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/CRM.pm                              |  4 ++++
 src/PVE/HA/Env.pm                              |  7 +++++++
 src/PVE/HA/Env/PVE2.pm                         | 15 +++++++++++++--
 src/PVE/HA/LRM.pm                              |  3 +++
 src/PVE/HA/Sim/Env.pm                          |  7 +++++++
 src/test/test-cfs-unavailable2/README          |  1 +
 src/test/test-cfs-unavailable2/cmdlist         |  5 +++++
 src/test/test-cfs-unavailable2/hardware_status |  5 +++++
 src/test/test-cfs-unavailable2/manager_status  |  1 +
 src/test/test-cfs-unavailable2/service_config  |  5 +++++
 10 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 src/test/test-cfs-unavailable2/README
 create mode 100644 src/test/test-cfs-unavailable2/cmdlist
 create mode 100644 src/test/test-cfs-unavailable2/hardware_status
 create mode 100644 src/test/test-cfs-unavailable2/manager_status
 create mode 100644 src/test/test-cfs-unavailable2/service_config

diff --git a/src/PVE/HA/CRM.pm b/src/PVE/HA/CRM.pm
index 21a0acc..d149c58 100644
--- a/src/PVE/HA/CRM.pm
+++ b/src/PVE/HA/CRM.pm
@@ -28,6 +28,7 @@ sub new {
 	haenv => $haenv,
 	manager => undef,
 	status => { state => 'startup' },
+	cluster_state_update => 0,
     }, $class;
 
     $self->set_local_status({ state => 'wait_for_quorum' });
@@ -146,6 +147,8 @@ sub do_one_iteration {
 
     $haenv->loop_start_hook();
 
+    $self->{cluster_state_update} = $haenv->cluster_state_update();
+
     my $res = $self->work();
 
     $haenv->loop_end_hook();
@@ -243,6 +246,7 @@ sub work {
 		$shutdown = 1;
 
 	    } else {
+
 		$manager->manage();
 	    }
 	};
diff --git a/src/PVE/HA/Env.pm b/src/PVE/HA/Env.pm
index 55f6684..5c20037 100644
--- a/src/PVE/HA/Env.pm
+++ b/src/PVE/HA/Env.pm
@@ -209,6 +209,13 @@ sub loop_end_hook {
     return $self->{plug}->loop_end_hook(@args);
 }
 
+sub cluster_state_update {
+    my ($self) = @_;
+
+    return $self->{plug}->cluster_state_update();
+}
+
+
 sub watchdog_open {
     my ($self) = @_;
 
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index 8baf2d0..9d198b9 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -348,9 +348,8 @@ sub sleep_until {
 sub loop_start_hook {
     my ($self) = @_;
 
-    PVE::Cluster::cfs_update();
-
     $self->{loop_start} = $self->get_time();
+
 }
 
 sub loop_end_hook {
@@ -361,6 +360,18 @@ sub loop_end_hook {
     warn "loop take too long ($delay seconds)\n" if $delay > 30;
 }
 
+sub cluster_state_update {
+    my ($self) = @_;
+
+    eval { PVE::Cluster::cfs_update(1) };
+    if (my $err = $@) {
+	$self->log('warn', "cluster file system update failed - $err");
+	return 0;
+    }
+
+    return 1;
+}
+
 my $watchdog_fh;
 
 sub watchdog_open {
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index 0301fce..3d09f0b 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -34,6 +34,7 @@ sub new {
 	shutdown_errors => 0,
 	# mode can be: active, reboot, shutdown, restart
 	mode => 'active',
+	cluster_state_update => 0,
     }, $class;
 
     $self->set_local_status({ state => 	'wait_for_agent_lock' });   
@@ -219,6 +220,8 @@ sub do_one_iteration {
 
     $haenv->loop_start_hook();
 
+    $self->{cluster_state_update} = $haenv->cluster_state_update();
+
     my $res = $self->work();
 
     $haenv->loop_end_hook();
diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index 34848b1..7344b04 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -366,6 +366,13 @@ sub loop_end_hook {
     # do nothing, overwrite in subclass
 }
 
+
+sub cluster_state_update {
+    my ($self) = @_;
+
+    return $self->{hardware}->get_cfs_state($self->{nodename}, 'update');
+}
+
 sub watchdog_open {
     my ($self) = @_;
 
diff --git a/src/test/test-cfs-unavailable2/README b/src/test/test-cfs-unavailable2/README
new file mode 100644
index 0000000..6fe7fc6
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/README
@@ -0,0 +1 @@
+Test a cfs update behavior, e.g., cfs_update fails (temporarily)
diff --git a/src/test/test-cfs-unavailable2/cmdlist b/src/test/test-cfs-unavailable2/cmdlist
new file mode 100644
index 0000000..590215d
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/cmdlist
@@ -0,0 +1,5 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "cfs node1 update fail", "service vm:101 stopped" ],
+    [ "cfs node1 update work" ]
+]
diff --git a/src/test/test-cfs-unavailable2/hardware_status b/src/test/test-cfs-unavailable2/hardware_status
new file mode 100644
index 0000000..451beb1
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/hardware_status
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-cfs-unavailable2/manager_status b/src/test/test-cfs-unavailable2/manager_status
new file mode 100644
index 0000000..0967ef4
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/manager_status
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-cfs-unavailable2/service_config b/src/test/test-cfs-unavailable2/service_config
new file mode 100644
index 0000000..70f11d6
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/service_config
@@ -0,0 +1,5 @@
+{
+    "vm:101": { "node": "node1", "state": "enabled" },
+    "vm:102": { "node": "node2" },
+    "vm:103": { "node": "node3", "state": "enabled" }
+}
-- 
2.11.0





More information about the pve-devel mailing list