diff --git a/actions.yaml b/actions.yaml index c7a00cb151..b2e9e5c009 100644 --- a/actions.yaml +++ b/actions.yaml @@ -45,6 +45,13 @@ set-tls-private-key: create-backup: description: Create a database backup using xtrabackup. S3 credentials are retrieved from a relation with the S3 integrator charm. + params: + force: + type: boolean + default: False + description: | + Whether to ignore cluster health concerns and create the backup regardless. + Use it with caution, as it can potentially create a backup from stale data. list-backups: description: List available backup_ids in the S3 bucket and path provided by the S3 integrator charm. diff --git a/docs/how-to/back-up-and-restore/create-a-backup.md b/docs/how-to/back-up-and-restore/create-a-backup.md index 4dacdb2497..f6829af1cd 100644 --- a/docs/how-to/back-up-and-restore/create-a-backup.md +++ b/docs/how-to/back-up-and-restore/create-a-backup.md @@ -15,9 +15,18 @@ Once `juju status` shows Charmed MySQL as `active` and `idle`, you can create yo juju run mysql/leader create-backup ``` -If you have a cluster of one unit, you can run the `create-backup` action on `mysql-k8s/leader` (which will also be the primary unit). +If you have a cluster of one unit, you can run the `create-backup` action on the leader (which will also be the primary unit). +Otherwise, you must run the `create-backup` action on a non-primary unit. To find the primary, see `juju status` or +run `juju run mysql/leader get-cluster-status` to find the primary unit. -Otherwise, you must run the `create-backup` action on a non-primary unit. To find the primary, see `juju status` or run `juju run mysql-k8s/leader get-cluster-status` to find the primary unit). +The `create-backup` action validates that the unit in charge of the backup is healthy, by: +- Checking that the MySQL cluster is in a valid state (`OK` or `OK_PARTIAL` from the InnoDB [cluster status](https://dev.mysql.com/doc/mysql-shell/8.0/en/monitoring-innodb-cluster.html)) +- Checking that the MySQL instance is in a valid state (`ONLINE` from Replication [member states](https://dev.mysql.com/doc/refman/8.0/en/group-replication-server-states.html). + +In order to override these precautions, use the `force` flag: +```shell +juju run mysql/leader create-backup force=True +``` ## List backups You can list your available, failed, and in progress backups by running the `list-backups` command: diff --git a/lib/charms/mysql/v0/backups.py b/lib/charms/mysql/v0/backups.py index b4bfdd15f7..281fa204f3 100644 --- a/lib/charms/mysql/v0/backups.py +++ b/lib/charms/mysql/v0/backups.py @@ -57,6 +57,7 @@ def is_unit_blocked(self) -> bool: S3Requirer, ) from charms.mysql.v0.mysql import ( + MySQLClusterState, MySQLConfigureInstanceError, MySQLCreateClusterError, MySQLCreateClusterSetError, @@ -66,6 +67,7 @@ def is_unit_blocked(self) -> bool: MySQLExecuteBackupCommandsError, MySQLInitializeJujuOperationsTableError, MySQLKillSessionError, + MySQLMemberState, MySQLNoMemberStateError, MySQLOfflineModeAndHiddenInstanceExistsError, MySQLPrepareBackupForRestoreError, @@ -111,7 +113,7 @@ def is_unit_blocked(self) -> bool: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 16 +LIBPATCH = 17 ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE = "S3 repository claimed by another cluster" MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR = ( @@ -280,6 +282,7 @@ def _pre_create_backup_checks(self, event: ActionEvent) -> bool: def _on_create_backup(self, event: ActionEvent) -> None: """Handle the create backup action.""" logger.info("A backup has been requested on unit") + force = event.params.get("force", False) if not self._pre_create_backup_checks(event): return @@ -295,9 +298,16 @@ def _on_create_backup(self, event: ActionEvent) -> None: backup_path = str(pathlib.Path(s3_parameters["path"]) / datetime_backup_requested) + # Check if this cluster can perform backup + can_cluster_perform_backup, validation_message = self._can_cluster_perform_backup() + if not (can_cluster_perform_backup or force): + logger.error(f"Backup failed: {validation_message}") + event.fail(validation_message or "") + return + # Check if this unit can perform backup can_unit_perform_backup, validation_message = self._can_unit_perform_backup() - if not can_unit_perform_backup: + if not (can_unit_perform_backup or force): logger.error(f"Backup failed: {validation_message}") event.fail(validation_message or "") return @@ -355,6 +365,21 @@ def _on_create_backup(self, event: ActionEvent) -> None: }) self.charm._on_update_status(None) + def _can_cluster_perform_backup(self) -> tuple[bool, str | None]: + """Validates whether this cluster can perform a backup. + + Returns: tuple of (success, error_message) + """ + cluster_status = self.charm._mysql.get_cluster_status() + if not cluster_status: + return False, "Cluster status unknown" + + cluster_status = cluster_status["defaultreplicaset"]["status"] + if cluster_status not in [MySQLClusterState.OK, MySQLClusterState.OK_PARTIAL]: + return False, "Cluster is not in a healthy state" + + return True, None + def _can_unit_perform_backup(self) -> tuple[bool, str | None]: """Validates whether this unit can perform a backup. @@ -381,7 +406,7 @@ def _can_unit_perform_backup(self) -> tuple[bool, str | None]: if role == "primary" and self.charm.app.planned_units() > 1: return False, "Unit cannot perform backups as it is the cluster primary" - if state in ["recovering", "offline", "error"]: + if state not in [MySQLMemberState.ONLINE]: return False, f"Unit cannot perform backups as its state is {state}" return True, None diff --git a/lib/charms/mysql/v0/mysql.py b/lib/charms/mysql/v0/mysql.py index d80d0b23dc..cd08144ba4 100644 --- a/lib/charms/mysql/v0/mysql.py +++ b/lib/charms/mysql/v0/mysql.py @@ -127,7 +127,7 @@ def wait_until_mysql_connection(self) -> None: # Increment this major API version when introducing breaking changes LIBAPI = 0 -LIBPATCH = 93 +LIBPATCH = 94 UNIT_TEARDOWN_LOCKNAME = "unit-teardown" UNIT_ADD_LOCKNAME = "unit-add" @@ -916,11 +916,7 @@ def get_cluster_endpoints(self, relation_name: str) -> tuple[str, str, str]: return ",".join(rw_endpoints), ",".join(ro_endpoints), ",".join(no_endpoints) - def get_secret( - self, - scope: Scopes, - key: str, - ) -> str | None: + def get_secret(self, scope: Scopes, key: str) -> str | None: """Get secret from the secret storage. Retrieve secret from juju secrets backend if secret exists there. @@ -1012,7 +1008,18 @@ class MySQLMemberState(str, enum.Enum): class MySQLClusterState(str, enum.Enum): """MySQL Cluster state.""" + # TODO: python 3.11 has new enum.StrEnum + # that can remove str inheritance + OK = "ok" + OK_PARTIAL = "ok_partial" + OK_NO_TOLERANCE = "ok_no_tolerance" + OK_NO_TOLERANCE_PARTIAL = "ok_no_tolerance_partial" + NO_QUORUM = "no_quorum" + OFFLINE = "offline" + ERROR = "error" + UNREACHABLE = "unreachable" + UNKNOWN = "unknown" FENCED = "fenced_writes" diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 1a2b919b6a..6252fdd715 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -2,7 +2,7 @@ # See LICENSE file for licensing details. import unittest -from unittest.mock import MagicMock, PropertyMock, patch +from unittest.mock import MagicMock, patch from charms.mysql.v0.mysql import ( MySQLConfigureInstanceError, @@ -155,7 +155,12 @@ def test_on_list_backups_failure(self, _list_backups_in_s3_path, _retrieve_s3_pa return_value=({"path": "/path"}, []), ) @patch( - "charms.mysql.v0.backups.MySQLBackups._can_unit_perform_backup", return_value=(True, None) + "charms.mysql.v0.backups.MySQLBackups._can_cluster_perform_backup", + return_value=(True, None), + ) + @patch( + "charms.mysql.v0.backups.MySQLBackups._can_unit_perform_backup", + return_value=(True, None), ) @patch("charms.mysql.v0.backups.upload_content_to_s3") @patch("charms.mysql.v0.backups.MySQLBackups._pre_backup", return_value=(True, None)) @@ -170,6 +175,7 @@ def test_on_create_backup( _pre_backup, _upload_content_to_s3, _can_unit_perform_backup, + _can_cluster_perform_backup, _retrieve_s3_parameters, _datetime, _update_status, @@ -191,6 +197,7 @@ def test_on_create_backup( self.mysql_backups._on_create_backup(event) _retrieve_s3_parameters.assert_called_once() + _can_cluster_perform_backup.assert_called_once() _can_unit_perform_backup.assert_called_once() _upload_content_to_s3.assert_called_once_with( expected_metadata, f"{expected_backup_path}.metadata", expected_s3_params @@ -208,7 +215,12 @@ def test_on_create_backup( return_value=({"path": "/path"}, []), ) @patch( - "charms.mysql.v0.backups.MySQLBackups._can_unit_perform_backup", return_value=(True, None) + "charms.mysql.v0.backups.MySQLBackups._can_cluster_perform_backup", + return_value=(True, None), + ) + @patch( + "charms.mysql.v0.backups.MySQLBackups._can_unit_perform_backup", + return_value=(True, None), ) @patch("charms.mysql.v0.backups.upload_content_to_s3") @patch("charms.mysql.v0.backups.MySQLBackups._pre_backup", return_value=(True, None)) @@ -223,6 +235,7 @@ def test_on_create_backup_failure( _pre_backup, _upload_content_to_s3, _can_unit_perform_backup, + _can_cluster_perform_backup, _retrieve_s3_parameters, _datetime, ): @@ -272,6 +285,7 @@ def test_on_create_backup_failure( # test failure with _can_unit_perform_backup _can_unit_perform_backup.return_value = False, "can unit perform backup failure" event = MagicMock() + event.params = {"force": False} self.charm.unit.status = ActiveStatus() self.mysql_backups._on_create_backup(event) @@ -279,6 +293,17 @@ def test_on_create_backup_failure( event.fail.assert_called_once_with("can unit perform backup failure") self.assertTrue(isinstance(self.harness.model.unit.status, ActiveStatus)) + # test failure with _can_cluster_perform_backup + _can_cluster_perform_backup.return_value = False, "can cluster perform backup failure" + event = MagicMock() + event.params = {"force": False} + self.charm.unit.status = ActiveStatus() + + self.mysql_backups._on_create_backup(event) + event.set_results.assert_not_called() + event.fail.assert_called_once_with("can cluster perform backup failure") + self.assertTrue(isinstance(self.harness.model.unit.status, ActiveStatus)) + # test failure with _retrieve_s3_parameters _retrieve_s3_parameters.return_value = False, ["bucket"] event = MagicMock() @@ -309,6 +334,32 @@ def test_on_create_backup_failure( event.fail.assert_called_once_with("Missing relation with S3 integrator charm") self.assertTrue(isinstance(self.harness.model.unit.status, ActiveStatus)) + @patch("mysql_vm_helpers.MySQL.get_cluster_status") + def test_can_cluster_perform_backup(self, _get_cluster_status): + """Test _can_cluster_perform_backup().""" + _get_cluster_status.return_value = {"defaultreplicaset": {"status": "ok"}} + + success, error_message = self.mysql_backups._can_cluster_perform_backup() + self.assertTrue(success) + self.assertIsNone(error_message) + + @patch("mysql_vm_helpers.MySQL.get_cluster_status") + def test_can_cluster_perform_backup_failure(self, _get_cluster_status): + """Test failure of _can_unit_perform_backup().""" + # test unknown state + _get_cluster_status.return_value = None + + success, error_message = self.mysql_backups._can_cluster_perform_backup() + self.assertFalse(success) + self.assertEqual(error_message, "Cluster status unknown") + + # test error state + _get_cluster_status.return_value = {"defaultreplicaset": {"status": "error"}} + + success, error_message = self.mysql_backups._can_cluster_perform_backup() + self.assertFalse(success) + self.assertEqual(error_message, "Cluster is not in a healthy state") + @patch("mysql_vm_helpers.MySQL.offline_mode_and_hidden_instance_exists", return_value=False) @patch("mysql_vm_helpers.MySQL.get_member_state", return_value=("online", "replica")) def test_can_unit_perform_backup( @@ -538,10 +589,7 @@ def test_pre_restore_checks( ): """Test _pre_restore_checks().""" event_mock = MagicMock() - type(event_mock).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - + event_mock.params = {"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} self.assertTrue(self.mysql_backups._pre_restore_checks(event_mock)) @patch("mysql_vm_helpers.MySQL.is_server_connectable", return_value=True) @@ -559,10 +607,7 @@ def test_pre_restore_checks_failure( # test more than one planned units self.harness.add_relation_unit(self.peer_relation_id, "mysql/1") event = MagicMock() - type(event).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - + event.params = {"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} self.assertFalse(self.mysql_backups._pre_restore_checks(event)) self.harness.remove_relation_unit(self.peer_relation_id, "mysql/1") @@ -570,40 +615,24 @@ def test_pre_restore_checks_failure( # test unit in blocked state _is_unit_busy.return_value = True event = MagicMock() - type(event).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - + event.params = {"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} self.assertFalse(self.mysql_backups._pre_restore_checks(event)) # test mysqld not running _is_server_connectable.return_value = False event = MagicMock() - type(event).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - + event.params = {"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} self.assertFalse(self.mysql_backups._pre_restore_checks(event)) # test missing backup-id event = MagicMock() - type(event).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - - params_mock = {} - with patch.dict(params_mock, {}): - type(event).params = PropertyMock(return_value=params_mock) - - self.assertFalse(self.mysql_backups._pre_restore_checks(event)) + event.params = {} + self.assertFalse(self.mysql_backups._pre_restore_checks(event)) # test missing s3-integrator relation self.harness.remove_relation(self.s3_integrator_id) event = MagicMock() - type(event).params = PropertyMock( - return_value={"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} - ) - + event.params = {"restore-to-time": "2025-02-10 12:30:30", "backup-id": "test-id"} self.assertFalse(self.mysql_backups._pre_restore_checks(event)) @patch("charm.MySQLOperatorCharm._on_update_status") @@ -630,12 +659,8 @@ def test_on_restore( ): """Test _on_restore().""" event = MagicMock() - params_mock = {} - - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) expected_s3_parameters = {"path": "/path"} @@ -678,11 +703,8 @@ def test_on_restore_failure( _post_restore.return_value = (False, "post restore error") event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("post restore error") @@ -691,11 +713,8 @@ def test_on_restore_failure( _restore.return_value = (False, True, "restore error") event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("restore error") @@ -707,11 +726,8 @@ def test_on_restore_failure( _restore.return_value = (False, False, "restore error") event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("restore error") @@ -721,11 +737,8 @@ def test_on_restore_failure( # test failure of _pre_restore() _pre_restore.return_value = (False, "pre restore error") event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("pre restore error") @@ -734,11 +747,8 @@ def test_on_restore_failure( _fetch_and_check_existence_of_s3_path.return_value = False event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("Invalid backup-id: test-backup-id") @@ -747,11 +757,8 @@ def test_on_restore_failure( _retrieve_s3_parameters.return_value = ({}, ["bucket"]) event = MagicMock() - params_mock = {} - with patch.dict(params_mock, {"backup-id": "test-backup-id"}): - type(event).params = PropertyMock(return_value=params_mock) - - self.mysql_backups._on_restore(event) + event.params = {"backup-id": "test-backup-id"} + self.mysql_backups._on_restore(event) event.set_results.assert_not_called() event.fail.assert_called_once_with("Missing S3 parameters: ['bucket']")