diff --git a/app.py b/app.py index 45e3767..e9ef03f 100644 --- a/app.py +++ b/app.py @@ -5,10 +5,21 @@ from datetime import datetime, timedelta import json import os -import csv import queue import threading from config_manager import ConfigManager +from drive_health import ( + SMART_FIELDS, + append_telemetry, + collect_hdsentinel_snapshot, + get_hdsentinel_display_snapshot, + get_hdsentinel_settings, + get_optimal_threshold, + get_smart_attributes, + predict_failure_probability, + run_scheduled_drive_health_check, + save_hdsentinel_settings, +) from setup_wizard import setup, install_systemd_tasks import logging from user_manager import UserManager, login_required, admin_required @@ -21,15 +32,6 @@ from tempfile import NamedTemporaryFile from runtime import get_runtime, get_fake_state -try: - import pandas as pd - from xgboost import XGBClassifier - import joblib -except (ImportError, OSError): - pd = None - XGBClassifier = None - joblib = None - app = Flask(__name__) app.secret_key = os.urandom(24) # Required for session management socketio = SocketIO(app) @@ -60,22 +62,7 @@ # Register setup blueprint app.register_blueprint(setup) -# Paths for the XGBoost model and threshold -MODEL_PATH = str(runtime.model_dir / "xgb_model.json") -THRESHOLD_PATH = str(runtime.model_dir / "optimal_threshold_xgb.pkl") - -# Load model and threshold once on startup -model = None -optimal_threshold = 0.5 -if XGBClassifier is not None and joblib is not None: - model = XGBClassifier() - try: - model.load_model(MODEL_PATH) - optimal_threshold = joblib.load(THRESHOLD_PATH) - except Exception as exc: - print(f"Failed to load model: {exc}") - model = None - optimal_threshold = 0.5 +optimal_threshold = get_optimal_threshold(runtime) def read_msmtp_config(): @@ -103,186 +90,6 @@ def read_msmtp_config(): return msmtp_config -def predict_failure_probability(smart): - if model is not None and pd is not None: - df = pd.DataFrame([smart]) - probabilities = model.predict_proba(df) - return float(probabilities[0, 1]) - - if runtime.is_fake: - temperature = float(smart.get('smart_194_raw', 30.0) or 30.0) - reallocated = float(smart.get('smart_5_raw', 0.0) or 0.0) - pending = float(smart.get('smart_197_raw', 0.0) or 0.0) - probability = 0.03 + min(0.25, reallocated * 0.02) + min(0.25, pending * 0.05) - if temperature > 40: - probability += min(0.15, (temperature - 40) * 0.01) - return min(0.95, probability) - - return None - -# SMART attributes used for telemetry/model with their default values and descriptions -SMART_FIELDS = { - "smart_1_raw": { - "default": 0.0, - "name": "Read Error Rate", - "description": "The rate of hardware read errors that occurred when reading data from the disk surface. A non-zero value may indicate problems with the disk surface or read/write heads.", - "short_desc": "Rate of hardware read errors" - }, - "smart_3_raw": { - "default": 0.0, - "name": "Spin-Up Time", - "description": "Average time (in milliseconds) for the disk to spin up from a stopped state to full speed. Higher values may indicate mechanical problems.", - "short_desc": "Time to reach full speed" - }, - "smart_4_raw": { - "default": 0.0, - "name": "Start/Stop Count", - "description": "The number of times the disk has been powered on and off. This is a lifetime counter that increases with each power cycle.", - "short_desc": "Number of power cycles" - }, - "smart_5_raw": { - "default": 0.0, - "name": "Reallocated Sectors Count", - "description": "The number of bad sectors that have been found and remapped. A non-zero value indicates the disk has had some problems, and the value should not increase over time.", - "short_desc": "Number of remapped sectors" - }, - "smart_7_raw": { - "default": 0.0, - "name": "Seek Error Rate", - "description": "The rate of seek errors that occur when the drive's heads try to position themselves over a track. Higher values may indicate mechanical problems.", - "short_desc": "Rate of positioning errors" - }, - "smart_10_raw": { - "default": 0.0, - "name": "Spin Retry Count", - "description": "The number of times the drive had to retry spinning up. A non-zero value indicates problems with the drive's motor or power supply.", - "short_desc": "Number of spin-up retries" - }, - "smart_192_raw": { - "default": 0.0, - "name": "Emergency Retract Count", - "description": "The number of times the drive's heads were retracted due to power loss or other emergency conditions. High values may indicate power problems.", - "short_desc": "Number of emergency head retractions" - }, - "smart_193_raw": { - "default": 0.0, - "name": "Load Cycle Count", - "description": "The number of times the drive's heads have been loaded and unloaded. This is a lifetime counter that increases with each load/unload cycle.", - "short_desc": "Number of head load/unload cycles" - }, - "smart_194_raw": { - "default": 25.0, - "name": "Temperature", - "description": "The current temperature of the drive in Celsius. Normal operating temperature is typically between 30-50°C. Higher temperatures may indicate cooling problems.", - "short_desc": "Current drive temperature" - }, - "smart_197_raw": { - "default": 0.0, - "name": "Current Pending Sectors", - "description": "The number of sectors that are waiting to be remapped. A non-zero value indicates the drive has found bad sectors that it hasn't been able to remap yet.", - "short_desc": "Number of sectors waiting to be remapped" - }, - "smart_198_raw": { - "default": 0.0, - "name": "Offline Uncorrectable", - "description": "The number of sectors that could not be corrected during offline testing. A non-zero value indicates the drive has sectors that are permanently damaged.", - "short_desc": "Number of uncorrectable sectors" - } -} - - -def get_fake_smart_attributes(): - attrs = {field: info["default"] for field, info in SMART_FIELDS.items()} - attrs.update({ - "smart_1_raw": 0.0, - "smart_3_raw": 1420.0, - "smart_4_raw": 321.0, - "smart_5_raw": 0.0, - "smart_7_raw": 0.0, - "smart_10_raw": 0.0, - "smart_192_raw": 2.0, - "smart_193_raw": 145.0, - "smart_194_raw": 31.0, - "smart_197_raw": 0.0, - "smart_198_raw": 0.0, - }) - return attrs, [] - - -def get_smart_attributes(device=None): - """Return SMART attributes as a dictionary using smartctl JSON output.""" - if runtime.is_fake: - return get_fake_smart_attributes() - try: - # If device is not specified, get UUID from config and resolve to device - if device is None: - uuid = config_manager.get_value('backup', 'uuid', None) - if not uuid: - print("No UUID found in config for backup partition.") - return None, None - # Find the partition device with this UUID - blkid_out = subprocess.run(['blkid', '-t', f'UUID={uuid}', '-o', 'device'], capture_output=True, text=True) - partition_device = blkid_out.stdout.strip() - if not partition_device: - print(f"No partition found with UUID {uuid}") - return None, None - # Get the parent drive (e.g. /dev/sda) - device = system_utils.get_parent_device(partition_device) - if not device: - print(f"Could not determine parent device for partition {partition_device}") - return None, None - result = subprocess.run( - ["sudo", "smartctl", "-A", "-j", device], capture_output=True, text=True, check=True - ) - data = json.loads(result.stdout) - - # Initialize dictionary with default values for all required SMART fields - attrs = {field: info["default"] for field, info in SMART_FIELDS.items()} - missing_attrs = set(SMART_FIELDS.keys()) - - # Update with actual values from smartctl output - for item in data.get("ata_smart_attributes", {}).get("table", []): - field_name = f"smart_{item['id']}_raw" - if field_name in SMART_FIELDS: - try: - # Temperature is a special case, it is a 16 bit value, but we only want the lowest byte - # Extract the lowest byte for smart_194_raw (Temperature) - if field_name == "smart_194_raw": - attrs[field_name] = float(int(item["raw"]["value"]) & 0xFF) - else: - attrs[field_name] = float(item["raw"]["value"]) - missing_attrs.remove(field_name) - except (ValueError, KeyError, TypeError): - print(f"Warning: Could not parse value for {field_name}") - continue - - return attrs, list(missing_attrs) - except subprocess.CalledProcessError as e: - print(f"Failed to execute smartctl: {e}") - return None, None - except json.JSONDecodeError as e: - print(f"Failed to parse smartctl JSON output: {e}") - return None, None - except Exception as e: - print(f"Unexpected error reading SMART data: {e}") - return None, None - - -def append_telemetry(data_dict, prediction): - """Append SMART data and prediction to telemetry.csv.""" - if not data_dict: - return - telemetry_path = runtime.telemetry_path - telemetry_path.parent.mkdir(parents=True, exist_ok=True) - file_exists = telemetry_path.exists() - with open(telemetry_path, "a", newline="") as csvfile: - writer = csv.writer(csvfile) - if not file_exists: - writer.writerow(list(SMART_FIELDS.keys()) + ["failure"]) - row = [data_dict.get(field, "") for field in SMART_FIELDS.keys()] - row.append(prediction) - writer.writerow(row) - # a service can either be running right now, or could have succeeded or failed last time it ran class Status: RUNNING = "Running" @@ -834,20 +641,61 @@ def drives(): probability = None error = None smart = None - missing_attrs = None + missing_attrs = [] + settings_message = None + settings_error = None + hdsentinel_settings = get_hdsentinel_settings(config_manager) + hdsentinel_snapshot = get_hdsentinel_display_snapshot( + config_manager, + system_utils, + runtime=runtime, + ) + if request.method == "POST": - result = get_smart_attributes() - if result is None: - error = "Could not retrieve SMART data" + form_action = request.form.get("form_action", "run_health_check") + if form_action == "save_hdsentinel_settings": + try: + save_hdsentinel_settings( + config_manager, + enabled=request.form.get("hdsentinel_enabled") == "on", + health_change_alert=request.form.get("hdsentinel_health_change_alert") == "on", + ) + hdsentinel_settings = get_hdsentinel_settings(config_manager) + # Refresh the snapshot after saving settings so the UI immediately + # reflects the current monitoring state (including when monitoring + # has just been enabled). + hdsentinel_snapshot = collect_hdsentinel_snapshot( + config_manager, + system_utils, + runtime=runtime, + ) + settings_message = "HDSentinel settings saved successfully." + except Exception as exc: + settings_error = f"Failed to save HDSentinel settings: {exc}" else: - smart, missing_attrs = result - prob = predict_failure_probability(smart) - if prob is not None: - prediction = int(prob >= optimal_threshold) - probability = prob - append_telemetry(smart, prediction) + smart, missing_attrs = get_smart_attributes( + config_manager, + system_utils, + runtime=runtime, + ) + if smart is None: + error = "Could not retrieve SMART data" else: - error = "Model not loaded" + prob = predict_failure_probability(smart, runtime=runtime) + if prob is not None: + prediction = int(prob >= optimal_threshold) + probability = prob + append_telemetry(smart, prediction, runtime=runtime) + else: + error = "Model not loaded" + + if hdsentinel_settings["enabled"]: + hdsentinel_snapshot = collect_hdsentinel_snapshot( + config_manager, + system_utils, + runtime=runtime, + ) + return render_template( "drive_health.html", smart=smart, @@ -856,6 +704,10 @@ def drives(): error=error, missing_attrs=missing_attrs, smart_fields=SMART_FIELDS, + hdsentinel_settings=hdsentinel_settings, + hdsentinel_snapshot=hdsentinel_snapshot, + settings_message=settings_message, + settings_error=settings_error, ) @@ -1006,14 +858,28 @@ def run_fake_task(task_name: str, cancel_event: threading.Event): fake_state.append_task_log(task_name, f'Found local backup source at {mount_point}; marking it as connected.') fake_state.append_task_log(task_name, f'Backup source available at {mount_point}.') elif task_name == 'Drive Health Check': - smart, _ = get_smart_attributes() if cancel_event.is_set(): raise RuntimeError('Task was cancelled.') - probability = predict_failure_probability(smart) + result = run_scheduled_drive_health_check(config_manager, system_utils, runtime=runtime) + probability = result.get('probability') if probability is not None: fake_state.append_task_log(task_name, f'Drive health probability: {probability:.4f}') else: fake_state.append_task_log(task_name, 'Model unavailable; using sample SMART data only.') + + hdsentinel_snapshot = result.get('hdsentinel', {}).get('snapshot') + if hdsentinel_snapshot and hdsentinel_snapshot.get('available'): + fake_state.append_task_log( + task_name, + ( + 'HDSentinel status: ' + f"health {hdsentinel_snapshot.get('health_pct')}%, " + f"performance {hdsentinel_snapshot.get('performance_pct')}%, " + f"temperature {hdsentinel_snapshot.get('temperature_c')}C" + ), + ) + elif hdsentinel_snapshot and hdsentinel_snapshot.get('error'): + fake_state.append_task_log(task_name, f"HDSentinel unavailable: {hdsentinel_snapshot['error']}") elif task_name == 'Cloud Backup': run_fake_cloud_backup(cancel_event) @@ -1500,18 +1366,29 @@ def api_storage_status(): @login_required def api_drive_health_summary(): try: - smart, missing_attrs = get_smart_attributes() + smart, missing_attrs = get_smart_attributes(config_manager, system_utils, runtime=runtime) if smart is None: return jsonify({'status': 'unknown', 'probability': None, 'temperature': None}) - prob = predict_failure_probability(smart) + prob = predict_failure_probability(smart, runtime=runtime) if prob is not None: temperature = smart.get('smart_194_raw', None) status = 'good' if prob < optimal_threshold else 'warning' - return jsonify({ + response = { 'status': status, 'probability': prob, 'temperature': temperature - }) + } + hdsentinel_snapshot = get_hdsentinel_display_snapshot(config_manager, system_utils, runtime=runtime) + hdsentinel_settings = get_hdsentinel_settings(config_manager) + hdsentinel_enabled = False + if isinstance(hdsentinel_settings, dict): + hdsentinel_enabled = bool(hdsentinel_settings.get('enabled')) + else: + hdsentinel_enabled = bool(getattr(hdsentinel_settings, 'enabled', False)) + if hdsentinel_enabled and hdsentinel_snapshot and hdsentinel_snapshot.get('available'): + response['hdsentinel_health'] = hdsentinel_snapshot.get('health_pct') + response['hdsentinel_performance'] = hdsentinel_snapshot.get('performance_pct') + return jsonify(response) else: return jsonify({'status': 'unknown', 'probability': None, 'temperature': None}) except Exception as e: diff --git a/config_manager.py b/config_manager.py index 47f4153..985ab1d 100644 --- a/config_manager.py +++ b/config_manager.py @@ -74,7 +74,12 @@ def create_default_config(self): self.config['schedule'] = { 'backup_cloud_time': '3:00' } - + + self.config['hdsentinel'] = { + 'enabled': 'true', + 'health_change_alert': 'true' + } + self.save_config() def save_config(self): diff --git a/docs/drive_health.md b/docs/drive_health.md index c86d7bf..1c76d5b 100644 --- a/docs/drive_health.md +++ b/docs/drive_health.md @@ -1,22 +1,23 @@ # Drive Health -The Drive Health page allows you to check the health of your storage drive using SMART data and a machine learning model. +The Drive Health page combines two views of the configured backup drive: + +- SMART data plus the local machine-learning failure prediction +- HDSentinel health and performance reporting ## Features -- **Error/Warning Alerts**: Shown if SMART data is missing or the model is not loaded. -- **Missing Attributes**: Lists any SMART attributes not available for the drive, with a warning that accuracy may be affected. -- **Prediction Result**: Shows if a failure is predicted, with probability percentage. -- **Run Health Check**: Button to run a new health check (submits the form). -- **SMART Data Table**: Lists all SMART attributes, descriptions, raw values, and status (available/default). -- **Tooltips**: Hover over info icons for detailed attribute descriptions. -- **Download Telemetry**: Button to download SMART telemetry data. -- **Send Telemetry**: Button to email telemetry data. +- **Run Health Check**: Runs a manual SMART and HDSentinel refresh from the page. +- **Prediction Result**: Shows whether the SMART model predicts failure, with probability percentage. +- **Missing Attributes**: Lists SMART attributes that fell back to defaults. +- **HDSentinel Status**: Shows install state, device, model, serial, health, performance, temperature, size, and last checked time. +- **HDSentinel Settings**: Lets users enable or disable HDSentinel monitoring and toggle health-change alerts. +- **SMART Data Table**: Lists SMART attributes, descriptions, raw values, and status. +- **Download Telemetry**: Downloads the SMART telemetry CSV. -## UI Details -- Inline feedback for errors and missing data. -- Table with attribute, description, value, and status. -- Spinners and tooltips for user feedback. +## Alerting +- HDSentinel alerts only trigger on health changes between scheduled checks. +- Temperature is displayed but does not currently trigger alerts. --- -This page helps monitor drive health and predict failures using SMART data. \ No newline at end of file +This page is the main place to inspect the backup drive's current SMART and HDSentinel health data. diff --git a/docs/manual_install.md b/docs/manual_install.md index 5bd4bdd..0654913 100644 --- a/docs/manual_install.md +++ b/docs/manual_install.md @@ -10,7 +10,7 @@ Open a terminal and run: ```bash sudo apt-get update -sudo apt-get install -y python3 python3-pip python3-flask python3-flask-socketio python3-psutil python3-xgboost python3-joblib python3-pandas python3-sklearn python3-cryptography smartmontools samba msmtp rsync curl +sudo apt-get install -y python3 python3-pip python3-flask python3-flask-socketio python3-psutil python3-xgboost python3-joblib python3-pandas python3-sklearn python3-cryptography smartmontools samba msmtp rsync curl unzip ``` ## 2. Install rclone (Official Script) @@ -25,7 +25,24 @@ sudo sh -c "bash $TMPFILE || true" rm -f "$TMPFILE" ``` -## 3. Download and Copy Application Files +## 3. Install HDSentinel + +Install the vendor binary to `/usr/local/bin/hdsentinel`. + +- `amd64`: `https://www.hdsentinel.com/hdslin/hdsentinel-020c-x64.zip` +- `arm64`: `https://www.hdsentinel.com/hdslin/hdsentinel-armv8.zip` + +Example for `amd64`: + +```bash +TMPDIR=$(mktemp -d) +curl -L --fail -o "$TMPDIR/hdsentinel.zip" "https://www.hdsentinel.com/hdslin/hdsentinel-020c-x64.zip" +unzip -o "$TMPDIR/hdsentinel.zip" -d "$TMPDIR" +sudo install -m 755 "$TMPDIR/HDSentinel" /usr/local/bin/hdsentinel +rm -rf "$TMPDIR" +``` + +## 4. Download and Copy Application Files Clone the repository and copy files to `/opt/SimpleSaferServer`: @@ -38,7 +55,7 @@ sudo rsync -a static /opt/SimpleSaferServer/ sudo rsync -a templates /opt/SimpleSaferServer/ ``` -## 4. Install Scripts and Model Files +## 5. Install Scripts and Model Files ```bash sudo mkdir -p /usr/local/bin @@ -50,7 +67,7 @@ sudo mkdir -p /opt/SimpleSaferServer/harddrive_model sudo cp harddrive_model/* /opt/SimpleSaferServer/harddrive_model/ ``` -## 5. Set Up the Systemd Service +## 6. Set Up the Systemd Service ## Email Setup - **Email Address**: Enter the address for alerts. @@ -67,7 +84,7 @@ sudo systemctl enable simple_safer_server_web.service sudo systemctl restart simple_safer_server_web.service ``` -## 6. (Optional) Open Firewall Port 5000 +## 7. (Optional) Open Firewall Port 5000 If you use a firewall, open port 5000: @@ -85,7 +102,7 @@ If you use a firewall, open port 5000: sudo iptables -C INPUT -p tcp --dport 5000 -j ACCEPT 2>/dev/null || sudo iptables -A INPUT -p tcp --dport 5000 -j ACCEPT ``` -## 7. Access the Web UI +## 8. Access the Web UI After installation, open a browser on any device in your network and go to: @@ -109,4 +126,4 @@ Follow the setup wizard to complete configuration. --- -For more help, see the [GitHub repository](https://github.com/chrismin13/SimpleSaferServer) or [landing page](https://sss.chrismin13.com). \ No newline at end of file +For more help, see the [GitHub repository](https://github.com/chrismin13/SimpleSaferServer) or [landing page](https://sss.chrismin13.com). diff --git a/drive_health.py b/drive_health.py new file mode 100644 index 0000000..ad9c3a3 --- /dev/null +++ b/drive_health.py @@ -0,0 +1,745 @@ +import csv +import json +import logging +import os +import re +import shutil +import subprocess +from datetime import datetime +from functools import lru_cache +from pathlib import Path +from tempfile import NamedTemporaryFile + +from runtime import get_runtime + +try: + import pandas as pd + from xgboost import XGBClassifier + import joblib +except (ImportError, OSError): + pd = None + XGBClassifier = None + joblib = None + + +LOGGER = logging.getLogger(__name__) + +SMART_FIELDS = { + "smart_1_raw": { + "default": 0.0, + "name": "Read Error Rate", + "description": "The rate of hardware read errors that occurred when reading data from the disk surface. A non-zero value may indicate problems with the disk surface or read/write heads.", + "short_desc": "Rate of hardware read errors", + }, + "smart_3_raw": { + "default": 0.0, + "name": "Spin-Up Time", + "description": "Average time (in milliseconds) for the disk to spin up from a stopped state to full speed. Higher values may indicate mechanical problems.", + "short_desc": "Time to reach full speed", + }, + "smart_4_raw": { + "default": 0.0, + "name": "Start/Stop Count", + "description": "The number of times the disk has been powered on and off. This is a lifetime counter that increases with each power cycle.", + "short_desc": "Number of power cycles", + }, + "smart_5_raw": { + "default": 0.0, + "name": "Reallocated Sectors Count", + "description": "The number of bad sectors that have been found and remapped. A non-zero value indicates the disk has had some problems, and the value should not increase over time.", + "short_desc": "Number of remapped sectors", + }, + "smart_7_raw": { + "default": 0.0, + "name": "Seek Error Rate", + "description": "The rate of seek errors that occur when the drive's heads try to position themselves over a track. Higher values may indicate mechanical problems.", + "short_desc": "Rate of positioning errors", + }, + "smart_10_raw": { + "default": 0.0, + "name": "Spin Retry Count", + "description": "The number of times the drive had to retry spinning up. A non-zero value indicates problems with the drive's motor or power supply.", + "short_desc": "Number of spin-up retries", + }, + "smart_192_raw": { + "default": 0.0, + "name": "Emergency Retract Count", + "description": "The number of times the drive's heads were retracted due to power loss or other emergency conditions. High values may indicate power problems.", + "short_desc": "Number of emergency head retractions", + }, + "smart_193_raw": { + "default": 0.0, + "name": "Load Cycle Count", + "description": "The number of times the drive's heads have been loaded and unloaded. This is a lifetime counter that increases with each load/unload cycle.", + "short_desc": "Number of head load/unload cycles", + }, + "smart_194_raw": { + "default": 25.0, + "name": "Temperature", + "description": "The current temperature of the drive in Celsius. Normal operating temperature is typically between 30-50°C. Higher temperatures may indicate cooling problems.", + "short_desc": "Current drive temperature", + }, + "smart_197_raw": { + "default": 0.0, + "name": "Current Pending Sectors", + "description": "The number of sectors that are waiting to be remapped. A non-zero value indicates the drive has found bad sectors that it hasn't been able to remap yet.", + "short_desc": "Number of sectors waiting to be remapped", + }, + "smart_198_raw": { + "default": 0.0, + "name": "Offline Uncorrectable", + "description": "The number of sectors that could not be corrected during offline testing. A non-zero value indicates the drive has sectors that are permanently damaged.", + "short_desc": "Number of uncorrectable sectors", + }, +} + +HDSENTINEL_SECTION = "hdsentinel" +HDSENTINEL_DEFAULTS = { + "enabled": True, + "health_change_alert": True, +} + + +def get_fake_smart_attributes(): + attrs = {field: info["default"] for field, info in SMART_FIELDS.items()} + attrs.update( + { + "smart_1_raw": 0.0, + "smart_3_raw": 1420.0, + "smart_4_raw": 321.0, + "smart_5_raw": 0.0, + "smart_7_raw": 0.0, + "smart_10_raw": 0.0, + "smart_192_raw": 2.0, + "smart_193_raw": 145.0, + "smart_194_raw": 31.0, + "smart_197_raw": 0.0, + "smart_198_raw": 0.0, + } + ) + return attrs, [] + + +@lru_cache(maxsize=1) +def _load_model_and_threshold(model_path: str, threshold_path: str): + if XGBClassifier is None or joblib is None: + return None, 0.5 + + model = XGBClassifier() + try: + model.load_model(model_path) + threshold = float(joblib.load(threshold_path)) + return model, threshold + except Exception as exc: + LOGGER.warning("Failed to load drive health model: %s", exc) + return None, 0.5 + + +def get_optimal_threshold(runtime=None): + runtime = runtime or get_runtime() + _, threshold = _load_model_and_threshold( + str(runtime.model_dir / "xgb_model.json"), + str(runtime.model_dir / "optimal_threshold_xgb.pkl"), + ) + return threshold + + +def predict_failure_probability(smart, runtime=None): + runtime = runtime or get_runtime() + model, _ = _load_model_and_threshold( + str(runtime.model_dir / "xgb_model.json"), + str(runtime.model_dir / "optimal_threshold_xgb.pkl"), + ) + + if model is not None and pd is not None: + df = pd.DataFrame([smart]) + probabilities = model.predict_proba(df) + return float(probabilities[0, 1]) + + if runtime.is_fake: + temperature = float(smart.get("smart_194_raw", 30.0) or 30.0) + reallocated = float(smart.get("smart_5_raw", 0.0) or 0.0) + pending = float(smart.get("smart_197_raw", 0.0) or 0.0) + probability = 0.03 + min(0.25, reallocated * 0.02) + min(0.25, pending * 0.05) + if temperature > 40: + probability += min(0.15, (temperature - 40) * 0.01) + return min(0.95, probability) + + return None + + +def resolve_backup_partition_device(config_manager, runtime=None): + runtime = runtime or get_runtime() + if runtime.is_fake: + return "/dev/fakebackup1", None + + uuid = config_manager.get_value("backup", "uuid", None) + if not uuid: + return None, "No backup drive UUID configured." + + blkid_out = subprocess.run( + ["blkid", "-t", f"UUID={uuid}", "-o", "device"], + capture_output=True, + text=True, + ) + partition_device = blkid_out.stdout.strip() + if not partition_device: + return None, f"Backup drive with UUID {uuid} was not found." + + return partition_device, None + + +def resolve_backup_parent_device(config_manager, system_utils, runtime=None): + runtime = runtime or get_runtime() + if runtime.is_fake: + return "/dev/fakebackup", "/dev/fakebackup1", None + + partition_device, error = resolve_backup_partition_device(config_manager, runtime=runtime) + if error: + return None, None, error + + parent_device = system_utils.get_parent_device(partition_device) + if not parent_device: + return None, partition_device, f"Could not determine parent device for {partition_device}." + + return parent_device, partition_device, None + + +def get_smart_attributes(config_manager, system_utils, device=None, runtime=None): + runtime = runtime or get_runtime() + if runtime.is_fake: + return get_fake_smart_attributes() + + try: + if device is None: + device, _, error = resolve_backup_parent_device(config_manager, system_utils, runtime=runtime) + if error: + LOGGER.warning(error) + return None, None + + command = ["smartctl", "-A", "-j", device] + if os.geteuid() != 0 and shutil.which("sudo"): + command.insert(0, "sudo") + result = subprocess.run(command, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + + attrs = {field: info["default"] for field, info in SMART_FIELDS.items()} + missing_attrs = set(SMART_FIELDS.keys()) + + for item in data.get("ata_smart_attributes", {}).get("table", []): + field_name = f"smart_{item['id']}_raw" + if field_name not in SMART_FIELDS: + continue + try: + if field_name == "smart_194_raw": + attrs[field_name] = float(int(item["raw"]["value"]) & 0xFF) + else: + attrs[field_name] = float(item["raw"]["value"]) + missing_attrs.remove(field_name) + except (ValueError, KeyError, TypeError): + LOGGER.warning("Could not parse SMART value for %s", field_name) + + return attrs, list(missing_attrs) + except subprocess.CalledProcessError as exc: + LOGGER.warning("Failed to execute smartctl: %s", exc) + return None, None + except json.JSONDecodeError as exc: + LOGGER.warning("Failed to parse smartctl JSON output: %s", exc) + return None, None + except Exception as exc: + LOGGER.warning("Unexpected SMART read error: %s", exc) + return None, None + + +def append_telemetry(data_dict, prediction, runtime=None): + if not data_dict: + return + + runtime = runtime or get_runtime() + telemetry_path = runtime.telemetry_path + telemetry_path.parent.mkdir(parents=True, exist_ok=True) + file_exists = telemetry_path.exists() + + with open(telemetry_path, "a", newline="") as csvfile: + writer = csv.writer(csvfile) + if not file_exists: + writer.writerow(list(SMART_FIELDS.keys()) + ["failure"]) + row = [data_dict.get(field, "") for field in SMART_FIELDS.keys()] + row.append(prediction) + writer.writerow(row) + + +def _parse_bool(value, default): + if value is None: + return default + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def get_hdsentinel_settings(config_manager): + return { + "enabled": _parse_bool( + config_manager.get_value(HDSENTINEL_SECTION, "enabled", None), + HDSENTINEL_DEFAULTS["enabled"], + ), + "health_change_alert": _parse_bool( + config_manager.get_value(HDSENTINEL_SECTION, "health_change_alert", None), + HDSENTINEL_DEFAULTS["health_change_alert"], + ), + } + + +def save_hdsentinel_settings(config_manager, *, enabled, health_change_alert): + config_manager.set_value(HDSENTINEL_SECTION, "enabled", str(bool(enabled)).lower()) + config_manager.set_value( + HDSENTINEL_SECTION, + "health_change_alert", + str(bool(health_change_alert)).lower(), + ) + + +def get_hdsentinel_binary_path(runtime=None): + runtime = runtime or get_runtime() + return runtime.bin_dir / "hdsentinel" + + +def get_hdsentinel_state_path(runtime=None): + runtime = runtime or get_runtime() + return runtime.data_dir / "hdsentinel_state.json" + + +def _write_json_atomically(path: Path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + with NamedTemporaryFile( + "w", + encoding="utf-8", + dir=path.parent, + prefix=f"{path.name}.", + suffix=".tmp", + delete=False, + ) as tmp_file: + json.dump(payload, tmp_file, indent=2) + tmp_path = Path(tmp_file.name) + tmp_path.replace(path) + + +def load_hdsentinel_state(runtime=None): + runtime = runtime or get_runtime() + path = get_hdsentinel_state_path(runtime) + if not path.exists(): + return None + + try: + state = json.loads(path.read_text()) + return state.get("last_snapshot") + except Exception as exc: + LOGGER.warning("Failed to load HDSentinel state: %s", exc) + return None + + +def save_hdsentinel_state(snapshot, runtime=None): + runtime = runtime or get_runtime() + path = get_hdsentinel_state_path(runtime) + _write_json_atomically(path, {"last_snapshot": snapshot}) + + +def _format_size_mb(size_mb): + if size_mb is None: + return None + if size_mb >= 1024 * 1024: + return f"{size_mb / (1024 * 1024):.2f} TB" + if size_mb >= 1024: + return f"{size_mb / 1024:.1f} GB" + return f"{size_mb} MB" + + +def _parse_optional_int(value): + if value in {None, "", "?", "-"}: + return None + try: + return int(str(value).strip()) + except (TypeError, ValueError): + return None + + +def _format_power_on_time_from_hours(hours): + if hours is None: + return None + days, remainder = divmod(hours, 24) + if days: + return f"{days} days, {remainder} hours" + return f"{hours} hours" + + +def _extract_first_match(patterns, text): + for pattern in patterns: + match = re.search(pattern, text, flags=re.IGNORECASE | re.MULTILINE) + if match: + return match.group(1).strip() + return None + + +def _parse_power_on_hours_from_text(text): + if not text: + return None + days_match = re.search(r"(\d+)\s+days?", text, flags=re.IGNORECASE) + hours_match = re.search(r"(\d+)\s+hours?", text, flags=re.IGNORECASE) + minutes_match = re.search(r"(\d+)\s+minutes?", text, flags=re.IGNORECASE) + + if days_match or hours_match or minutes_match: + total_hours = 0 + if days_match: + total_hours += int(days_match.group(1)) * 24 + if hours_match: + total_hours += int(hours_match.group(1)) + if minutes_match and int(minutes_match.group(1)) >= 30: + total_hours += 1 + return total_hours + + compact_match = re.search(r"(\d+)\s*hours?", text, flags=re.IGNORECASE) + if compact_match: + return int(compact_match.group(1)) + + return None + + +def get_fake_hdsentinel_snapshot(config_manager=None, runtime=None): + runtime = runtime or get_runtime() + settings = get_hdsentinel_settings(config_manager) if config_manager is not None else HDSENTINEL_DEFAULTS + return { + "installed": True, + "enabled": settings["enabled"], + "health_change_alert": settings["health_change_alert"], + "available": settings["enabled"], + "device": "/dev/fakebackup", + "model": "Fake Developer Backup Drive", + "serial": "FAKE-BACKUP-0001", + "size_mb": 953869, + "size_text": "931.5 GB", + "temperature_c": 31, + "health_pct": 100, + "performance_pct": 100, + "power_on_hours": 612, + "power_on_time_text": "25 days, 12 hours", + "last_checked": datetime.now().isoformat(timespec="seconds"), + "error": None if settings["enabled"] else "HDSentinel monitoring is disabled.", + "binary_path": str(get_hdsentinel_binary_path(runtime)), + } + + +def parse_hdsentinel_solid_output(output, device=None): + for raw_line in output.splitlines(): + line = raw_line.strip() + if not line.startswith("/dev/"): + continue + + parts = line.split() + if len(parts) < 7: + continue + + if device and parts[0] != device: + continue + + size_mb = _parse_optional_int(parts[6]) + power_on_hours = _parse_optional_int(parts[3]) + return { + "device": parts[0], + "temperature_c": _parse_optional_int(parts[1]), + "health_pct": _parse_optional_int(parts[2]), + "power_on_hours": power_on_hours, + "power_on_time_text": _format_power_on_time_from_hours(power_on_hours), + "model": parts[4].replace("_", " "), + "serial": parts[5].replace("_", " "), + "size_mb": size_mb, + "size_text": _format_size_mb(size_mb), + } + + return None + + +def parse_hdsentinel_report(report_text): + health_pct = _parse_optional_int( + _extract_first_match([r"Health\s*:\s*(\d+)%"], report_text) + ) + performance_pct = _parse_optional_int( + _extract_first_match([r"Performance\s*:\s*(\d+)%"], report_text) + ) + temperature_c = _parse_optional_int( + _extract_first_match([r"Temperature\s*:\s*(-?\d+)\s*(?:°|deg)?\s*C"], report_text) + ) + power_on_time_text = _extract_first_match( + [r"Power on time\s*:\s*(.+)$", r"Power-on time\s*:\s*(.+)$"], + report_text, + ) + power_on_hours = _parse_power_on_hours_from_text(power_on_time_text) + + return { + "health_pct": health_pct, + "performance_pct": performance_pct, + "temperature_c": temperature_c, + "power_on_hours": power_on_hours, + "power_on_time_text": power_on_time_text, + "model": _extract_first_match([r"Model ID\s*:\s*(.+)$", r"Model\s*:\s*(.+)$"], report_text), + "serial": _extract_first_match( + [r"Serial Number\s*:\s*(.+)$", r"Serial No\.?\s*:\s*(.+)$"], + report_text, + ), + "size_text": _extract_first_match([r"Size\s*:\s*(.+)$", r"Capacity\s*:\s*(.+)$"], report_text), + "interface": _extract_first_match([r"Interface\s*:\s*(.+)$"], report_text), + "firmware": _extract_first_match([r"Revision\s*:\s*(.+)$", r"Firmware Revision\s*:\s*(.+)$"], report_text), + } + + +def _run_hdsentinel_command(binary_path: Path, args): + command = [str(binary_path), *args] + if os.geteuid() != 0 and shutil.which("sudo"): + command.insert(0, "sudo") + return subprocess.run(command, capture_output=True, text=True) + + +def collect_hdsentinel_snapshot(config_manager, system_utils, runtime=None, device=None): + runtime = runtime or get_runtime() + settings = get_hdsentinel_settings(config_manager) + binary_path = get_hdsentinel_binary_path(runtime) + snapshot = { + "installed": binary_path.exists() or runtime.is_fake, + "enabled": settings["enabled"], + "health_change_alert": settings["health_change_alert"], + "available": False, + "device": None, + "model": None, + "serial": None, + "size_mb": None, + "size_text": None, + "temperature_c": None, + "health_pct": None, + "performance_pct": None, + "power_on_hours": None, + "power_on_time_text": None, + "interface": None, + "firmware": None, + "last_checked": datetime.now().isoformat(timespec="seconds"), + "error": None, + "binary_path": str(binary_path), + } + + if runtime.is_fake: + return get_fake_hdsentinel_snapshot(config_manager=config_manager, runtime=runtime) + + if not settings["enabled"]: + snapshot["error"] = "HDSentinel monitoring is disabled." + return snapshot + + if not binary_path.exists(): + snapshot["error"] = f"HDSentinel binary is not installed at {binary_path}." + return snapshot + + if device is None: + device, _, error = resolve_backup_parent_device(config_manager, system_utils, runtime=runtime) + if error: + snapshot["error"] = error + return snapshot + + solid_result = _run_hdsentinel_command(binary_path, ["-solid", "-dev", device]) + if solid_result.returncode != 0: + stderr = (solid_result.stderr or solid_result.stdout or "").strip() + snapshot["device"] = device + snapshot["error"] = stderr or "HDSentinel did not return drive data." + return snapshot + + solid_data = parse_hdsentinel_solid_output(solid_result.stdout, device=device) + if not solid_data: + snapshot["device"] = device + snapshot["error"] = "HDSentinel returned output that could not be parsed." + return snapshot + + report_text = None + report_data = {} + with NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file: + report_path = Path(tmp_file.name) + + try: + report_result = _run_hdsentinel_command(binary_path, ["-dev", device, "-r", str(report_path)]) + if report_result.returncode == 0 and report_path.exists(): + report_text = report_path.read_text(errors="replace") + report_data = parse_hdsentinel_report(report_text) + finally: + report_path.unlink(missing_ok=True) + + snapshot.update(solid_data) + for key, value in report_data.items(): + if value not in {None, ""}: + snapshot[key] = value + + if snapshot["power_on_time_text"] is None and snapshot["power_on_hours"] is not None: + snapshot["power_on_time_text"] = _format_power_on_time_from_hours(snapshot["power_on_hours"]) + + snapshot["available"] = True + return snapshot + + +def get_hdsentinel_display_snapshot(config_manager, system_utils, runtime=None): + runtime = runtime or get_runtime() + snapshot = load_hdsentinel_state(runtime) + if snapshot is not None: + return snapshot + if runtime.is_fake: + return get_fake_hdsentinel_snapshot(config_manager=config_manager, runtime=runtime) + return None + + +def _log_and_email_alert(config_manager, runtime, title, message, *, alert_type, source): + config_manager.log_alert(title, message, alert_type=alert_type, source=source) + + if runtime.is_fake: + LOGGER.info("Fake mode: suppressing email for alert '%s'", title) + return + + email_address = (config_manager.get_value("backup", "email_address", "") or "").strip() + from_address = (config_manager.get_value("backup", "from_address", "") or "").strip() + if not email_address or not from_address: + LOGGER.warning("Skipping email alert because email settings are incomplete.") + return + + email_body = f"Subject: {title}\nFrom: {from_address}\n\n{message}" + try: + subprocess.run( + ["msmtp", f"--from={from_address}", email_address], + input=email_body, + text=True, + check=True, + ) + except subprocess.CalledProcessError as exc: + LOGGER.warning("Failed to send alert email '%s': %s", title, exc) + + +def run_hdsentinel_health_monitor(config_manager, system_utils, runtime=None): + runtime = runtime or get_runtime() + previous_snapshot = load_hdsentinel_state(runtime) + current_snapshot = collect_hdsentinel_snapshot(config_manager, system_utils, runtime=runtime) + settings = get_hdsentinel_settings(config_manager) + alert_sent = False + + previous_health = None if not previous_snapshot else previous_snapshot.get("health_pct") + current_health = current_snapshot.get("health_pct") + + if ( + settings["enabled"] + and settings["health_change_alert"] + and current_snapshot.get("available") + and previous_snapshot + and previous_snapshot.get("available") + and previous_health is not None + and current_health is not None + and current_health != previous_health + ): + drive_label = current_snapshot.get("model") or current_snapshot.get("device") or "backup drive" + title = "HDSentinel Drive Health Changed" + message = ( + f"HDSentinel reported a health change for {drive_label}. " + f"Previous health: {previous_health}%. Current health: {current_health}%." + ) + if current_snapshot.get("performance_pct") is not None: + message += f" Current performance: {current_snapshot['performance_pct']}%." + if current_snapshot.get("serial"): + message += f" Serial: {current_snapshot['serial']}." + _log_and_email_alert( + config_manager, + runtime, + title, + message, + alert_type="warning", + source="hdsentinel", + ) + alert_sent = True + + save_hdsentinel_state(current_snapshot, runtime=runtime) + return { + "previous_snapshot": previous_snapshot, + "snapshot": current_snapshot, + "alert_sent": alert_sent, + } + + +def run_scheduled_drive_health_check(config_manager, system_utils, runtime=None): + runtime = runtime or get_runtime() + mount_point = config_manager.get_value("backup", "mount_point", runtime.default_mount_point) + + if not system_utils.is_mounted(mount_point): + title = "Drive Health Check Failed - Drive Not Mounted" + message = f"The backup drive is not mounted at {mount_point}." + _log_and_email_alert( + config_manager, + runtime, + title, + message, + alert_type="error", + source="check_health", + ) + raise RuntimeError(message) + + device, _, error = resolve_backup_parent_device(config_manager, system_utils, runtime=runtime) + if error: + title = "Drive Health Check Failed - Drive Not Found" + _log_and_email_alert( + config_manager, + runtime, + title, + error, + alert_type="error", + source="check_health", + ) + raise RuntimeError(error) + + smart, missing_attrs = get_smart_attributes( + config_manager, + system_utils, + device=device, + runtime=runtime, + ) + if smart is None: + message = f"Could not retrieve SMART data from {device}." + _log_and_email_alert( + config_manager, + runtime, + "Drive Health Check Failed - No SMART Data", + message, + alert_type="error", + source="check_health", + ) + raise RuntimeError(message) + + probability = predict_failure_probability(smart, runtime=runtime) + if probability is None: + message = "Drive health model is unavailable." + _log_and_email_alert( + config_manager, + runtime, + "Drive Health Check Failed - Prediction Error", + message, + alert_type="error", + source="check_health", + ) + raise RuntimeError(message) + + threshold = get_optimal_threshold(runtime) + prediction = int(probability >= threshold) + if prediction == 1: + _log_and_email_alert( + config_manager, + runtime, + "Drive Health Warning", + f"Drive health check predicted failure with probability {probability:.4f}. Drive: {device}.", + alert_type="warning", + source="check_health", + ) + + hdsentinel_result = run_hdsentinel_health_monitor(config_manager, system_utils, runtime=runtime) + return { + "device": device, + "smart": smart, + "missing_attrs": missing_attrs, + "probability": probability, + "prediction": prediction, + "threshold": threshold, + "hdsentinel": hdsentinel_result, + } diff --git a/install.sh b/install.sh index 65f4b74..f6ad96a 100755 --- a/install.sh +++ b/install.sh @@ -46,6 +46,66 @@ APP_DIR="/opt/SimpleSaferServer" SCRIPTS_DIR="$APP_DIR/scripts" MODEL_DIR="/opt/SimpleSaferServer/harddrive_model" SERVICE_FILE="/etc/systemd/system/simple_safer_server_web.service" +HDSENTINEL_BIN="/usr/local/bin/hdsentinel" + +install_hdsentinel() { + local arch="" + local machine="" + local url="" + local tmpdir="" + + if command -v dpkg >/dev/null 2>&1; then + arch=$(dpkg --print-architecture 2>/dev/null || true) + fi + + if [ -z "$arch" ]; then + machine=$(uname -m) + case "$machine" in + x86_64|amd64) + arch="amd64" + ;; + aarch64|arm64) + arch="arm64" + ;; + esac + fi + + case "$arch" in + amd64) + url="https://www.hdsentinel.com/hdslin/hdsentinel-020c-x64.zip" + ;; + arm64) + url="https://www.hdsentinel.com/hdslin/hdsentinel-armv8.zip" + ;; + *) + echo -e "${YELLOW}HDSentinel auto-install skipped: unsupported architecture '${arch:-unknown}'.${NC}" + return 0 + ;; + esac + + tmpdir=$(mktemp -d) + if ! curl -L --fail --output "$tmpdir/hdsentinel.zip" "$url"; then + echo -e "${YELLOW}HDSentinel download failed. Continuing without it.${NC}" + rm -rf "$tmpdir" + return 0 + fi + + if ! unzip -o "$tmpdir/hdsentinel.zip" -d "$tmpdir" >/dev/null; then + echo -e "${YELLOW}HDSentinel extraction failed. Continuing without it.${NC}" + rm -rf "$tmpdir" + return 0 + fi + + if [ ! -f "$tmpdir/HDSentinel" ]; then + echo -e "${YELLOW}HDSentinel binary not found in downloaded archive. Continuing without it.${NC}" + rm -rf "$tmpdir" + return 0 + fi + + install -m 755 "$tmpdir/HDSentinel" "$HDSENTINEL_BIN" + rm -rf "$tmpdir" + echo -e "${GREEN}✔ HDSentinel installed to $HDSENTINEL_BIN.${NC}\n" +} # 1. Install system dependencies and Python packages using apt # We use apt for Python packages to avoid conflicts with Debian's externally managed Python environment. @@ -54,7 +114,7 @@ echo -e "${YELLOW}Step 1: Installing system and Python dependencies...${NC}" apt-get update # Preseed AppArmor prompt for msmtp only to ensure non-interactive install echo "msmtp msmtp/apply_apparmor boolean true" | debconf-set-selections -DEBIAN_FRONTEND=noninteractive apt-get install -y python3 python3-pip python3-flask python3-flask-socketio python3-psutil python3-xgboost python3-joblib python3-pandas python3-sklearn python3-cryptography smartmontools samba msmtp +DEBIAN_FRONTEND=noninteractive apt-get install -y python3 python3-pip python3-flask python3-flask-socketio python3-psutil python3-xgboost python3-joblib python3-pandas python3-sklearn python3-cryptography smartmontools samba msmtp curl unzip rsync echo -e "${GREEN}✔ System and Python dependencies installed.${NC}\n" @@ -70,20 +130,24 @@ sudo sh -c "bash $TMPFILE || true" rm -f "$TMPFILE" echo -e "${GREEN}✔ rclone installed.${NC}\n" -# 3. Copy/update application files (excluding /etc/SimpleSaferServer/) -echo -e "${YELLOW}Step 3: Copying application files...${NC}" +# 3. Install HDSentinel for supported architectures +echo -e "${YELLOW}Step 3: Installing HDSentinel...${NC}" +install_hdsentinel + +# 4. Copy/update application files (excluding /etc/SimpleSaferServer/) +echo -e "${YELLOW}Step 4: Copying application files...${NC}" mkdir -p "$APP_DIR" rsync -a --exclude='venv' --exclude='__pycache__' --exclude='*.pyc' --exclude='*.pyo' --exclude='*.log' --exclude='telemetry.csv' --exclude='harddrive_model' --exclude='static' --exclude='templates' ./ "$APP_DIR/" echo -e "${GREEN}✔ Application files copied.${NC}\n" -# 4. Copy static and templates directories -echo -e "${YELLOW}Step 4: Copying static assets and templates...${NC}" +# 5. Copy static and templates directories +echo -e "${YELLOW}Step 5: Copying static assets and templates...${NC}" rsync -a static "$APP_DIR/" rsync -a templates "$APP_DIR/" echo -e "${GREEN}✔ Static assets and templates copied.${NC}\n" -# 5. Copy scripts to /opt/SimpleSaferServer/scripts and set permissions -echo -e "${YELLOW}Step 5: Installing scripts...${NC}" +# 6. Copy scripts to /opt/SimpleSaferServer/scripts and set permissions +echo -e "${YELLOW}Step 6: Installing scripts...${NC}" mkdir -p "$SCRIPTS_DIR" for script in scripts/*.sh scripts/*.py; do cp "$script" "$SCRIPTS_DIR/" @@ -91,22 +155,22 @@ for script in scripts/*.sh scripts/*.py; do done echo -e "${GREEN}✔ Scripts installed to $SCRIPTS_DIR.${NC}\n" -# 6. Copy model files -echo -e "${YELLOW}Step 6: Copying model files...${NC}" +# 7. Copy model files +echo -e "${YELLOW}Step 7: Copying model files...${NC}" mkdir -p "$MODEL_DIR" cp harddrive_model/* "$MODEL_DIR/" echo -e "${GREEN}✔ Model files copied.${NC}\n" -# 7. Install/refresh systemd service for Flask app -echo -e "${YELLOW}Step 7: Setting up systemd service...${NC}" +# 8. Install/refresh systemd service for Flask app +echo -e "${YELLOW}Step 8: Setting up systemd service...${NC}" cp simple_safer_server_web.service "$SERVICE_FILE" systemctl daemon-reload systemctl enable simple_safer_server_web.service systemctl restart simple_safer_server_web.service echo -e "${GREEN}✔ Systemd service enabled and started.${NC}\n" -# 8. Open port 5000 in firewall if active -echo -e "${YELLOW}Step 8: Configuring firewall (if active)...${NC}" +# 9. Open port 5000 in firewall if active +echo -e "${YELLOW}Step 9: Configuring firewall (if active)...${NC}" if command -v ufw >/dev/null 2>&1 && ufw status | grep -q 'Status: active'; then ufw allow 5000/tcp echo -e "${GREEN}✔ Port 5000 opened in ufw.${NC}" @@ -122,7 +186,7 @@ echo -e "${YELLOW}No active firewall detected or configured. Skipping firewall s fi echo -# 9. Print all network interface IPs for user access +# 10. Print all network interface IPs for user access echo -e "${BLUE}===============================================" echo -e " SimpleSaferServer Web UI Access URLs" echo -e "===============================================${NC}" diff --git a/scripts/check_health.py b/scripts/check_health.py new file mode 100644 index 0000000..a501ac9 --- /dev/null +++ b/scripts/check_health.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import sys +from pathlib import Path + + +def _add_app_to_path(): + script_path = Path(__file__).resolve() + candidates = [ + script_path.parents[1], + Path('/opt/SimpleSaferServer'), + ] + for candidate in candidates: + if (candidate / 'drive_health.py').exists(): + sys.path.insert(0, str(candidate)) + return + + +_add_app_to_path() + +from config_manager import ConfigManager # noqa: E402 +from drive_health import run_scheduled_drive_health_check # noqa: E402 +from runtime import get_runtime # noqa: E402 +from system_utils import SystemUtils # noqa: E402 + + +def main(): + runtime = get_runtime() + config_manager = ConfigManager(runtime=runtime) + system_utils = SystemUtils(runtime=runtime) + + result = run_scheduled_drive_health_check( + config_manager, + system_utils, + runtime=runtime, + ) + + probability = result.get('probability') + device = result.get('device') + if probability is not None and device: + print(f"Drive health probability for {device}: {probability:.4f}") + + hdsentinel_snapshot = result.get('hdsentinel', {}).get('snapshot') + if hdsentinel_snapshot and hdsentinel_snapshot.get('available'): + print( + "HDSentinel: " + f"health {hdsentinel_snapshot.get('health_pct')}%, " + f"performance {hdsentinel_snapshot.get('performance_pct')}%, " + f"temperature {hdsentinel_snapshot.get('temperature_c')}C" + ) + elif hdsentinel_snapshot and hdsentinel_snapshot.get('error'): + print(f"HDSentinel unavailable: {hdsentinel_snapshot['error']}") + + +if __name__ == '__main__': + try: + main() + except Exception as exc: + print(str(exc), file=sys.stderr) + sys.exit(1) diff --git a/scripts/check_health.sh b/scripts/check_health.sh index 8b2bf40..38dfa44 100644 --- a/scripts/check_health.sh +++ b/scripts/check_health.sh @@ -1,108 +1,10 @@ #!/bin/bash -CONFIG_FILE="/etc/SimpleSaferServer/config.conf" +set -euo pipefail -get_config_value() { - section=$1 - key=$2 - awk -F '=' -v section="[$section]" -v key="$key" ' - $0 == section { in_section=1; next } - /^\[.*\]/ { in_section=0 } - in_section && $1 ~ "^[ \t]*"key"[ \t]*$" { gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2; exit } - ' "$CONFIG_FILE" | tr -d '"' -} - -MOUNT_POINT=$(get_config_value backup mount_point) -UUID=$(get_config_value backup uuid) -FROM_ADDRESS=$(get_config_value backup from_address) -EMAIL_ADDRESS=$(get_config_value backup email_address) -SERVER_NAME=$(get_config_value system server_name) - -# Function to send email and log alert -function send_email { - echo "$1 - $2" # Log the status - echo -e "Subject: $1 - $SERVER_NAME\nFrom: $FROM_ADDRESS\n\n$2" | msmtp --from=$FROM_ADDRESS $EMAIL_ADDRESS - # Log alert using the standalone script - python3 /opt/SimpleSaferServer/scripts/log_alert.py "$1" "$2" "error" "check_health" -} - -echo "Starting drive health check using XGBoost model..." - -# Check if drive is mounted -if ! grep -qs "$MOUNT_POINT" /proc/mounts; then - send_email "Drive Health Check Failed - Drive Not Mounted" "The backup drive is not mounted at $MOUNT_POINT" - exit 1 -fi - -# Get the device path from UUID -if [ -z "$UUID" ]; then - send_email "Drive Health Check Failed - No UUID Configured" "Cannot determine drive device without UUID" - exit 1 -fi - -partition_device=$(blkid -t UUID=$UUID -o device) -if [ -z "$partition_device" ]; then - send_email "Drive Health Check Failed - Drive Not Found" "Cannot find drive with UUID $UUID" - exit 1 -fi - -# Get the parent device (e.g., /dev/sda from /dev/sda1) -parent_device=$(lsblk -no PKNAME "$partition_device") -if [ -z "$parent_device" ]; then - # Fallback: strip trailing digits - parent_device=$(echo "$partition_device" | sed 's/[0-9]*$//') -fi - -# Ensure parent_device is a full device path -if [[ "$parent_device" != /dev/* ]]; then - parent_device="/dev/$parent_device" +if [ -x "/opt/SimpleSaferServer/scripts/check_health.py" ]; then + exec /usr/bin/python3 "/opt/SimpleSaferServer/scripts/check_health.py" fi -if [ -z "$parent_device" ]; then - send_email "Drive Health Check Failed - Cannot Determine Parent Device" "Cannot determine parent device for $partition_device" - exit 1 -fi - -echo "Checking health of device: $parent_device" - -# Get SMART attributes using smartctl -echo "Getting SMART attributes..." -smart_attrs=$(smartctl -A "$parent_device" | grep -E "^( [0-9]+| [0-9]+)" | grep -E "^( 5| 9| 12|177|194|197|198|199|200|201|202|203|204|205|206|207|208|209|211|212|220|221|222|223|224|225|226|227|228|230|231|232|233|234|235|240|241|242|250|251|252|254|255)" | awk '{print $2 " " $10}') - -if [ -z "$smart_attrs" ]; then - send_email "Drive Health Check Failed - No SMART Data" "Could not retrieve SMART data from $parent_device" - exit 1 -fi - -# Convert to JSON format for Python script -json_data="{\"smart_attrs\":{" -while read -r line; do - if [ ! -z "$line" ]; then - attr_id=$(echo $line | awk '{print $1}') - value=$(echo $line | awk '{print $2}') - json_data="${json_data}\"$attr_id\":$value," - fi -done <<< "$smart_attrs" -json_data="${json_data%,}}}" - -echo "SMART data collected, making health prediction..." - -# Call Python script to make prediction -prediction=$(python3 /opt/SimpleSaferServer/scripts/predict_health.py "$json_data") - -if [ $? -ne 0 ]; then - send_email "Drive Health Check Failed - Prediction Error" "Error running health prediction script" - exit 1 -fi - -# Parse prediction result -probability=$(echo $prediction | jq -r '.probability') -prediction_result=$(echo $prediction | jq -r '.prediction') - -if [ "$prediction_result" = "1" ]; then - send_email "Drive Health Warning" "Drive health check failed with probability $probability. Drive: $parent_device" - exit 1 -else - echo "Drive health check passed with probability $probability" - exit 0 -fi \ No newline at end of file +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +exec /usr/bin/python3 "$SCRIPT_DIR/check_health.py" diff --git a/system_utils.py b/system_utils.py index fa029d5..fd3de33 100644 --- a/system_utils.py +++ b/system_utils.py @@ -169,7 +169,7 @@ def install_systemd_scripts(self, config): scripts_dest_dir.mkdir(parents=True, exist_ok=True) # Copy and configure each script - script_files = ['check_mount.sh', 'check_health.sh', 'backup_cloud.sh', 'predict_health.py', 'log_alert.py'] + script_files = ['check_mount.sh', 'check_health.sh', 'check_health.py', 'backup_cloud.sh', 'predict_health.py', 'log_alert.py'] for script_file in script_files: source_path = scripts_source_dir / script_file @@ -238,6 +238,7 @@ def create_systemd_config_file(self, config): backup_config = config.get('backup', {}) system_config = config.get('system', {}) schedule_config = config.get('schedule', {}) + hdsentinel_config = config.get('hdsentinel', {}) config_content = f"""[system] username = {system_config.get('username', '')} @@ -258,6 +259,10 @@ def create_systemd_config_file(self, config): [schedule] backup_cloud_time = {schedule_config.get('backup_cloud_time', '')} + +[hdsentinel] +enabled = {hdsentinel_config.get('enabled', 'true')} +health_change_alert = {hdsentinel_config.get('health_change_alert', 'true')} """ # Create the config directory config_dir = self.runtime.config_dir @@ -314,12 +319,12 @@ def install_systemd_services_and_timers(self, config): WantedBy=multi-user.target """, 'check_health.service': f"""[Unit] -Description=Drive Health Check using XGBoost Model +Description=Drive Health Check After=network.target [Service] Type=oneshot -ExecStart=/usr/local/bin/check_health.sh +ExecStart=/usr/local/bin/check_health.py User=root StandardOutput=journal StandardError=journal diff --git a/templates/drive_health.html b/templates/drive_health.html index a7aa8f2..cd9a385 100644 --- a/templates/drive_health.html +++ b/templates/drive_health.html @@ -6,6 +6,7 @@ {% block content %}
{% endif %} + {% if settings_message %} +| Installed | ++ {% if hdsentinel_snapshot and hdsentinel_snapshot.installed %} + Yes + {% elif hdsentinel_snapshot %} + No + {% else %} + Unknown + {% endif %} + | +
|---|---|
| Binary Path | +{{ hdsentinel_snapshot.binary_path if hdsentinel_snapshot else '/usr/local/bin/hdsentinel' }} |
+
| Health | +{{ hdsentinel_snapshot.health_pct ~ '%' if hdsentinel_snapshot and hdsentinel_snapshot.health_pct is not none else '—' }} | +
| Performance | +{{ hdsentinel_snapshot.performance_pct ~ '%' if hdsentinel_snapshot and hdsentinel_snapshot.performance_pct is not none else '—' }} | +
| Temperature | +{{ hdsentinel_snapshot.temperature_c ~ ' C' if hdsentinel_snapshot and hdsentinel_snapshot.temperature_c is not none else '—' }} | +
| Drive | +{{ hdsentinel_snapshot.model if hdsentinel_snapshot and hdsentinel_snapshot.model else '—' }} | +
| Serial | +{{ hdsentinel_snapshot.serial if hdsentinel_snapshot and hdsentinel_snapshot.serial else '—' }} | +
| Device | +{{ hdsentinel_snapshot.device if hdsentinel_snapshot and hdsentinel_snapshot.device else '—' }} |
+
| Size | +{{ hdsentinel_snapshot.size_text if hdsentinel_snapshot and hdsentinel_snapshot.size_text else '—' }} | +
| Power-On Time | +{{ hdsentinel_snapshot.power_on_time_text if hdsentinel_snapshot and hdsentinel_snapshot.power_on_time_text else '—' }} | +
| Last Checked | +{{ hdsentinel_snapshot.last_checked if hdsentinel_snapshot and hdsentinel_snapshot.last_checked else 'Never' }} | +