fix: address security vulnerabilities identified by CodeRabbitAI review

Hoper-J · Hoper-J · commit 0edcb949fe53 · 2025-09-06T21:34:04.000+08:00
- Prevent path traversal attacks with strict date format validation
- Improve data comparison logic to preserve comprehensive information
diff --git a/src/claude_monitor/data/history_manager.py b/src/claude_monitor/data/history_manager.py
@@ -40,20 +40,29 @@ def _get_daily_file_path(self, date_str: str) -> Path:
 
         Returns:
             Path to the daily data file
+
+        Raises:
+            ValueError: If date_str is not in valid YYYY-MM-DD format
         """
-        # Organize by year and month for better file management
+        import re
+
+        # Strict validation to prevent path traversal attacks
+        if not re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", date_str):
+            raise ValueError(f"Invalid date format: {date_str}. Must be YYYY-MM-DD")
+
+        # Parse and validate the date
         try:
             date = datetime.strptime(date_str, "%Y-%m-%d")
-            year = date.strftime("%Y")
-            month = date.strftime("%m")
+        except ValueError as e:
+            raise ValueError(f"Invalid date: {date_str}. {e}")
 
-            month_dir = self.daily_dir / year / month
-            month_dir.mkdir(parents=True, exist_ok=True)
+        year = date.strftime("%Y")
+        month = date.strftime("%m")
 
-            return month_dir / f"{date_str}.json"
-        except ValueError:
-            # Fallback for invalid date formats
-            return self.daily_dir / f"{date_str}.json"
+        month_dir = self.daily_dir / year / month
+        month_dir.mkdir(parents=True, exist_ok=True)
+
+        return month_dir / f"{date_str}.json"
 
     def save_daily_data(
         self, daily_data: List[Dict[str, Any]], overwrite: bool = False
@@ -85,34 +94,55 @@ def save_daily_data(
             if file_path.exists() and not overwrite:
                 # Load existing data to check if it needs updating
                 try:
-                    with open(file_path, "r") as f:
+                    with open(file_path, "r", encoding="utf-8") as f:
                         existing_data = json.load(f)
 
                     # If the data is identical, skip
                     if existing_data == day_data:
                         self._saved_dates.add(date_str)
                         continue
 
-                    # If existing data has more information, keep it
-                    existing_tokens = existing_data.get(
-                        "input_tokens", 0
-                    ) + existing_data.get("output_tokens", 0)
-                    new_tokens = day_data.get("input_tokens", 0) + day_data.get(
-                        "output_tokens", 0
+                    # Compare total information to decide which data to keep
+                    # Sum all token counts for comparison
+                    existing_total_tokens = (
+                        existing_data.get("input_tokens", 0)
+                        + existing_data.get("output_tokens", 0)
+                        + existing_data.get("cache_creation_tokens", 0)
+                        + existing_data.get("cache_read_tokens", 0)
+                    )
+                    new_total_tokens = (
+                        day_data.get("input_tokens", 0)
+                        + day_data.get("output_tokens", 0)
+                        + day_data.get("cache_creation_tokens", 0)
+                        + day_data.get("cache_read_tokens", 0)
                     )
 
-                    if existing_tokens >= new_tokens:
+                    # Compare entries count and cost
+                    existing_entries = existing_data.get("entries_count", 0)
+                    new_entries = day_data.get("entries_count", 0)
+                    existing_cost = existing_data.get("total_cost", 0.0)
+                    new_cost = day_data.get("total_cost", 0.0)
+
+                    # Keep existing only if it has more total tokens, more entries, AND higher cost
+                    # This ensures we don't lose any valuable information
+                    if (
+                        existing_total_tokens > new_total_tokens
+                        and existing_entries >= new_entries
+                        and existing_cost >= new_cost
+                    ):
                         self._saved_dates.add(date_str)
                         continue
 
+                    # Otherwise, save the new data (it has more information)
+
                 except Exception as e:
                     logger.warning(f"Error reading existing data for {date_str}: {e}")
 
             # Save the data
             try:
                 temp_file = file_path.with_suffix(".tmp")
-                with open(temp_file, "w") as f:
-                    json.dump(day_data, f, indent=2, default=str)
+                with open(temp_file, "w", encoding="utf-8") as f:
+                    json.dump(day_data, f, indent=2, default=str, ensure_ascii=False)
                 temp_file.replace(file_path)
 
                 self._saved_dates.add(date_str)
@@ -198,7 +228,7 @@ def load_historical_daily_data(
                                 continue
 
                             # Load the data
-                            with open(file_path, "r") as f:
+                            with open(file_path, "r", encoding="utf-8") as f:
                                 data = json.load(f)
                                 historical_data.append(data)
 
diff --git a/src/tests/test_history_manager.py b/src/tests/test_history_manager.py
@@ -46,9 +46,13 @@ def test_get_daily_file_path_invalid_date(
     ) -> None:
         """Test file path generation with invalid date format."""
         date_str = "invalid-date"
-        expected_path = history_manager.daily_dir / "invalid-date.json"
-        actual_path = history_manager._get_daily_file_path(date_str)
-        assert actual_path == expected_path
+        with pytest.raises(ValueError, match="Invalid date format"):
+            history_manager._get_daily_file_path(date_str)
+
+        # Test path traversal attempt
+        malicious_str = "../../../etc/passwd"
+        with pytest.raises(ValueError, match="Invalid date format"):
+            history_manager._get_daily_file_path(malicious_str)
 
     def test_save_daily_data(self, history_manager: HistoryManager) -> None:
         """Test saving daily data."""
@@ -349,33 +353,86 @@ def test_save_daily_data_missing_date(
     def test_save_daily_data_with_existing_better_data(
         self, history_manager: HistoryManager
     ) -> None:
-        """Test that existing data with more tokens is preserved."""
-        # Save initial data with more tokens
+        """Test that existing data with more total tokens is preserved."""
+        # Save initial data with more total tokens
         initial_data = [
             {
                 "date": "2024-12-15",
                 "input_tokens": 2000,
                 "output_tokens": 1000,
+                "cache_creation_tokens": 100,
+                "cache_read_tokens": 50,
+                "total_cost": 0.10,
+                "entries_count": 20,
             }
         ]
         history_manager.save_daily_data(initial_data)
 
         # Clear saved dates to allow checking existing file
         history_manager._saved_dates.clear()
 
-        # Try to save data with fewer tokens
+        # Try to save data with fewer total tokens
         new_data = [
             {
                 "date": "2024-12-15",
                 "input_tokens": 500,
                 "output_tokens": 250,
+                "cache_creation_tokens": 50,
+                "cache_read_tokens": 25,
+                "total_cost": 0.05,
+                "entries_count": 10,
             }
         ]
         saved_count = history_manager.save_daily_data(new_data, overwrite=False)
         assert saved_count == 0
 
         # Verify original data is preserved
         file_path = history_manager._get_daily_file_path("2024-12-15")
-        with open(file_path, "r") as f:
+        with open(file_path, "r", encoding="utf-8") as f:
             saved_data = json.load(f)
         assert saved_data["input_tokens"] == 2000
+        assert saved_data["total_cost"] == 0.10
+
+    def test_save_daily_data_updates_with_more_info(
+        self, history_manager: HistoryManager
+    ) -> None:
+        """Test that new data with more information replaces old data."""
+        # Save initial data
+        initial_data = [
+            {
+                "date": "2024-12-16",
+                "input_tokens": 1000,
+                "output_tokens": 500,
+                "cache_creation_tokens": 100,
+                "cache_read_tokens": 50,
+                "total_cost": 0.05,
+                "entries_count": 10,
+            }
+        ]
+        history_manager.save_daily_data(initial_data)
+
+        # Clear saved dates
+        history_manager._saved_dates.clear()
+
+        # Save new data with more total tokens
+        new_data = [
+            {
+                "date": "2024-12-16",
+                "input_tokens": 900,
+                "output_tokens": 600,
+                "cache_creation_tokens": 200,
+                "cache_read_tokens": 100,
+                "total_cost": 0.08,
+                "entries_count": 15,
+            }
+        ]
+        saved_count = history_manager.save_daily_data(new_data, overwrite=False)
+        assert saved_count == 1
+
+        # Verify new data was saved
+        file_path = history_manager._get_daily_file_path("2024-12-16")
+        with open(file_path, "r", encoding="utf-8") as f:
+            saved_data = json.load(f)
+        assert saved_data["input_tokens"] == 900
+        assert saved_data["cache_creation_tokens"] == 200
+        assert saved_data["total_cost"] == 0.08