Merge pull request #10 from oguarni/cloud-rules-7d-ml

oguarni · web-flow · commit 3eb6fafa1445 · 2026-03-08T23:33:32.000-03:00
feat(security): add CloudWatch logging and VPC flow log rules, expand…
diff --git a/terrasafe/application/scanner.py b/terrasafe/application/scanner.py
@@ -219,9 +219,10 @@ def _validate_features(self, features: np.ndarray) -> np.ndarray:
             Validated feature array with values clipped to acceptable bounds
         """
         # Define acceptable bounds for each feature
-        # [open_ports, hardcoded_secrets, public_access, unencrypted_storage, total_resources]
-        min_bounds = np.array([0, 0, 0, 0, 0], dtype=np.int32)
-        max_bounds = np.array([100, 100, 100, 100, 10000], dtype=np.int32)
+        # [open_ports, hardcoded_secrets, public_access, unencrypted_storage,
+        #  missing_logging, missing_flow_logs, total_resources]
+        min_bounds = np.array([0, 0, 0, 0, 0, 0, 0], dtype=np.int32)
+        max_bounds = np.array([100, 100, 100, 100, 100, 100, 10000], dtype=np.int32)
 
         # Clip features to acceptable ranges
         validated = np.clip(features, min_bounds, max_bounds)
@@ -244,11 +245,11 @@ def _extract_features(self, vulnerabilities: List[Vulnerability]) -> np.ndarray:
             vulnerabilities: List of detected vulnerabilities
 
         Returns:
-            Numpy array of features (shape: 1x5)
+            Numpy array of features (shape: 1x7)
         """
         if not vulnerabilities:
             # Return default feature vector for empty vulnerability list
-            return np.array([[0, 0, 0, 0, 1]], dtype=np.int32)
+            return np.array([[0, 0, 0, 0, 0, 0, 1]], dtype=np.int32)
 
         # Count unique resources
         unique_resources = len(set(v.resource for v in vulnerabilities))
@@ -270,12 +271,18 @@ def _extract_features(self, vulnerabilities: List[Vulnerability]) -> np.ndarray:
 
         unencrypted_mask = np.char.find(messages, 'unencrypted') >= 0
 
+        missing_logging_mask = np.char.find(messages, 'missing logging') >= 0
+
+        missing_flow_logs_mask = np.char.find(messages, 'missing vpc flow logs') >= 0
+
         # Count matches using numpy sum (faster than Python loops)
         features = np.array([
             np.sum(open_ports_mask),
             np.sum(hardcoded_mask),
             np.sum(public_access_mask),
             np.sum(unencrypted_mask),
+            np.sum(missing_logging_mask),
+            np.sum(missing_flow_logs_mask),
             unique_resources
         ], dtype=np.int32).reshape(1, -1)
 
@@ -288,7 +295,10 @@ def _summarize_vulns(self, vulns: List[Vulnerability]) -> Dict[str, int]:
         return summary
 
     def _format_features(self, features: np.ndarray) -> Dict[str, int]:
-        feature_names = ['open_ports', 'hardcoded_secrets', 'public_access', 'unencrypted_storage', 'total_resources']
+        feature_names = [
+            'open_ports', 'hardcoded_secrets', 'public_access', 'unencrypted_storage',
+            'missing_logging', 'missing_flow_logs', 'total_resources'
+        ]
         return {name: int(val) for name, val in zip(feature_names, features[0])}
 
     def _vulnerability_to_dict(self, vuln: Vulnerability) -> Dict[str, Any]:
diff --git a/terrasafe/config/settings.py b/terrasafe/config/settings.py
@@ -67,6 +67,10 @@ class Settings(BaseSettings):
         default="models/isolation_forest.pkl",
         description="Path to ML model file"
     )
+    severity_overrides: Dict[str, str] = Field(
+        default={},
+        description="Override severity for specific rules, e.g. {'missing_logging': 'MEDIUM'}"
+    )
 
     # Security Configuration
     max_file_size_mb: int = Field(
diff --git a/terrasafe/domain/security_rules.py b/terrasafe/domain/security_rules.py
@@ -2,6 +2,7 @@
 import re
 from typing import List, Dict
 from .models import Vulnerability, Severity
+from ..config.settings import get_settings
 
 
 # Constants for severity points (Clean Code: No magic numbers)
@@ -258,6 +259,67 @@ def check_iam_policies(self, tf_content: Dict) -> List[Vulnerability]:
 
         return vulns
 
+    def check_missing_logging(self, tf_content: Dict) -> List[Vulnerability]:
+        """Check for missing CloudTrail/CloudWatch logging resources.
+
+        If infrastructure resources exist but no logging resources are present,
+        flag as HIGH severity.
+        """
+        vulns: List[Vulnerability] = []
+
+        if 'resource' not in tf_content:
+            return vulns
+
+        resources = tf_content.get('resource', [])
+        all_resource_types = set()
+        for resource_block in resources:
+            all_resource_types.update(resource_block.keys())
+
+        # Only flag if there are infrastructure resources to log
+        infra_types = all_resource_types - {'aws_cloudtrail', 'aws_cloudwatch_log_group'}
+        has_infra = bool(infra_types)
+        has_logging = 'aws_cloudtrail' in all_resource_types or 'aws_cloudwatch_log_group' in all_resource_types
+
+        if has_infra and not has_logging:
+            vulns.append(Vulnerability(
+                severity=Severity.HIGH,
+                points=POINTS_HIGH,
+                message="[HIGH] Missing logging - no CloudTrail or CloudWatch log group detected",
+                resource="Logging",
+                remediation="Add aws_cloudtrail or aws_cloudwatch_log_group to enable audit logging"
+            ))
+
+        return vulns
+
+    def check_missing_vpc_flow_logs(self, tf_content: Dict) -> List[Vulnerability]:
+        """Check for VPC resources without corresponding flow logs.
+
+        If an aws_vpc resource exists but no aws_flow_log is found, flag as MEDIUM.
+        """
+        vulns: List[Vulnerability] = []
+
+        if 'resource' not in tf_content:
+            return vulns
+
+        resources = tf_content.get('resource', [])
+        all_resource_types = set()
+        for resource_block in resources:
+            all_resource_types.update(resource_block.keys())
+
+        has_vpc = 'aws_vpc' in all_resource_types
+        has_flow_log = 'aws_flow_log' in all_resource_types
+
+        if has_vpc and not has_flow_log:
+            vulns.append(Vulnerability(
+                severity=Severity.MEDIUM,
+                points=POINTS_MEDIUM,
+                message="[MEDIUM] Missing VPC flow logs - aws_vpc present but no aws_flow_log detected",
+                resource="VPC",
+                remediation="Add an aws_flow_log resource to enable VPC traffic logging"
+            ))
+
+        return vulns
+
     def analyze(self, tf_content: Dict, raw_content: str) -> List[Vulnerability]:
         """Run all security checks"""
         all_vulns = []
@@ -268,5 +330,23 @@ def analyze(self, tf_content: Dict, raw_content: str) -> List[Vulnerability]:
         all_vulns.extend(self.check_encryption(tf_content))
         all_vulns.extend(self.check_public_s3(tf_content))
         all_vulns.extend(self.check_iam_policies(tf_content))
+        all_vulns.extend(self.check_missing_logging(tf_content))
+        all_vulns.extend(self.check_missing_vpc_flow_logs(tf_content))
+
+        # Apply severity overrides from config
+        overrides = get_settings().severity_overrides
+        if overrides:
+            severity_map = {s.value: s for s in Severity}
+            rule_key_map = {
+                'missing_logging': '[HIGH] Missing logging',
+                'missing_flow_logs': '[MEDIUM] Missing VPC flow logs',
+            }
+            for vuln in all_vulns:
+                for rule_name, override_level in overrides.items():
+                    fragment = rule_key_map.get(rule_name)
+                    if fragment and fragment in vuln.message:
+                        new_severity = severity_map.get(override_level.upper())
+                        if new_severity:
+                            vuln.severity = new_severity
 
         return all_vulns
diff --git a/terrasafe/infrastructure/ml_model.py b/terrasafe/infrastructure/ml_model.py
@@ -364,33 +364,33 @@ def _train_baseline_model(self):
         rng = np.random.default_rng(42)
 
         # Enhanced baseline patterns representing secure configurations
-        # Features: [open_ports, secrets, public_access, unencrypted, resource_count]
+        # Features: [open_ports, secrets, public_access, unencrypted, missing_logging, missing_flow_logs, resource_count]
         baseline_patterns = [
             # Fully secure configurations
-            [0, 0, 0, 0, 5],   # Small secure microservice
-            [0, 0, 0, 0, 10],  # Medium secure application
-            [0, 0, 0, 0, 15],  # Large secure infrastructure
-            [0, 0, 0, 0, 25],  # Enterprise secure setup
-            [0, 0, 0, 0, 3],   # Minimal secure Lambda function
+            [0, 0, 0, 0, 0, 0, 5],    # Small secure microservice
+            [0, 0, 0, 0, 0, 0, 10],   # Medium secure application
+            [0, 0, 0, 0, 0, 0, 15],   # Large secure infrastructure
+            [0, 0, 0, 0, 0, 0, 25],   # Enterprise secure setup
+            [0, 0, 0, 0, 0, 0, 3],    # Minimal secure Lambda function
 
             # Web applications (acceptable public exposure)
-            [1, 0, 0, 0, 8],   # Simple web app with HTTP
-            [2, 0, 0, 0, 12],  # Web app with HTTP/HTTPS
-            [2, 0, 1, 0, 20],  # E-commerce with CDN (public S3)
-            [1, 0, 1, 0, 15],  # Static site with S3 hosting
-            [2, 0, 2, 0, 30],  # Multi-region web platform
+            [1, 0, 0, 0, 0, 0, 8],    # Simple web app with HTTP
+            [2, 0, 0, 0, 0, 0, 12],   # Web app with HTTP/HTTPS
+            [2, 0, 1, 0, 0, 0, 20],   # E-commerce with CDN (public S3)
+            [1, 0, 1, 0, 0, 0, 15],   # Static site with S3 hosting
+            [2, 0, 2, 0, 0, 0, 30],   # Multi-region web platform
 
             # Development environments (slightly relaxed)
-            [1, 0, 0, 1, 6],   # Dev env with one unencrypted volume
-            [2, 0, 0, 1, 10],  # Staging with test data
-            [1, 0, 1, 1, 8],   # QA environment
-            [0, 0, 0, 2, 12],  # Test cluster with temp storage
+            [1, 0, 0, 1, 0, 0, 6],    # Dev env with one unencrypted volume
+            [2, 0, 0, 1, 0, 0, 10],   # Staging with test data
+            [1, 0, 1, 1, 0, 0, 8],    # QA environment
+            [0, 0, 0, 2, 0, 0, 12],   # Test cluster with temp storage
 
             # Microservices architectures
-            [3, 0, 0, 0, 40],  # Service mesh with multiple endpoints
-            [4, 0, 1, 0, 50],  # Kubernetes cluster with ingress
-            [2, 0, 0, 0, 35],  # Docker swarm setup
-            [3, 0, 2, 0, 45],  # Multi-service with CDN
+            [3, 0, 0, 0, 0, 0, 40],   # Service mesh with multiple endpoints
+            [4, 0, 1, 0, 0, 0, 50],   # Kubernetes cluster with ingress
+            [2, 0, 0, 0, 0, 0, 35],   # Docker swarm setup
+            [3, 0, 2, 0, 0, 0, 45],   # Multi-service with CDN
         ]
 
         baseline_features = np.array(baseline_patterns)
@@ -401,7 +401,7 @@ def _train_baseline_model(self):
         # Add noise variations for each pattern
         for pattern in baseline_features:
             for _ in range(3):  # Create 3 variations per pattern
-                noise = rng.normal(0, 0.15, 5)
+                noise = rng.normal(0, 0.15, 7)
                 augmented = pattern + noise
                 augmented = np.maximum(augmented, 0)  # Ensure non-negative
                 # Round discrete features
@@ -410,11 +410,11 @@ def _train_baseline_model(self):
 
         # Add edge cases representing acceptable boundaries
         edge_cases = np.array([
-            [5, 0, 0, 0, 60],  # Large microservices
-            [0, 0, 5, 0, 40],  # Content delivery network
-            [3, 0, 3, 2, 50],  # Legacy migration
-            [0, 0, 0, 3, 25],  # Development cluster
-            [6, 0, 2, 0, 70],  # API gateway with multiple services
+            [5, 0, 0, 0, 0, 0, 60],   # Large microservices
+            [0, 0, 5, 0, 0, 0, 40],   # Content delivery network
+            [3, 0, 3, 2, 0, 0, 50],   # Legacy migration
+            [0, 0, 0, 3, 0, 0, 25],   # Development cluster
+            [6, 0, 2, 0, 0, 0, 70],   # API gateway with multiple services
         ])
 
         augmented_data = np.vstack([augmented_data, edge_cases])
@@ -446,7 +446,9 @@ def _train_baseline_model(self):
                 'hardcoded_secrets': {'min': int(augmented_data[:, 1].min()), 'max': int(augmented_data[:, 1].max())},
                 'public_access': {'min': int(augmented_data[:, 2].min()), 'max': int(augmented_data[:, 2].max())},
                 'unencrypted_storage': {'min': int(augmented_data[:, 3].min()), 'max': int(augmented_data[:, 3].max())},
-                'total_resources': {'min': int(augmented_data[:, 4].min()), 'max': int(augmented_data[:, 4].max())},
+                'missing_logging': {'min': int(augmented_data[:, 4].min()), 'max': int(augmented_data[:, 4].max())},
+                'missing_flow_logs': {'min': int(augmented_data[:, 5].min()), 'max': int(augmented_data[:, 5].max())},
+                'total_resources': {'min': int(augmented_data[:, 6].min()), 'max': int(augmented_data[:, 6].max())},
             },
             'model_parameters': {
                 'contamination': 0.1,
diff --git a/test_files/mixed.tf b/test_files/mixed.tf
@@ -63,3 +63,18 @@ variable "db_password" {
   type        = string
   sensitive   = true
 }
+
+resource "aws_vpc" "app_vpc" {
+  cidr_block = "10.0.0.0/16"  # MEDIUM: No aws_flow_log present
+  tags = {
+    Name = "mixed-vpc"
+  }
+}
+
+resource "aws_cloudtrail" "app_trail" {
+  name           = "app-trail"
+  s3_bucket_name = aws_s3_bucket.app_bucket.bucket
+  # CloudTrail present — satisfies missing_logging rule
+}
+# NOTE: aws_cloudtrail present → no missing_logging vuln
+# NOTE: no aws_flow_log → triggers missing_vpc_flow_logs rule only
diff --git a/test_files/secure.tf b/test_files/secure.tf
@@ -72,4 +72,36 @@ variable "db_password" {
   description = "Database password"
   type        = string
   sensitive   = true
+}
+
+resource "aws_vpc" "main" {
+  cidr_block = "10.0.0.0/16"
+  tags = {
+    Name = "secure-vpc"
+  }
+}
+
+resource "aws_flow_log" "main" {
+  vpc_id          = aws_vpc.main.id
+  traffic_type    = "ALL"
+  iam_role_arn    = var.flow_log_role_arn
+  log_destination = aws_cloudwatch_log_group.flow_logs.arn
+}
+
+resource "aws_cloudwatch_log_group" "flow_logs" {
+  name              = "/aws/vpc/flow-logs"
+  retention_in_days = 90
+}
+
+resource "aws_cloudtrail" "main" {
+  name                          = "secure-trail"
+  s3_bucket_name                = aws_s3_bucket.main_bucket.bucket
+  include_global_service_events = true
+  is_multi_region_trail         = true
+  enable_log_file_validation    = true
+}
+
+variable "flow_log_role_arn" {
+  description = "IAM role ARN for VPC flow logs"
+  type        = string
 }
diff --git a/test_files/vulnerable.tf b/test_files/vulnerable.tf
@@ -59,4 +59,13 @@ resource "aws_s3_bucket" "main_bucket" {
   tags = {
     Environment = "test"
   }
-}
+}
+
+resource "aws_vpc" "main" {
+  cidr_block = "10.0.0.0/16"  # HIGH: No aws_flow_log present — VPC traffic is unmonitored
+  tags = {
+    Name = "vulnerable-vpc"
+  }
+}
+# NOTE: no aws_cloudtrail, no aws_cloudwatch_log_group → triggers missing_logging rule
+# NOTE: no aws_flow_log → triggers missing_vpc_flow_logs rule
diff --git a/tests/test_security_rules_logging.py b/tests/test_security_rules_logging.py
diff --git a/tests/test_security_scanner.py b/tests/test_security_scanner.py