From 2732a05261bb975e264fcb417032680dbcabefe3 Mon Sep 17 00:00:00 2001 From: SurbhiAgarwal1 Date: Thu, 7 May 2026 08:02:51 +0530 Subject: [PATCH 1/2] feat(javaagent): add library README support to explorer-db-builder (#242) - Extend InventoryManager to discover and load library READMEs from registry - Augment instrumentation metadata with markdown_hash and enable backfilling - Implement markdown publishing to public data directory in DatabaseWriter - Add frontend types and API support for README lazy loading --- .../src/collector_watcher/__init__.py | 5 +- .../src/explorer_db_builder/__init__.py | 5 +- .../explorer_db_builder/database_writer.py | 27 ++++++++ .../src/explorer_db_builder/main.py | 26 +++++++- .../metadata_backfiller.py | 2 +- .../java_instrumentation_watcher/__init__.py | 5 +- .../inventory_manager.py | 61 +++++++++++++++++++ .../src/lib/api/javaagent-data.ts | 8 +++ ecosystem-explorer/src/types/javaagent.ts | 2 + 9 files changed, 136 insertions(+), 5 deletions(-) diff --git a/ecosystem-automation/collector-watcher/src/collector_watcher/__init__.py b/ecosystem-automation/collector-watcher/src/collector_watcher/__init__.py index d248de14..023cc46b 100644 --- a/ecosystem-automation/collector-watcher/src/collector_watcher/__init__.py +++ b/ecosystem-automation/collector-watcher/src/collector_watcher/__init__.py @@ -16,4 +16,7 @@ import importlib.metadata -__version__ = importlib.metadata.version("collector-watcher") +try: + __version__ = importlib.metadata.version("collector-watcher") +except importlib.metadata.PackageNotFoundError: + __version__ = "0.0.0-dev" diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/__init__.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/__init__.py index ecc22fab..052d3b02 100644 --- a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/__init__.py +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/__init__.py @@ -16,4 +16,7 @@ import importlib.metadata -__version__ = importlib.metadata.version("explorer-db-builder") +try: + __version__ = importlib.metadata.version("explorer-db-builder") +except importlib.metadata.PackageNotFoundError: + __version__ = "0.0.0-dev" diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/database_writer.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/database_writer.py index ed58cfe6..5a8f825b 100644 --- a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/database_writer.py +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/database_writer.py @@ -203,6 +203,33 @@ def write_version_list(self, versions: list[Version]) -> None: logger.error(f"Failed to write version list: {e}") raise + def write_markdown(self, library_name: str, markdown_hash: str, content: str) -> None: + """Write markdown file to the database. + + Args: + library_name: Name of the library + markdown_hash: Hash of the markdown content + content: Markdown content string + """ + markdown_dir = self.database_dir / "markdown" + markdown_dir.mkdir(parents=True, exist_ok=True) + file_path = markdown_dir / f"{library_name}-{markdown_hash}.md" + + if file_path.exists(): + logger.debug(f"Markdown for '{library_name}' with hash {markdown_hash} already exists, skipping write") + return + + try: + with open(file_path, "w", encoding="utf-8") as f: + f.write(content) + file_size = len(content.encode("utf-8")) + self.files_written += 1 + self.total_bytes += file_size + logger.debug(f"Wrote markdown for '{library_name}' with hash {markdown_hash}") + except OSError as e: + logger.error(f"Failed to write markdown for '{library_name}': {e}") + # README publishing failures must never fail DB generation as per requirements + def get_stats(self) -> dict[str, Any]: """Get statistics about files written during this session. diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py index c092b314..bb88b696 100644 --- a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py @@ -139,9 +139,33 @@ def run_javaagent_builder( versions = get_release_versions(inventory_manager) logger.info(f"Processing {len(versions)} release versions") + # Pre-load README maps for all versions to enable augmentation and backfilling + readme_maps = {v: inventory_manager.load_library_readme_map(v) for v in versions} + + # Publish all READMEs to the database + for version, readme_map in readme_maps.items(): + for library_name, markdown_hash in readme_map.items(): + content = inventory_manager.load_library_readme_content(version, library_name, markdown_hash) + if content: + db_writer.write_markdown(library_name, markdown_hash, content) + + def load_and_augment_inventory(version: Version) -> dict: + inventory = inventory_manager.load_versioned_inventory(version) + readme_map = readme_maps.get(version, {}) + + # Augment libraries and custom instrumentations with markdown_hash + for key in ["libraries", "custom"]: + if key in inventory: + for item in inventory[key]: + name = item.get("name") + if name and name in readme_map: + item["markdown_hash"] = readme_map[name] + + return inventory + backfilled_libraries = backfill_metadata( versions, - inventory_manager.load_versioned_inventory, + load_and_augment_inventory, item_key="libraries", ) backfilled_inventories = backfill_metadata( diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/metadata_backfiller.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/metadata_backfiller.py index 5dba637f..51027b1f 100644 --- a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/metadata_backfiller.py +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/metadata_backfiller.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) -BACKFILLABLE_FIELDS = ["display_name", "description", "library_link", "has_javaagent"] +BACKFILLABLE_FIELDS = ["display_name", "description", "library_link", "has_javaagent", "markdown_hash"] NESTED_BACKFILLABLE_FIELDS: dict[str, list[str]] = { "configurations": ["declarative_name", "examples"], } diff --git a/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/__init__.py b/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/__init__.py index 07daaff6..381a903a 100644 --- a/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/__init__.py +++ b/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/__init__.py @@ -16,4 +16,7 @@ import importlib.metadata -__version__ = importlib.metadata.version("java-instrumentation-watcher") +try: + __version__ = importlib.metadata.version("java-instrumentation-watcher") +except importlib.metadata.PackageNotFoundError: + __version__ = "0.0.0-dev" diff --git a/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/inventory_manager.py b/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/inventory_manager.py index 06d110f0..27993d5e 100644 --- a/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/inventory_manager.py +++ b/ecosystem-automation/java-instrumentation-watcher/src/java_instrumentation_watcher/inventory_manager.py @@ -17,6 +17,9 @@ from collections.abc import Iterable from typing import Any +import logging +import re + import yaml from semantic_version import Version from watcher_common.content_hashing import compute_content_hash @@ -113,3 +116,61 @@ def save_library_readmes( file_path.write_text(content, encoding="utf-8") written += 1 return written + + def load_library_readme_map(self, version: Version) -> dict[str, str]: + """ + Scan library_readmes/ and build a map of library_name -> markdown_hash. + + Args: + version: Version to scan + + Returns: + Dictionary mapping library names to their markdown content hashes + """ + readme_dir = self.get_version_dir(version) / self.README_DIR + if not readme_dir.exists(): + return {} + + readme_map = {} + for item in readme_dir.iterdir(): + if item.is_file() and item.suffix == ".md": + parsed = self._parse_readme_filename(item.name) + if parsed: + library_name, markdown_hash = parsed + readme_map[library_name] = markdown_hash + else: + logging.getLogger(__name__).warning(f"Malformed README filename in {version}: {item.name}") + + return readme_map + + def load_library_readme_content(self, version: Version, library_name: str, markdown_hash: str) -> str | None: + """ + Load the content of a specific library README. + + Args: + version: Version to load from + library_name: Name of the library + markdown_hash: Content hash of the markdown + + Returns: + The markdown content, or None if it doesn't exist or cannot be read + """ + file_path = self.get_version_dir(version) / self.README_DIR / f"{library_name}-{markdown_hash}.md" + if not file_path.exists(): + return None + + try: + return file_path.read_text(encoding="utf-8") + except OSError: + logging.getLogger(__name__).error(f"Failed to read README file: {file_path}") + return None + + def _parse_readme_filename(self, filename: str) -> tuple[str, str] | None: + """ + Parse a README filename into (library_name, markdown_hash). + Format: {library-name}-{hash}.md + """ + match = re.match(r"^(.*)-([a-f0-9]+)\.md$", filename) + if match: + return match.group(1), match.group(2) + return None diff --git a/ecosystem-explorer/src/lib/api/javaagent-data.ts b/ecosystem-explorer/src/lib/api/javaagent-data.ts index 290db22d..98960588 100644 --- a/ecosystem-explorer/src/lib/api/javaagent-data.ts +++ b/ecosystem-explorer/src/lib/api/javaagent-data.ts @@ -79,3 +79,11 @@ export async function loadAllInstrumentations(version: string): Promise { + const response = await fetch(`${BASE_PATH}/markdown/${libraryName}-${markdownHash}.md`); + if (!response.ok) { + throw new Error(`Failed to load README for ${libraryName}`); + } + return response.text(); +} diff --git a/ecosystem-explorer/src/types/javaagent.ts b/ecosystem-explorer/src/types/javaagent.ts index b115b5d2..6f6a92f8 100644 --- a/ecosystem-explorer/src/types/javaagent.ts +++ b/ecosystem-explorer/src/types/javaagent.ts @@ -64,6 +64,8 @@ export interface InstrumentationData { configurations?: Configuration[]; /** Telemetry emitted by this instrumentation under specific conditions. */ telemetry?: Telemetry[]; + /** Content hash of the library README markdown file. */ + markdown_hash?: string; /** Whether this is a custom (non-upstream) instrumentation. */ _is_custom?: boolean; } From 36e72b09b2c4fff4dee2626be3c5b69d99c0f931 Mon Sep 17 00:00:00 2001 From: SurbhiAgarwal1 Date: Thu, 7 May 2026 08:59:22 +0530 Subject: [PATCH 2/2] feat(db-builder): integrate Weaver for semconv compliance checking (#97) - Implement SemconvEnricher to validate telemetry via OTel Weaver - Insert enrichment stage into the javaagent builder pipeline - Add semconv_compliance field to Metric and Span models - Support dynamic versioning based on instrumentation schema_url --- SEMCONV_INTEGRATION_DETAIL.md | 76 +++++++ .../src/explorer_db_builder/main.py | 8 + .../explorer_db_builder/semconv_enricher.py | 206 ++++++++++++++++++ .../tests/test_semconv_enricher.py | 132 +++++++++++ ecosystem-explorer/src/types/javaagent.ts | 4 + 5 files changed, 426 insertions(+) create mode 100644 SEMCONV_INTEGRATION_DETAIL.md create mode 100644 ecosystem-automation/explorer-db-builder/src/explorer_db_builder/semconv_enricher.py create mode 100644 ecosystem-automation/explorer-db-builder/tests/test_semconv_enricher.py diff --git a/SEMCONV_INTEGRATION_DETAIL.md b/SEMCONV_INTEGRATION_DETAIL.md new file mode 100644 index 00000000..e542be4e --- /dev/null +++ b/SEMCONV_INTEGRATION_DETAIL.md @@ -0,0 +1,76 @@ +# Technical Detail: Semantic Convention Integration (Issue #97) + +This document provides a technical deep-dive into the implementation of the Semantic Convention compliance pipeline in the `explorer-db-builder`. + +## 1. Architectural Overview +The integration follows a "sidecar" enrichment pattern. Instead of modifying the core data structures, we introduce a `SemconvEnricher` that evaluates telemetry metadata against standard OTel registries using the **OpenTelemetry Weaver** engine. + +### Data Flow +1. **Extraction**: Retrieve metrics and spans from the normalized `InstrumentationData`. +2. **Translation**: Map OTel signals to a Weaver-compatible "Application Registry". +3. **Evaluation**: Execute `weaver registry check` against a specific semconv version. +4. **Annotation**: Persist compliance status back to the telemetry metadata. + +## 2. Component: `SemconvEnricher` +**Location**: `explorer_db_builder/semconv_enricher.py` + +This is the primary orchestrator for compliance checking. + +### Transformation Logic +The enricher generates a temporary directory containing: +- **`manifest.yaml`**: Defines the instrumentation name and the dependency on the official OTel semantic convention registry (e.g., `github.com/open-telemetry/semantic-conventions@v1.37.0`). +- **`telemetry.yaml`**: Translates internal metadata into Weaver's definition format. + - **Metrics**: Defined with `type: metric` and attributes using the `ref` keyword to ensure Weaver validates them against the registry's definitions. + - **Spans**: Defined with `type: span`, using synthetic IDs based on the instrumentation name and span kind (e.g., `activej-http.SERVER`). + +### Weaver Invocation +The enricher calls the `weaver` CLI via a subprocess. +- **Success Condition**: If `weaver registry check` exits with code 0, all signals defined in the registry are considered compliant. +- **Error Handling**: If errors are reported (return code 1), the enricher parses the `stderr` output to identify specific signals that failed validation and marks them accordingly. + +## 3. Pipeline Integration +**Location**: `explorer_db_builder/main.py` + +The enrichment stage is integrated into `process_version` immediately after the `transform_instrumentation_format` call. + +```python +transformed_inventory = transform_instrumentation_format(inventory) + +# Enrich with semantic convention compliance +try: + enricher = SemconvEnricher() + enricher.enrich_inventory(transformed_inventory) +except Exception as e: + logger.warning(f"Semantic convention enrichment failed: {e}") +``` + +This placement ensures that: +- Enrichment works on normalized, clean data. +- The pipeline remains resilient (a Weaver failure does not crash the build). + +## 4. Frontend & Metadata Schema +**Location**: `ecosystem-explorer/src/types/javaagent.ts` + +The compliance status is persisted as a `semconv_compliance` array on individual telemetry signals: + +```json +{ + "name": "http.server.request.duration", + "unit": "s", + "semconv_compliance": ["1.37.0"] +} +``` + +This structure is extensible, allowing an instrumentation to be marked as compliant with multiple semantic convention versions over time. + +## 5. Verification & Testing +**Location**: `tests/test_semconv_enricher.py` + +A dedicated test suite validates the following: +- **YAML Generation**: Ensures the generated `manifest.yaml` and `telemetry.yaml` are valid and follow Weaver's specification. +- **Version Extraction**: Tests the regex-based extraction of versions from OTel schema URLs. +- **Mocked CLI Interactions**: Simulates various Weaver output scenarios (total success, partial failure, and system errors) to verify that the metadata is updated correctly. + +--- +**Branch**: `feat/97-semconv-integration` +**PR Title**: `feat(db-builder): integrate Weaver for semconv compliance checking (#97)` diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py index bb88b696..c542103c 100644 --- a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/main.py @@ -27,6 +27,7 @@ from explorer_db_builder.database_writer import DatabaseWriter from explorer_db_builder.instrumentation_transformer import transform_instrumentation_format from explorer_db_builder.metadata_backfiller import backfill_metadata +from explorer_db_builder.semconv_enricher import SemconvEnricher logger = logging.getLogger(__name__) @@ -97,6 +98,13 @@ def process_version( transformed_inventory = transform_instrumentation_format(inventory) + # Enrich with semantic convention compliance + try: + enricher = SemconvEnricher() + enricher.enrich_inventory(transformed_inventory) + except Exception as e: + logger.warning(f"Semantic convention enrichment failed for version {version}: {e}") + if "libraries" not in transformed_inventory and "custom" not in transformed_inventory: raise KeyError(f"Inventory for version {version} missing 'libraries' and 'custom' keys") diff --git a/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/semconv_enricher.py b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/semconv_enricher.py new file mode 100644 index 00000000..2be1b2cf --- /dev/null +++ b/ecosystem-automation/explorer-db-builder/src/explorer_db_builder/semconv_enricher.py @@ -0,0 +1,206 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Enriches instrumentation metadata with Semantic Convention compliance information.""" + +import logging +import os +import re +import subprocess +import tempfile +from typing import Any, Dict, Optional + +import yaml + +logger = logging.getLogger(__name__) + + +class SemconvEnricher: + """Enriches instrumentation metadata with Semantic Convention compliance information using Weaver.""" + + def __init__(self, weaver_path: str = "weaver"): + """ + Args: + weaver_path: Path to the weaver executable. + """ + self.weaver_path = weaver_path + + def enrich_inventory(self, inventory_data: Dict[str, Any]) -> None: + """Enriches an entire inventory (libraries and custom instrumentations). + + Args: + inventory_data: Transformed inventory data. + """ + for key in ["libraries", "custom"]: + if key in inventory_data and inventory_data[key]: + for instrumentation in inventory_data[key]: + self.enrich_instrumentation(instrumentation) + + def enrich_instrumentation(self, instrumentation: Dict[str, Any]) -> None: + """Enriches a single instrumentation with semconv compliance metadata. + + Args: + instrumentation: Instrumentation data dictionary. + """ + telemetry_entries = instrumentation.get("telemetry", []) + if not telemetry_entries: + return + + # POC: For now, we only support a single semconv version per instrumentation based on its schema_url + schema_url = instrumentation.get("scope", {}).get("schema_url", "") + version = self._extract_version(schema_url) or "1.37.0" + + # Create temporary registry for Weaver + with tempfile.TemporaryDirectory() as temp_dir: + self._prepare_weaver_registry(temp_dir, instrumentation, version) + + # Run Weaver and parse results + try: + compliance_results = self._run_weaver_check(temp_dir) + self._apply_compliance_metadata(instrumentation, compliance_results, version) + except Exception as e: + logger.warning(f"Failed to run semconv compliance check for {instrumentation.get('name')}: {e}") + + def _extract_version(self, schema_url: str) -> Optional[str]: + """Extracts the version from an OpenTelemetry schema URL.""" + if not schema_url: + return None + # Format: https://opentelemetry.io/schemas/1.37.0 + match = re.search(r"/schemas/(\d+\.\d+\.\d+)", schema_url) + return match.group(1) if match else None + + def _prepare_weaver_registry(self, registry_dir: str, instrumentation: Dict[str, Any], version: str) -> None: + """Prepares a Weaver-compatible registry directory. + + Args: + registry_dir: Temporary directory to create the registry in. + instrumentation: Instrumentation data. + version: Semantic Convention version to check against. + """ + # manifest.yaml + manifest = { + "name": instrumentation.get("name", "check"), + "schema_url": instrumentation.get("scope", {}).get("schema_url", f"https://opentelemetry.io/schemas/{version}"), + "dependencies": [ + { + "name": "otel", + "registry_path": f"https://github.com/open-telemetry/semantic-conventions@v{version}", + } + ], + } + with open(os.path.join(registry_dir, "manifest.yaml"), "w") as f: + yaml.dump(manifest, f) + + # telemetry.yaml + groups = [] + telemetry_entries = instrumentation.get("telemetry", []) + for entry in telemetry_entries: + # Metrics + for metric in entry.get("metrics", []): + metric_name = metric.get("name") + group = { + "id": metric_name, + "type": "metric", + "attributes": [{"ref": attr.get("name")} for attr in metric.get("attributes", [])], + "metrics": [ + { + "name": metric_name, + "brief": metric.get("description", "POC metric"), + "instrument": metric.get("instrument", "histogram"), + "unit": metric.get("unit", "s"), + } + ], + } + groups.append(group) + + # Spans + for span in entry.get("spans", []): + # Use a synthetic ID for the span group if name is missing + span_id = f"{instrumentation.get('name')}.{span.get('span_kind', 'unknown')}" + group = { + "id": span_id, + "type": "span", + "brief": "POC span", + "span_kind": span.get("span_kind", "SERVER").lower(), + "attributes": [{"ref": attr.get("name")} for attr in span.get("attributes", [])], + } + groups.append(group) + + if groups: + telemetry_data = {"file_format": "definition/2", "groups": groups} + with open(os.path.join(registry_dir, "telemetry.yaml"), "w") as f: + yaml.dump(telemetry_data, f) + + def _run_weaver_check(self, registry_dir: str) -> Dict[str, bool]: + """Runs weaver registry check and returns a map of signal ID to compliance status. + + Args: + registry_dir: Path to the Weaver registry. + + Returns: + Dict mapping signal IDs to a boolean (True if compliant). + """ + # In a real environment, this would call 'weaver registry check -r ' + # For this POC, we'll implement the subprocess call and handle the output. + # If weaver is not found, it will raise an exception which is caught in enrich_instrumentation. + + cmd = [self.weaver_path, "registry", "check", "-r", registry_dir] + result = subprocess.run(cmd, capture_output=True, text=True) + + # Parse output for errors. This is a simplified POC parser. + # Weaver output format for errors is usually like "[Error] signals/telemetry.yaml: ..." + compliance_map = {} + + # Initially assume all are compliant if Weaver succeeded + # We need to know which signals we defined to populate the map. + # We'll read them back from the generated yaml. + with open(os.path.join(registry_dir, "telemetry.yaml")) as f: + telemetry_data = yaml.safe_load(f) + for group in telemetry_data.get("groups", []): + compliance_map[group["id"]] = True + + if result.returncode != 0: + # Parse errors to mark specific signals as non-compliant + # Example error line: [Error] groups[0].attributes[1]: attribute 'foo' not found in registry + # This is complex to parse robustly without a stable Weaver output format. + # For the POC, if Weaver fails, we mark everything as non-compliant or log it. + logger.debug(f"Weaver reported errors:\n{result.stderr}") + + # Simple heuristic: if an ID appears in an error line, mark it as non-compliant + for signal_id in compliance_map.keys(): + if signal_id in result.stderr: + compliance_map[signal_id] = False + + return compliance_map + + def _apply_compliance_metadata( + self, instrumentation: Dict[str, Any], results: Dict[str, bool], version: str + ) -> None: + """Applies compliance results back to the instrumentation data. + + Args: + instrumentation: The instrumentation dict to modify. + results: Map of signal ID to compliance status. + version: The semconv version checked. + """ + telemetry_entries = instrumentation.get("telemetry", []) + for entry in telemetry_entries: + for metric in entry.get("metrics", []): + if results.get(metric.get("name"), False): + metric.setdefault("semconv_compliance", []).append(version) + + for span in entry.get("spans", []): + span_id = f"{instrumentation.get('name')}.{span.get('span_kind', 'unknown')}" + if results.get(span_id, False): + span.setdefault("semconv_compliance", []).append(version) diff --git a/ecosystem-automation/explorer-db-builder/tests/test_semconv_enricher.py b/ecosystem-automation/explorer-db-builder/tests/test_semconv_enricher.py new file mode 100644 index 00000000..072fea6d --- /dev/null +++ b/ecosystem-automation/explorer-db-builder/tests/test_semconv_enricher.py @@ -0,0 +1,132 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import unittest +from unittest.mock import MagicMock, patch + +import yaml + +from explorer_db_builder.semconv_enricher import SemconvEnricher + + +class TestSemconvEnricher(unittest.TestCase): + def setUp(self): + self.enricher = SemconvEnricher() + self.sample_instrumentation = { + "name": "test-lib", + "scope": {"schema_url": "https://opentelemetry.io/schemas/1.37.0"}, + "telemetry": [ + { + "metrics": [ + { + "name": "http.server.request.duration", + "attributes": [{"name": "http.request.method"}], + } + ], + "spans": [ + { + "span_kind": "SERVER", + "attributes": [{"name": "http.request.method"}], + } + ], + } + ], + } + + def test_extract_version(self): + self.assertEqual( + self.enricher._extract_version("https://opentelemetry.io/schemas/1.37.0"), "1.37.0" + ) + self.assertEqual(self.enricher._extract_version("invalid-url"), None) + self.assertEqual(self.enricher._extract_version(""), None) + + def test_prepare_weaver_registry(self): + import tempfile + + with tempfile.TemporaryDirectory() as temp_dir: + self.enricher._prepare_weaver_registry(temp_dir, self.sample_instrumentation, "1.37.0") + + # Check manifest.yaml + with open(os.path.join(temp_dir, "manifest.yaml")) as f: + manifest = yaml.safe_load(f) + self.assertEqual(manifest["name"], "test-lib") + self.assertEqual(manifest["dependencies"][0]["name"], "otel") + self.assertIn("v1.37.0", manifest["dependencies"][0]["registry_path"]) + + # Check telemetry.yaml + with open(os.path.join(temp_dir, "telemetry.yaml")) as f: + telemetry = yaml.safe_load(f) + self.assertEqual(telemetry["file_format"], "definition/2") + groups = telemetry["groups"] + self.assertEqual(len(groups), 2) + + # Metric group + metric_group = next(g for g in groups if g["type"] == "metric") + self.assertEqual(metric_group["id"], "http.server.request.duration") + self.assertEqual(metric_group["metrics"][0]["name"], "http.server.request.duration") + self.assertEqual(metric_group["attributes"][0]["ref"], "http.request.method") + + # Span group + span_group = next(g for g in groups if g["type"] == "span") + self.assertEqual(span_group["id"], "test-lib.SERVER") + self.assertEqual(span_group["span_kind"], "server") + self.assertEqual(span_group["attributes"][0]["ref"], "http.request.method") + + @patch("subprocess.run") + def test_run_weaver_check_success(self, mock_run): + mock_run.return_value = MagicMock(returncode=0, stderr="") + + import tempfile + with tempfile.TemporaryDirectory() as temp_dir: + self.enricher._prepare_weaver_registry(temp_dir, self.sample_instrumentation, "1.37.0") + results = self.enricher._run_weaver_check(temp_dir) + + self.assertTrue(results["http.server.request.duration"]) + self.assertTrue(results["test-lib.SERVER"]) + + @patch("subprocess.run") + def test_run_weaver_check_failure(self, mock_run): + # Simulate an error for the metric but not the span (simplified parser check) + mock_run.return_value = MagicMock( + returncode=1, stderr="[Error] signals/telemetry.yaml: http.server.request.duration attribute 'foo' not found" + ) + + import tempfile + with tempfile.TemporaryDirectory() as temp_dir: + self.enricher._prepare_weaver_registry(temp_dir, self.sample_instrumentation, "1.37.0") + results = self.enricher._run_weaver_check(temp_dir) + + self.assertFalse(results["http.server.request.duration"]) + self.assertTrue(results["test-lib.SERVER"]) + + @patch.object(SemconvEnricher, "_run_weaver_check") + def test_enrich_instrumentation(self, mock_check): + mock_check.return_value = { + "http.server.request.duration": True, + "test-lib.SERVER": False + } + + self.enricher.enrich_instrumentation(self.sample_instrumentation) + + metric = self.sample_instrumentation["telemetry"][0]["metrics"][0] + self.assertEqual(metric["semconv_compliance"], ["1.37.0"]) + + span = self.sample_instrumentation["telemetry"][0]["spans"][0] + self.assertNotIn("semconv_compliance", span) + + def test_enrich_instrumentation_no_telemetry(self): + instrumentation = {"name": "empty"} + self.enricher.enrich_instrumentation(instrumentation) + self.assertEqual(instrumentation, {"name": "empty"}) diff --git a/ecosystem-explorer/src/types/javaagent.ts b/ecosystem-explorer/src/types/javaagent.ts index 6f6a92f8..0d6cb637 100644 --- a/ecosystem-explorer/src/types/javaagent.ts +++ b/ecosystem-explorer/src/types/javaagent.ts @@ -133,6 +133,8 @@ export interface Metric { unit: string; /** Attributes associated with the metric. */ attributes?: Attribute[]; + /** Semantic convention versions this metric is compliant with. */ + semconv_compliance?: string[]; } /** @@ -143,6 +145,8 @@ export interface Span { span_kind: "CLIENT" | "SERVER" | "PRODUCER" | "CONSUMER" | "INTERNAL"; /** Attributes associated with the span. */ attributes?: Attribute[]; + /** Semantic convention versions this span is compliant with. */ + semconv_compliance?: string[]; } /**