Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@

import importlib.metadata

__version__ = importlib.metadata.version("collector-watcher")
try:
__version__ = importlib.metadata.version("collector-watcher")
except importlib.metadata.PackageNotFoundError:
__version__ = "0.0.0-dev"
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@

import importlib.metadata

__version__ = importlib.metadata.version("explorer-db-builder")
try:
__version__ = importlib.metadata.version("explorer-db-builder")
except importlib.metadata.PackageNotFoundError:
__version__ = "0.0.0-dev"
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import json
import logging
import re
import shutil
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -46,6 +47,10 @@ def __init__(self, database_dir: str = "ecosystem-explorer/public/data/javaagent
self.files_written = 0
self.total_bytes = 0

def _sanitize_name(self, name: str) -> str:
"""Sanitizes a name for use as a filename to prevent path traversal."""
return re.sub(r"[^a-zA-Z0-9._\-]", "_", name)

def _get_file_path(self, library_name: str, library_hash: str) -> Path:
"""Get the file path for a library with the given name and hash.

Expand All @@ -58,9 +63,10 @@ def _get_file_path(self, library_name: str, library_hash: str) -> Path:
Returns:
Path to the library JSON file
"""
instrumentations_dir = self.database_dir / "instrumentations" / library_name
safe_name = self._sanitize_name(library_name)
instrumentations_dir = self.database_dir / "instrumentations" / safe_name
instrumentations_dir.mkdir(parents=True, exist_ok=True)
return instrumentations_dir / f"{library_name}-{library_hash}.json"
return instrumentations_dir / f"{safe_name}-{library_hash}.json"

def write_libraries(self, libraries: list[dict[str, Any]]) -> dict[str, str]:
"""Write library data to content-addressed files.
Expand Down Expand Up @@ -203,6 +209,35 @@ def write_version_list(self, versions: list[Version]) -> None:
logger.error(f"Failed to write version list: {e}")
raise

def write_markdown(self, library_name: str, markdown_hash: str, content: str) -> None:
"""Write markdown file to the database.

Args:
library_name: Name of the library
markdown_hash: Hash of the markdown content
content: Markdown content string
"""
markdown_dir = self.database_dir / "markdown"
markdown_dir.mkdir(parents=True, exist_ok=True)

safe_name = self._sanitize_name(library_name)
file_path = markdown_dir / f"{safe_name}-{markdown_hash}.md"

if file_path.exists():
logger.debug(f"Markdown for '{safe_name}' with hash {markdown_hash} already exists, skipping write")
return

try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
file_size = len(content.encode("utf-8"))
self.files_written += 1
self.total_bytes += file_size
logger.debug(f"Wrote markdown for '{safe_name}' with hash {markdown_hash}")
except OSError as e:
logger.error(f"Failed to write markdown for '{safe_name}': {e}")
# README publishing failures must never fail DB generation as per requirements

Comment thread
SurbhiAgarwal1 marked this conversation as resolved.
Comment on lines +212 to +240
def get_stats(self) -> dict[str, Any]:
"""Get statistics about files written during this session.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,33 @@ def run_javaagent_builder(
versions = get_release_versions(inventory_manager)
logger.info(f"Processing {len(versions)} release versions")

# Pre-load README maps for all versions to enable augmentation and backfilling
readme_maps = {v: inventory_manager.load_library_readme_map(v) for v in versions}

# Publish all READMEs to the database
for version, readme_map in readme_maps.items():
for library_name, markdown_hash in readme_map.items():
content = inventory_manager.load_library_readme_content(version, library_name, markdown_hash)
if content is not None:
db_writer.write_markdown(library_name, markdown_hash, content)

def load_and_augment_inventory(version: Version) -> dict:
inventory = inventory_manager.load_versioned_inventory(version)
readme_map = readme_maps.get(version, {})

# Augment libraries and custom instrumentations with markdown_hash
for key in ["libraries", "custom"]:
if key in inventory:
for item in inventory[key]:
name = item.get("name")
if name and name in readme_map:
item["markdown_hash"] = readme_map[name]

return inventory

Comment thread
SurbhiAgarwal1 marked this conversation as resolved.
Comment on lines +142 to +165
backfilled_libraries = backfill_metadata(
versions,
inventory_manager.load_versioned_inventory,
load_and_augment_inventory,
item_key="libraries",
)
backfilled_inventories = backfill_metadata(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

logger = logging.getLogger(__name__)

BACKFILLABLE_FIELDS = ["display_name", "description", "library_link", "has_javaagent"]
BACKFILLABLE_FIELDS = ["display_name", "description", "library_link", "has_javaagent", "markdown_hash"]
NESTED_BACKFILLABLE_FIELDS: dict[str, list[str]] = {
"configurations": ["declarative_name", "examples"],
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,54 @@ def test_multiple_versions_workflow(self, db_writer, temp_db_dir):
# Verify structure
assert (temp_db_dir / "versions" / "1.0.0-index.json").exists()
assert (temp_db_dir / "versions" / "2.0.0-index.json").exists()


class TestWriteMarkdown:
"""Tests for markdown file writing."""

def test_write_markdown_success(self, db_writer, temp_db_dir):
library_name = "test-lib"
markdown_hash = "abc123def456"
content = "# Test README"

db_writer.write_markdown(library_name, markdown_hash, content)

# Verify file creation
markdown_file = temp_db_dir / "markdown" / f"{library_name}-{markdown_hash}.md"
assert markdown_file.exists()
assert markdown_file.read_text(encoding="utf-8") == content

# Verify stats
assert db_writer.files_written == 1
assert db_writer.total_bytes == len(content.encode("utf-8"))

def test_write_markdown_deduplication(self, db_writer, temp_db_dir, caplog):
import logging
caplog.set_level(logging.DEBUG)

library_name = "test-lib"
markdown_hash = "abc123def456"
content = "# Test README"

# Write first time
db_writer.write_markdown(library_name, markdown_hash, content)
assert db_writer.files_written == 1

# Write second time (same content)
db_writer.write_markdown(library_name, markdown_hash, content)

# Stats should not increase
assert db_writer.files_written == 1
assert "already exists, skipping write" in caplog.text

def test_write_markdown_error_handling(self, db_writer):
from unittest.mock import patch

with patch("builtins.open", side_effect=OSError("Disk full")):
with patch("explorer_db_builder.database_writer.logger") as mock_logger:
db_writer.write_markdown("error-lib", "hash", "content")

# Verify error was logged
mock_logger.error.assert_called()
args, _ = mock_logger.error.call_args
assert "Failed to write markdown" in args[0]
47 changes: 43 additions & 4 deletions ecosystem-automation/explorer-db-builder/tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,45 @@ def test_run_builder_processes_all_versions(self, mock_inventory_manager, mock_d
assert mock_db_writer.write_libraries.call_count == 3
assert mock_db_writer.write_version_index.call_count == 3

def test_run_builder_processes_readmes(self, mock_inventory_manager, mock_db_writer):
"""Verifies READMEs are discovered, published, and hashes injected."""
versions = [Version("1.0.0")]
inventory_data = {
"file_format": 0.2,
"libraries": [{"name": "lib1"}],
"custom": [{"name": "custom1"}]
}
readme_map = {"lib1": "abc123def456", "custom1": "fed4321cba98"}
readme_content = "# README content"

mock_inventory_manager.list_versions.return_value = versions
mock_inventory_manager.load_versioned_inventory.return_value = inventory_data
mock_inventory_manager.load_library_readme_map.return_value = readme_map
mock_inventory_manager.load_library_readme_content.return_value = readme_content
mock_db_writer.write_libraries.return_value = {"lib1": "hash1"}

exit_code = run_javaagent_builder(mock_inventory_manager, mock_db_writer)

assert exit_code == 0

# Verify READMEs were loaded and written
assert mock_inventory_manager.load_library_readme_map.call_count == 1
assert mock_inventory_manager.load_library_readme_content.call_count == 2
assert mock_db_writer.write_markdown.call_count == 2
mock_db_writer.write_markdown.assert_any_call("lib1", "abc123def456", readme_content)
mock_db_writer.write_markdown.assert_any_call("custom1", "fed4321cba98", readme_content)

# Verify hashes were injected before writing libraries
write_calls = mock_db_writer.write_libraries.call_args_list
# libraries call
libs = write_calls[0][0][0]
assert libs[0]["name"] == "lib1"
assert libs[0]["markdown_hash"] == "abc123def456"
# custom call
custom = write_calls[1][0][0]
assert custom[0]["name"] == "custom1"
assert custom[0]["markdown_hash"] == "fed4321cba98"

def test_run_builder_uses_backfilled_inventories(self, mock_inventory_manager, mock_db_writer):
versions = [Version("1.0.0"), Version("2.0.0")]
inventory_1_0 = {
Expand All @@ -235,17 +274,17 @@ def test_run_builder_uses_backfilled_inventories(self, mock_inventory_manager, m

# Verify backfilled data is written: version 1.0.0 should have display_name backfilled
write_calls = mock_db_writer.write_libraries.call_args_list
# We expect 2 calls: one for version 1.0.0 libraries, one for version 2.0.0 libraries
# (Custom instrumentations are empty, so they aren't called)
assert len(write_calls) == 2

# First call is for version 1.0.0 - should have backfilled display_name
# First call is for version 1.0.0 libraries - should have backfilled display_name
libraries_v1 = write_calls[0][0][0]
assert len(libraries_v1) == 1
assert libraries_v1[0]["name"] == "lib1"
assert libraries_v1[0]["display_name"] == "Library 1"

# Second call is for version 2.0.0 - should have original display_name
# Second call is for version 2.0.0 libraries - should have original display_name
libraries_v2 = write_calls[1][0][0]
assert len(libraries_v2) == 1
assert libraries_v2[0]["name"] == "lib1"
assert libraries_v2[0]["display_name"] == "Library 1"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@

import importlib.metadata

__version__ = importlib.metadata.version("java-instrumentation-watcher")
try:
__version__ = importlib.metadata.version("java-instrumentation-watcher")
except importlib.metadata.PackageNotFoundError:
__version__ = "0.0.0-dev"
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,18 @@
from collections.abc import Iterable
from typing import Any

import logging
import re

import yaml
from semantic_version import Version
from watcher_common.content_hashing import compute_content_hash
from watcher_common.inventory_manager import BaseInventoryManager


logger = logging.getLogger(__name__)


class InventoryManager(BaseInventoryManager):
"""Manages Java instrumentation inventory storage and retrieval."""

Expand Down Expand Up @@ -96,6 +102,10 @@ def readme_dir_exists(self, version: Version) -> bool:
"""Return True if the library_readmes directory exists for this version."""
return (self.get_version_dir(version) / self.README_DIR).exists()

def _sanitize_name(self, name: str) -> str:
"""Sanitizes a name for use as a filename to prevent path traversal."""
return re.sub(r"[^a-zA-Z0-9._\-]", "_", name)

def save_library_readmes(
self,
version: Version,
Expand All @@ -107,9 +117,100 @@ def save_library_readmes(
written = 0
for name, content in readmes:
digest = compute_content_hash(content)
file_path = target_dir / f"{name}-{digest}.md"
safe_name = self._sanitize_name(name)
file_path = target_dir / f"{safe_name}-{digest}.md"
if file_path.exists():
continue
file_path.write_text(content, encoding="utf-8")
written += 1
return written

def load_library_readme_map(self, version: Version) -> dict[str, str]:
"""
Scan library_readmes/ and build a map of library_name -> markdown_hash.

Args:
version: Version to scan

Returns:
Dictionary mapping library names to their markdown content hashes
"""
readme_dir = self.get_version_dir(version) / self.README_DIR
if not readme_dir.exists():
return {}

# Tracking for deterministic selection: library_name -> (hash, mtime_ns, filename)
selected_readmes: dict[str, tuple[str, int, str]] = {}
seen_hashes: dict[str, set[str]] = {}

for item in sorted(readme_dir.iterdir(), key=lambda p: p.name):
if item.is_file() and item.suffix == ".md":
parsed = self._parse_readme_filename(item.name)
if parsed:
library_name, markdown_hash = parsed
seen_hashes.setdefault(library_name, set()).add(markdown_hash)

try:
mtime_ns = item.stat().st_mtime_ns
except OSError:
logger.warning(f"Failed to stat README file in {version}: {item.name}")
continue

current = selected_readmes.get(library_name)
if current is None:
selected_readmes[library_name] = (markdown_hash, mtime_ns, item.name)
else:
_, current_mtime_ns, current_name = current
# Pick newest by mtime, fallback to lexicographical name for total determinism
if mtime_ns > current_mtime_ns or (
mtime_ns == current_mtime_ns and item.name > current_name
):
selected_readmes[library_name] = (markdown_hash, mtime_ns, item.name)
else:
logger.warning(f"Malformed README filename in {version}: {item.name}")

readme_map = {}
for library_name, (markdown_hash, _, selected_name) in selected_readmes.items():
readme_map[library_name] = markdown_hash
hashes = seen_hashes.get(library_name, set())
if len(hashes) > 1:
logger.warning(
f"Multiple README files found for library '{library_name}' in {version}; "
f"selected '{selected_name}' with hash '{markdown_hash}'. "
f"Available hashes: {sorted(hashes)}"
)

return readme_map

def load_library_readme_content(self, version: Version, library_name: str, markdown_hash: str) -> str | None:
"""
Load the content of a specific library README.

Args:
version: Version to load from
library_name: Name of the library
markdown_hash: Content hash of the markdown

Returns:
The markdown content, or None if it doesn't exist or cannot be read
"""
safe_name = self._sanitize_name(library_name)
file_path = self.get_version_dir(version) / self.README_DIR / f"{safe_name}-{markdown_hash}.md"
if not file_path.exists():
return None

try:
return file_path.read_text(encoding="utf-8")
except OSError as e:
logger.error(f"Failed to read README file '{file_path}': {e}")
return None

def _parse_readme_filename(self, filename: str) -> tuple[str, str] | None:
"""
Parse a README filename into (library_name, markdown_hash).
Format: {library-name}-{hash}.md
"""
match = re.match(r"^(.+)-([a-f0-9]{12})\.md$", filename)
if match:
return match.group(1), match.group(2)
return None
Loading
Loading