Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 120
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[FORMAT]
max-line-length=120
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ backend/ # (future) Backend API

## Component Descriptions
- **core/**: Implements the main logic for generating synthetic data for LLM post-training. (First component to be implemented)
- **web/**: (Planned) Web-based user interface for data generation and management.
- **backend/**: (Planned) API backend for orchestration and integration.

## Contributing
- Contributions are welcome! Please see the contributing guidelines in each component folder.
13 changes: 0 additions & 13 deletions data-gen/core/README.md

This file was deleted.

1 change: 0 additions & 1 deletion data-gen/core/tests/__init__.py

This file was deleted.

3 changes: 0 additions & 3 deletions data-gen/core/tests/test_smoke.py

This file was deleted.

Empty file added data_gen/core/__init__.py
Empty file.
59 changes: 59 additions & 0 deletions data_gen/core/generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional


class DataGenerator(ABC):
"""Abstract base class for data generation using various LLM APIs."""

def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key

@abstractmethod
def generate_data(self, requirement: str, num_records: int) -> List[Dict[str, Any]]:
"""Generate synthetic data based on user requirement.

Args:
requirement: Natural language description of data requirements
num_records: Number of records to generate

Returns:
List of generated data records
"""
pass

def _convert_requirement_to_prompts(self, requirement: str, num_records: int) -> List[str]:
"""Convert user requirement to specific data generation prompts.

Args:
requirement: Natural language data requirement
num_records: Number of records to generate

Returns:
List of specific prompts for each record
"""
# Enhanced prompt with better instructions for synthetic data generation
base_prompt = f"""You are a synthetic data generator. Generate
realistic synthetic data based on this requirement:
{requirement}

Instructions:
- Return only valid JSON without any additional text, explanations,
or markdown formatting
- The JSON should represent a single data record that matches the requirement
- Ensure the data is realistic and varied
- Use appropriate data types (strings, numbers, booleans, arrays, objects as needed)
- Make each record unique and realistic

Generate one data record now:
"""

# Add slight variations to prompts to encourage diversity
prompts = []
for i in range(num_records):
if i == 0:
prompts.append(base_prompt)
else:
variation_prompt = base_prompt + f"\n\nMake this record distinct from previous records (this is record #{i+1})."
prompts.append(variation_prompt)

return prompts
126 changes: 126 additions & 0 deletions data_gen/core/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import inspect
import logging
import os
from enum import Enum
from typing import Optional, Union

from colorama import Fore
from dotenv import load_dotenv


class Logger:
_instance = None
_initialized = False

class LoggingLevel(Enum):
DEBUG = logging.DEBUG
INFO = logging.INFO
TOOL = 25
TASK = 26
THOUGHT_PROCESS = 27
WARNING = logging.WARNING
ERROR = logging.ERROR
CRITICAL = logging.CRITICAL

@classmethod
def from_string(cls, level_string: str):
try:
return cls[level_string.upper()]
except KeyError:
raise ValueError(f"Invalid logging level: {level_string}")

def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance

def __init__(
self,
logger_name: str,
verbose: bool = True,
display_level: Optional[Union[LoggingLevel, str]] = None,
):
if self._initialized:
return

load_dotenv(override=True)

if display_level is None:
env_level = os.getenv("LOGGING_LEVEL", "INFO")
self.logging_level = self.LoggingLevel.from_string(env_level)
elif isinstance(display_level, str):
self.logging_level = self.LoggingLevel.from_string(display_level)
else:
self.logging_level = display_level

self.logger = logging.getLogger(logger_name)
self.logger.setLevel(self.logging_level.value)

# Remove all existing handlers
for handler in self.logger.handlers:
self.logger.removeHandler(handler)

self.formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s (%(filename)s:%(lineno)d)")
self.console_handler = logging.StreamHandler()
self.console_handler.setLevel(self.logging_level.value)
self.console_handler.setFormatter(self.formatter)
self.logger.addHandler(self.console_handler)

# Add custom log levels
logging.addLevelName(self.LoggingLevel.TOOL.value, "TOOL")
logging.addLevelName(self.LoggingLevel.TASK.value, "TASK")
logging.addLevelName(self.LoggingLevel.THOUGHT_PROCESS.value, "THOUGHT_PROCESS")

self._initialized = True

def _log(self, message: str, level: LoggingLevel, color: str) -> None:
if level.value >= self.logging_level.value:
if len(inspect.stack()) >= 4:
caller_frame = inspect.stack()[3]
else:
caller_frame = inspect.stack()[2]
caller_name = caller_frame.function
caller_line = caller_frame.lineno
message = f"{caller_name}({caller_line}): {message}"
log_message = color + message + Fore.RESET

if level == self.LoggingLevel.DEBUG:
self.logger.debug(log_message)
elif level == self.LoggingLevel.INFO:
self.logger.info(log_message)
elif level == self.LoggingLevel.TOOL:
self.logger.log(level.value, log_message)
elif level == self.LoggingLevel.TASK:
self.logger.log(level.value, log_message)
elif level == self.LoggingLevel.THOUGHT_PROCESS:
self.logger.log(level.value, log_message)
elif level == self.LoggingLevel.WARNING:
self.logger.warning(log_message)
elif level == self.LoggingLevel.ERROR:
self.logger.error(log_message)
elif level == self.LoggingLevel.CRITICAL:
self.logger.critical(log_message)

def debug(self, message: str) -> None:
self._log(message, self.LoggingLevel.DEBUG, Fore.BLACK)

def info(self, message: str) -> None:
self._log(message, self.LoggingLevel.INFO, Fore.WHITE)

def tool_log(self, message: str) -> None:
self._log(message, self.LoggingLevel.TOOL, Fore.YELLOW)

def task_log(self, message: str) -> None:
self._log(message, self.LoggingLevel.TASK, Fore.BLUE)

def thought_process_log(self, message: str) -> None:
self._log(message, self.LoggingLevel.THOUGHT_PROCESS, Fore.GREEN)

def warning(self, message: str) -> None:
self._log(message, self.LoggingLevel.WARNING, Fore.YELLOW)

def error(self, message: str) -> None:
self._log(message, self.LoggingLevel.ERROR, Fore.RED)

def critical(self, message: str) -> None:
self._log(message, self.LoggingLevel.CRITICAL, Fore.MAGENTA)
File renamed without changes.
79 changes: 79 additions & 0 deletions data_gen/core/tests/test_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Pytest style tests. Max line length: 120 (see .flake8, .pylintrc, pyproject.toml)
import pytest
from typing import List, Dict, Any

from ..generator import DataGenerator


class TestDataGenerator(DataGenerator):
"""Concrete implementation of DataGenerator for testing."""
def generate_data(self, requirement: str, num_records: int) -> List[Dict[str, Any]]:
return [{"test": f"record_{i}"} for i in range(num_records)]


@pytest.fixture
def generator():
return TestDataGenerator()


def test_initialization_with_api_key():
api_key = "test_api_key"
generator = TestDataGenerator(api_key=api_key)
assert generator.api_key == api_key


def test_initialization_without_api_key():
generator = TestDataGenerator()
assert generator.api_key is None


def test_convert_requirement_to_prompts_basic(generator):
requirement = "Generate user profiles"
num_records = 3
prompts = generator._convert_requirement_to_prompts(requirement, num_records)
assert len(prompts) == num_records
assert requirement in prompts[0]
assert "JSON" in prompts[0]
assert "synthetic data" in prompts[0]


def test_convert_requirement_to_prompts_variation(generator):
requirement = "Generate product data"
num_records = 5
prompts = generator._convert_requirement_to_prompts(requirement, num_records)
assert "record #" not in prompts[0]
for i in range(1, num_records):
assert f"record #{i+1}" in prompts[i]
assert "distinct" in prompts[i]


def test_convert_requirement_to_prompts_single_record(generator):
requirement = "Generate a single user"
num_records = 1
prompts = generator._convert_requirement_to_prompts(requirement, num_records)
assert len(prompts) == 1
assert "record #" not in prompts[0]


def test_convert_requirement_to_prompts_empty_requirement(generator):
requirement = ""
num_records = 2
prompts = generator._convert_requirement_to_prompts(requirement, num_records)
assert len(prompts) == num_records
assert "JSON" in prompts[0]


def test_convert_requirement_to_prompts_zero_records(generator):
requirement = "Generate data"
num_records = 0
prompts = generator._convert_requirement_to_prompts(requirement, num_records)
assert len(prompts) == 0


def test_generate_data_abstract_method(generator):
requirement = "Generate test data"
num_records = 3
result = generator.generate_data(requirement, num_records)
assert len(result) == num_records
assert isinstance(result, list)
assert isinstance(result[0], dict)
15 changes: 14 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ authors = [
]
requires-python = ">=3.9"
readme = "README.md"
dependencies = [
"python-dotenv>=1.0.0"
]

[project.optional-dependencies]
test = [
Expand All @@ -16,4 +19,14 @@ test = [

[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
build-backend = "setuptools.build_meta"

[tool.poetry.dependencies]
colorama = "^0.4.6"
python-dotenv = "^1.0.1"

[tool.black]
line-length = 120

[tool.isort]
line_length = 120
Loading
Loading