diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a5d865d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install uv + run: pip install uv + + - name: Create venv + run: uv venv .venv + + - name: Activate venv and install dependencies + run: | + source .venv/bin/activate + uv pip install -e .[test] + + - name: Run tests with coverage + run: | + source .venv/bin/activate + pytest --cov=data-gen --cov-report=xml --cov-fail-under=0 + + - name: Upload coverage report + uses: actions/upload-artifact@v4 + with: + name: coverage-xml + path: coverage.xml \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a916b51 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +# Install uv +RUN pip install uv + +# Set workdir and copy project +WORKDIR /app +COPY . . + +RUN uv pip install --system -r requirements.txt || true +RUN uv pip install --system . + +# Default command (for development) +CMD ["bash"] \ No newline at end of file diff --git a/README.md b/README.md index 481a475..18ca5fd 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,57 @@ -# data-gen -The one-stop shop to generate synthetic data for GenAI post-training. +# Data-Gen + +## Project Overview +Data-Gen is a modular toolkit for generating synthetic data tailored for LLM post-training and evaluation. The project is structured as a monorepo to support multiple components, including a core data generation module, a backend API, and a web client. + +## Monorepo Structure +``` +core/ # Core synthetic data generation logic (Python, uv-managed) +web/ # (future) Web client +backend/ # (future) Backend API +``` + +## Getting Started +- See `core/README.md` for details on the core module. +- Each component is managed independently. + +## Local Development (Core Module) + +1. Install [uv](https://github.com/astral-sh/uv): + ```sh + pip install uv + ``` +2. Create a virtual environment (recommended): + ```sh + uv venv .venv + source .venv/bin/activate + ``` +3. Install dependencies: + ```sh + uv pip install -e . + ``` +4. Run tests: + ```sh + uv pip install pytest # if not already installed + pytest + ``` + +## Docker Usage + +1. Build the Docker image: + ```sh + docker build -t data-gen . + ``` +2. Run the container: + ```sh + docker run -it data-gen + ``` + +> **Note:** If you see an error about "No virtual environment found" when building Docker, ensure the Dockerfile uses `uv pip install --system ...` for global installs (already set up in this repo). + +## Component Descriptions +- **core/**: Implements the main logic for generating synthetic data for LLM post-training. (First component to be implemented) +- **web/**: (Planned) Web-based user interface for data generation and management. +- **backend/**: (Planned) API backend for orchestration and integration. + +## Contributing +- Contributions are welcome! Please see the contributing guidelines in each component folder. diff --git a/data-gen/core/README.md b/data-gen/core/README.md new file mode 100644 index 0000000..b604a3d --- /dev/null +++ b/data-gen/core/README.md @@ -0,0 +1,13 @@ +# Core Module: Synthetic Data Generation + +This module implements the core logic for generating synthetic data for LLM post-training and evaluation. + +## Structure +- `core/` — Python package with core logic +- `tests/` — Unit tests for the core module + +## Setup +- Managed with [uv](https://github.com/astral-sh/uv) and PEP 621 (`pyproject.toml`) + +## Usage +- (Coming soon) Example usage and API documentation. \ No newline at end of file diff --git a/data-gen/core/__init__.py b/data-gen/core/__init__.py new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/data-gen/core/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/data-gen/core/tests/__init__.py b/data-gen/core/tests/__init__.py new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/data-gen/core/tests/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/data-gen/core/tests/test_smoke.py b/data-gen/core/tests/test_smoke.py new file mode 100644 index 0000000..840f7d2 --- /dev/null +++ b/data-gen/core/tests/test_smoke.py @@ -0,0 +1,3 @@ +def test_smoke() -> None: + """Dummy test to ensure CI and coverage run successfully.""" + assert True \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..28db653 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "core" +version = "0.1.0" +description = "Core synthetic data generation logic for LLM post-training." +authors = [ + { name = "Your Name", email = "your@email.com" } +] +requires-python = ">=3.9" +readme = "README.md" + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov" +] + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" \ No newline at end of file