zenml-io · wjayesh · Dec 11, 2024 · Jan 2, 2025 · Jan 2, 2025 · Jan 3, 2025
diff --git a/.github/workflows/docs_summarization_check.yml b/.github/workflows/docs_summarization_check.yml
@@ -0,0 +1,90 @@
+name: Check Docs Summarization
+
+on:
+  push:
+    branches: [release/**]
+
+jobs:
+  check-batch:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    permissions:
+      contents: read
+      id-token: write
+      actions: read
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install openai huggingface_hub
+
+      - name: List artifacts
+        uses: actions/github-script@v6
+        id: artifacts
+        with:
+          script: |
+            const artifacts = await github.rest.actions.listArtifactsForRepo({
+              owner: context.repo.owner,
+              repo: context.repo.name,
+            });
+            const batchArtifact = artifacts.data.artifacts
+              .find(artifact => artifact.name.startsWith('batch-id-'));
+            if (!batchArtifact) {
+              throw new Error('No batch ID artifact found');
+            }
+            console.log(`Found artifact: ${batchArtifact.name}`);
+            return batchArtifact.name;
+
+      - name: Download batch ID
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ steps.artifacts.outputs.result }}
+
+      - name: Download repomix outputs
+        uses: actions/download-artifact@v3
+        with:
+          name: repomix-outputs
+          path: repomix-outputs
+
+      - name: Process batch results and upload to HuggingFace
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # Process OpenAI batch results
+          python scripts/check_batch_output.py
+
+          # Upload all files to HuggingFace
+          python -c '
+          from huggingface_hub import HfApi
+          import os
+
+          api = HfApi()
+
+          # Upload OpenAI summary
+          api.upload_file(
+              token=os.environ["HF_TOKEN"],
+              repo_id="zenml/llms.txt",
+              repo_type="dataset",
+              path_in_repo="how-to-guides.txt",
+              path_or_fileobj="zenml_docs.txt",
+          )
+
+          # Upload repomix outputs
+          for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]:
+              api.upload_file(
+                  token=os.environ["HF_TOKEN"],
+                  repo_id="zenml/llms.txt",
+                  repo_type="dataset",
+                  path_in_repo=filename,
+                  path_or_fileobj=f"repomix-outputs/{filename}",
+              )
+          ' 
diff --git a/.github/workflows/docs_summarization_submit.yml b/.github/workflows/docs_summarization_submit.yml
@@ -0,0 +1,76 @@
+name: Submit Docs Summarization
+
+on:
+  workflow_run:
+    workflows: ["release-prepare"]
+    types:
+      - completed
+
+jobs:
+  submit-batch:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    permissions:
+      contents: read
+      id-token: write
+      actions: write
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install openai pathlib repomix
+
+      - name: Generate repomix outputs
+        run: |
+          # Create directory for outputs
+          mkdir -p repomix-outputs
+
+          # Full docs
+          repomix --include "docs/book/**/*.md"
+          mv repomix-output.txt repomix-outputs/llms-full.txt
+
+          # Component guide
+          repomix --include "docs/book/component-guide/**/*.md"
+          mv repomix-output.txt repomix-outputs/component-guide.txt
+
+          # User guide
+          repomix --include "docs/book/user-guide/**/*.md"
+          mv repomix-output.txt user-guide.txt
+
+          # Getting started
+          repomix --include "docs/book/getting-started/**/*.md"
+          mv repomix-output.txt getting-started.txt
+
+          # Merge user guide and getting started into basics
+          cat user-guide.txt getting-started.txt > repomix-outputs/basics.txt
+          rm user-guide.txt getting-started.txt
+
+      - name: Upload repomix outputs
+        uses: actions/upload-artifact@v3
+        with:
+          name: repomix-outputs
+          path: repomix-outputs
+          retention-days: 5
+
+      - name: Submit batch job
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        id: submit
+        run: |
+          python scripts/summarize_docs.py
+          echo "batch_id=$(cat batch_id.txt)" >> $GITHUB_OUTPUT
+
+      - name: Upload batch ID
+        uses: actions/upload-artifact@v3
+        with:
+          name: batch-id-${{ steps.submit.outputs.batch_id }}
+          path: batch_id.txt
+          retention-days: 5 
diff --git a/scripts/check_batch_output.py b/scripts/check_batch_output.py
@@ -0,0 +1,34 @@
+import json
+from openai import OpenAI
+
+client = OpenAI()
+
+def main():
+    # Read the batch ID from file
+    with open("batch_id.txt", "r") as f:
+        batch_id = f.read().strip()
+
+    # Get the batch results file
+    batch = client.batches.retrieve(batch_id)
+    if batch.status != "completed":
+        raise Exception(f"Batch job {batch_id} is not completed. Status: {batch.status}")
+
+    # Get the output file
+    file_response = client.files.content(batch.output_file_id)
+    text = file_response.text
+
+    # Process the results and write to file
+    with open("zenml_docs.txt", "w") as f:
+        for line in text.splitlines():
+            json_line = json.loads(line)
+
+            # Extract and format the file path from custom_id
+            file_path = "-".join(json_line["custom_id"].split("-")[2:]).replace("_", "/")
+
+            # Write the file path and content
+            f.write(f"File: {file_path}\n\n")
+            f.write(json_line["response"]["body"]["choices"][0]["message"]["content"])
+            f.write("\n\n" + "="*80 + "\n\n")
+
+if __name__ == "__main__":
+    main() 
diff --git a/scripts/summarize_docs.py b/scripts/summarize_docs.py
@@ -0,0 +1,118 @@
+import os
+import re
+import json
+from openai import OpenAI
+from pathlib import Path
+from typing import List, Dict
+import time
+
+# Initialize OpenAI client
+client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+
+def extract_content_blocks(md_content: str) -> str:
+    """Extracts content blocks while preserving order and marking code blocks."""
+    parts = re.split(r'(```[\s\S]*?```)', md_content)
+
+    processed_content = ""
+    for part in parts:
+        if part.startswith('```'):
+            processed_content += "\n[CODE_BLOCK_START]\n" + part + "\n[CODE_BLOCK_END]\n"
+        else:
+            cleaned_text = re.sub(r'\s+', ' ', part).strip()
+            if cleaned_text:
+                processed_content += "\n" + cleaned_text + "\n"
+
+    return processed_content
+
+def prepare_batch_requests(md_files: List[Path]) -> List[Dict]:
+    """Prepares batch requests for each markdown file."""
+    batch_requests = []
+
+    for i, file_path in enumerate(md_files):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            processed_content = extract_content_blocks(content)
+
+            file_path_str_with_no_slashes = str(file_path).replace("/", "_")
+
+            # Prepare the request for this file
+            request = {
+                "custom_id": f"file-{i}-{file_path_str_with_no_slashes}",
+                "method": "POST",
+                "url": "/v1/chat/completions",
+                "body": {
+                    "model": "gpt-4-turbo-preview",
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": "You are a technical documentation summarizer."
+                        },
+                        {
+                            "role": "user",
+                            "content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail. 
+                            Keep all important technical information and key points while removing redundancy and verbose explanations. 
+                            Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
+                            Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
+
+                            {processed_content}"""
+                        }
+                    ],
+                    "temperature": 0.3,
+                    "max_tokens": 2000
+                }
+            }
+            batch_requests.append(request)
+
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+
+    return batch_requests
+
+def submit_batch_job(batch_requests: List[Dict]) -> str:
+    """Submits batch job to OpenAI and returns batch ID."""
+    # Create batch input file
+    batch_file_path = "batch_input.jsonl"
+    with open(batch_file_path, "w") as f:
+        for request in batch_requests:
+            f.write(json.dumps(request) + "\n")
+
+    # Upload the file
+    with open(batch_file_path, "rb") as f:
+        batch_input_file = client.files.create(
+            file=f,
+            purpose="batch"
+        )
+
+    # Create the batch
+    batch = client.batches.create(
+        input_file_id=batch_input_file.id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+        metadata={
+            "description": "ZenML docs summarization"
+        }
+    )
+
+    # Store batch ID for later use
+    with open("batch_id.txt", "w") as f:
+        f.write(batch.id)
+
+    print(f"Batch job submitted with ID: {batch.id}")
+    return batch.id
+
+def main():
+    docs_dir = "docs/book"
+
+    # Get markdown files
+    exclude_files = ["toc.md"]
+    md_files = list(Path(docs_dir).rglob("*.md"))
+    md_files = [file for file in md_files if file.name not in exclude_files]
+
+    # Prepare and submit batch job
+    batch_requests = prepare_batch_requests(md_files)
+    batch_id = submit_batch_job(batch_requests)
+
+if __name__ == "__main__":
+    main()