generate llms.txt for our docs (#3273)

* Add new toc (#3255) * Add new server management and collaboration features * Add Python environment configuration guides * Add understanding of ZenML artifacts and complex use-cases * test redirect * one more * revert redirects * revert redirects * add page plcaeholder for collaborate with team * add icon * move files to the right directories * update toc with new paths * add all redirects * remove .md and README from the left pane * fix all broken links * fix more links --------- Co-authored-by: Jayesh Sharma <[email protected]> (cherry picked from commit ae73e2e) * docs and code separate * first version * use the batch api * write file from batch output * 70k version of docs * slightly improved with filenames * add file names to output * add workflows * add scripts * fix repomix output * add gemini script * rm testing files * update huggingface repo name * add the full docs too * rm docs file * rm breakpoint * rm if condition as it is useless * move hf logic to python script * update model * use realtime instead of batch * add license * rm gemini for now * yamlfix * Auto-update of Starter template * Auto-update of LLM Finetuning template * Auto-update of E2E template * Auto-update of NLP template --------- Co-authored-by: Hamza Tahir <[email protected]> Co-authored-by: GitHub Actions <[email protected]>
zenml-io · Jan 21, 2025 · fec0956 · fec0956
1 parent ffa4ec3
commit fec0956
Show file tree

Hide file tree

Showing 3 changed files with 181 additions and 0 deletions.
diff --git a/.github/workflows/docs_summarization_submit.yml b/.github/workflows/docs_summarization_submit.yml
@@ -0,0 +1,59 @@
+---
+name: Summarize and Upload Docs
+on:
+  push:
+    branches: [release/**]
+jobs:
+  summarize-and-upload:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install openai pathlib huggingface_hub
+          npm install -g repomix
+      - name: Generate repomix outputs
+        run: |
+          # Create directory for outputs
+          mkdir -p repomix-outputs
+
+          # Full docs
+          repomix --include "docs/book/**/*.md"
+          mv repomix-output.txt repomix-outputs/llms-full.txt
+
+          # Component guide
+          repomix --include "docs/book/component-guide/**/*.md"
+          mv repomix-output.txt repomix-outputs/component-guide.txt
+
+          # User guide
+          repomix --include "docs/book/user-guide/**/*.md"
+          mv repomix-output.txt user-guide.txt
+
+          # Getting started
+          repomix --include "docs/book/getting-started/**/*.md"
+          mv repomix-output.txt getting-started.txt
+
+          # Merge user guide and getting started into basics
+          cat user-guide.txt getting-started.txt > repomix-outputs/basics.txt
+          rm user-guide.txt getting-started.txt
+      - name: Upload repomix outputs
+        uses: actions/upload-artifact@v4
+        with:
+          name: repomix-outputs
+          path: repomix-outputs
+          retention-days: 5
+      - name: Summarize and upload to HuggingFace
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |-
+          python scripts/summarize_docs.py
+          python scripts/upload_to_huggingface.py
diff --git a/scripts/summarize_docs.py b/scripts/summarize_docs.py
@@ -0,0 +1,97 @@
+#  Copyright (c) ZenML GmbH 2025. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+import os
+import re
+import json
+from openai import OpenAI
+from pathlib import Path
+from typing import List, Dict
+
+# Initialize OpenAI client
+client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+
+def extract_content_blocks(md_content: str) -> str:
+    """Extracts content blocks while preserving order and marking code blocks."""
+    parts = re.split(r'(```[\s\S]*?```)', md_content)
+
+    processed_content = ""
+    for part in parts:
+        if part.startswith('```'):
+            processed_content += "\n[CODE_BLOCK_START]\n" + part + "\n[CODE_BLOCK_END]\n"
+        else:
+            cleaned_text = re.sub(r'\s+', ' ', part).strip()
+            if cleaned_text:
+                processed_content += "\n" + cleaned_text + "\n"
+
+    return processed_content
+
+def summarize_content(content: str, file_path: str) -> str:
+    """Summarizes content using OpenAI API."""
+    try:
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a technical documentation summarizer."
+                },
+                {
+                    "role": "user",
+                    "content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail. 
+                    Keep all important technical information and key points while removing redundancy and verbose explanations. 
+                    Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
+                    Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
+
+                    {content}"""
+                }
+            ],
+            temperature=0.3,
+            max_tokens=2000
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Error summarizing {file_path}: {e}")
+        return ""
+
+def main():
+    docs_dir = "docs/book"
+    output_file = "summarized_docs.txt"
+
+    # Get markdown files
+    exclude_files = ["toc.md"]
+    md_files = list(Path(docs_dir).rglob("*.md"))
+    md_files = [file for file in md_files if file.name not in exclude_files]
+
+    # Process each file and write summaries
+    with open(output_file, "w", encoding="utf-8") as out_f:
+        for file_path in md_files:
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+
+                processed_content = extract_content_blocks(content)
+                summary = summarize_content(processed_content, str(file_path))
+
+                if summary:
+                    out_f.write(f"=== File: {file_path} ===\n\n")
+                    out_f.write(summary)
+                    out_f.write("\n\n" + "="*50 + "\n\n")
+
+                    print(f"Processed: {file_path}")
+
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+
+if __name__ == "__main__":
+    main() 
diff --git a/scripts/upload_to_huggingface.py b/scripts/upload_to_huggingface.py
@@ -0,0 +1,25 @@
+from huggingface_hub import HfApi
+import os
+
+def upload_to_huggingface():
+    api = HfApi(token=os.environ["HF_TOKEN"])
+
+    # Upload OpenAI summary
+    api.upload_file(
+        path_or_fileobj="summarized_docs.txt",
+        path_in_repo="how-to-guides.txt",
+        repo_id="zenml/llms.txt",
+        repo_type="dataset"
+    )
+
+    # Upload repomix outputs
+    for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]:
+        api.upload_file(
+            path_or_fileobj=f"repomix-outputs/{filename}",
+            path_in_repo=filename,
+            repo_id="zenml/llms.txt",
+            repo_type="dataset"
+        )
+
+if __name__ == "__main__":
+    upload_to_huggingface()