Skip to content

Commit

Permalink
generate llms.txt for our docs (#3273)
Browse files Browse the repository at this point in the history
* Add new toc (#3255)

* Add new server management and collaboration features

* Add Python environment configuration guides

* Add understanding of ZenML artifacts and complex use-cases

* test redirect

* one more

* revert redirects

* revert redirects

* add page plcaeholder for collaborate with team

* add icon

* move files to the right directories

* update toc with new paths

* add all redirects

* remove .md and README from the left pane

* fix all broken links

* fix more links

---------

Co-authored-by: Jayesh Sharma <[email protected]>
(cherry picked from commit ae73e2e)

* docs and code separate

* first version

* use the batch api

* write file from batch output

* 70k version of docs

* slightly improved with filenames

* add file names to output

* add workflows

* add scripts

* fix repomix output

* add gemini script

* rm testing files

* update huggingface repo name

* add the full docs too

* rm docs file

* rm breakpoint

* rm if condition as it is useless

* move hf logic to python script

* update model

* use realtime instead of batch

* add license

* rm gemini for now

* yamlfix

* Auto-update of Starter template

* Auto-update of LLM Finetuning template

* Auto-update of E2E template

* Auto-update of NLP template

---------

Co-authored-by: Hamza Tahir <[email protected]>
Co-authored-by: GitHub Actions <[email protected]>
  • Loading branch information
3 people authored Jan 21, 2025
1 parent ffa4ec3 commit fec0956
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 0 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/docs_summarization_submit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
---
name: Summarize and Upload Docs
on:
push:
branches: [release/**]
jobs:
summarize-and-upload:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install openai pathlib huggingface_hub
npm install -g repomix
- name: Generate repomix outputs
run: |
# Create directory for outputs
mkdir -p repomix-outputs
# Full docs
repomix --include "docs/book/**/*.md"
mv repomix-output.txt repomix-outputs/llms-full.txt
# Component guide
repomix --include "docs/book/component-guide/**/*.md"
mv repomix-output.txt repomix-outputs/component-guide.txt
# User guide
repomix --include "docs/book/user-guide/**/*.md"
mv repomix-output.txt user-guide.txt
# Getting started
repomix --include "docs/book/getting-started/**/*.md"
mv repomix-output.txt getting-started.txt
# Merge user guide and getting started into basics
cat user-guide.txt getting-started.txt > repomix-outputs/basics.txt
rm user-guide.txt getting-started.txt
- name: Upload repomix outputs
uses: actions/upload-artifact@v4
with:
name: repomix-outputs
path: repomix-outputs
retention-days: 5
- name: Summarize and upload to HuggingFace
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |-
python scripts/summarize_docs.py
python scripts/upload_to_huggingface.py
97 changes: 97 additions & 0 deletions scripts/summarize_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) ZenML GmbH 2025. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
import os
import re
import json
from openai import OpenAI
from pathlib import Path
from typing import List, Dict

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def extract_content_blocks(md_content: str) -> str:
"""Extracts content blocks while preserving order and marking code blocks."""
parts = re.split(r'(```[\s\S]*?```)', md_content)

processed_content = ""
for part in parts:
if part.startswith('```'):
processed_content += "\n[CODE_BLOCK_START]\n" + part + "\n[CODE_BLOCK_END]\n"
else:
cleaned_text = re.sub(r'\s+', ' ', part).strip()
if cleaned_text:
processed_content += "\n" + cleaned_text + "\n"

return processed_content

def summarize_content(content: str, file_path: str) -> str:
"""Summarizes content using OpenAI API."""
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a technical documentation summarizer."
},
{
"role": "user",
"content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail.
Keep all important technical information and key points while removing redundancy and verbose explanations.
Make it concise but ensure NO critical information is lost and some details that you think are important are kept.
Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy:
{content}"""
}
],
temperature=0.3,
max_tokens=2000
)
return response.choices[0].message.content
except Exception as e:
print(f"Error summarizing {file_path}: {e}")
return ""

def main():
docs_dir = "docs/book"
output_file = "summarized_docs.txt"

# Get markdown files
exclude_files = ["toc.md"]
md_files = list(Path(docs_dir).rglob("*.md"))
md_files = [file for file in md_files if file.name not in exclude_files]

# Process each file and write summaries
with open(output_file, "w", encoding="utf-8") as out_f:
for file_path in md_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

processed_content = extract_content_blocks(content)
summary = summarize_content(processed_content, str(file_path))

if summary:
out_f.write(f"=== File: {file_path} ===\n\n")
out_f.write(summary)
out_f.write("\n\n" + "="*50 + "\n\n")

print(f"Processed: {file_path}")

except Exception as e:
print(f"Error processing {file_path}: {e}")

if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions scripts/upload_to_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from huggingface_hub import HfApi
import os

def upload_to_huggingface():
api = HfApi(token=os.environ["HF_TOKEN"])

# Upload OpenAI summary
api.upload_file(
path_or_fileobj="summarized_docs.txt",
path_in_repo="how-to-guides.txt",
repo_id="zenml/llms.txt",
repo_type="dataset"
)

# Upload repomix outputs
for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]:
api.upload_file(
path_or_fileobj=f"repomix-outputs/{filename}",
path_in_repo=filename,
repo_id="zenml/llms.txt",
repo_type="dataset"
)

if __name__ == "__main__":
upload_to_huggingface()

0 comments on commit fec0956

Please sign in to comment.