-
Notifications
You must be signed in to change notification settings - Fork 465
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
generate llms.txt for our docs #3273
base: develop
Are you sure you want to change the base?
Changes from all commits
c68275b
5abcd1a
945a94c
89ecf8d
b835d5b
316e658
2e9d234
6e5dbf6
5ca3247
cfe4b57
5a03670
5476e72
1aaf1c7
50de6a7
80a1acb
f1c333d
a9fe9ea
54d8c98
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
name: Check Docs Summarization | ||
|
||
on: | ||
push: | ||
branches: [release/**] | ||
|
||
jobs: | ||
check-batch: | ||
runs-on: ubuntu-latest | ||
if: ${{ github.event.workflow_run.conclusion == 'success' }} | ||
permissions: | ||
contents: read | ||
id-token: write | ||
actions: read | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.11' | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install openai huggingface_hub | ||
- name: List artifacts | ||
uses: actions/github-script@v6 | ||
id: artifacts | ||
with: | ||
script: | | ||
const artifacts = await github.rest.actions.listArtifactsForRepo({ | ||
owner: context.repo.owner, | ||
repo: context.repo.name, | ||
}); | ||
const batchArtifact = artifacts.data.artifacts | ||
.find(artifact => artifact.name.startsWith('batch-id-')); | ||
if (!batchArtifact) { | ||
throw new Error('No batch ID artifact found'); | ||
} | ||
console.log(`Found artifact: ${batchArtifact.name}`); | ||
return batchArtifact.name; | ||
- name: Download batch ID | ||
uses: actions/download-artifact@v3 | ||
with: | ||
name: ${{ steps.artifacts.outputs.result }} | ||
|
||
- name: Download repomix outputs | ||
uses: actions/download-artifact@v3 | ||
with: | ||
name: repomix-outputs | ||
path: repomix-outputs | ||
|
||
- name: Process batch results and upload to HuggingFace | ||
env: | ||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
run: | | ||
# Process OpenAI batch results | ||
python scripts/check_batch_output.py | ||
# Upload all files to HuggingFace | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first half of the step here is in the |
||
python -c ' | ||
from huggingface_hub import HfApi | ||
import os | ||
api = HfApi() | ||
# Upload OpenAI summary | ||
api.upload_file( | ||
token=os.environ["HF_TOKEN"], | ||
repo_id="zenml/llms.txt", | ||
repo_type="dataset", | ||
path_in_repo="how-to-guides.txt", | ||
path_or_fileobj="zenml_docs.txt", | ||
) | ||
# Upload repomix outputs | ||
for filename in ["component-guide.txt", "basics.txt", "llms-full.txt"]: | ||
api.upload_file( | ||
token=os.environ["HF_TOKEN"], | ||
repo_id="zenml/llms.txt", | ||
repo_type="dataset", | ||
path_in_repo=filename, | ||
path_or_fileobj=f"repomix-outputs/{filename}", | ||
) | ||
' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
name: Submit Docs Summarization | ||
|
||
on: | ||
workflow_run: | ||
workflows: ["release-prepare"] | ||
types: | ||
- completed | ||
|
||
jobs: | ||
submit-batch: | ||
runs-on: ubuntu-latest | ||
if: ${{ github.event.workflow_run.conclusion == 'success' }} | ||
permissions: | ||
contents: read | ||
id-token: write | ||
actions: write | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.11' | ||
|
||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install openai pathlib repomix | ||
|
||
- name: Generate repomix outputs | ||
run: | | ||
# Create directory for outputs | ||
mkdir -p repomix-outputs | ||
|
||
# Full docs | ||
repomix --include "docs/book/**/*.md" | ||
mv repomix-output.txt repomix-outputs/llms-full.txt | ||
|
||
# Component guide | ||
repomix --include "docs/book/component-guide/**/*.md" | ||
mv repomix-output.txt repomix-outputs/component-guide.txt | ||
|
||
# User guide | ||
repomix --include "docs/book/user-guide/**/*.md" | ||
mv repomix-output.txt user-guide.txt | ||
|
||
# Getting started | ||
repomix --include "docs/book/getting-started/**/*.md" | ||
mv repomix-output.txt getting-started.txt | ||
|
||
# Merge user guide and getting started into basics | ||
cat user-guide.txt getting-started.txt > repomix-outputs/basics.txt | ||
rm user-guide.txt getting-started.txt | ||
|
||
- name: Upload repomix outputs | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: repomix-outputs | ||
path: repomix-outputs | ||
retention-days: 5 | ||
|
||
- name: Submit batch job | ||
env: | ||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||
id: submit | ||
run: | | ||
python scripts/summarize_docs.py | ||
echo "batch_id=$(cat batch_id.txt)" >> $GITHUB_OUTPUT | ||
|
||
- name: Upload batch ID | ||
uses: actions/upload-artifact@v3 | ||
with: | ||
name: batch-id-${{ steps.submit.outputs.batch_id }} | ||
path: batch_id.txt | ||
retention-days: 5 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import json | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All the |
||
from openai import OpenAI | ||
|
||
client = OpenAI() | ||
|
||
def main(): | ||
# Read the batch ID from file | ||
with open("batch_id.txt", "r") as f: | ||
batch_id = f.read().strip() | ||
|
||
# Get the batch results file | ||
batch = client.batches.retrieve(batch_id) | ||
if batch.status != "completed": | ||
raise Exception(f"Batch job {batch_id} is not completed. Status: {batch.status}") | ||
|
||
# Get the output file | ||
file_response = client.files.content(batch.output_file_id) | ||
text = file_response.text | ||
|
||
# Process the results and write to file | ||
with open("zenml_docs.txt", "w") as f: | ||
for line in text.splitlines(): | ||
json_line = json.loads(line) | ||
|
||
# Extract and format the file path from custom_id | ||
file_path = "-".join(json_line["custom_id"].split("-")[2:]).replace("_", "/") | ||
|
||
# Write the file path and content | ||
f.write(f"File: {file_path}\n\n") | ||
f.write(json_line["response"]["body"]["choices"][0]["message"]["content"]) | ||
f.write("\n\n" + "="*80 + "\n\n") | ||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import os | ||
import re | ||
import json | ||
from openai import OpenAI | ||
from pathlib import Path | ||
from typing import List, Dict | ||
import time | ||
|
||
# Initialize OpenAI client | ||
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY')) | ||
|
||
def extract_content_blocks(md_content: str) -> str: | ||
"""Extracts content blocks while preserving order and marking code blocks.""" | ||
parts = re.split(r'(```[\s\S]*?```)', md_content) | ||
|
||
processed_content = "" | ||
for part in parts: | ||
if part.startswith('```'): | ||
processed_content += "\n[CODE_BLOCK_START]\n" + part + "\n[CODE_BLOCK_END]\n" | ||
else: | ||
cleaned_text = re.sub(r'\s+', ' ', part).strip() | ||
if cleaned_text: | ||
processed_content += "\n" + cleaned_text + "\n" | ||
|
||
return processed_content | ||
|
||
def prepare_batch_requests(md_files: List[Path]) -> List[Dict]: | ||
"""Prepares batch requests for each markdown file.""" | ||
batch_requests = [] | ||
|
||
for i, file_path in enumerate(md_files): | ||
try: | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
content = f.read() | ||
|
||
processed_content = extract_content_blocks(content) | ||
|
||
file_path_str_with_no_slashes = str(file_path).replace("/", "_") | ||
|
||
# Prepare the request for this file | ||
request = { | ||
"custom_id": f"file-{i}-{file_path_str_with_no_slashes}", | ||
"method": "POST", | ||
"url": "/v1/chat/completions", | ||
"body": { | ||
"model": "gpt-4-turbo-preview", | ||
"messages": [ | ||
{ | ||
"role": "system", | ||
"content": "You are a technical documentation summarizer." | ||
}, | ||
{ | ||
"role": "user", | ||
"content": f"""Please summarize the following documentation text for another LLM to be able to answer questions about it with enough detail. | ||
Keep all important technical information and key points while removing redundancy and verbose explanations. | ||
Make it concise but ensure NO critical information is lost and some details that you think are important are kept. | ||
Make the code shorter where possible keeping only the most important parts while preserving syntax and accuracy: | ||
|
||
{processed_content}""" | ||
} | ||
], | ||
"temperature": 0.3, | ||
"max_tokens": 2000 | ||
} | ||
} | ||
batch_requests.append(request) | ||
|
||
except Exception as e: | ||
print(f"Error processing {file_path}: {e}") | ||
|
||
return batch_requests | ||
|
||
def submit_batch_job(batch_requests: List[Dict]) -> str: | ||
"""Submits batch job to OpenAI and returns batch ID.""" | ||
# Create batch input file | ||
batch_file_path = "batch_input.jsonl" | ||
with open(batch_file_path, "w") as f: | ||
for request in batch_requests: | ||
f.write(json.dumps(request) + "\n") | ||
|
||
# Upload the file | ||
with open(batch_file_path, "rb") as f: | ||
batch_input_file = client.files.create( | ||
file=f, | ||
purpose="batch" | ||
) | ||
|
||
# Create the batch | ||
batch = client.batches.create( | ||
input_file_id=batch_input_file.id, | ||
endpoint="/v1/chat/completions", | ||
completion_window="24h", | ||
metadata={ | ||
"description": "ZenML docs summarization" | ||
} | ||
) | ||
|
||
# Store batch ID for later use | ||
with open("batch_id.txt", "w") as f: | ||
f.write(batch.id) | ||
|
||
print(f"Batch job submitted with ID: {batch.id}") | ||
return batch.id | ||
|
||
def main(): | ||
docs_dir = "docs/book" | ||
|
||
# Get markdown files | ||
exclude_files = ["toc.md"] | ||
md_files = list(Path(docs_dir).rglob("*.md")) | ||
md_files = [file for file in md_files if file.name not in exclude_files] | ||
|
||
# Prepare and submit batch job | ||
batch_requests = prepare_batch_requests(md_files) | ||
batch_id = submit_batch_job(batch_requests) | ||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not so sure if I am missing something here, but it seems like the only trigger for this workflow is any push event on the release branches. I have two concerns about this:
The if condition here
github.event.workflow_run.conclusion == 'success'
might not trigger as the push events do not have workflow runs, thus no conclusions.The current trigger logic might trigger when we backport docs changes or similar things to the existing release branches. Is this the desired behaviour?