diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 893125a..858b770 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -47,6 +47,11 @@ "name": "hugging-face-trackio", "source": "./skills/hugging-face-trackio", "description": "Track and visualize ML training experiments with Trackio. Log metrics via Python API and retrieve them via CLI. Supports real-time dashboards synced to HF Spaces." + }, + { + "name": "hf-create-a-space", + "source": "./skills/hf-create-a-space", + "description": "Create and deploy Hugging Face Spaces for ML models. Supports Gradio chat interfaces via Inference API (for supported models) or ZeroGPU (for any model). Guides you through deployment method selection and potential compatibility issues." } ] -} +} \ No newline at end of file diff --git a/README.md b/README.md index 8c55381..e7784c8 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ This repository contains a few skills to get you started. You can also contribut | Name | Description | Documentation | |------|-------------|---------------| +| `hf-create-a-space` | Create and deploy Hugging Face Spaces for ML models. Supports Gradio chat interfaces via Inference API (for supported models) or ZeroGPU (for any model). Guides you through deployment method selection and potential compatibility issues. | [SKILL.md](skills/hf-create-a-space/SKILL.md) | | `hugging-face-cli` | Execute Hugging Face Hub operations using the hf CLI. Download models/datasets, upload files, manage repos, and run cloud compute jobs. | [SKILL.md](skills/hugging-face-cli/SKILL.md) | | `hugging-face-datasets` | Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. | [SKILL.md](skills/hugging-face-datasets/SKILL.md) | | `hugging-face-evaluation` | Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom evaluations with vLLM/lighteval. | [SKILL.md](skills/hugging-face-evaluation/SKILL.md) | diff --git a/agents/AGENTS.md b/agents/AGENTS.md index 99f5354..45119ed 100644 --- a/agents/AGENTS.md +++ b/agents/AGENTS.md @@ -3,6 +3,7 @@ You have additional SKILLs documented in directories containing a "SKILL.md" file. These skills are: + - hf-create-a-space -> "skills/hf-create-a-space/SKILL.md" - hugging-face-cli -> "skills/hugging-face-cli/SKILL.md" - hugging-face-datasets -> "skills/hugging-face-datasets/SKILL.md" - hugging-face-evaluation -> "skills/hugging-face-evaluation/SKILL.md" @@ -16,6 +17,7 @@ IMPORTANT: You MUST read the SKILL.md file whenever the description of the skill +hf-create-a-space: `Create and deploy Hugging Face Spaces for ML models. Supports Gradio chat interfaces via Inference API (for supported models) or ZeroGPU (for any model). Guides you through deployment method selection and potential compatibility issues.` hugging-face-cli: `Execute Hugging Face Hub operations using the `hf` CLI. Use when the user needs to download models/datasets/spaces, upload files to Hub repositories, create repos, manage local cache, or run compute jobs on HF infrastructure. Covers authentication, file transfers, repository creation, cache operations, and cloud compute.` hugging-face-datasets: `Create and manage datasets on Hugging Face Hub. Supports initializing repos, defining configs/system prompts, streaming row updates, and SQL-based dataset querying/transformation. Designed to work alongside HF MCP server for comprehensive dataset workflows.` hugging-face-evaluation: `Add and manage evaluation results in Hugging Face model cards. Supports extracting eval tables from README content, importing scores from Artificial Analysis API, and running custom model evaluations with vLLM/lighteval. Works with the model-index metadata format.` diff --git a/skills/hf-create-a-space/SKILL.md b/skills/hf-create-a-space/SKILL.md new file mode 100644 index 0000000..fd5d3cf --- /dev/null +++ b/skills/hf-create-a-space/SKILL.md @@ -0,0 +1,496 @@ +--- +name: hf-create-a-space +description: Create and deploy Hugging Face Spaces for ML models. Supports Gradio chat interfaces via Inference API (for supported models) or ZeroGPU (for any model). Guides you through deployment method selection and potential compatibility issues. +--- + +# HF Create a Space + +Create and deploy Hugging Face Spaces for ML models with guided deployment method selection. + +## When to Use This Skill + +Use this skill when: +- A user wants to create a demo/Space for a HuggingFace model +- A user has fine-tuned a model and wants to showcase it +- A user wants to deploy a LoRA adapter with a base model +- A user needs help choosing between Inference API and ZeroGPU + +## Key Workflow: ASK DEPLOYMENT METHOD FIRST + +**CRITICAL: Before generating any code, ASK the user:** + +> "How do you want to deploy this model?" +> +> 1. **Inference API** - Free, no GPU needed, but model must be supported by HF's serverless inference +> 2. **ZeroGPU** - Free with quota, loads model directly on GPU, works with any model + +This choice determines the entire template structure. The two approaches are NOT interchangeable. + +## Deployment Method Comparison + +| Feature | Inference API | ZeroGPU | +|---------|--------------|---------| +| Cost | Free | Free (with quota) | +| Hardware | cpu-basic | zero-a10g (H200) | +| Model Support | Major providers only | Any model | +| LoRA Adapters | Never works | Fully supported | +| Fine-tuned models | Rarely works | Fully supported | +| Code Pattern | `InferenceClient` | `@spaces.GPU` + transformers | +| PRO Required | No | Yes (to host) | + +## Quick Compatibility Checks + +### For Inference API + +Before recommending Inference API, verify: + +**Likely to work:** +- Model is from major provider: `Qwen/`, `meta-llama/`, `mistralai/`, `google/`, `HuggingFaceH4/` +- Model page shows "Inference Providers" widget +- High download count (>10,000) + +**Will NOT work:** +- Personal/fine-tuned models (e.g., `username/my-model`) +- LoRA adapters (NEVER work with Inference API) +- Models without `pipeline_tag` metadata + +**Requires HF_TOKEN:** +- Gated models: `meta-llama/`, `mistralai/Mistral-`, `google/gemma-` + +### For ZeroGPU + +**Technical Specs:** +- GPU: Nvidia H200 slice with **70GB VRAM** +- SDK: **Gradio only** (not Streamlit or Docker) +- PyTorch: 2.1.0 to latest supported +- Python: 3.10.13 + +**Model Size Considerations:** +| Size | Compatibility | Notes | +|------|--------------|-------| +| < 3B params | Excellent | Fast loading, default duration=60 is fine | +| 3B - 7B params | Good | Use `duration=120` | +| 7B - 13B params | Possible | Use `duration=120`, may hit limits | +| > 13B params | Difficult | Likely OOM even with 70GB, consider quantization | + +**Duration Parameter (IMPORTANT):** +- Default: **60 seconds** - function must complete within this time +- For larger models or longer generation: use `@spaces.GPU(duration=120)` +- Can use dynamic duration: `@spaces.GPU(duration=get_duration_func)` + +**Usage Quotas (Daily):** +| Account Type | Daily Quota | Queue Priority | +|--------------|-------------|----------------| +| Unauthenticated | 2 min | Low | +| Free account | 3.5 min | Medium | +| PRO account | 25 min | Highest | +| Enterprise | 45 min | Highest | + +**Limitations:** +- `torch.compile` is **NOT supported** (use ahead-of-time compilation with torch 2.8+) +- Max 10 ZeroGPU Spaces per PRO account +- Max 50 ZeroGPU Spaces per Enterprise org + +**Special Cases:** +- **LoRA adapter?** → Needs `peft` dependency, must identify base model +- **Gated model?** → User must accept license + add `HF_TOKEN` secret + +## Templates: These Are NOT Interchangeable + +### Template 1: Inference API + +**Use when:** Model has serverless inference support + +```python +import os +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" + +# Token required for gated models (Llama, Mistral, Gemma, etc.) +HF_TOKEN = os.environ.get("HF_TOKEN") +client = InferenceClient(MODEL_ID, token=HF_TOKEN) + + +def respond(message, history, system_message, max_tokens, temperature, top_p): + messages = [{"role": "system", "content": system_message}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if assistant_msg: + messages.append({"role": "assistant", "content": assistant_msg}) + + messages.append({"role": "user", "content": message}) + + response = "" + for token in client.chat_completion( + messages, + max_tokens=max_tokens, + stream=True, + temperature=temperature, + top_p=top_p, + ): + delta = token.choices[0].delta.content or "" + response += delta + yield response + + +demo = gr.ChatInterface( + respond, + title="Chat Demo", + additional_inputs=[ + gr.Textbox(value="You are a helpful assistant.", label="System message"), + gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max tokens"), + gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), + ], + examples=[["Hello!"], ["Write a poem"]], +) + +if __name__ == "__main__": + demo.launch() +``` + +**requirements.txt:** +``` +gradio>=5.0.0 +huggingface_hub>=0.26.0 +``` + +**Hardware:** cpu-basic (free, no configuration needed) + +--- + +### Template 2: ZeroGPU (Full Model) + +**Use when:** Model doesn't have Inference API support, OR user wants direct model loading + +```python +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL_ID = "username/my-finetuned-model" + +# Load tokenizer at startup (lightweight) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Model loaded lazily inside GPU context +model = None + + +def load_model(): + global model + if model is None: + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + return model + + +@spaces.GPU(duration=120) # GPU allocated for up to 120 seconds +def generate_response(message, history, system_message, max_tokens, temperature, top_p): + model = load_model() + + messages = [{"role": "system", "content": system_message}] + for user_msg, assistant_msg in history: + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if assistant_msg: + messages.append({"role": "assistant", "content": assistant_msg}) + messages.append({"role": "user", "content": message}) + + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = tokenizer([text], return_tensors="pt").to(model.device) + + outputs = model.generate( + **inputs, + max_new_tokens=int(max_tokens), + temperature=temperature, + top_p=top_p, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) + return response + + +demo = gr.ChatInterface( + generate_response, + title="Chat Demo", + additional_inputs=[ + gr.Textbox(value="You are a helpful assistant.", label="System message"), + gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), + gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), + ], + examples=[["Hello!"], ["Write a poem"]], +) + +if __name__ == "__main__": + demo.launch() +``` + +**requirements.txt:** +``` +gradio>=5.0.0 +torch +transformers +accelerate +spaces +``` + +**Hardware:** Must set to `ZeroGPU` in Space Settings after deployment! + +--- + +### Template 3: ZeroGPU (LoRA Adapter) + +**Use when:** Model is a LoRA/PEFT adapter + +```python +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import PeftModel + +ADAPTER_ID = "username/my-lora-adapter" +BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct" # From adapter_config.json + +tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID) +model = None + + +def load_model(): + global model + if model is None: + base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID) + model = peft_model.merge_and_unload() + return model + + +@spaces.GPU(duration=120) +def generate_response(message, history, system_message, max_tokens, temperature, top_p): + model = load_model() + # ... same generation code as Template 2 ... + + +demo = gr.ChatInterface(generate_response, ...) + +if __name__ == "__main__": + demo.launch() +``` + +--- + +## ZeroGPU Best Practices + +### Model Loading Patterns + +**Pattern 1: Lazy Loading (Recommended for chat models)** +```python +model = None + +def load_model(): + global model + if model is None: + model = AutoModelForCausalLM.from_pretrained(..., device_map="auto") + return model + +@spaces.GPU(duration=120) +def generate(prompt): + model = load_model() + # ... use model ... +``` + +**Pattern 2: Eager Loading (From HF docs, good for diffusion)** +```python +pipe = DiffusionPipeline.from_pretrained(...) +pipe.to('cuda') + +@spaces.GPU +def generate(prompt): + return pipe(prompt).images +``` + +### Duration Tips + +```python +# Default: 60 seconds - fine for small models +@spaces.GPU +def quick_inference(x): + ... + +# For 7B+ models or long generation +@spaces.GPU(duration=120) +def longer_inference(x): + ... + +# Dynamic duration based on input +def calc_duration(prompt, max_tokens): + return min(60 + (max_tokens // 100) * 10, 120) + +@spaces.GPU(duration=calc_duration) +def dynamic_inference(prompt, max_tokens): + ... +``` + +### Things That Don't Work on ZeroGPU + +1. **`torch.compile()`** - Use ahead-of-time compilation (torch 2.8+) instead +2. **Streamlit/Docker SDK** - ZeroGPU is Gradio-only +3. **Persistent GPU state between requests** - GPU is released after each call + +**requirements.txt:** +``` +gradio>=5.0.0 +torch +transformers +accelerate +spaces +peft +``` + +## Post-Deployment Checklist + +### For Inference API Deployments +- [ ] Space builds successfully +- [ ] If gated model: Add `HF_TOKEN` as Repository Secret + +### For ZeroGPU Deployments +- [ ] Go to Space Settings +- [ ] Set hardware to "ZeroGPU" (requires PRO subscription) +- [ ] If gated model: Add `HF_TOKEN` as Repository Secret +- [ ] Wait for build to complete + +## Troubleshooting Reference + +| Error | Likely Cause | Fix | +|-------|--------------|-----| +| `No @spaces.GPU function detected` | Inference API code running on ZeroGPU hardware | Switch to ZeroGPU template (Template 2 or 3) | +| `No API found` (Inference API) | Model doesn't support serverless inference | Use ZeroGPU instead | +| `No API found` (gated model) | Missing HF_TOKEN | Add HF_TOKEN secret in Space Settings | +| Model not loading | Wrong template for model type | Check if LoRA vs full model | +| `OSError: does not appear to have...safetensors` | LoRA adapter loaded as full model | Use Template 3 with PEFT | +| Out of memory | Model too large for hardware | Reduce max_tokens, use quantization, or larger GPU | +| Build succeeds but app errors | Hardware not set | Set hardware to ZeroGPU in Settings | +| `ImportError: cannot import name 'HfFolder'` | Version mismatch | Use gradio>=5.0.0, huggingface_hub>=0.26.0 | +| Function timeout / killed | Exceeded duration limit | Add `@spaces.GPU(duration=120)` for longer ops | +| `torch.compile` errors | Not supported on ZeroGPU | Remove torch.compile or use ahead-of-time compilation | +| Quota exceeded | Daily GPU quota used up | Wait for reset or upgrade to PRO (25 min/day) | + +## Decision Flowchart + +``` +User wants to deploy model +│ +├─→ ASK: "How do you want to deploy?" +│ +├─→ User chooses INFERENCE API +│ │ +│ ├─→ Check: Is model from major provider? +│ │ ├─→ YES → Proceed with Template 1 +│ │ └─→ NO → Warn: "This model may not have Inference API support" +│ │ +│ ├─→ Check: Is it a LoRA adapter? +│ │ └─→ YES → STOP: "LoRA adapters don't work with Inference API. Use ZeroGPU." +│ │ +│ └─→ Check: Is model gated? +│ └─→ YES → Add note: "You'll need to add HF_TOKEN secret" +│ +└─→ User chooses ZEROGPU + │ + ├─→ Check: Is it a LoRA adapter? + │ ├─→ YES → Use Template 3, identify base model + │ └─→ NO → Use Template 2 + │ + ├─→ Check: Model size? + │ └─→ > 7B → Warn about potential memory limits + │ + └─→ Check: Is model gated? + └─→ YES → Add note: "You'll need to add HF_TOKEN secret" +``` + +## Scripts Reference + +### deploy_model.py + +```bash +# Deploy with Inference API +python scripts/deploy_model.py MODEL_ID --type chat --method inference-api + +# Deploy with ZeroGPU +python scripts/deploy_model.py MODEL_ID --type chat --method zerogpu + +# Dry run to preview +python scripts/deploy_model.py MODEL_ID --type chat --dry-run +``` + +### manage_space.py + +```bash +# Set hardware +python scripts/manage_space.py hardware username/space --tier zero-a10g + +# Add secret +python scripts/manage_space.py secret username/space --key HF_TOKEN --value hf_xxx + +# Restart space +python scripts/manage_space.py restart username/space +``` + +## Common Scenarios + +### Scenario 1: Deploy Llama 3.1 (Gated, Inference API supported) + +``` +User: "Deploy meta-llama/Llama-3.1-8B-Instruct as a chat Space" + +Response: +1. This model supports Inference API. Do you want to use: + - Inference API (free, simple) - Recommended for this model + - ZeroGPU (loads model directly) + +2. Note: This is a gated model. After deployment: + - Accept license at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct + - Add HF_TOKEN secret in Space Settings +``` + +### Scenario 2: Deploy a Fine-tuned Model (No Inference API) + +``` +User: "Deploy username/my-finetuned-llama as a chat Space" + +Response: +1. This appears to be a personal/fine-tuned model. These typically don't have Inference API support. + Recommended: ZeroGPU + +2. Requirements: + - PRO subscription to host ZeroGPU Space + - Set hardware to ZeroGPU in Space Settings after deployment +``` + +### Scenario 3: Deploy a LoRA Adapter + +``` +User: "Deploy username/my-lora-adapter as a chat Space" + +Response: +1. This is a LoRA adapter. LoRA adapters ONLY work with ZeroGPU (not Inference API). + +2. I found the base model in adapter_config.json: Qwen/Qwen2.5-Coder-1.5B-Instruct + +3. Using Template 3 (ZeroGPU + PEFT) +``` diff --git a/skills/hf-create-a-space/references/error_patterns.md b/skills/hf-create-a-space/references/error_patterns.md new file mode 100644 index 0000000..0420cd2 --- /dev/null +++ b/skills/hf-create-a-space/references/error_patterns.md @@ -0,0 +1,358 @@ +# Error Patterns Reference + +Comprehensive catalog of common errors in Hugging Face Spaces and their solutions. + +## Package & Import Errors + +### ModuleNotFoundError + +**Pattern**: `ModuleNotFoundError: No module named 'X'` + +**Example**: +``` +ModuleNotFoundError: No module named 'transformers' +``` + +**Cause**: Package not in requirements.txt + +**Fix**: Add the package to requirements.txt +```bash +python scripts/remediate.py fix-requirements username/space --add transformers +``` + +**Common missing packages**: +| Import | Package to add | +|--------|---------------| +| `torch` | `torch` | +| `transformers` | `transformers>=4.40.0` | +| `peft` | `peft` | +| `spaces` | `spaces` | +| `PIL` | `Pillow>=10.0.0` | +| `cv2` | `opencv-python>=4.8.0` | + +--- + +### ImportError: cannot import name 'HfFolder' + +**Pattern**: `ImportError: cannot import name 'HfFolder' from 'huggingface_hub'` + +**Cause**: Version mismatch between gradio and huggingface_hub + +**Fix**: Update both packages: +``` +gradio>=5.0.0 +huggingface_hub>=0.26.0 +``` + +**Auto-fix**: +```bash +python scripts/remediate.py auto-fix username/space +``` + +--- + +### ImportError: cannot import name 'X' from 'Y' + +**Pattern**: `ImportError: cannot import name 'AutoModel' from 'transformers'` + +**Cause**: Usually version mismatch or API change + +**Fix**: Pin a compatible version or update code + +--- + +## Model Loading Errors + +### OSError: does not appear to have a file named... + +**Pattern**: `OSError: username/model does not appear to have a file named pytorch_model.bin, model.safetensors` + +**Cause**: Trying to load a LoRA adapter as a full model + +**Detection**: Check for `adapter_config.json` in the model repo + +**Fix**: Use PEFT to load the adapter: +```python +from peft import PeftModel +from transformers import AutoModelForCausalLM + +# Load base model first +base_model = AutoModelForCausalLM.from_pretrained("base-model-id") + +# Then apply adapter +model = PeftModel.from_pretrained(base_model, "adapter-id") +model = model.merge_and_unload() # Optional: merge for faster inference +``` + +--- + +### Cannot access gated repo + +**Pattern**: `Cannot access gated repo for url ... Access to model ... is restricted` + +**Cause**: Model is gated and user hasn't accepted terms + +**Fix**: +1. Go to model page on HF Hub +2. Accept the terms/license +3. Add HF_TOKEN secret to Space with a token that has access + +--- + +### Repository Not Found + +**Pattern**: `404 Client Error: Repository Not Found` + +**Cause**: Model ID is incorrect or model is private + +**Fix**: +1. Verify model ID is correct +2. If private, add HF_TOKEN secret with access + +--- + +## GPU & CUDA Errors + +### CUDA out of memory + +**Pattern**: `torch.cuda.OutOfMemoryError: CUDA out of memory` + +**Cause**: Model too large for GPU VRAM + +**Fixes**: +1. **Upgrade hardware**: Use l40s (48GB) or a100 (80GB) +2. **Use quantization**: +```python +model = AutoModel.from_pretrained(MODEL_ID, load_in_8bit=True) +``` +3. **Reduce batch size / context length** +4. **Use flash attention**: +```python +model = AutoModel.from_pretrained(MODEL_ID, attn_implementation="flash_attention_2") +``` + +--- + +### CUDA is not available + +**Pattern**: `AssertionError: Torch not compiled with CUDA enabled` or `CUDA is not available` + +**Cause**: Space running on CPU but code requires GPU + +**Fix**: Set hardware to GPU tier in Space Settings +- ZeroGPU (free with PRO) +- T4/L4/A10G/A100 (paid) + +--- + +### GPU allocation timed out + +**Pattern**: `GPU allocation timed out` or `Queue timeout` + +**Cause**: ZeroGPU queue is backed up + +**Fixes**: +1. Try again later +2. Use paid GPU tier for guaranteed access +3. Optimize code to complete faster + +--- + +### Duration exceeded + +**Pattern**: Function terminated due to exceeding duration limit + +**Cause**: `@spaces.GPU(duration=X)` is too short + +**Fix**: Increase duration: +```python +@spaces.GPU(duration=120) # Up from default 60 +def generate(...): + ... +``` + +--- + +## Gradio Errors + +### ValueError: examples must be nested list + +**Pattern**: `ValueError: examples must be nested list` + +**Cause**: Gradio 5.x changed examples format + +**Fix**: Use nested lists: +```python +# WRONG (Gradio 4.x style): +examples=["Example 1", "Example 2"] + +# CORRECT (Gradio 5.x style): +examples=[["Example 1"], ["Example 2"]] +``` + +--- + +### No API found + +**Pattern**: `No API found for this Space` + +**Cause**: Gradio app not exposing API properly, often due to hardware mismatch + +**Fix**: +1. Go to Space Settings +2. Set correct hardware (ZeroGPU or paid GPU) +3. Restart Space + +--- + +### Address already in use + +**Pattern**: `OSError: [Errno 98] Address already in use` + +**Cause**: Explicit port binding in code + +**Fix**: Remove port specification: +```python +# WRONG: +demo.launch(server_port=7860) + +# CORRECT: +demo.launch() # Let Spaces handle the port +``` + +--- + +## Authentication Errors + +### 401 Unauthorized + +**Pattern**: `401 Client Error: Unauthorized` + +**Cause**: Missing or invalid HF token + +**Fix**: +1. Generate a token at https://huggingface.co/settings/tokens +2. Add as secret in Space Settings: `HF_TOKEN` +3. Or use environment variable in code: +```python +import os +token = os.environ.get("HF_TOKEN") +``` + +--- + +### 403 Forbidden + +**Pattern**: `403 Client Error: Forbidden` + +**Causes**: +- Token doesn't have required permissions +- Model is gated and access not granted +- Organization restrictions + +**Fix**: Check token permissions and model access + +--- + +## Model-Specific Errors + +### Cannot use chat template + +**Pattern**: `Cannot use chat template` or `Chat template is not defined` + +**Cause**: Model doesn't have a chat template defined + +**Fixes**: +1. Use base completion instead of chat +2. Manually apply a chat template: +```python +def format_prompt(message, history): + prompt = "" + for user_msg, assistant_msg in history: + prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n" + prompt += f"User: {message}\nAssistant:" + return prompt +``` + +--- + +### Tokenizer not found + +**Pattern**: `Can't load tokenizer for 'model-id'` + +**Cause**: Tokenizer files missing or model ID wrong + +**Fix**: Verify model ID and that tokenizer files exist in repo + +--- + +## Disk Space Errors + +### No space left on device + +**Pattern**: `OSError: [Errno 28] No space left on device` + +**Cause**: Model too large for disk quota + +**Fixes**: +1. Use streaming download +2. Upgrade storage tier +3. Use smaller model or quantized version + +--- + +## Network Errors + +### Connection timeout + +**Pattern**: `requests.exceptions.ConnectTimeout` + +**Cause**: HF Hub or external service timeout + +**Fix**: Add retry logic or increase timeout + +--- + +### Rate limit exceeded + +**Pattern**: `429 Too Many Requests` + +**Cause**: Too many API calls + +**Fix**: Add backoff/retry logic or reduce request frequency + +--- + +## How to Use This Reference + +### Manual Lookup + +1. Find the error pattern in your logs +2. Search this document for the pattern +3. Apply the recommended fix + +### Automated Detection + +```bash +# Analyze Space logs for known patterns +python scripts/monitor_space.py analyze-errors username/space-name + +# Auto-fix what can be fixed automatically +python scripts/remediate.py auto-fix username/space-name +``` + +### Adding New Patterns + +To add a new error pattern to automated detection, edit `scripts/monitor_space.py`: + +```python +ERROR_PATTERNS = { + # Add new pattern here + "new_error_type": { + "pattern": r"regex pattern here", + "description": "Human readable description", + "fix_template": "How to fix this error", + "auto_fixable": False, # or True if can be auto-fixed + }, +} +``` diff --git a/skills/hf-create-a-space/references/hardware_guide.md b/skills/hf-create-a-space/references/hardware_guide.md new file mode 100644 index 0000000..f0d255a --- /dev/null +++ b/skills/hf-create-a-space/references/hardware_guide.md @@ -0,0 +1,231 @@ +# Hugging Face Spaces Hardware Guide + +Comprehensive reference for hardware options available on Hugging Face Spaces. + +## Hardware Tiers Overview + +### CPU Tiers + +| Tier | vCPU | RAM | Disk | Cost | Best For | +|------|------|-----|------|------|----------| +| **cpu-basic** | 2 | 16GB | 50GB | Free | Inference API apps, simple demos | +| **cpu-upgrade** | 8 | 32GB | 50GB | $0.03/hr | Classification, embeddings, small models | + +### GPU Tiers + +| Tier | GPU | VRAM | vCPU | RAM | Disk | Cost | Best For | +|------|-----|------|------|-----|------|------|----------| +| **zero-a10g** | H200 slice | 70GB | Dynamic | Dynamic | 50GB | Free* | Most models <7B | +| **t4-small** | T4 | 16GB | 4 | 15GB | 50GB | $0.40/hr | Entry-level GPU | +| **t4-medium** | T4 | 16GB | 8 | 30GB | 100GB | $0.60/hr | More CPU/RAM | +| **l4** | L4 | 24GB | 8 | 30GB | 400GB | $0.80/hr | Great value for 3-7B | +| **l40s** | L40S | 48GB | 8 | 62GB | 380GB | $1.80/hr | 7-14B models | +| **a10g-small** | A10G | 24GB | 4 | 14GB | 110GB | $1.00/hr | 24GB alternative | +| **a10g-large** | A10G | 24GB | 12 | 46GB | 200GB | $1.50/hr | More CPU/RAM | +| **a100-large** | A100 | 80GB | 12 | 142GB | 1TB | $2.50/hr | Large 14-30B+ models | + +*ZeroGPU requires PRO subscription to host. + +## ZeroGPU Deep Dive + +### What is ZeroGPU? + +ZeroGPU is a shared GPU infrastructure that dynamically allocates NVIDIA H200 GPUs on-demand: + +- **Free GPU access** for PRO subscribers +- GPU allocated when function is called +- GPU released when function returns +- Daily quota limits based on account tier + +### ZeroGPU Specifications + +- **GPU Type**: NVIDIA H200 slice +- **Available VRAM**: 70GB per workload +- **SDK Support**: Gradio only (no Streamlit/Docker) +- **Python Version**: 3.10.13 +- **PyTorch Versions**: 2.1.0 - 2.8.0 + +### ZeroGPU Quotas + +| Account Type | Daily Quota | Queue Priority | +|--------------|-------------|----------------| +| Unauthenticated | 2 min | Low | +| Free account | 3.5 min | Medium | +| PRO account | 25 min | Highest | +| Team organization | 25 min | Highest | +| Enterprise | 45 min | Highest | + +### Hosting Requirements + +| Account Type | Can HOST ZeroGPU? | Max Spaces | +|--------------|-------------------|------------| +| Free | No | 0 | +| PRO | Yes | 10 | +| Enterprise | Yes | 50 | + +### Using ZeroGPU + +```python +import spaces + +@spaces.GPU(duration=120) # Max seconds for GPU allocation +def generate(prompt): + # GPU is available inside this function + output = model.generate(...) + return output + # GPU released after function returns +``` + +### ZeroGPU Limitations + +1. **No torch.compile** - Use ahead-of-time compilation instead +2. **No streaming** - GPU held for entire function duration +3. **Gradio only** - Streamlit and Docker not supported +4. **Queue delays** - High demand can cause wait times +5. **Duration limits** - Default 60s, max configurable + +## Hardware Selection Guidelines + +### By Model Size + +| Model Parameters | Minimum VRAM | Recommended Tier | +|------------------|--------------|------------------| +| < 500M | CPU works | cpu-upgrade | +| 500M - 1B | ~4GB | zero-a10g or t4-small | +| 1B - 3B | ~8GB | zero-a10g or t4-small | +| 3B - 7B | ~16GB | zero-a10g or l4 | +| 7B - 13B | ~28GB | l40s | +| 13B - 30B | ~60GB | a100-large | +| > 30B | 80GB+ | a100-large + quantization | + +### By Use Case + +| Use Case | Recommended Tier | +|----------|-----------------| +| Inference API demo | cpu-basic | +| Classification model | cpu-upgrade | +| Embedding model | cpu-upgrade | +| Chat model (supported provider) | cpu-basic | +| Chat model (custom) | zero-a10g | +| Image generation | zero-a10g or l4 | +| Video generation | l40s or a100 | +| Vision-language model | zero-a10g or l4 | +| Speech recognition | cpu-upgrade or zero-a10g | + +### Cost Optimization + +1. **Use Inference API when available** - Free, no GPU needed +2. **Use ZeroGPU for PRO users** - Free GPU access +3. **Choose L4 over A10G** - Better value at $0.80/hr +4. **Use smaller tiers for development** - Scale up for production +5. **Set sleep time** - Auto-pause when inactive +6. **Use quantization** - Reduce VRAM requirements + +## VRAM Estimation + +### Formula + +For transformer models in FP16: +``` +VRAM (GB) ≈ Parameters (B) × 2 × 1.2 +``` + +The 1.2 factor accounts for overhead (KV cache, gradients, etc.) + +### Examples + +| Model | Parameters | Estimated VRAM | +|-------|------------|----------------| +| GPT-2 | 124M | ~0.3GB | +| Llama-3-8B | 8B | ~19GB | +| Llama-3-70B | 70B | ~168GB | +| Mixtral-8x7B | 47B | ~113GB | + +### Reducing VRAM + +1. **Quantization** - INT8 halves VRAM, INT4 quarters it +2. **Smaller batch size** - Less KV cache +3. **Shorter context** - Less KV cache +4. **Flash Attention** - More efficient memory + +## Changing Hardware + +### Via UI + +1. Go to Space Settings +2. Scroll to "Space Hardware" +3. Select new tier +4. Space will restart with new hardware + +### Via API + +```python +from huggingface_hub import request_space_hardware + +request_space_hardware( + repo_id="username/space-name", + hardware="l4" +) +``` + +### Via CLI + +```bash +python scripts/manage_space.py hardware username/space-name --tier l4 +``` + +## Sleep Behavior + +### Free Tiers (cpu-basic) + +- Sleeps after 48 hours of inactivity +- Wakes automatically on visitor + +### Paid Tiers + +- Never sleeps by default +- Can set custom sleep time in Settings +- Not billed while sleeping/paused + +### Pausing Manually + +```bash +python scripts/manage_space.py pause username/space-name +python scripts/manage_space.py restart username/space-name +``` + +## Framework-Specific Setup + +### PyTorch CUDA + +``` +--extra-index-url https://download.pytorch.org/whl/cu118 +torch +``` + +### JAX CUDA + +``` +-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +jax[cuda12_pip] +``` + +### TensorFlow + +TensorFlow auto-detects CUDA, just add: +``` +tensorflow +``` + +## Persistent Storage + +In addition to hardware, you can add persistent storage: + +| Storage Tier | Size | Monthly Cost | +|--------------|------|--------------| +| Ephemeral (default) | 50GB | Free | +| Small | +20GB | $5 | +| Medium | +150GB | $25 | +| Large | +1TB | $100 | + +Ephemeral storage is wiped on restart; persistent storage survives. diff --git a/skills/hf-create-a-space/references/zerogpu_guide.md b/skills/hf-create-a-space/references/zerogpu_guide.md new file mode 100644 index 0000000..51ce830 --- /dev/null +++ b/skills/hf-create-a-space/references/zerogpu_guide.md @@ -0,0 +1,354 @@ +# ZeroGPU Complete Guide + +Everything you need to know about using ZeroGPU on Hugging Face Spaces. + +## What is ZeroGPU? + +ZeroGPU is a shared GPU infrastructure that provides free on-demand GPU access for Hugging Face Spaces. Instead of dedicating a GPU to your Space 24/7, ZeroGPU dynamically allocates an NVIDIA H200 GPU slice only when your code needs it. + +## Key Benefits + +1. **Free GPU access** - No per-hour charges (PRO subscription required to host) +2. **70GB VRAM** - H200 slice provides significant GPU memory +3. **On-demand allocation** - Pay nothing when idle +4. **Efficient sharing** - Better resource utilization across HF + +## Requirements + +### To HOST a ZeroGPU Space + +| Account Type | Can Host? | Max Spaces | +|--------------|-----------|------------| +| Free | No | 0 | +| PRO ($9/month) | Yes | 10 | +| Team/Enterprise | Yes | 50 | + +### To USE a ZeroGPU Space (as visitor) + +Anyone can use ZeroGPU Spaces, with daily quotas: + +| Account Type | Daily Quota | +|--------------|-------------| +| Unauthenticated | 2 minutes | +| Free account | 3.5 minutes | +| PRO | 25 minutes | +| Team/Enterprise | 25-45 minutes | + +## Technical Specifications + +- **GPU**: NVIDIA H200 slice +- **VRAM**: 70GB available per workload +- **SDK**: Gradio only (Streamlit/Docker not supported) +- **Python**: 3.10.13 +- **PyTorch**: 2.1.0 through 2.8.0 +- **Default duration**: 60 seconds per call +- **Max duration**: Configurable via decorator + +## How to Use ZeroGPU + +### 1. Install the spaces package + +``` +# requirements.txt +gradio>=5.0.0 +spaces +torch +transformers +``` + +### 2. Import and decorate + +```python +import spaces + +@spaces.GPU +def my_gpu_function(input): + # GPU is allocated when this function is called + output = model.generate(input) + return output + # GPU is released when function returns +``` + +### 3. Set duration (optional) + +```python +@spaces.GPU(duration=120) # Allow up to 120 seconds +def slow_generation(input): + # Long-running GPU operation + return output +``` + +### 4. Dynamic duration + +```python +def calculate_duration(text, num_steps): + return min(num_steps * 2, 120) # Scale with steps, max 120s + +@spaces.GPU(duration=calculate_duration) +def generate(text, num_steps): + return model.generate(text, steps=num_steps) +``` + +## Code Patterns + +### Basic Chat Model + +```python +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL_ID = "your-model-id" + +# Load on CPU at startup (this is free) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + torch_dtype=torch.float16, +) + +@spaces.GPU(duration=120) +def generate(message, history): + # GPU allocated here + # Model automatically moved to GPU by spaces library + + inputs = tokenizer(message, return_tensors="pt").to(model.device) + outputs = model.generate(**inputs, max_new_tokens=512) + response = tokenizer.decode(outputs[0], skip_special_tokens=True) + + return response + # GPU released here + +demo = gr.ChatInterface(generate) +demo.launch() +``` + +### Image Generation + +```python +import spaces +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float16, +) + +@spaces.GPU(duration=60) +def generate_image(prompt): + pipe.to("cuda") + image = pipe(prompt).images[0] + return image +``` + +### LoRA Adapter + +```python +import spaces +from transformers import AutoModelForCausalLM +from peft import PeftModel + +BASE_MODEL_ID = "meta-llama/Llama-3.1-8B" +ADAPTER_ID = "your-adapter-id" + +# Load base model on CPU +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_ID, + torch_dtype=torch.float16, +) + +# Apply adapter +model = PeftModel.from_pretrained(base_model, ADAPTER_ID) +model = model.merge_and_unload() + +tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID) + +@spaces.GPU(duration=120) +def generate(text): + inputs = tokenizer(text, return_tensors="pt").to(model.device) + outputs = model.generate(**inputs) + return tokenizer.decode(outputs[0]) +``` + +## Important Limitations + +### 1. No torch.compile() + +ZeroGPU does not support `torch.compile()`. Use ahead-of-time (AOT) compilation instead: + +```python +# DON'T do this with ZeroGPU +model = torch.compile(model) # Will fail + +# DO use AOT compilation for PyTorch 2.8+ +# See: https://huggingface.co/blog/zerogpu-aoti +``` + +### 2. No Streaming + +With ZeroGPU, the GPU is held for the entire function duration. You cannot stream tokens because: + +```python +# This won't work well with ZeroGPU: +@spaces.GPU +def generate_stream(text): + for token in model.generate_streaming(...): + yield token # GPU held until all tokens done +``` + +Instead, generate the full response and return it: + +```python +@spaces.GPU +def generate(text): + return model.generate(text) # Return full response +``` + +### 3. Gradio Only + +ZeroGPU is only compatible with Gradio SDK. Streamlit and Docker Spaces cannot use ZeroGPU. + +### 4. Queue Delays + +During high demand, there may be wait times for GPU allocation. PRO users get highest priority. + +### 5. Duration Limits + +If your function exceeds the specified duration, it will be terminated. Set appropriate durations: + +```python +# For quick operations (classification, short generation) +@spaces.GPU(duration=30) + +# For medium operations (standard generation) +@spaces.GPU(duration=60) # default + +# For slow operations (long generation, large models) +@spaces.GPU(duration=120) +``` + +## Best Practices + +### 1. Load models on CPU at startup + +```python +# Good: Load on CPU outside the GPU function +model = AutoModel.from_pretrained(MODEL_ID) + +@spaces.GPU +def infer(x): + return model(x) # Model moved to GPU automatically +``` + +### 2. Use appropriate dtypes + +```python +# Good: Use float16 to reduce VRAM +model = AutoModel.from_pretrained(MODEL_ID, torch_dtype=torch.float16) +``` + +### 3. Set realistic durations + +```python +# Match duration to expected inference time +@spaces.GPU(duration=30) # For fast models +def quick_inference(x): + return model(x) + +@spaces.GPU(duration=120) # For slow models +def slow_inference(x): + return large_model(x) +``` + +### 4. Handle errors gracefully + +```python +@spaces.GPU +def generate(text): + try: + return model.generate(text) + except RuntimeError as e: + if "out of memory" in str(e): + return "Error: Input too long. Please try shorter text." + raise +``` + +### 5. Optimize for batch processing if possible + +```python +# If you have multiple inputs, batch them +@spaces.GPU +def batch_inference(inputs): + return model(inputs) # Single GPU allocation for all inputs +``` + +## Troubleshooting + +### "GPU allocation timed out" + +**Cause**: High demand, queue is long +**Fix**: Try again later, or use paid GPU tier + +### "Duration exceeded" + +**Cause**: Function took longer than specified duration +**Fix**: Increase `@spaces.GPU(duration=X)` or optimize code + +### "CUDA out of memory" + +**Cause**: Model too large for 70GB VRAM +**Fix**: Use quantization or smaller model + +### Model not using GPU + +**Cause**: Model loaded outside @spaces.GPU function +**Fix**: The spaces library should auto-move models. Ensure you're using latest version. + +### "spaces module not found" + +**Cause**: Missing from requirements.txt +**Fix**: Add `spaces` to requirements.txt + +## Setting Up a New ZeroGPU Space + +### 1. Create Space with Gradio SDK + +```bash +hf repo create my-space --repo-type space --space-sdk gradio +``` + +### 2. Add requirements.txt + +``` +gradio>=5.0.0 +spaces +torch +transformers +accelerate +``` + +### 3. Create app.py with @spaces.GPU + +```python +import gradio as gr +import spaces +# ... your code with @spaces.GPU decorator +``` + +### 4. Upload to Space + +```bash +hf upload username/my-space . --repo-type space +``` + +### 5. Set hardware to ZeroGPU + +Go to Space Settings > Hardware > Select "ZeroGPU" + +## Resources + +- [Official ZeroGPU Documentation](https://huggingface.co/docs/hub/spaces-zerogpu) +- [ZeroGPU with AOT Compilation](https://huggingface.co/blog/zerogpu-aoti) +- [ZeroGPU Spaces Gallery](https://huggingface.co/spaces/enzostvs/zero-gpu-spaces) +- [Feedback & Discussion](https://huggingface.co/spaces/zero-gpu-explorers/README/discussions) diff --git a/skills/hf-create-a-space/scripts/create_space.py b/skills/hf-create-a-space/scripts/create_space.py new file mode 100644 index 0000000..06aa95e --- /dev/null +++ b/skills/hf-create-a-space/scripts/create_space.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +""" +Create a new Hugging Face Space with proper configuration. + +Usage: + python create_space.py --sdk gradio --hardware cpu-basic + python create_space.py my-demo --sdk streamlit --private +""" + +import argparse +import os +from huggingface_hub import create_repo, upload_file, HfApi + + +SPACE_TEMPLATE_README = """--- +title: {title} +emoji: {emoji} +colorFrom: {color_from} +colorTo: {color_to} +sdk: {sdk} +sdk_version: {sdk_version} +app_file: app.py +pinned: false +license: mit +short_description: {description} +--- + +# {title} + +{description} + +## Usage + +Describe how to use your Space here. + +## Model + +Describe the model(s) used in this Space. +""" + +GRADIO_TEMPLATE = '''import gradio as gr + +def greet(name: str) -> str: + """Simple greeting function.""" + return f"Hello, {name}! Welcome to this Hugging Face Space." + +demo = gr.Interface( + fn=greet, + inputs=gr.Textbox(label="Your Name", placeholder="Enter your name..."), + outputs=gr.Textbox(label="Greeting"), + title="My Gradio Space", + description="A simple demo Space. Replace this with your own functionality!", + examples=[["World"], ["Hugging Face"]], +) + +if __name__ == "__main__": + demo.launch() +''' + +STREAMLIT_TEMPLATE = '''import streamlit as st + +st.set_page_config(page_title="My Space", page_icon="🤗") + +st.title("🤗 My Streamlit Space") + +st.write("Welcome to this Hugging Face Space!") + +name = st.text_input("Enter your name:", placeholder="Your name...") + +if name: + st.success(f"Hello, {name}! Welcome to this Space.") + +st.markdown("---") +st.markdown("Replace this template with your own functionality!") +''' + +DOCKER_TEMPLATE = '''import gradio as gr + +def greet(name: str) -> str: + return f"Hello from Docker, {name}!" + +demo = gr.Interface(fn=greet, inputs="text", outputs="text") + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860) +''' + +DOCKERFILE_TEMPLATE = '''FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py . + +EXPOSE 7860 + +CMD ["python", "app.py"] +''' + +REQUIREMENTS = { + "gradio": "gradio>=4.0.0\nhuggingface_hub>=0.26.0\n", + "streamlit": "streamlit>=1.28.0\nhuggingface_hub>=0.26.0\n", + "docker": "gradio>=4.0.0\nhuggingface_hub>=0.26.0\n", +} + +SDK_VERSIONS = { + "gradio": "4.44.0", + "streamlit": "1.40.0", + "docker": None, +} + +EMOJIS = ["🚀", "🤖", "🔥", "✨", "💡", "🎯", "🌟", "⚡", "🎨", "🧠"] +COLORS = ["red", "yellow", "green", "blue", "indigo", "purple", "pink", "gray"] + + +def create_space( + space_name: str, + sdk: str = "gradio", + hardware: str = "cpu-basic", + private: bool = False, + description: str = "A Hugging Face Space", + emoji: str = "🚀", + color_from: str = "blue", + color_to: str = "purple", + organization: str | None = None, +) -> str: + """Create a new Hugging Face Space with all necessary files.""" + + api = HfApi() + user = api.whoami() + username = organization or user["name"] + repo_id = f"{username}/{space_name}" + + print(f"Creating Space: {repo_id}") + print(f" SDK: {sdk}") + print(f" Hardware: {hardware}") + print(f" Private: {private}") + + # Create the Space repository + create_repo( + repo_id=repo_id, + repo_type="space", + space_sdk=sdk, + space_hardware=hardware if hardware != "cpu-basic" else None, + private=private, + exist_ok=False, + ) + print(f"✓ Created repository: {repo_id}") + + # Generate README.md + readme_content = SPACE_TEMPLATE_README.format( + title=space_name.replace("-", " ").title(), + emoji=emoji, + color_from=color_from, + color_to=color_to, + sdk=sdk, + sdk_version=SDK_VERSIONS.get(sdk, ""), + description=description, + ) + + # Upload README.md + upload_file( + path_or_fileobj=readme_content.encode(), + path_in_repo="README.md", + repo_id=repo_id, + repo_type="space", + ) + print("✓ Uploaded README.md") + + # Upload app file + if sdk == "gradio": + app_content = GRADIO_TEMPLATE + elif sdk == "streamlit": + app_content = STREAMLIT_TEMPLATE + elif sdk == "docker": + app_content = DOCKER_TEMPLATE + else: + app_content = GRADIO_TEMPLATE + + upload_file( + path_or_fileobj=app_content.encode(), + path_in_repo="app.py", + repo_id=repo_id, + repo_type="space", + ) + print("✓ Uploaded app.py") + + # Upload requirements.txt + requirements_content = REQUIREMENTS.get(sdk, REQUIREMENTS["gradio"]) + upload_file( + path_or_fileobj=requirements_content.encode(), + path_in_repo="requirements.txt", + repo_id=repo_id, + repo_type="space", + ) + print("✓ Uploaded requirements.txt") + + # Upload Dockerfile for Docker SDK + if sdk == "docker": + upload_file( + path_or_fileobj=DOCKERFILE_TEMPLATE.encode(), + path_in_repo="Dockerfile", + repo_id=repo_id, + repo_type="space", + ) + print("✓ Uploaded Dockerfile") + + space_url = f"https://huggingface.co/spaces/{repo_id}" + print(f"\n✅ Space created successfully!") + print(f" URL: {space_url}") + print(f"\n Clone with: git clone https://huggingface.co/spaces/{repo_id}") + + return space_url + + +def main(): + parser = argparse.ArgumentParser( + description="Create a new Hugging Face Space", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python create_space.py my-demo + python create_space.py my-demo --sdk streamlit + python create_space.py my-demo --sdk gradio --hardware t4-small + python create_space.py my-demo --private --org my-organization + """, + ) + + parser.add_argument("name", help="Name of the Space (e.g., my-awesome-demo)") + parser.add_argument( + "--sdk", + choices=["gradio", "streamlit", "docker"], + default="gradio", + help="SDK to use (default: gradio)", + ) + parser.add_argument( + "--hardware", + choices=[ + "cpu-basic", + "cpu-upgrade", + "t4-small", + "t4-medium", + "a10g-small", + "a10g-large", + "a100-large", + ], + default="cpu-basic", + help="Hardware tier (default: cpu-basic)", + ) + parser.add_argument( + "--private", + action="store_true", + help="Make the Space private", + ) + parser.add_argument( + "--description", + default="A Hugging Face Space", + help="Short description of the Space", + ) + parser.add_argument( + "--emoji", + default="🚀", + help="Emoji for the Space card", + ) + parser.add_argument( + "--org", + dest="organization", + help="Organization to create the Space under", + ) + + args = parser.parse_args() + + # Check for HF token + if not os.environ.get("HF_TOKEN") and not os.path.exists( + os.path.expanduser("~/.cache/huggingface/token") + ): + print("Error: No Hugging Face token found.") + print("Please set HF_TOKEN environment variable or run `huggingface-cli login`") + return 1 + + try: + create_space( + space_name=args.name, + sdk=args.sdk, + hardware=args.hardware, + private=args.private, + description=args.description, + emoji=args.emoji, + organization=args.organization, + ) + except Exception as e: + print(f"Error creating Space: {e}") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/skills/hf-create-a-space/scripts/deploy_model.py b/skills/hf-create-a-space/scripts/deploy_model.py new file mode 100644 index 0000000..72121a2 --- /dev/null +++ b/skills/hf-create-a-space/scripts/deploy_model.py @@ -0,0 +1,1357 @@ +#!/usr/bin/env python3 +""" +Deploy a Hugging Face model to a Space with auto-generated UI. + +This script automatically detects: +1. Whether the model is a LoRA adapter or full model +2. Whether it supports the Inference API +3. Chooses the appropriate deployment strategy + +Deployment strategies: +- Models WITH Inference API: Uses InferenceClient (free, cpu-basic) +- Full models WITHOUT Inference API: Uses transformers + ZeroGPU (free with quota) +- LoRA adapters: Uses peft + transformers + ZeroGPU (free with quota) + +Usage: + python deploy_model.py meta-llama/Llama-3-8B-Instruct --type chat + python deploy_model.py GhostScientist/my-finetuned-model --type chat + python deploy_model.py GhostScientist/my-lora-adapter --type chat --base-model Qwen/Qwen2.5-Coder-1.5B-Instruct +""" + +import argparse +import json +from huggingface_hub import HfApi, create_repo, upload_file, model_info, hf_hub_download + + +# Known providers that typically have Inference API support +INFERENCE_API_PROVIDERS = { + "meta-llama", + "mistralai", + "HuggingFaceH4", + "google", + "stabilityai", + "openai", + "microsoft", + "facebook", + "sentence-transformers", + "Qwen", + "deepseek-ai", + "THUDM", + "tiiuae", + "bigscience", + "EleutherAI", + "allenai", + "nvidia", + "black-forest-labs", +} + +# Gated model prefixes that require HF_TOKEN authentication +GATED_MODEL_PREFIXES = { + "meta-llama/", # Llama 2, Llama 3, etc. + "mistralai/Mistral-", # Some Mistral models + "google/gemma-", # Gemma models + "bigscience/bloom", # BLOOM + "tiiuae/falcon-", # Some Falcon models +} + +# Hardware tiers with specs and pricing (as of 2025) +HARDWARE_TIERS = { + "cpu-basic": { + "cost_per_hour": 0.00, + "vram_gb": 0, + "vcpu": 2, + "ram_gb": 16, + "disk_gb": 50, + "description": "Free CPU tier for simple demos", + }, + "cpu-upgrade": { + "cost_per_hour": 0.03, + "vram_gb": 0, + "vcpu": 8, + "ram_gb": 32, + "disk_gb": 50, + "description": "Faster CPU for heavier workloads", + }, + "zero-a10g": { + "cost_per_hour": 0.00, + "vram_gb": 70, # H200 slice + "vcpu": 0, # Dynamic + "ram_gb": 0, # Dynamic + "disk_gb": 50, + "description": "Free GPU on-demand (requires PRO to host)", + "requires_pro": True, + }, + "t4-small": { + "cost_per_hour": 0.40, + "vram_gb": 16, + "vcpu": 4, + "ram_gb": 15, + "disk_gb": 50, + "description": "Entry-level GPU for small models", + }, + "t4-medium": { + "cost_per_hour": 0.60, + "vram_gb": 16, + "vcpu": 8, + "ram_gb": 30, + "disk_gb": 100, + "description": "More CPU/RAM with T4 GPU", + }, + "l4": { + "cost_per_hour": 0.80, + "vram_gb": 24, + "vcpu": 8, + "ram_gb": 30, + "disk_gb": 400, + "description": "Modern GPU, good price/performance", + }, + "l40s": { + "cost_per_hour": 1.80, + "vram_gb": 48, + "vcpu": 8, + "ram_gb": 62, + "disk_gb": 380, + "description": "High VRAM for 7-13B models", + }, + "a10g-small": { + "cost_per_hour": 1.00, + "vram_gb": 24, + "vcpu": 4, + "ram_gb": 14, + "disk_gb": 110, + "description": "24GB VRAM for medium models", + }, + "a10g-large": { + "cost_per_hour": 1.50, + "vram_gb": 24, + "vcpu": 12, + "ram_gb": 46, + "disk_gb": 200, + "description": "More CPU/RAM with A10G GPU", + }, + "a100-large": { + "cost_per_hour": 2.50, + "vram_gb": 80, + "vcpu": 12, + "ram_gb": 142, + "disk_gb": 1000, + "description": "Top tier for large models", + }, +} + + +def recommend_hardware( + params_billions: float | None = None, + pipeline_tag: str | None = None, + prefer_free: bool = True, + user_has_pro: bool = False, +) -> str: + """ + Recommend hardware tier based on model size and type. + + Args: + params_billions: Estimated model size in billions of parameters + pipeline_tag: HF pipeline tag (e.g., "text-generation", "text-to-image") + prefer_free: Prefer free tiers when possible + user_has_pro: Whether user has PRO subscription (for ZeroGPU) + + Returns: + Recommended hardware tier name + """ + # Default to ZeroGPU if user has PRO and prefers free + if prefer_free and user_has_pro: + default = "zero-a10g" + elif prefer_free: + default = "cpu-basic" + else: + default = "t4-small" + + if params_billions is None: + return default + + # Text generation models need more VRAM + if pipeline_tag in ["text-generation", "text2text-generation", "conversational"]: + if params_billions < 0.5: + return "cpu-upgrade" if prefer_free else "t4-small" + elif params_billions < 3: + if user_has_pro: + return "zero-a10g" + return "t4-small" if prefer_free else "l4" + elif params_billions < 7: + if user_has_pro: + return "zero-a10g" # ZeroGPU H200 can handle 7B + return "l4" + elif params_billions < 14: + return "l40s" + elif params_billions < 35: + return "a100-large" + else: + return "a100-large" # May need quantization + + # Image generation models + elif pipeline_tag in ["text-to-image", "image-to-image"]: + if params_billions and params_billions > 3: + return "l4" # SDXL-class models + if user_has_pro: + return "zero-a10g" + return "t4-small" + + # Audio models + elif pipeline_tag in ["text-to-speech", "automatic-speech-recognition", "audio-to-audio"]: + if user_has_pro: + return "zero-a10g" + return "t4-small" + + # Classification/detection models (typically smaller) + elif pipeline_tag in [ + "image-classification", "object-detection", "image-segmentation", + "text-classification", "token-classification", "zero-shot-classification" + ]: + return "cpu-upgrade" + + # Embedding models + elif pipeline_tag in ["feature-extraction", "sentence-similarity"]: + return "cpu-upgrade" + + # Vision-language models + elif pipeline_tag in ["visual-question-answering", "image-to-text", "document-question-answering"]: + if user_has_pro: + return "zero-a10g" + return "l4" + + # Default based on size + if params_billions < 1: + return "cpu-upgrade" + elif params_billions < 7: + if user_has_pro: + return "zero-a10g" + return "l4" + else: + return "l40s" + + +def detect_model_type(model_id: str) -> dict: + """ + Detect whether a model is a full model or LoRA adapter. + + Returns dict with: + - is_adapter: bool + - base_model: str or None (if adapter) + - has_full_weights: bool + """ + api = HfApi() + result = { + "is_adapter": False, + "base_model": None, + "has_full_weights": False, + } + + try: + # List files in the repo + files = api.list_repo_files(model_id, repo_type="model") + file_names = [f.split("/")[-1] for f in files] + + # Check for adapter files + has_adapter_config = "adapter_config.json" in file_names + has_adapter_model = any("adapter_model" in f for f in file_names) + + # Check for full model files + has_full_weights = any( + f in file_names for f in [ + "model.safetensors", + "pytorch_model.bin", + "model-00001-of-00001.safetensors", # Single shard + ] + ) or any( + "model-" in f and ".safetensors" in f for f in file_names # Sharded + ) + + result["has_full_weights"] = has_full_weights + + # If it has adapter files but no full weights, it's a LoRA adapter + if has_adapter_config and has_adapter_model and not has_full_weights: + result["is_adapter"] = True + + # Try to get base model from adapter_config.json + try: + config_path = hf_hub_download(model_id, "adapter_config.json") + with open(config_path) as f: + adapter_config = json.load(f) + result["base_model"] = adapter_config.get("base_model_name_or_path") + except Exception: + pass + + except Exception as e: + print(f"Warning: Could not fully analyze model: {e}") + + return result + + +def is_gated_model(model_id: str) -> bool: + """ + Check if a model is likely gated and requires HF_TOKEN authentication. + + Gated models require: + 1. User to accept the license on the model page + 2. HF_TOKEN secret added to Space Settings + """ + for prefix in GATED_MODEL_PREFIXES: + if model_id.startswith(prefix): + return True + return False + + +def has_inference_api(model_id: str) -> bool: + """ + Check if a model likely has Inference API support. + + This uses heuristics since the API doesn't directly expose this info: + 1. Check if the model is from a known provider + 2. User-uploaded models (personal namespaces) typically don't have it + 3. LoRA adapters never have direct Inference API + """ + # First check if it's an adapter - adapters never have Inference API + model_type = detect_model_type(model_id) + if model_type["is_adapter"]: + return False + + org = model_id.split("/")[0] if "/" in model_id else None + + if org in INFERENCE_API_PROVIDERS: + return True + + # Try to check model info for inference status + try: + info = model_info(model_id) + # Models with "inference" in pipeline_tag or with widget usually have API + if info.pipeline_tag and info.pipeline_tag in [ + "text-generation", "text2text-generation", "conversational", + "fill-mask", "text-classification", "image-classification", + "text-to-image", "image-to-text" + ]: + # Popular models from big orgs usually have it + if info.downloads and info.downloads > 10000: + return True + except Exception: + pass + + return False + + +# ============================================================================ +# TEMPLATES FOR INFERENCE API (models that support it) +# ============================================================================ + +CHAT_APP_INFERENCE = '''"""Chat interface for {model_id} using Inference API""" +import os +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "{model_id}" + +# Token required for gated models (Llama, Mistral, Gemma, etc.) +# Add HF_TOKEN as a Repository Secret in Space Settings +HF_TOKEN = os.environ.get("HF_TOKEN") +client = InferenceClient(MODEL_ID, token=HF_TOKEN) + + +def respond(message, history, system_message, max_tokens, temperature, top_p): + messages = [{{"role": "system", "content": system_message}}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({{"role": "user", "content": user_msg}}) + if assistant_msg: + messages.append({{"role": "assistant", "content": assistant_msg}}) + + messages.append({{"role": "user", "content": message}}) + + response = "" + for token in client.chat_completion( + messages, + max_tokens=max_tokens, + stream=True, + temperature=temperature, + top_p=top_p, + ): + delta = token.choices[0].delta.content or "" + response += delta + yield response + + +demo = gr.ChatInterface( + respond, + title="{title}", + description="Chat with {model_id}", + additional_inputs=[ + gr.Textbox(value="You are a helpful assistant.", label="System message"), + gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max tokens"), + gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), + ], + # IMPORTANT: Gradio 5.x requires nested lists for examples + examples=[ + ["Hello! How are you?"], + ["Write a Python function to sort a list"], + ["Explain this concept simply"], + ], +) + +if __name__ == "__main__": + demo.launch() +''' + + +# ============================================================================ +# TEMPLATES FOR ZEROGPU (models without Inference API) +# ============================================================================ + +CHAT_APP_ZEROGPU = '''"""Chat interface for {model_id} using ZeroGPU (free!)""" +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL_ID = "{model_id}" + +# Load tokenizer at startup (lightweight, no GPU needed) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Model will be loaded lazily on first request +model = None + + +def load_model(): + """Load model - called inside GPU context.""" + global model + if model is None: + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + return model + + +@spaces.GPU(duration=120) # GPU allocated for up to 120 seconds +def generate_response(message, history, system_message, max_tokens, temperature, top_p): + """Generate response - GPU is allocated only during this function.""" + # Load model on GPU + model = load_model() + + messages = [{{"role": "system", "content": system_message}}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({{"role": "user", "content": user_msg}}) + if assistant_msg: + messages.append({{"role": "assistant", "content": assistant_msg}}) + + messages.append({{"role": "user", "content": message}}) + + # Apply chat template + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + inputs = tokenizer([text], return_tensors="pt").to(model.device) + + # Generate (no streaming with ZeroGPU) + outputs = model.generate( + **inputs, + max_new_tokens=int(max_tokens), + temperature=temperature, + top_p=top_p, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + response = tokenizer.decode( + outputs[0][inputs['input_ids'].shape[1]:], + skip_special_tokens=True + ) + return response + + +demo = gr.ChatInterface( + generate_response, + title="{title}", + description="Chat with {model_id} (powered by ZeroGPU - free!)", + additional_inputs=[ + gr.Textbox(value="You are a helpful assistant.", label="System message"), + gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), + gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), + ], + # IMPORTANT: Gradio 5.x requires nested lists for examples + examples=[ + ["Hello! How are you?"], + ["Write a Python function to sort a list"], + ["Explain this concept simply"], + ], +) + +if __name__ == "__main__": + demo.launch() +''' + + +# ============================================================================ +# TEMPLATE FOR LORA ADAPTERS +# ============================================================================ + +CHAT_APP_LORA = '''"""Chat interface for {model_id} (LoRA adapter) using ZeroGPU""" +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import PeftModel + +# LoRA adapter +ADAPTER_ID = "{model_id}" +# Base model (from adapter_config.json) +BASE_MODEL_ID = "{base_model}" + +# Load tokenizer from adapter (lightweight, no GPU needed) +tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID) + +# Model will be loaded lazily on first request +model = None + + +def load_model(): + """Load and merge LoRA adapter - called inside GPU context.""" + global model + if model is None: + print(f"Loading base model: {{BASE_MODEL_ID}}") + base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + + print(f"Applying adapter: {{ADAPTER_ID}}") + peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID) + + # Merge for faster inference + print("Merging adapter weights...") + model = peft_model.merge_and_unload() + print("Model ready!") + return model + + +@spaces.GPU(duration=120) # GPU allocated for up to 120 seconds +def generate_response(message, history, system_message, max_tokens, temperature, top_p): + """Generate response - GPU is allocated only during this function.""" + # Load model on GPU + model = load_model() + + messages = [{{"role": "system", "content": system_message}}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({{"role": "user", "content": user_msg}}) + if assistant_msg: + messages.append({{"role": "assistant", "content": assistant_msg}}) + + messages.append({{"role": "user", "content": message}}) + + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + inputs = tokenizer([text], return_tensors="pt").to(model.device) + + outputs = model.generate( + **inputs, + max_new_tokens=int(max_tokens), + temperature=temperature, + top_p=top_p, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + response = tokenizer.decode( + outputs[0][inputs['input_ids'].shape[1]:], + skip_special_tokens=True + ) + return response + + +demo = gr.ChatInterface( + generate_response, + title="{title}", + description="LoRA fine-tuned model powered by ZeroGPU (free!)", + additional_inputs=[ + gr.Textbox(value="You are a helpful assistant.", label="System message"), + gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"), + gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), + ], + examples=[ + ["Hello! How are you?"], + ["Write a Python function to sort a list"], + ["Explain this concept simply"], + ], +) + +if __name__ == "__main__": + demo.launch() +''' + + +# ============================================================================ +# OTHER TEMPLATES (unchanged patterns) +# ============================================================================ + +IMAGE_CLASSIFICATION_APP = '''"""Image classification with {model_id}""" +import gradio as gr +from transformers import pipeline + +MODEL_ID = "{model_id}" +classifier = pipeline("image-classification", model=MODEL_ID) + + +def classify(image): + if image is None: + return {{}} + results = classifier(image) + return {{r["label"]: r["score"] for r in results}} + + +demo = gr.Interface( + fn=classify, + inputs=gr.Image(type="pil", label="Upload Image"), + outputs=gr.Label(num_top_classes=5, label="Predictions"), + title="{title}", + description="Upload an image to classify it using {model_id}", + # IMPORTANT: Gradio 5.x requires nested lists for examples + examples=[], +) + +if __name__ == "__main__": + demo.launch() +''' + +TEXT_TO_IMAGE_APP = '''"""Text-to-image generation with {model_id}""" +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "{model_id}" +client = InferenceClient() + + +def generate(prompt, negative_prompt, width, height, guidance_scale, num_steps): + if not prompt: + return None + + image = client.text_to_image( + prompt, + negative_prompt=negative_prompt or None, + model=MODEL_ID, + width=width, + height=height, + guidance_scale=guidance_scale, + num_inference_steps=num_steps, + ) + return image + + +demo = gr.Interface( + fn=generate, + inputs=[ + gr.Textbox(label="Prompt", placeholder="A beautiful sunset over mountains..."), + gr.Textbox(label="Negative Prompt", placeholder="blurry, low quality"), + gr.Slider(512, 1024, value=1024, step=64, label="Width"), + gr.Slider(512, 1024, value=1024, step=64, label="Height"), + gr.Slider(1, 20, value=7.5, step=0.5, label="Guidance Scale"), + gr.Slider(10, 50, value=30, step=1, label="Steps"), + ], + outputs=gr.Image(label="Generated Image"), + title="{title}", + description="Generate images using {model_id}", +) + +if __name__ == "__main__": + demo.launch() +''' + +TEXT_GENERATION_APP = '''"""Text generation with {model_id}""" +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "{model_id}" +client = InferenceClient(MODEL_ID) + + +def generate(prompt, max_tokens, temperature, top_p): + if not prompt: + return "" + + response = client.text_generation( + prompt, + max_new_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + ) + return response + + +demo = gr.Interface( + fn=generate, + inputs=[ + gr.Textbox(label="Prompt", lines=5, placeholder="Enter your prompt..."), + gr.Slider(50, 1000, value=200, step=10, label="Max Tokens"), + gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), + ], + outputs=gr.Textbox(label="Generated Text", lines=10), + title="{title}", + description="Generate text using {model_id}", +) + +if __name__ == "__main__": + demo.launch() +''' + +EMBEDDING_APP = '''"""Text embeddings with {model_id}""" +import gradio as gr +from sentence_transformers import SentenceTransformer +import numpy as np + +MODEL_ID = "{model_id}" +model = SentenceTransformer(MODEL_ID) + + +def get_embeddings(texts): + if not texts.strip(): + return "Please enter some text." + + lines = [line.strip() for line in texts.split("\\n") if line.strip()] + embeddings = model.encode(lines) + + output = [] + for i, (text, emb) in enumerate(zip(lines, embeddings)): + output.append(f"Text {{i+1}}: {{text[:50]}}...") + output.append(f"Embedding shape: {{emb.shape}}") + output.append(f"First 5 values: {{emb[:5].tolist()}}") + output.append("") + + if len(lines) > 1: + output.append("Similarity Matrix:") + sim_matrix = np.inner(embeddings, embeddings) + for i in range(len(lines)): + row = [f"{{sim_matrix[i][j]:.3f}}" for j in range(len(lines))] + output.append(f" Text {{i+1}}: {{' | '.join(row)}}") + + return "\\n".join(output) + + +demo = gr.Interface( + fn=get_embeddings, + inputs=gr.Textbox(label="Input Texts (one per line)", lines=5), + outputs=gr.Textbox(label="Embeddings", lines=15), + title="{title}", + description="Generate embeddings using {model_id}. Enter multiple lines to see similarity scores.", +) + +if __name__ == "__main__": + demo.launch() +''' + + +# ============================================================================ +# REQUIREMENTS +# ============================================================================ + +REQUIREMENTS_INFERENCE = """gradio>=5.0.0 +huggingface_hub>=0.26.0 +""" + +REQUIREMENTS_ZEROGPU = """gradio>=5.0.0 +torch +transformers +accelerate +spaces +""" + +REQUIREMENTS_LORA = """gradio>=5.0.0 +torch +transformers +accelerate +spaces +peft +""" + +REQUIREMENTS_IMAGE_CLASS = """gradio>=5.0.0 +transformers>=4.40.0 +torch>=2.0.0 +Pillow>=10.0.0 +""" + +REQUIREMENTS_EMBEDDING = """gradio>=5.0.0 +sentence-transformers>=2.2.0 +numpy>=1.24.0 +""" + + +# ============================================================================ +# README TEMPLATES +# ============================================================================ + +README_TEMPLATE_INFERENCE = """--- +title: {title} +emoji: {emoji} +colorFrom: blue +colorTo: purple +sdk: gradio +sdk_version: 5.9.1 +app_file: app.py +pinned: false +license: apache-2.0 +short_description: {short_description} +--- + +# {title} + +{description} + +## Model + +This Space uses [{model_id}](https://huggingface.co/{model_id}). + +## Usage + +{usage} +""" + +README_TEMPLATE_ZEROGPU = """--- +title: {title} +emoji: {emoji} +colorFrom: blue +colorTo: purple +sdk: gradio +sdk_version: 5.9.1 +app_file: app.py +pinned: false +license: apache-2.0 +short_description: {short_description} +suggested_hardware: zero-a10g +--- + +# {title} + +{description} + +## Model + +This Space uses [{model_id}](https://huggingface.co/{model_id}). + +## How It Works + +This Space uses **ZeroGPU** - a free GPU allocation system: +- The app runs on CPU by default (free) +- When you send a message, a GPU is allocated on-demand +- After generation completes, the GPU is released +- You get a daily quota of free GPU time + +## Usage + +{usage} +""" + + +def deploy_model( + model_id: str, + model_type: str, + space_name: str | None = None, + hardware: str | None = None, + private: bool = False, + organization: str | None = None, + method: str = "auto", + force_zerogpu: bool = False, + base_model: str | None = None, + dry_run: bool = False, + skip_preflight: bool = False, + set_hardware: bool = False, +) -> str: + """ + Deploy a model to a new Space with smart defaults. + + Args: + model_id: HuggingFace model ID (e.g., "username/model-name") + model_type: Type of deployment (chat, text-generation, etc.) + space_name: Custom name for the Space (auto-generated if not provided) + hardware: Hardware tier (auto-selected if not provided) + private: Make the Space private + organization: Create Space under an organization + method: Deployment method - "inference-api", "zerogpu", or "auto" + force_zerogpu: (Deprecated) Use method="zerogpu" instead + base_model: Base model ID for LoRA adapters + dry_run: Show what would be done without making changes + skip_preflight: Skip pre-flight checks + set_hardware: Automatically configure hardware after deployment + + Returns: + Space URL on success + """ + # Handle deprecated force_zerogpu flag + if force_zerogpu and method == "auto": + method = "zerogpu" + api = HfApi() + + # === PRE-FLIGHT CHECKS === + if not skip_preflight: + print("\n📋 Running pre-flight checks...") + + # Import preflight module + try: + from preflight import check_hf_token, check_zerogpu_eligibility, check_model_access + except ImportError: + # Fallback if not in same directory + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent)) + from preflight import check_hf_token, check_zerogpu_eligibility, check_model_access + + # Check token + token_result = check_hf_token() + if not token_result.valid: + raise ValueError(f"Token check failed: {token_result.error}") + print(f" ✓ Token valid (user: {token_result.username})") + + # Check model access + access_result = check_model_access(model_id) + if not access_result.accessible: + raise ValueError(f"Model not accessible: {access_result.error}") + print(f" ✓ Model accessible") + + # Check ZeroGPU eligibility (for later hardware selection) + subscription_result = check_zerogpu_eligibility() + user_has_pro = subscription_result.can_host_zerogpu + if user_has_pro: + print(f" ✓ PRO subscription (can host ZeroGPU)") + else: + print(f" ⚠ Free tier (ZeroGPU hosting requires PRO)") + else: + user_has_pro = False # Conservative default + print("\n⚠ Skipping pre-flight checks") + + user = api.whoami() + username = organization or user["name"] + + # Generate space name from model if not provided + if not space_name: + space_name = model_id.split("/")[-1].lower().replace("_", "-") + "-demo" + + repo_id = f"{username}/{space_name}" + + # === MODEL ANALYSIS === + print(f"\n🔍 Analyzing model: {model_id}") + + # Detect model type (LoRA adapter vs full model) + model_info_result = detect_model_type(model_id) + is_lora_adapter = model_info_result["is_adapter"] + detected_base_model = model_info_result.get("base_model") + has_full_weights = model_info_result.get("has_full_weights", False) + + print(f" Model type: {'LoRA adapter' if is_lora_adapter else 'Full model' if has_full_weights else 'Unknown'}") + + # Use provided base model or detected one + if is_lora_adapter: + base_model = base_model or detected_base_model + if base_model: + print(f" Base model: {base_model}") + else: + raise ValueError( + f"Model {model_id} appears to be a LoRA adapter but no base model was found.\n" + f"Please provide --base-model argument with the base model ID.\n" + f"Check adapter_config.json for 'base_model_name_or_path' field." + ) + + # Determine deployment strategy based on method parameter + inference_api_available = has_inference_api(model_id) + print(f" Inference API available: {'Yes' if inference_api_available else 'No'}") + + # Determine actual method to use + if method == "auto": + # Auto-detect: LoRA always uses ZeroGPU, otherwise check Inference API + if is_lora_adapter: + actual_method = "zerogpu" + elif inference_api_available: + actual_method = "inference-api" + else: + actual_method = "zerogpu" + else: + actual_method = method + + # Validate method choice + if actual_method == "inference-api" and is_lora_adapter: + print(f" ⚠️ WARNING: LoRA adapters do NOT work with Inference API!") + print(f" Switching to ZeroGPU automatically.") + actual_method = "zerogpu" + + if actual_method == "inference-api" and not inference_api_available: + print(f" ⚠️ WARNING: This model may not have Inference API support.") + print(f" Personal/fine-tuned models typically don't support Inference API.") + print(f" Consider using --method zerogpu instead if deployment fails.") + + print(f" Deployment method: {actual_method}") + + if model_type == "chat": + if is_lora_adapter: + app_template = CHAT_APP_LORA + requirements = REQUIREMENTS_LORA + readme_template = README_TEMPLATE_ZEROGPU + default_hardware = "zero-a10g" + strategy = f"LoRA Adapter + ZeroGPU (base: {base_model})" + elif actual_method == "inference-api": + app_template = CHAT_APP_INFERENCE + requirements = REQUIREMENTS_INFERENCE + readme_template = README_TEMPLATE_INFERENCE + default_hardware = "cpu-basic" + strategy = "Inference API" + else: # zerogpu + app_template = CHAT_APP_ZEROGPU + requirements = REQUIREMENTS_ZEROGPU + readme_template = README_TEMPLATE_ZEROGPU + default_hardware = "zero-a10g" + strategy = "ZeroGPU" + elif model_type == "image-classification": + app_template = IMAGE_CLASSIFICATION_APP + requirements = REQUIREMENTS_IMAGE_CLASS + readme_template = README_TEMPLATE_INFERENCE + default_hardware = "cpu-upgrade" + strategy = "Local transformers" + elif model_type == "text-to-image": + app_template = TEXT_TO_IMAGE_APP + requirements = REQUIREMENTS_INFERENCE + readme_template = README_TEMPLATE_INFERENCE + default_hardware = "cpu-basic" + strategy = "Inference API" + elif model_type == "text-generation": + app_template = TEXT_GENERATION_APP + requirements = REQUIREMENTS_INFERENCE + readme_template = README_TEMPLATE_INFERENCE + default_hardware = "cpu-basic" + strategy = "Inference API" + elif model_type == "embedding": + app_template = EMBEDDING_APP + requirements = REQUIREMENTS_EMBEDDING + readme_template = README_TEMPLATE_INFERENCE + default_hardware = "cpu-upgrade" + strategy = "Local sentence-transformers" + else: + raise ValueError(f"Unknown model type: {model_type}") + + # Smart hardware selection if not explicitly provided + if hardware is None: + # Use recommend_hardware function with user's subscription status + if not skip_preflight: + hardware = recommend_hardware( + params_billions=None, # Could estimate from model + pipeline_tag=model_type, + prefer_free=True, + user_has_pro=user_has_pro, + ) + # Override to paid if ZeroGPU selected but user doesn't have PRO + if hardware == "zero-a10g" and not user_has_pro: + print(f" ⚠ ZeroGPU requires PRO subscription, falling back to {default_hardware}") + hardware = default_hardware if default_hardware != "zero-a10g" else "t4-small" + else: + hardware = default_hardware + + # === DEPLOYMENT SUMMARY === + print(f"\n🚀 Deployment Plan") + print(f" Model: {model_id}") + print(f" Type: {model_type}") + print(f" Strategy: {strategy}") + print(f" Space: {repo_id}") + print(f" Hardware: {hardware}") + if hardware in HARDWARE_TIERS: + tier_info = HARDWARE_TIERS[hardware] + if tier_info["cost_per_hour"] > 0: + print(f" Cost: ${tier_info['cost_per_hour']:.2f}/hour") + else: + print(f" Cost: Free") + print(f" Private: {private}") + + # Check for gated model and warn + gated = is_gated_model(model_id) + if gated: + print(f" ⚠️ GATED MODEL: Requires HF_TOKEN secret after deployment") + + # === DRY RUN === + if dry_run: + print(f"\n[DRY RUN] Would create Space with above configuration.") + print(f"[DRY RUN] No changes made.") + if gated: + print(f"\n[DRY RUN] Note: This is a gated model. After deployment:") + print(f" 1. Accept license at: https://huggingface.co/{model_id}") + print(f" 2. Add HF_TOKEN secret in Space Settings") + return f"https://huggingface.co/spaces/{repo_id}" + + # === CREATE SPACE === + print(f"\n📦 Creating Space...") + create_repo( + repo_id=repo_id, + repo_type="space", + space_sdk="gradio", + private=private, + exist_ok=False, + ) + print(" ✓ Created Space repository") + + # Generate title + title = model_id.split("/")[-1].replace("-", " ").replace("_", " ").title() + + # Get app content + if is_lora_adapter: + app_content = app_template.format( + model_id=model_id, + title=title, + base_model=base_model, + ) + else: + app_content = app_template.format( + model_id=model_id, + title=title, + ) + + # Upload app.py + upload_file( + path_or_fileobj=app_content.encode(), + path_in_repo="app.py", + repo_id=repo_id, + repo_type="space", + ) + print(" ✓ Uploaded app.py") + + # Upload requirements.txt + upload_file( + path_or_fileobj=requirements.encode(), + path_in_repo="requirements.txt", + repo_id=repo_id, + repo_type="space", + ) + print(" ✓ Uploaded requirements.txt") + + # Generate and upload README + emoji_map = { + "chat": "💬", + "image-classification": "🖼️", + "text-to-image": "🎨", + "text-generation": "📝", + "embedding": "🔢", + } + + usage_map = { + "chat": "Type a message to start chatting with the model.", + "image-classification": "Upload an image to classify it.", + "text-to-image": "Enter a prompt to generate an image.", + "text-generation": "Enter a prompt to generate text.", + "embedding": "Enter text (one item per line) to generate embeddings.", + } + + readme = readme_template.format( + title=title, + emoji=emoji_map.get(model_type, "🤖"), + short_description=f"{model_type.replace('-', ' ').title()} demo", + description=f"Interactive demo of {model_id} using Gradio.", + model_id=model_id, + usage=usage_map.get(model_type, "Use the interface to interact with the model."), + ) + + upload_file( + path_or_fileobj=readme.encode(), + path_in_repo="README.md", + repo_id=repo_id, + repo_type="space", + ) + print(" ✓ Uploaded README.md") + + space_url = f"https://huggingface.co/spaces/{repo_id}" + settings_url = f"https://huggingface.co/spaces/{repo_id}/settings" + print(f"\n✅ Space created successfully!") + print(f" URL: {space_url}") + print(f" Strategy: {strategy}") + + # === POST-DEPLOYMENT: Hardware Configuration === + # Check if hardware needs to be set (GPU models need explicit hardware setting) + needs_gpu = hardware in ["zero-a10g", "t4-small", "t4-medium", "l4", "l40s", "a10g-small", "a10g-large", "a100-large"] + + if needs_gpu and not dry_run: + print(f"\n⚠️ IMPORTANT: GPU hardware configuration required!") + print(f" Your Space uses {strategy} which requires GPU hardware.") + print(f" Recommended hardware: {hardware}") + if hardware in HARDWARE_TIERS: + tier_info = HARDWARE_TIERS[hardware] + if tier_info["cost_per_hour"] == 0: + print(f" Cost: Free (requires PRO subscription to host)") + else: + print(f" Cost: ${tier_info['cost_per_hour']:.2f}/hour") + + print(f"\n To configure hardware:") + print(f" 1. Go to: {settings_url}") + print(f" 2. Select '{hardware}' under 'Space Hardware'") + print(f" 3. Click 'Apply' to save changes") + + # If set_hardware flag is True, configure it automatically + if set_hardware: + print(f"\n🔧 Configuring hardware automatically...") + try: + api.request_space_hardware(repo_id, hardware) + print(f" ✓ Hardware set to: {hardware}") + except Exception as e: + print(f" ⚠ Could not set hardware automatically: {e}") + print(f" Please configure manually at: {settings_url}") + else: + if hardware == "cpu-basic": + print(f" Hardware: cpu-basic (Free, no configuration needed)") + + if hardware == "zero-a10g": + print("\n Note: ZeroGPU provides free GPU access with daily quota") + print(" Requires PRO subscription to host") + + # === POST-DEPLOYMENT: Gated Model Authentication === + if is_gated_model(model_id) and not dry_run: + print(f"\n🔐 GATED MODEL: {model_id} requires authentication!") + print(f" This model is gated and requires HF_TOKEN to access.") + print(f"\n Required steps:") + print(f" 1. Accept the model's license at: https://huggingface.co/{model_id}") + print(f" 2. Go to Space Settings: {settings_url}") + print(f" 3. Add Repository Secret: HF_TOKEN = ") + print(f"\n Without HF_TOKEN, the Space will show 'No API found' error.") + + return space_url + + +def main(): + parser = argparse.ArgumentParser( + description="Deploy a Hugging Face model to a Space", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Model Types: + chat - Chat/instruct models + text-generation - Text completion models + image-classification - Image classifiers + text-to-image - Diffusion models + embedding - Sentence embedding models + +Deployment Methods (--method): + inference-api - Uses HF's serverless Inference API (free, cpu-basic) + Works with: Major providers (Qwen, meta-llama, mistralai, google) + Does NOT work with: LoRA adapters, personal/fine-tuned models + + zerogpu - Loads model directly using transformers + @spaces.GPU (free with quota) + Works with: Any model, including LoRA adapters and fine-tuned models + Requires: PRO subscription to host + + auto - Auto-detect best method (default) + +Examples: + # Explicit Inference API deployment + python deploy_model.py meta-llama/Llama-3.1-8B-Instruct --type chat --method inference-api + + # Explicit ZeroGPU deployment + python deploy_model.py username/my-finetuned-model --type chat --method zerogpu + + # Auto-detect (default) + python deploy_model.py meta-llama/Llama-3.1-8B-Instruct --type chat + + # LoRA adapter (always uses ZeroGPU) + python deploy_model.py username/my-lora-adapter --type chat --base-model Qwen/Qwen2.5-Coder-1.5B-Instruct + + # Dry run to preview + python deploy_model.py meta-llama/Llama-3.1-8B-Instruct --type chat --dry-run + """, + ) + + parser.add_argument("model_id", help="Model ID on Hugging Face Hub") + parser.add_argument( + "--type", + dest="model_type", + required=True, + choices=["chat", "image-classification", "text-to-image", "text-generation", "embedding"], + help="Type of model/interface", + ) + parser.add_argument( + "--name", + dest="space_name", + help="Custom Space name (default: derived from model)", + ) + parser.add_argument( + "--hardware", + choices=list(HARDWARE_TIERS.keys()), + help="Hardware tier (default: auto-selected based on strategy)", + ) + parser.add_argument( + "--private", + action="store_true", + help="Make Space private", + ) + parser.add_argument( + "--org", + dest="organization", + help="Organization to create Space under", + ) + parser.add_argument( + "--method", + choices=["inference-api", "zerogpu", "auto"], + default="auto", + help="Deployment method: inference-api (serverless), zerogpu (direct model loading), or auto (detect)", + ) + parser.add_argument( + "--force-zerogpu", + action="store_true", + help="(Deprecated) Use --method zerogpu instead", + ) + parser.add_argument( + "--base-model", + dest="base_model", + help="Base model ID for LoRA adapters (auto-detected from adapter_config.json if not provided)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show deployment plan without making changes", + ) + parser.add_argument( + "--skip-preflight", + action="store_true", + help="Skip pre-flight checks (token, subscription, model access)", + ) + parser.add_argument( + "--set-hardware", + action="store_true", + help="Automatically configure hardware after deployment (requires API access)", + ) + + args = parser.parse_args() + + try: + deploy_model( + model_id=args.model_id, + model_type=args.model_type, + space_name=args.space_name, + hardware=args.hardware, + private=args.private, + organization=args.organization, + method=args.method, + force_zerogpu=args.force_zerogpu, + base_model=args.base_model, + dry_run=args.dry_run, + skip_preflight=args.skip_preflight, + set_hardware=args.set_hardware, + ) + except Exception as e: + print(f"Error deploying model: {e}") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/skills/hf-create-a-space/scripts/manage_space.py b/skills/hf-create-a-space/scripts/manage_space.py new file mode 100644 index 0000000..7ae25a3 --- /dev/null +++ b/skills/hf-create-a-space/scripts/manage_space.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Manage Hugging Face Space settings: hardware, secrets, pause/restart. + +Usage: + python manage_space.py status username/my-space + python manage_space.py hardware username/my-space --tier t4-small + python manage_space.py secret username/my-space --key API_KEY --value xxx + python manage_space.py pause username/my-space + python manage_space.py restart username/my-space +""" + +import argparse +import os +from huggingface_hub import ( + HfApi, + request_space_hardware, + add_space_secret, + delete_space_secret, + pause_space, + restart_space, + space_info, +) + + +def get_space_status(repo_id: str) -> dict: + """Get current status of a Space.""" + api = HfApi() + info = api.space_info(repo_id) + + status = { + "id": info.id, + "author": info.author, + "sdk": info.sdk, + "runtime": { + "stage": info.runtime.stage if info.runtime else "unknown", + "hardware": info.runtime.hardware if info.runtime else "unknown", + "storage": info.runtime.storage if info.runtime else None, + }, + "private": info.private, + "likes": info.likes, + "created_at": str(info.created_at) if info.created_at else None, + "last_modified": str(info.last_modified) if info.last_modified else None, + } + + return status + + +def print_status(repo_id: str): + """Print formatted Space status.""" + print(f"\n📊 Space Status: {repo_id}") + print("=" * 50) + + try: + status = get_space_status(repo_id) + + print(f" ID: {status['id']}") + print(f" Author: {status['author']}") + print(f" SDK: {status['sdk']}") + print(f" Stage: {status['runtime']['stage']}") + print(f" Hardware: {status['runtime']['hardware']}") + print(f" Storage: {status['runtime']['storage'] or 'None'}") + print(f" Private: {status['private']}") + print(f" Likes: {status['likes']}") + print(f" Created: {status['created_at']}") + print(f" Modified: {status['last_modified']}") + print() + print(f" URL: https://huggingface.co/spaces/{repo_id}") + + except Exception as e: + print(f" Error: {e}") + + +def set_hardware(repo_id: str, hardware: str): + """Change Space hardware tier.""" + print(f"\n⚙️ Setting hardware for {repo_id}") + print(f" New tier: {hardware}") + + request_space_hardware(repo_id=repo_id, hardware=hardware) + print(" ✓ Hardware updated successfully") + print(" Note: Space will restart with new hardware") + + +def add_secret(repo_id: str, key: str, value: str): + """Add or update a Space secret.""" + print(f"\n🔐 Adding secret to {repo_id}") + print(f" Key: {key}") + + add_space_secret(repo_id=repo_id, key=key, value=value) + print(" ✓ Secret added successfully") + print(f" Access in code: os.environ.get('{key}')") + + +def remove_secret(repo_id: str, key: str): + """Remove a Space secret.""" + print(f"\n🔐 Removing secret from {repo_id}") + print(f" Key: {key}") + + delete_space_secret(repo_id=repo_id, key=key) + print(" ✓ Secret removed successfully") + + +def pause(repo_id: str): + """Pause a Space to stop billing.""" + print(f"\n⏸️ Pausing Space: {repo_id}") + + pause_space(repo_id=repo_id) + print(" ✓ Space paused successfully") + print(" Note: No charges while paused") + + +def restart(repo_id: str): + """Restart a paused Space.""" + print(f"\n▶️ Restarting Space: {repo_id}") + + restart_space(repo_id=repo_id) + print(" ✓ Space restart initiated") + print(" Note: May take a few minutes to become available") + + +def main(): + parser = argparse.ArgumentParser( + description="Manage Hugging Face Space settings", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # Status command + status_parser = subparsers.add_parser("status", help="Get Space status") + status_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + # Hardware command + hw_parser = subparsers.add_parser("hardware", help="Change hardware tier") + hw_parser.add_argument("repo_id", help="Space ID (username/space-name)") + hw_parser.add_argument( + "--tier", + required=True, + choices=[ + "cpu-basic", + "cpu-upgrade", + "zero-a10g", + "t4-small", + "t4-medium", + "l4", + "l40s", + "a10g-small", + "a10g-large", + "a100-large", + ], + help="Hardware tier", + ) + + # Secret commands + secret_parser = subparsers.add_parser("secret", help="Add/update a secret") + secret_parser.add_argument("repo_id", help="Space ID (username/space-name)") + secret_parser.add_argument("--key", required=True, help="Secret key name") + secret_parser.add_argument("--value", required=True, help="Secret value") + + rm_secret_parser = subparsers.add_parser("rm-secret", help="Remove a secret") + rm_secret_parser.add_argument("repo_id", help="Space ID (username/space-name)") + rm_secret_parser.add_argument("--key", required=True, help="Secret key name") + + # Pause/Restart commands + pause_parser = subparsers.add_parser("pause", help="Pause Space (stop billing)") + pause_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + restart_parser = subparsers.add_parser("restart", help="Restart paused Space") + restart_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + args = parser.parse_args() + + # Execute command + if args.command == "status": + print_status(args.repo_id) + elif args.command == "hardware": + set_hardware(args.repo_id, args.tier) + elif args.command == "secret": + add_secret(args.repo_id, args.key, args.value) + elif args.command == "rm-secret": + remove_secret(args.repo_id, args.key) + elif args.command == "pause": + pause(args.repo_id) + elif args.command == "restart": + restart(args.repo_id) + + +if __name__ == "__main__": + main() diff --git a/skills/hf-create-a-space/scripts/monitor_space.py b/skills/hf-create-a-space/scripts/monitor_space.py new file mode 100644 index 0000000..9ad0fd3 --- /dev/null +++ b/skills/hf-create-a-space/scripts/monitor_space.py @@ -0,0 +1,532 @@ +#!/usr/bin/env python3 +""" +Monitor Hugging Face Space build and runtime status with error detection. + +Provides: +- Real-time build status monitoring +- Error pattern detection in logs +- Health checks for running Spaces +- Actionable fix suggestions + +Usage: + python monitor_space.py status username/my-space + python monitor_space.py watch username/my-space + python monitor_space.py logs username/my-space + python monitor_space.py analyze-errors username/my-space + python monitor_space.py health-check username/my-space +""" + +import argparse +import re +import sys +import time +from dataclasses import dataclass +from typing import Optional + +import requests +from huggingface_hub import HfApi, space_info + + +# Common error patterns and their fixes +ERROR_PATTERNS = { + "module_not_found": { + "pattern": r"ModuleNotFoundError: No module named ['\"]([^'\"]+)['\"]", + "description": "Missing Python package", + "fix_template": "Add '{match}' to requirements.txt", + "auto_fixable": True, + }, + "import_error": { + "pattern": r"ImportError: cannot import name ['\"]([^'\"]+)['\"]", + "description": "Import error - version mismatch or missing package", + "fix_template": "Check package versions in requirements.txt. May need to pin specific version.", + "auto_fixable": False, + }, + "cuda_oom": { + "pattern": r"(CUDA out of memory|OutOfMemoryError|torch\.cuda\.OutOfMemoryError)", + "description": "GPU memory exceeded", + "fix_template": "Model too large for hardware. Upgrade to larger GPU tier or use quantization.", + "auto_fixable": False, + }, + "cuda_not_available": { + "pattern": r"(CUDA is not available|AssertionError: Torch not compiled with CUDA)", + "description": "No GPU available but code requires it", + "fix_template": "Set Space hardware to GPU tier (ZeroGPU, T4, L4, etc.) in Settings.", + "auto_fixable": False, + }, + "hf_token_invalid": { + "pattern": r"(401|403).*(token|unauthorized|forbidden)", + "description": "HF token invalid or missing", + "fix_template": "Add HF_TOKEN secret in Space Settings with a valid token.", + "auto_fixable": False, + }, + "gradio_hffolder": { + "pattern": r"cannot import name ['\"]HfFolder['\"]", + "description": "Gradio/huggingface_hub version mismatch", + "fix_template": "Use gradio>=5.0.0 and huggingface_hub>=0.26.0 in requirements.txt", + "auto_fixable": True, + }, + "model_not_found": { + "pattern": r"OSError: .+ does not appear to have a file named (pytorch_model\.bin|model\.safetensors)", + "description": "Model files not found - may be LoRA adapter", + "fix_template": "Model appears to be a LoRA adapter. Use PEFT to load: PeftModel.from_pretrained(base_model, adapter_id)", + "auto_fixable": False, + }, + "chat_template_missing": { + "pattern": r"(Cannot use chat template|Chat template is not defined)", + "description": "Model doesn't have chat template", + "fix_template": "Use text-generation instead of chat, or apply a chat template manually.", + "auto_fixable": False, + }, + "zerogpu_timeout": { + "pattern": r"(GPU allocation timed out|Queue timeout|ZeroGPU.*timeout)", + "description": "ZeroGPU queue timeout", + "fix_template": "High demand on ZeroGPU. Try again later or use a paid GPU tier.", + "auto_fixable": False, + }, + "zerogpu_duration": { + "pattern": r"(GPU time limit exceeded|duration.*exceeded)", + "description": "Function exceeded ZeroGPU time limit", + "fix_template": "Increase @spaces.GPU(duration=X) or optimize code to run faster.", + "auto_fixable": False, + }, + "gradio_examples_format": { + "pattern": r"ValueError: .*(examples|must be.*(nested|list of lists))", + "description": "Gradio 5.x examples format error", + "fix_template": "Use nested lists for examples: [['ex1'], ['ex2']] not ['ex1', 'ex2']", + "auto_fixable": True, + }, + "disk_space": { + "pattern": r"(No space left on device|OSError: \[Errno 28\])", + "description": "Disk space exhausted", + "fix_template": "Model too large for disk. Upgrade storage tier or use streaming.", + "auto_fixable": False, + }, + "gated_model": { + "pattern": r"(Cannot access gated repo|gated.*access|401.*gated)", + "description": "Gated model requires access", + "fix_template": "Request access to the model on HF Hub, then add HF_TOKEN secret.", + "auto_fixable": False, + }, + "syntax_error": { + "pattern": r"SyntaxError: (.+)", + "description": "Python syntax error", + "fix_template": "Fix syntax error in app.py: {match}", + "auto_fixable": False, + }, + "port_in_use": { + "pattern": r"(Address already in use|port .* is already allocated)", + "description": "Port conflict", + "fix_template": "Remove explicit port binding. Gradio handles this automatically in Spaces.", + "auto_fixable": True, + }, +} + + +@dataclass +class SpaceStatus: + """Current status of a Space.""" + repo_id: str + stage: str # BUILDING, RUNNING, RUNTIME_ERROR, PAUSED, etc. + hardware: Optional[str] = None + sdk: Optional[str] = None + error_message: Optional[str] = None + url: Optional[str] = None + + +@dataclass +class DetectedError: + """An error detected in Space logs.""" + error_type: str + description: str + matched_text: str + fix_suggestion: str + auto_fixable: bool + match_groups: tuple = () + + +def get_space_status(repo_id: str) -> SpaceStatus: + """Get current status of a Space.""" + try: + info = space_info(repo_id) + runtime = info.runtime + + return SpaceStatus( + repo_id=repo_id, + stage=runtime.stage if runtime else "UNKNOWN", + hardware=runtime.hardware if runtime else None, + sdk=info.sdk, + url=f"https://huggingface.co/spaces/{repo_id}", + ) + except Exception as e: + return SpaceStatus( + repo_id=repo_id, + stage="ERROR", + error_message=str(e), + ) + + +def get_build_logs(repo_id: str, lines: int = 200) -> str: + """ + Fetch recent build logs from a Space. + + Note: This uses the HF Spaces logs endpoint which may require authentication. + """ + api = HfApi() + + try: + # Try to get logs via the API + # The logs endpoint format: https://huggingface.co/api/spaces/{repo_id}/logs/build + token = api.token + headers = {"Authorization": f"Bearer {token}"} if token else {} + + # Build logs endpoint + logs_url = f"https://huggingface.co/api/spaces/{repo_id}/logs/build" + response = requests.get(logs_url, headers=headers, timeout=30) + + if response.status_code == 200: + return response.text + elif response.status_code == 404: + return "[No build logs available yet]" + else: + return f"[Could not fetch logs: HTTP {response.status_code}]" + + except Exception as e: + return f"[Error fetching logs: {e}]" + + +def get_runtime_logs(repo_id: str, lines: int = 200) -> str: + """Fetch runtime logs from a running Space.""" + api = HfApi() + + try: + token = api.token + headers = {"Authorization": f"Bearer {token}"} if token else {} + + # Runtime logs endpoint + logs_url = f"https://huggingface.co/api/spaces/{repo_id}/logs/run" + response = requests.get(logs_url, headers=headers, timeout=30) + + if response.status_code == 200: + return response.text + elif response.status_code == 404: + return "[No runtime logs available]" + else: + return f"[Could not fetch logs: HTTP {response.status_code}]" + + except Exception as e: + return f"[Error fetching logs: {e}]" + + +def detect_errors(logs: str) -> list[DetectedError]: + """ + Parse logs to identify common error patterns. + + Returns a list of detected errors with fix suggestions. + """ + errors = [] + + for error_type, pattern_info in ERROR_PATTERNS.items(): + pattern = pattern_info["pattern"] + matches = re.finditer(pattern, logs, re.IGNORECASE | re.MULTILINE) + + for match in matches: + groups = match.groups() + fix = pattern_info["fix_template"] + + # Substitute match groups into fix template + if groups and "{match}" in fix: + fix = fix.format(match=groups[0]) + + errors.append(DetectedError( + error_type=error_type, + description=pattern_info["description"], + matched_text=match.group(0)[:200], # Truncate long matches + fix_suggestion=fix, + auto_fixable=pattern_info["auto_fixable"], + match_groups=groups, + )) + + return errors + + +def watch_space(repo_id: str, interval: int = 10, max_checks: int = 60): + """ + Watch Space build status until it's running or fails. + + Args: + repo_id: Space repository ID + interval: Seconds between status checks + max_checks: Maximum number of checks before timeout + """ + print(f"\nWatching Space: {repo_id}") + print(f"URL: https://huggingface.co/spaces/{repo_id}") + print("-" * 50) + + previous_stage = None + + for i in range(max_checks): + status = get_space_status(repo_id) + + if status.stage != previous_stage: + timestamp = time.strftime("%H:%M:%S") + print(f"[{timestamp}] Stage: {status.stage}") + + if status.stage == "RUNNING": + print(f"\n✅ Space is running!") + print(f" URL: {status.url}") + return True + + elif status.stage in ["RUNTIME_ERROR", "BUILD_ERROR"]: + print(f"\n❌ Space failed: {status.stage}") + print("\nAnalyzing errors...") + logs = get_build_logs(repo_id) + "\n" + get_runtime_logs(repo_id) + errors = detect_errors(logs) + + if errors: + print_error_analysis(errors) + else: + print(" No specific error patterns detected.") + print(" Check logs manually for more details.") + return False + + elif status.stage == "PAUSED": + print(f"\n⏸️ Space is paused. Restart it to continue.") + return False + + previous_stage = status.stage + + time.sleep(interval) + + print(f"\n⚠️ Timeout after {max_checks * interval} seconds") + print(f" Current stage: {status.stage}") + return False + + +def print_error_analysis(errors: list[DetectedError]): + """Pretty print detected errors with fixes.""" + print(f"\n{'=' * 60}") + print(f"DETECTED ERRORS ({len(errors)} found)") + print("=" * 60) + + # Deduplicate by error type + seen_types = set() + unique_errors = [] + for error in errors: + if error.error_type not in seen_types: + seen_types.add(error.error_type) + unique_errors.append(error) + + for i, error in enumerate(unique_errors, 1): + auto_fix = "🔧 Auto-fixable" if error.auto_fixable else "📋 Manual fix" + print(f"\n{i}. {error.description} [{auto_fix}]") + print(f" Type: {error.error_type}") + print(f" Matched: {error.matched_text[:100]}...") + print(f" Fix: {error.fix_suggestion}") + + print("\n" + "=" * 60) + + +def run_health_check(repo_id: str) -> dict: + """ + Run comprehensive health check on a Space. + + Returns dict with check results. + """ + results = { + "repo_id": repo_id, + "checks": {}, + "healthy": True, + "issues": [], + } + + # Check 1: Space status + status = get_space_status(repo_id) + results["checks"]["status"] = { + "stage": status.stage, + "hardware": status.hardware, + "sdk": status.sdk, + } + + if status.stage != "RUNNING": + results["healthy"] = False + results["issues"].append(f"Space is not running (stage: {status.stage})") + + # Check 2: Analyze logs for errors + logs = get_build_logs(repo_id) + "\n" + get_runtime_logs(repo_id) + errors = detect_errors(logs) + + results["checks"]["errors"] = { + "count": len(errors), + "types": list(set(e.error_type for e in errors)), + } + + if errors: + results["healthy"] = False + for error in errors[:3]: # Top 3 errors + results["issues"].append(f"{error.description}: {error.fix_suggestion}") + + # Check 3: Try to access the Space URL + if status.stage == "RUNNING": + try: + # Try to access the gradio API endpoint + space_url = f"https://{repo_id.replace('/', '-')}.hf.space" + response = requests.get(space_url, timeout=30) + results["checks"]["accessibility"] = { + "reachable": response.status_code == 200, + "status_code": response.status_code, + } + if response.status_code != 200: + results["healthy"] = False + results["issues"].append(f"Space URL returned status {response.status_code}") + except Exception as e: + results["checks"]["accessibility"] = { + "reachable": False, + "error": str(e), + } + results["healthy"] = False + results["issues"].append(f"Cannot reach Space: {e}") + + return results + + +def print_health_check(results: dict): + """Pretty print health check results.""" + print(f"\n{'=' * 60}") + print(f"HEALTH CHECK: {results['repo_id']}") + print("=" * 60) + + # Overall status + if results["healthy"]: + print("\n✅ HEALTHY - Space is running normally") + else: + print("\n❌ UNHEALTHY - Issues detected") + + # Status check + status = results["checks"].get("status", {}) + print(f"\nStatus:") + print(f" Stage: {status.get('stage', 'unknown')}") + print(f" Hardware: {status.get('hardware', 'unknown')}") + print(f" SDK: {status.get('sdk', 'unknown')}") + + # Error check + errors = results["checks"].get("errors", {}) + print(f"\nErrors:") + print(f" Detected: {errors.get('count', 0)}") + if errors.get("types"): + print(f" Types: {', '.join(errors['types'])}") + + # Accessibility check + access = results["checks"].get("accessibility", {}) + if access: + print(f"\nAccessibility:") + print(f" Reachable: {access.get('reachable', 'unknown')}") + if access.get("status_code"): + print(f" Status code: {access['status_code']}") + if access.get("error"): + print(f" Error: {access['error']}") + + # Issues + if results["issues"]: + print(f"\nIssues to address:") + for i, issue in enumerate(results["issues"], 1): + print(f" {i}. {issue}") + + print("\n" + "=" * 60) + + +def main(): + parser = argparse.ArgumentParser( + description="Monitor HF Space build and runtime status", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check current status + python monitor_space.py status username/my-space + + # Watch build progress until complete + python monitor_space.py watch username/my-space + + # Get build logs + python monitor_space.py logs username/my-space --type build + + # Get runtime logs + python monitor_space.py logs username/my-space --type runtime + + # Analyze logs for errors + python monitor_space.py analyze-errors username/my-space + + # Run full health check + python monitor_space.py health-check username/my-space + """, + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # status command + status_parser = subparsers.add_parser("status", help="Get current Space status") + status_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + # watch command + watch_parser = subparsers.add_parser("watch", help="Watch build progress") + watch_parser.add_argument("repo_id", help="Space ID (username/space-name)") + watch_parser.add_argument("--interval", type=int, default=10, help="Check interval in seconds") + watch_parser.add_argument("--max-checks", type=int, default=60, help="Max number of checks") + + # logs command + logs_parser = subparsers.add_parser("logs", help="Get Space logs") + logs_parser.add_argument("repo_id", help="Space ID (username/space-name)") + logs_parser.add_argument("--type", choices=["build", "runtime", "both"], default="both", + help="Type of logs to fetch") + logs_parser.add_argument("--lines", type=int, default=200, help="Number of lines") + + # analyze-errors command + analyze_parser = subparsers.add_parser("analyze-errors", help="Analyze logs for errors") + analyze_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + # health-check command + health_parser = subparsers.add_parser("health-check", help="Run comprehensive health check") + health_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + args = parser.parse_args() + + if args.command == "status": + status = get_space_status(args.repo_id) + print(f"\nSpace: {status.repo_id}") + print(f"Stage: {status.stage}") + print(f"Hardware: {status.hardware or 'unknown'}") + print(f"SDK: {status.sdk or 'unknown'}") + print(f"URL: {status.url}") + if status.error_message: + print(f"Error: {status.error_message}") + + elif args.command == "watch": + success = watch_space(args.repo_id, args.interval, args.max_checks) + sys.exit(0 if success else 1) + + elif args.command == "logs": + if args.type in ["build", "both"]: + print("=== BUILD LOGS ===") + print(get_build_logs(args.repo_id, args.lines)) + if args.type in ["runtime", "both"]: + print("\n=== RUNTIME LOGS ===") + print(get_runtime_logs(args.repo_id, args.lines)) + + elif args.command == "analyze-errors": + print(f"Fetching logs for {args.repo_id}...") + logs = get_build_logs(args.repo_id) + "\n" + get_runtime_logs(args.repo_id) + errors = detect_errors(logs) + + if errors: + print_error_analysis(errors) + else: + print("\n✅ No common error patterns detected in logs.") + print(" If issues persist, check logs manually.") + + elif args.command == "health-check": + results = run_health_check(args.repo_id) + print_health_check(results) + sys.exit(0 if results["healthy"] else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/hf-create-a-space/scripts/preflight.py b/skills/hf-create-a-space/scripts/preflight.py new file mode 100644 index 0000000..6a8e644 --- /dev/null +++ b/skills/hf-create-a-space/scripts/preflight.py @@ -0,0 +1,554 @@ +#!/usr/bin/env python3 +""" +Pre-flight checks for Hugging Face Space deployment. + +Validates all prerequisites before attempting deployment: +- HF token exists and has write permissions +- User subscription status (PRO required for ZeroGPU hosting) +- Model accessibility (not gated without access) +- Model size estimation for hardware recommendations + +Usage: + python preflight.py check-all username/model-id + python preflight.py check-token + python preflight.py check-subscription + python preflight.py check-model username/model-id + python preflight.py estimate-size username/model-id +""" + +import argparse +import json +import os +import sys +from dataclasses import dataclass +from typing import Optional + +from huggingface_hub import HfApi, hf_hub_download, model_info, whoami +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError, GatedRepoError + + +@dataclass +class TokenCheckResult: + """Result of HF token validation.""" + valid: bool + username: Optional[str] = None + has_write: bool = False + error: Optional[str] = None + + +@dataclass +class SubscriptionCheckResult: + """Result of subscription status check.""" + subscription: str # "free", "pro", "team", "enterprise" + can_host_zerogpu: bool + zerogpu_quota_minutes: int # Daily quota in minutes + error: Optional[str] = None + + +@dataclass +class ModelAccessResult: + """Result of model accessibility check.""" + accessible: bool + exists: bool = True + gated: bool = False + gated_access_granted: bool = False + private: bool = False + error: Optional[str] = None + access_url: Optional[str] = None + + +@dataclass +class ModelSizeResult: + """Result of model size estimation.""" + params_billions: Optional[float] = None + estimated_vram_gb: Optional[float] = None + recommended_hardware: Optional[str] = None + model_type: Optional[str] = None # "full", "adapter", "unknown" + error: Optional[str] = None + + +def check_hf_token() -> TokenCheckResult: + """ + Verify HF token exists and has write permissions. + + Checks both environment variable and cached token file. + """ + try: + user_info = whoami() + + # Check if user has write access by looking at auth info + # The whoami() call succeeds means token is valid + # Write access is typically available unless explicitly restricted + has_write = True # Default assumption for valid tokens + + return TokenCheckResult( + valid=True, + username=user_info.get("name"), + has_write=has_write, + ) + except Exception as e: + error_msg = str(e) + if "401" in error_msg or "Invalid" in error_msg: + return TokenCheckResult( + valid=False, + error="Token is invalid or expired. Run: huggingface-cli login" + ) + elif "token" in error_msg.lower(): + return TokenCheckResult( + valid=False, + error="No HF token found. Run: huggingface-cli login" + ) + else: + return TokenCheckResult( + valid=False, + error=f"Token check failed: {error_msg}" + ) + + +def check_zerogpu_eligibility() -> SubscriptionCheckResult: + """ + Check if user has PRO subscription (required to HOST ZeroGPU Spaces). + + ZeroGPU Hosting Requirements: + - Personal accounts: PRO subscription required + - Organizations: Team or Enterprise plan required + + ZeroGPU Usage Quotas (as visitor): + - Unauthenticated: 2 minutes/day + - Free account: 3.5 minutes/day + - PRO account: 25 minutes/day + - Team/Enterprise: 25-45 minutes/day + """ + try: + user_info = whoami() + + # Check subscription type + # Note: The API may not directly expose subscription status + # We infer from available fields + is_pro = user_info.get("isPro", False) + + # Check organization memberships for team/enterprise + orgs = user_info.get("orgs", []) + has_paid_org = any( + org.get("subscription") in ["team", "enterprise"] + for org in orgs + ) if orgs else False + + if is_pro: + return SubscriptionCheckResult( + subscription="pro", + can_host_zerogpu=True, + zerogpu_quota_minutes=25, + ) + elif has_paid_org: + return SubscriptionCheckResult( + subscription="team", + can_host_zerogpu=True, + zerogpu_quota_minutes=25, + ) + else: + return SubscriptionCheckResult( + subscription="free", + can_host_zerogpu=False, + zerogpu_quota_minutes=3, # 3.5 rounded down + ) + + except Exception as e: + return SubscriptionCheckResult( + subscription="unknown", + can_host_zerogpu=False, + zerogpu_quota_minutes=0, + error=f"Could not check subscription: {e}" + ) + + +def check_model_access(model_id: str) -> ModelAccessResult: + """ + Verify model is accessible (not gated without access, not private without access). + + Returns detailed information about why access might be restricted. + """ + try: + info = model_info(model_id) + + # Check if model is gated + gated = getattr(info, "gated", False) + if gated and gated != "auto": + # Model is gated, check if user has access + # If we got here without error, user has access + return ModelAccessResult( + accessible=True, + gated=True, + gated_access_granted=True, + ) + + # Model is accessible + return ModelAccessResult( + accessible=True, + gated=bool(gated), + private=getattr(info, "private", False), + ) + + except GatedRepoError: + return ModelAccessResult( + accessible=False, + exists=True, + gated=True, + gated_access_granted=False, + error="Model is gated. Request access first.", + access_url=f"https://huggingface.co/{model_id}" + ) + except RepositoryNotFoundError: + return ModelAccessResult( + accessible=False, + exists=False, + error=f"Model '{model_id}' not found. Check the model ID." + ) + except HfHubHTTPError as e: + if "403" in str(e): + return ModelAccessResult( + accessible=False, + exists=True, + private=True, + error="Model is private. You don't have access.", + ) + else: + return ModelAccessResult( + accessible=False, + error=f"Could not access model: {e}" + ) + except Exception as e: + return ModelAccessResult( + accessible=False, + error=f"Error checking model access: {e}" + ) + + +def estimate_model_size(model_id: str) -> ModelSizeResult: + """ + Estimate model size and recommend hardware. + + Downloads config.json to estimate parameters and VRAM requirements. + Also detects if model is a LoRA adapter vs full model. + """ + api = HfApi() + + try: + # List files to detect model type + files = api.list_repo_files(model_id, repo_type="model") + file_names = [f.split("/")[-1] for f in files] + + # Check for adapter files (LoRA/PEFT) + has_adapter_config = "adapter_config.json" in file_names + has_adapter_model = any("adapter_model" in f for f in file_names) + + # Check for full model files + has_full_weights = any( + f in file_names for f in [ + "model.safetensors", + "pytorch_model.bin", + ] + ) or any( + "model-" in f and ".safetensors" in f for f in file_names + ) + + # Determine model type + if has_adapter_config and has_adapter_model and not has_full_weights: + model_type = "adapter" + elif has_full_weights: + model_type = "full" + else: + model_type = "unknown" + + # Try to get config.json for parameter estimation + params_billions = None + try: + config_path = hf_hub_download(model_id, "config.json") + with open(config_path) as f: + config = json.load(f) + + # Different architectures store size differently + # Common patterns: + hidden_size = config.get("hidden_size", config.get("d_model", 0)) + num_layers = config.get("num_hidden_layers", config.get("n_layer", config.get("num_layers", 0))) + vocab_size = config.get("vocab_size", 0) + intermediate_size = config.get("intermediate_size", hidden_size * 4) + + if hidden_size and num_layers: + # Rough estimation formula for transformer models + # Parameters ≈ 12 * L * H^2 (simplified) + # More accurate: embedding + attention + FFN + embedding_params = vocab_size * hidden_size if vocab_size else 0 + attention_params = 4 * hidden_size * hidden_size * num_layers # Q, K, V, O + ffn_params = 2 * hidden_size * intermediate_size * num_layers + total_params = embedding_params + attention_params + ffn_params + + params_billions = total_params / 1e9 + + except Exception: + # Config not available or parsing failed + pass + + # For adapters, also check adapter_config for base model size hint + if model_type == "adapter" and params_billions is None: + try: + adapter_config_path = hf_hub_download(model_id, "adapter_config.json") + with open(adapter_config_path) as f: + adapter_config = json.load(f) + # LoRA adapters are tiny, but base model determines hardware + # Return a flag that base model should be checked + base_model = adapter_config.get("base_model_name_or_path") + if base_model: + # Recursively estimate base model size + base_result = estimate_model_size(base_model) + params_billions = base_result.params_billions + except Exception: + pass + + # Estimate VRAM requirements (rough: 2 bytes per param for fp16) + estimated_vram_gb = None + if params_billions: + # FP16: ~2 bytes per param, plus overhead (~20%) + estimated_vram_gb = params_billions * 2 * 1.2 + + # Recommend hardware based on VRAM + recommended_hardware = _recommend_hardware(params_billions, estimated_vram_gb) + + return ModelSizeResult( + params_billions=round(params_billions, 2) if params_billions else None, + estimated_vram_gb=round(estimated_vram_gb, 1) if estimated_vram_gb else None, + recommended_hardware=recommended_hardware, + model_type=model_type, + ) + + except Exception as e: + return ModelSizeResult( + error=f"Could not estimate model size: {e}" + ) + + +def _recommend_hardware(params_billions: Optional[float], vram_gb: Optional[float]) -> str: + """ + Recommend hardware tier based on model size. + + Hardware VRAM: + - cpu-basic: 0 (CPU only) + - cpu-upgrade: 0 (CPU only, more RAM) + - zero-a10g: 24GB (free with quota, requires PRO) + - t4-small: 16GB ($0.40/hr) + - l4: 24GB ($0.80/hr) + - l40s: 48GB ($1.80/hr) + - a10g-small: 24GB ($1.00/hr) + - a100-large: 80GB ($2.50/hr) + """ + if params_billions is None or vram_gb is None: + return "zero-a10g" # Default to ZeroGPU for unknown sizes + + if params_billions < 0.5: + return "cpu-upgrade" # Small models can run on CPU + elif params_billions < 3: + return "zero-a10g" # ZeroGPU handles up to ~3B comfortably + elif params_billions < 7: + return "l4" # 24GB VRAM for 3-7B models + elif params_billions < 14: + return "l40s" # 48GB VRAM for 7-14B models + elif params_billions < 30: + return "a100-large" # 80GB VRAM for 14-30B models + else: + return "a100-large" # Largest available; may need quantization + + +def run_all_checks(model_id: str) -> dict: + """ + Run all pre-flight checks and return comprehensive results. + + Returns a dict with all check results and a summary. + """ + results = { + "token": check_hf_token(), + "subscription": check_zerogpu_eligibility(), + "model_access": check_model_access(model_id), + "model_size": estimate_model_size(model_id), + } + + # Build summary + issues = [] + warnings = [] + + if not results["token"].valid: + issues.append(f"Token: {results['token'].error}") + + if not results["subscription"].can_host_zerogpu: + warnings.append( + "ZeroGPU hosting requires PRO subscription. " + "Options: upgrade to PRO, use paid GPU, or use Inference API." + ) + + if not results["model_access"].accessible: + issues.append(f"Model access: {results['model_access'].error}") + + results["summary"] = { + "ready": len(issues) == 0, + "issues": issues, + "warnings": warnings, + } + + return results + + +def print_check_results(results: dict): + """Pretty print check results.""" + print("\n" + "=" * 60) + print("PRE-FLIGHT CHECK RESULTS") + print("=" * 60) + + # Token check + token = results["token"] + if token.valid: + print(f"\n✓ Token: Valid (user: {token.username})") + else: + print(f"\n✗ Token: {token.error}") + + # Subscription check + sub = results["subscription"] + if sub.can_host_zerogpu: + print(f"✓ Subscription: {sub.subscription.upper()} (can host ZeroGPU)") + else: + print(f"⚠ Subscription: {sub.subscription.upper()} (cannot host ZeroGPU)") + print(f" Daily ZeroGPU quota as visitor: {sub.zerogpu_quota_minutes} min") + + # Model access check + access = results["model_access"] + if access.accessible: + status = "✓ Model: Accessible" + if access.gated: + status += " (gated, access granted)" + if access.private: + status += " (private)" + print(status) + else: + print(f"✗ Model: {access.error}") + if access.access_url: + print(f" Request access: {access.access_url}") + + # Model size check + size = results["model_size"] + if size.error: + print(f"⚠ Size estimation: {size.error}") + else: + print(f"✓ Model type: {size.model_type}") + if size.params_billions: + print(f" Estimated size: {size.params_billions}B parameters") + print(f" Estimated VRAM: {size.estimated_vram_gb}GB") + print(f" Recommended hardware: {size.recommended_hardware}") + + # Summary + summary = results["summary"] + print("\n" + "-" * 60) + if summary["ready"]: + print("✅ READY FOR DEPLOYMENT") + else: + print("❌ ISSUES MUST BE RESOLVED:") + for issue in summary["issues"]: + print(f" • {issue}") + + if summary["warnings"]: + print("\n⚠️ WARNINGS:") + for warning in summary["warnings"]: + print(f" • {warning}") + + print("=" * 60 + "\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Pre-flight checks for HF Space deployment", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Run all checks for a model + python preflight.py check-all meta-llama/Llama-3.1-8B-Instruct + + # Check only token status + python preflight.py check-token + + # Check subscription status + python preflight.py check-subscription + + # Check model access + python preflight.py check-model meta-llama/Llama-3.1-8B-Instruct + + # Estimate model size and get hardware recommendation + python preflight.py estimate-size meta-llama/Llama-3.1-8B-Instruct + """, + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # check-all command + all_parser = subparsers.add_parser("check-all", help="Run all pre-flight checks") + all_parser.add_argument("model_id", help="Model ID to check (e.g., username/model)") + + # check-token command + subparsers.add_parser("check-token", help="Check HF token status") + + # check-subscription command + subparsers.add_parser("check-subscription", help="Check subscription status") + + # check-model command + model_parser = subparsers.add_parser("check-model", help="Check model accessibility") + model_parser.add_argument("model_id", help="Model ID to check") + + # estimate-size command + size_parser = subparsers.add_parser("estimate-size", help="Estimate model size") + size_parser.add_argument("model_id", help="Model ID to analyze") + + args = parser.parse_args() + + if args.command == "check-all": + results = run_all_checks(args.model_id) + print_check_results(results) + sys.exit(0 if results["summary"]["ready"] else 1) + + elif args.command == "check-token": + result = check_hf_token() + if result.valid: + print(f"✓ Token valid (user: {result.username})") + else: + print(f"✗ {result.error}") + sys.exit(1) + + elif args.command == "check-subscription": + result = check_zerogpu_eligibility() + print(f"Subscription: {result.subscription}") + print(f"Can host ZeroGPU: {result.can_host_zerogpu}") + print(f"ZeroGPU quota: {result.zerogpu_quota_minutes} min/day") + if result.error: + print(f"Warning: {result.error}") + + elif args.command == "check-model": + result = check_model_access(args.model_id) + if result.accessible: + print(f"✓ Model accessible") + if result.gated: + print(" (gated, access granted)") + else: + print(f"✗ {result.error}") + if result.access_url: + print(f" Request access: {result.access_url}") + sys.exit(1) + + elif args.command == "estimate-size": + result = estimate_model_size(args.model_id) + if result.error: + print(f"Error: {result.error}") + sys.exit(1) + print(f"Model type: {result.model_type}") + if result.params_billions: + print(f"Estimated size: {result.params_billions}B parameters") + print(f"Estimated VRAM: {result.estimated_vram_gb}GB") + print(f"Recommended hardware: {result.recommended_hardware}") + + +if __name__ == "__main__": + main() diff --git a/skills/hf-create-a-space/scripts/remediate.py b/skills/hf-create-a-space/scripts/remediate.py new file mode 100644 index 0000000..912bd14 --- /dev/null +++ b/skills/hf-create-a-space/scripts/remediate.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python3 +""" +Auto-remediate common Hugging Face Space deployment issues. + +Provides automated fixes for common problems: +- Missing packages in requirements.txt +- Hardware mismatches +- Missing secrets +- Version conflicts + +Usage: + python remediate.py fix-requirements username/my-space --add torch transformers + python remediate.py fix-hardware username/my-space --tier zero-a10g + python remediate.py add-secret username/my-space --key HF_TOKEN --value xxx + python remediate.py auto-fix username/my-space +""" + +import argparse +import re +import sys +import tempfile +from pathlib import Path + +from huggingface_hub import ( + HfApi, + hf_hub_download, + upload_file, + add_space_secret, + request_space_hardware, + restart_space, +) + +from monitor_space import get_build_logs, get_runtime_logs, detect_errors, DetectedError + + +# Known package fixes for common import errors +PACKAGE_FIXES = { + "torch": "torch", + "transformers": "transformers>=4.40.0", + "accelerate": "accelerate", + "spaces": "spaces", + "peft": "peft", + "gradio": "gradio>=5.0.0", + "huggingface_hub": "huggingface_hub>=0.26.0", + "sentence_transformers": "sentence-transformers>=2.2.0", + "PIL": "Pillow>=10.0.0", + "cv2": "opencv-python>=4.8.0", + "numpy": "numpy>=1.24.0", + "scipy": "scipy>=1.10.0", + "sklearn": "scikit-learn>=1.3.0", + "pandas": "pandas>=2.0.0", + "matplotlib": "matplotlib>=3.7.0", + "torchaudio": "torchaudio", + "torchvision": "torchvision", + "diffusers": "diffusers>=0.25.0", + "safetensors": "safetensors>=0.4.0", + "bitsandbytes": "bitsandbytes", + "einops": "einops", + "timm": "timm", + "soundfile": "soundfile", + "librosa": "librosa", + "tiktoken": "tiktoken", + "sentencepiece": "sentencepiece", +} + +# Version fixes for known compatibility issues +VERSION_FIXES = { + "gradio_hffolder": { + "gradio": "gradio>=5.0.0", + "huggingface_hub": "huggingface_hub>=0.26.0", + }, +} + + +def get_current_requirements(repo_id: str) -> list[str]: + """Download and parse current requirements.txt from a Space.""" + try: + req_path = hf_hub_download( + repo_id=repo_id, + filename="requirements.txt", + repo_type="space", + ) + with open(req_path) as f: + return [line.strip() for line in f if line.strip() and not line.startswith("#")] + except Exception: + return [] + + +def add_packages_to_requirements( + repo_id: str, + packages: list[str], + dry_run: bool = False, +) -> bool: + """ + Add missing packages to requirements.txt and upload. + + Args: + repo_id: Space repository ID + packages: List of packages to add + dry_run: If True, show what would be changed without making changes + + Returns: + True if successful + """ + current_reqs = get_current_requirements(repo_id) + + # Normalize package names for comparison + def normalize(pkg: str) -> str: + # Extract package name without version specifier + match = re.match(r"([a-zA-Z0-9_-]+)", pkg) + return match.group(1).lower().replace("-", "_") if match else pkg.lower() + + current_normalized = {normalize(r) for r in current_reqs} + + # Find packages to add + packages_to_add = [] + for pkg in packages: + # Map common import names to package names + pkg_spec = PACKAGE_FIXES.get(pkg, pkg) + pkg_name = normalize(pkg_spec) + + if pkg_name not in current_normalized: + packages_to_add.append(pkg_spec) + + if not packages_to_add: + print("All packages already present in requirements.txt") + return True + + # Create new requirements + new_reqs = current_reqs + packages_to_add + new_content = "\n".join(new_reqs) + "\n" + + print(f"\nPackages to add: {', '.join(packages_to_add)}") + print(f"\nNew requirements.txt:") + print("-" * 40) + print(new_content) + print("-" * 40) + + if dry_run: + print("\n[DRY RUN] No changes made.") + return True + + # Upload new requirements.txt + try: + upload_file( + path_or_fileobj=new_content.encode(), + path_in_repo="requirements.txt", + repo_id=repo_id, + repo_type="space", + commit_message=f"Add packages: {', '.join(packages_to_add)}", + ) + print(f"\n✓ Updated requirements.txt") + print(" Space will automatically rebuild.") + return True + except Exception as e: + print(f"\n✗ Failed to update requirements.txt: {e}") + return False + + +def fix_version_conflicts( + repo_id: str, + error_type: str, + dry_run: bool = False, +) -> bool: + """ + Fix known version conflicts in requirements.txt. + + Args: + repo_id: Space repository ID + error_type: Type of version conflict (e.g., "gradio_hffolder") + dry_run: If True, show what would be changed without making changes + + Returns: + True if successful + """ + if error_type not in VERSION_FIXES: + print(f"Unknown version conflict type: {error_type}") + return False + + fixes = VERSION_FIXES[error_type] + current_reqs = get_current_requirements(repo_id) + + # Update package versions + new_reqs = [] + for req in current_reqs: + # Extract package name + match = re.match(r"([a-zA-Z0-9_-]+)", req) + if match: + pkg_name = match.group(1).lower().replace("-", "_") + # Check if this package needs a version fix + for fix_pkg, fix_spec in fixes.items(): + if pkg_name == fix_pkg.lower().replace("-", "_"): + new_reqs.append(fix_spec) + break + else: + new_reqs.append(req) + else: + new_reqs.append(req) + + # Add any missing packages from fixes + current_normalized = {re.match(r"([a-zA-Z0-9_-]+)", r).group(1).lower().replace("-", "_") + for r in new_reqs if re.match(r"([a-zA-Z0-9_-]+)", r)} + + for fix_pkg, fix_spec in fixes.items(): + if fix_pkg.lower().replace("-", "_") not in current_normalized: + new_reqs.append(fix_spec) + + new_content = "\n".join(new_reqs) + "\n" + + print(f"\nApplying version fixes for: {error_type}") + print(f"\nNew requirements.txt:") + print("-" * 40) + print(new_content) + print("-" * 40) + + if dry_run: + print("\n[DRY RUN] No changes made.") + return True + + try: + upload_file( + path_or_fileobj=new_content.encode(), + path_in_repo="requirements.txt", + repo_id=repo_id, + repo_type="space", + commit_message=f"Fix version conflict: {error_type}", + ) + print(f"\n✓ Updated requirements.txt") + return True + except Exception as e: + print(f"\n✗ Failed to update requirements.txt: {e}") + return False + + +def set_hardware(repo_id: str, hardware: str, dry_run: bool = False) -> bool: + """ + Change Space hardware tier. + + Args: + repo_id: Space repository ID + hardware: Hardware tier (e.g., "zero-a10g", "t4-small") + dry_run: If True, show what would be changed without making changes + + Returns: + True if successful + """ + print(f"\nSetting hardware for {repo_id} to: {hardware}") + + if dry_run: + print("[DRY RUN] No changes made.") + return True + + try: + request_space_hardware(repo_id=repo_id, hardware=hardware) + print(f"✓ Hardware updated to {hardware}") + print(" Space will restart with new hardware.") + return True + except Exception as e: + print(f"✗ Failed to update hardware: {e}") + return False + + +def add_secret(repo_id: str, key: str, value: str, dry_run: bool = False) -> bool: + """ + Add or update a Space secret. + + Args: + repo_id: Space repository ID + key: Secret key name + value: Secret value + dry_run: If True, show what would be changed without making changes + + Returns: + True if successful + """ + print(f"\nAdding secret '{key}' to {repo_id}") + + if dry_run: + print("[DRY RUN] No changes made.") + return True + + try: + add_space_secret(repo_id=repo_id, key=key, value=value) + print(f"✓ Secret '{key}' added") + print(f" Access in code: os.environ.get('{key}')") + return True + except Exception as e: + print(f"✗ Failed to add secret: {e}") + return False + + +def auto_fix(repo_id: str, dry_run: bool = False) -> dict: + """ + Automatically detect and fix common issues. + + Analyzes Space logs, detects errors, and applies automatic fixes + for issues that can be resolved programmatically. + + Args: + repo_id: Space repository ID + dry_run: If True, show what would be fixed without making changes + + Returns: + Dict with fix results + """ + results = { + "analyzed": True, + "errors_found": 0, + "fixes_applied": 0, + "fixes_available": 0, + "manual_fixes_needed": [], + } + + print(f"\n{'=' * 60}") + print(f"AUTO-FIX: {repo_id}") + print("=" * 60) + + # Fetch and analyze logs + print("\nFetching logs...") + logs = get_build_logs(repo_id) + "\n" + get_runtime_logs(repo_id) + errors = detect_errors(logs) + + results["errors_found"] = len(errors) + + if not errors: + print("\n✅ No errors detected in logs.") + return results + + print(f"\nFound {len(errors)} error(s)") + + # Group errors by type for deduplication + errors_by_type: dict[str, DetectedError] = {} + for error in errors: + if error.error_type not in errors_by_type: + errors_by_type[error.error_type] = error + + # Process each error type + packages_to_add = [] + + for error_type, error in errors_by_type.items(): + print(f"\n--- {error.description} ---") + + if error.auto_fixable: + results["fixes_available"] += 1 + + if error_type == "module_not_found": + # Extract package name and add to list + if error.match_groups: + pkg = error.match_groups[0] + packages_to_add.append(pkg) + print(f" Will add package: {pkg}") + + elif error_type == "gradio_hffolder": + # Fix version conflict + print(" Will fix Gradio/huggingface_hub versions") + if not dry_run: + if fix_version_conflicts(repo_id, "gradio_hffolder", dry_run): + results["fixes_applied"] += 1 + + elif error_type == "gradio_examples_format": + print(" ⚠️ Examples format needs manual fix in app.py") + print(f" Fix: {error.fix_suggestion}") + results["manual_fixes_needed"].append(error.fix_suggestion) + + elif error_type == "port_in_use": + print(" ⚠️ Port binding needs manual fix in app.py") + print(" Remove explicit port in demo.launch()") + results["manual_fixes_needed"].append(error.fix_suggestion) + + else: + results["manual_fixes_needed"].append(error.fix_suggestion) + print(f" ⚠️ Manual fix required: {error.fix_suggestion}") + + # Apply package additions + if packages_to_add: + print(f"\n--- Adding missing packages ---") + if add_packages_to_requirements(repo_id, packages_to_add, dry_run): + results["fixes_applied"] += len(packages_to_add) + + # Summary + print(f"\n{'=' * 60}") + print("SUMMARY") + print("=" * 60) + print(f"Errors found: {results['errors_found']}") + print(f"Auto-fixes available: {results['fixes_available']}") + print(f"Fixes applied: {results['fixes_applied']}") + + if results["manual_fixes_needed"]: + print(f"\nManual fixes needed ({len(results['manual_fixes_needed'])}):") + for i, fix in enumerate(results["manual_fixes_needed"], 1): + print(f" {i}. {fix}") + + if results["fixes_applied"] > 0 and not dry_run: + print("\n✓ Space will automatically rebuild with fixes.") + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Auto-remediate HF Space issues", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Add missing packages + python remediate.py fix-requirements username/my-space --add torch transformers + + # Fix hardware mismatch + python remediate.py fix-hardware username/my-space --tier zero-a10g + + # Add a secret + python remediate.py add-secret username/my-space --key HF_TOKEN --value hf_xxx + + # Auto-detect and fix issues + python remediate.py auto-fix username/my-space + + # Dry run (show what would be fixed) + python remediate.py auto-fix username/my-space --dry-run + """, + ) + + subparsers = parser.add_subparsers(dest="command", required=True) + + # fix-requirements command + req_parser = subparsers.add_parser("fix-requirements", help="Add packages to requirements.txt") + req_parser.add_argument("repo_id", help="Space ID (username/space-name)") + req_parser.add_argument("--add", nargs="+", required=True, help="Packages to add") + req_parser.add_argument("--dry-run", action="store_true", help="Show changes without applying") + + # fix-hardware command + hw_parser = subparsers.add_parser("fix-hardware", help="Change hardware tier") + hw_parser.add_argument("repo_id", help="Space ID (username/space-name)") + hw_parser.add_argument("--tier", required=True, help="Hardware tier") + hw_parser.add_argument("--dry-run", action="store_true", help="Show changes without applying") + + # add-secret command + secret_parser = subparsers.add_parser("add-secret", help="Add a Space secret") + secret_parser.add_argument("repo_id", help="Space ID (username/space-name)") + secret_parser.add_argument("--key", required=True, help="Secret key name") + secret_parser.add_argument("--value", required=True, help="Secret value") + secret_parser.add_argument("--dry-run", action="store_true", help="Show changes without applying") + + # auto-fix command + auto_parser = subparsers.add_parser("auto-fix", help="Auto-detect and fix issues") + auto_parser.add_argument("repo_id", help="Space ID (username/space-name)") + auto_parser.add_argument("--dry-run", action="store_true", help="Show changes without applying") + + # restart command + restart_parser = subparsers.add_parser("restart", help="Restart a Space") + restart_parser.add_argument("repo_id", help="Space ID (username/space-name)") + + args = parser.parse_args() + + if args.command == "fix-requirements": + success = add_packages_to_requirements(args.repo_id, args.add, args.dry_run) + sys.exit(0 if success else 1) + + elif args.command == "fix-hardware": + success = set_hardware(args.repo_id, args.tier, args.dry_run) + sys.exit(0 if success else 1) + + elif args.command == "add-secret": + success = add_secret(args.repo_id, args.key, args.value, args.dry_run) + sys.exit(0 if success else 1) + + elif args.command == "auto-fix": + results = auto_fix(args.repo_id, args.dry_run) + # Exit with error if manual fixes are needed + sys.exit(0 if not results["manual_fixes_needed"] else 1) + + elif args.command == "restart": + try: + restart_space(args.repo_id) + print(f"✓ Space {args.repo_id} restart initiated") + except Exception as e: + print(f"✗ Failed to restart: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/hf-create-a-space/templates/README_template.md b/skills/hf-create-a-space/templates/README_template.md new file mode 100644 index 0000000..c0fc034 --- /dev/null +++ b/skills/hf-create-a-space/templates/README_template.md @@ -0,0 +1,38 @@ +--- +title: {{TITLE}} +emoji: {{EMOJI}} +colorFrom: {{COLOR_FROM}} +colorTo: {{COLOR_TO}} +sdk: {{SDK}} +sdk_version: {{SDK_VERSION}} +app_file: app.py +pinned: false +license: {{LICENSE}} +short_description: {{SHORT_DESCRIPTION}} +--- + +# {{TITLE}} + +{{DESCRIPTION}} + +## Features + +- Feature 1 +- Feature 2 +- Feature 3 + +## Usage + +Describe how to use your Space here. + +## Model + +This Space uses [{{MODEL_ID}}](https://huggingface.co/{{MODEL_ID}}). + +## Examples + +Add example inputs and outputs here. + +## License + +{{LICENSE}} diff --git a/skills/hf-create-a-space/templates/README_zerogpu.md b/skills/hf-create-a-space/templates/README_zerogpu.md new file mode 100644 index 0000000..7cf43c9 --- /dev/null +++ b/skills/hf-create-a-space/templates/README_zerogpu.md @@ -0,0 +1,37 @@ +--- +title: {{TITLE}} +emoji: {{EMOJI}} +colorFrom: blue +colorTo: purple +sdk: gradio +sdk_version: 5.9.1 +app_file: app.py +pinned: false +license: apache-2.0 +short_description: {{SHORT_DESCRIPTION}} +suggested_hardware: zero-a10g +--- + +# {{TITLE}} + +{{DESCRIPTION}} + +## Model + +This Space uses [{{MODEL_ID}}](https://huggingface.co/{{MODEL_ID}}). + +## How It Works + +This Space uses **ZeroGPU** - a free GPU allocation system: +- The app runs on CPU by default (free) +- When you send a message, a GPU is allocated on-demand +- After generation completes, the GPU is released +- You get a daily quota of free GPU time + +## Usage + +Simply type your message in the chat box and press Enter! + +## License + +{{LICENSE}} diff --git a/skills/hf-create-a-space/templates/gradio_asr.py b/skills/hf-create-a-space/templates/gradio_asr.py new file mode 100644 index 0000000..bf65930 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_asr.py @@ -0,0 +1,47 @@ +"""Automatic Speech Recognition with {model_id}""" +import gradio as gr +from transformers import pipeline + +MODEL_ID = "{model_id}" + +# Load ASR pipeline +asr = pipeline("automatic-speech-recognition", model=MODEL_ID) + + +def transcribe(audio, return_timestamps): + """Transcribe audio to text.""" + if audio is None: + return "Please upload or record audio." + + # Run transcription + if return_timestamps: + result = asr(audio, return_timestamps="word") + # Format with timestamps + if "chunks" in result: + lines = [] + for chunk in result["chunks"]: + start = chunk.get("timestamp", [0, 0])[0] + end = chunk.get("timestamp", [0, 0])[1] + text = chunk.get("text", "") + lines.append(f"[{start:.2f}s - {end:.2f}s] {text}") + return "\n".join(lines) + return result.get("text", str(result)) + else: + result = asr(audio) + return result.get("text", str(result)) + + +demo = gr.Interface( + fn=transcribe, + inputs=[ + gr.Audio(type="filepath", label="Upload or Record Audio"), + gr.Checkbox(label="Return word timestamps", value=False), + ], + outputs=gr.Textbox(label="Transcription", lines=10), + title="{title}", + description="Transcribe audio to text using {model_id}", + examples=[], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_audio_class.py b/skills/hf-create-a-space/templates/gradio_audio_class.py new file mode 100644 index 0000000..9566498 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_audio_class.py @@ -0,0 +1,33 @@ +"""Audio Classification with {model_id}""" +import gradio as gr +from transformers import pipeline + +MODEL_ID = "{model_id}" + +# Load audio classification pipeline +classifier = pipeline("audio-classification", model=MODEL_ID) + + +def classify_audio(audio): + """Classify audio into categories.""" + if audio is None: + return {{"error": "Please upload or record audio"}} + + # Run classification + results = classifier(audio) + + # Format as dict for label output + return {{r["label"]: r["score"] for r in results}} + + +demo = gr.Interface( + fn=classify_audio, + inputs=gr.Audio(type="filepath", label="Upload or Record Audio"), + outputs=gr.Label(label="Classification Results", num_top_classes=5), + title="{title}", + description="Classify audio using {model_id}", + examples=[], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_chat.py b/skills/hf-create-a-space/templates/gradio_chat.py new file mode 100644 index 0000000..d5af92f --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_chat.py @@ -0,0 +1,107 @@ +""" +Gradio Chat Interface Template + +A ready-to-use chat interface for conversational models. +Replace MODEL_ID with your model of choice. + +IMPORTANT: For gated models (Llama, Mistral, Gemma, etc.): +1. Accept the model's license on its HuggingFace page +2. Add HF_TOKEN as a Repository Secret in Space Settings +""" + +import os +import gradio as gr +from huggingface_hub import InferenceClient + +# ============================================================================ +# CONFIGURATION - Modify these values +# ============================================================================ +MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" # Change to your model +TITLE = "Chat Demo" +DESCRIPTION = "Chat with an AI assistant powered by Hugging Face." +DEFAULT_SYSTEM_MESSAGE = "You are a helpful, harmless, and honest assistant." + +# ============================================================================ +# APPLICATION CODE - Modify if needed +# ============================================================================ + +# Token required for gated models (Llama, Mistral, Gemma, etc.) +# Add HF_TOKEN as a Repository Secret in Space Settings +HF_TOKEN = os.environ.get("HF_TOKEN") +client = InferenceClient(MODEL_ID, token=HF_TOKEN) + + +def respond( + message: str, + history: list[tuple[str, str]], + system_message: str, + max_tokens: int, + temperature: float, + top_p: float, +): + """Generate a streaming response to the user's message.""" + messages = [{"role": "system", "content": system_message}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if assistant_msg: + messages.append({"role": "assistant", "content": assistant_msg}) + + messages.append({"role": "user", "content": message}) + + response = "" + for token in client.chat_completion( + messages, + max_tokens=max_tokens, + stream=True, + temperature=temperature, + top_p=top_p, + ): + delta = token.choices[0].delta.content or "" + response += delta + yield response + + +# Build the Gradio interface +demo = gr.ChatInterface( + respond, + title=TITLE, + description=DESCRIPTION, + additional_inputs=[ + gr.Textbox( + value=DEFAULT_SYSTEM_MESSAGE, + label="System message", + lines=2, + ), + gr.Slider( + minimum=1, + maximum=4096, + value=512, + step=1, + label="Max tokens", + ), + gr.Slider( + minimum=0.1, + maximum=2.0, + value=0.7, + step=0.1, + label="Temperature", + ), + gr.Slider( + minimum=0.1, + maximum=1.0, + value=0.95, + step=0.05, + label="Top-p (nucleus sampling)", + ), + ], + examples=[ + ["Hello! How are you today?"], + ["Can you explain quantum computing in simple terms?"], + ["Write a haiku about programming."], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_depth.py b/skills/hf-create-a-space/templates/gradio_depth.py new file mode 100644 index 0000000..1b6cde1 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_depth.py @@ -0,0 +1,58 @@ +"""Depth Estimation with {model_id}""" +import gradio as gr +from transformers import pipeline +import numpy as np +from PIL import Image + +MODEL_ID = "{model_id}" + +# Load depth estimation pipeline +depth_estimator = pipeline("depth-estimation", model=MODEL_ID) + + +def estimate_depth(image, colormap): + """Estimate depth from a single image.""" + if image is None: + return None + + # Run depth estimation + result = depth_estimator(image) + depth = result["depth"] + + # Convert to numpy array + depth_array = np.array(depth) + + # Normalize to 0-255 + depth_normalized = ((depth_array - depth_array.min()) / + (depth_array.max() - depth_array.min()) * 255).astype(np.uint8) + + # Apply colormap + if colormap == "Grayscale": + depth_colored = Image.fromarray(depth_normalized) + else: + import matplotlib.pyplot as plt + cmap = plt.get_cmap(colormap.lower()) + depth_colored = (cmap(depth_normalized / 255.0)[:, :, :3] * 255).astype(np.uint8) + depth_colored = Image.fromarray(depth_colored) + + return depth_colored + + +demo = gr.Interface( + fn=estimate_depth, + inputs=[ + gr.Image(type="pil", label="Upload Image"), + gr.Radio( + choices=["Grayscale", "Viridis", "Plasma", "Inferno", "Magma"], + value="Viridis", + label="Colormap", + ), + ], + outputs=gr.Image(label="Depth Map"), + title="{title}", + description="Estimate depth from a single image using {model_id}. Brighter = closer, darker = farther.", + examples=[], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_image_gen.py b/skills/hf-create-a-space/templates/gradio_image_gen.py new file mode 100644 index 0000000..b39adf9 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_image_gen.py @@ -0,0 +1,99 @@ +""" +Gradio Text-to-Image Template + +A ready-to-use image generation interface. +Uses Hugging Face Inference API for serverless generation. +""" + +import gradio as gr +from huggingface_hub import InferenceClient + +# ============================================================================ +# CONFIGURATION - Modify these values +# ============================================================================ +MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0" # Change to your model +TITLE = "Image Generator" +DESCRIPTION = "Generate images from text descriptions using Stable Diffusion." + +# ============================================================================ +# APPLICATION CODE - Modify if needed +# ============================================================================ +client = InferenceClient() + + +def generate( + prompt: str, + negative_prompt: str, + width: int, + height: int, + guidance_scale: float, + num_steps: int, +): + """Generate an image from the prompt.""" + if not prompt.strip(): + return None + + image = client.text_to_image( + prompt, + negative_prompt=negative_prompt if negative_prompt.strip() else None, + model=MODEL_ID, + width=width, + height=height, + guidance_scale=guidance_scale, + num_inference_steps=num_steps, + ) + return image + + +# Build the Gradio interface +with gr.Blocks(title=TITLE) as demo: + gr.Markdown(f"# {TITLE}") + gr.Markdown(DESCRIPTION) + + with gr.Row(): + with gr.Column(scale=1): + prompt = gr.Textbox( + label="Prompt", + placeholder="A majestic castle on a floating island in the sky...", + lines=3, + ) + negative_prompt = gr.Textbox( + label="Negative Prompt", + placeholder="blurry, low quality, distorted, ugly", + lines=2, + ) + + with gr.Row(): + width = gr.Slider(512, 1024, value=1024, step=64, label="Width") + height = gr.Slider(512, 1024, value=1024, step=64, label="Height") + + with gr.Row(): + guidance_scale = gr.Slider( + 1, 20, value=7.5, step=0.5, label="Guidance Scale" + ) + num_steps = gr.Slider(10, 50, value=30, step=1, label="Steps") + + generate_btn = gr.Button("Generate", variant="primary") + + with gr.Column(scale=1): + output_image = gr.Image(label="Generated Image", type="pil") + + # Example prompts + gr.Examples( + examples=[ + ["A serene Japanese garden with cherry blossoms and a koi pond", ""], + ["A cyberpunk cityscape at night with neon lights", "blurry, low quality"], + ["A cozy cabin in a snowy forest with warm light from windows", ""], + ["An astronaut riding a horse on Mars, digital art", "photorealistic"], + ], + inputs=[prompt, negative_prompt], + ) + + generate_btn.click( + generate, + inputs=[prompt, negative_prompt, width, height, guidance_scale, num_steps], + outputs=output_image, + ) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_img2img.py b/skills/hf-create-a-space/templates/gradio_img2img.py new file mode 100644 index 0000000..4f1696c --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_img2img.py @@ -0,0 +1,47 @@ +"""Image-to-Image generation with {model_id}""" +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "{model_id}" +client = InferenceClient() + + +def transform_image(image, prompt, negative_prompt, strength, guidance_scale, num_steps): + """Transform an image based on a text prompt.""" + if image is None: + raise gr.Error("Please upload an image") + if not prompt or not prompt.strip(): + raise gr.Error("Please enter a prompt") + + try: + result = client.image_to_image( + image=image, + prompt=prompt, + negative_prompt=negative_prompt or None, + model=MODEL_ID, + strength=strength, + guidance_scale=guidance_scale, + num_inference_steps=num_steps, + ) + return result + except Exception as e: + raise gr.Error(f"Image transformation failed: {e}") + + +demo = gr.Interface( + fn=transform_image, + inputs=[ + gr.Image(type="pil", label="Input Image"), + gr.Textbox(label="Prompt", placeholder="Describe the transformation..."), + gr.Textbox(label="Negative Prompt", placeholder="What to avoid..."), + gr.Slider(0.1, 1.0, value=0.8, step=0.05, label="Strength (how much to change)"), + gr.Slider(1, 20, value=7.5, step=0.5, label="Guidance Scale"), + gr.Slider(10, 50, value=30, step=1, label="Steps"), + ], + outputs=gr.Image(label="Transformed Image"), + title="{title}", + description="Transform images using {model_id}. Higher strength = more change from original.", +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_lora_chat.py b/skills/hf-create-a-space/templates/gradio_lora_chat.py new file mode 100644 index 0000000..5adeb95 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_lora_chat.py @@ -0,0 +1,181 @@ +""" +Gradio Chat Interface with LoRA Adapter + ZeroGPU Template + +Use this template for LoRA/PEFT adapters (models with adapter_config.json). +This is FREE with daily GPU quota on Hugging Face Spaces. + +IMPORTANT: You must identify the base model from adapter_config.json! +Look for the "base_model_name_or_path" field. + +Requirements: +- gradio>=5.0.0 +- torch +- transformers +- accelerate +- spaces +- peft # REQUIRED for LoRA adapters + +README.md must include: suggested_hardware: zero-a10g + +IMPORTANT: Hardware must be set to ZeroGPU in Space Settings after deployment! +""" + +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import PeftModel + +# ============================================================================ +# CONFIGURATION - You MUST set these correctly +# ============================================================================ + +# Your LoRA adapter (the model with adapter_config.json) +ADAPTER_ID = "YOUR_USERNAME/YOUR_LORA_ADAPTER" + +# Base model - FIND THIS in adapter_config.json -> "base_model_name_or_path" +# Example: If adapter_config.json contains: +# "base_model_name_or_path": "Qwen/Qwen2.5-Coder-1.5B-Instruct" +# Then set: +BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct" + +TITLE = "My Fine-Tuned Model" +DESCRIPTION = "LoRA fine-tuned model powered by ZeroGPU (free!)" +DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." + +# ============================================================================ +# MODEL LOADING - Lazy loading inside GPU context +# ============================================================================ + +# Load tokenizer from adapter (lightweight, no GPU needed) +tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID) + +# Model will be loaded lazily on first request +model = None + + +def load_model(): + """Load and merge LoRA adapter - called inside GPU context.""" + global model + if model is None: + print(f"Loading base model: {BASE_MODEL_ID}") + base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + + print(f"Applying adapter: {ADAPTER_ID}") + peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID) + + # Merge adapter into base model for faster inference + print("Merging adapter weights...") + model = peft_model.merge_and_unload() + print("Model ready!") + return model + + +# ============================================================================ +# GENERATION FUNCTION - GPU allocated only during this function +# ============================================================================ + +@spaces.GPU(duration=120) # GPU allocated for up to 120 seconds +def generate_response( + message: str, + history: list[tuple[str, str]], + system_message: str, + max_tokens: int, + temperature: float, + top_p: float, +) -> str: + """Generate response - GPU is allocated only during this call.""" + + # Load model on GPU + model = load_model() + + # Build conversation history + messages = [{"role": "system", "content": system_message}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if assistant_msg: + messages.append({"role": "assistant", "content": assistant_msg}) + + messages.append({"role": "user", "content": message}) + + # Apply chat template (model-specific formatting) + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + + # Tokenize and move to GPU + inputs = tokenizer([text], return_tensors="pt").to(model.device) + + # Generate response (no streaming with ZeroGPU) + outputs = model.generate( + **inputs, + max_new_tokens=int(max_tokens), + temperature=temperature, + top_p=top_p, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + # Decode only the new tokens (skip the input) + response = tokenizer.decode( + outputs[0][inputs['input_ids'].shape[1]:], + skip_special_tokens=True + ) + + return response + + +# ============================================================================ +# GRADIO INTERFACE +# ============================================================================ + +demo = gr.ChatInterface( + generate_response, + title=TITLE, + description=DESCRIPTION, + additional_inputs=[ + gr.Textbox( + value=DEFAULT_SYSTEM_MESSAGE, + label="System message", + lines=2, + ), + gr.Slider( + minimum=1, + maximum=2048, + value=512, + step=1, + label="Max tokens", + ), + gr.Slider( + minimum=0.1, + maximum=2.0, + value=0.7, + step=0.1, + label="Temperature", + ), + gr.Slider( + minimum=0.1, + maximum=1.0, + value=0.95, + step=0.05, + label="Top-p (nucleus sampling)", + ), + ], + # IMPORTANT: Examples must be nested lists in Gradio 5.x! + examples=[ + ["Hello! How are you today?"], + ["Can you help me write a Python function?"], + ["Explain this code to me."], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_object_detection.py b/skills/hf-create-a-space/templates/gradio_object_detection.py new file mode 100644 index 0000000..252776a --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_object_detection.py @@ -0,0 +1,76 @@ +"""Object Detection with {model_id}""" +import gradio as gr +from transformers import pipeline +from PIL import Image, ImageDraw, ImageFont + +MODEL_ID = "{model_id}" + +# Load object detection pipeline +detector = pipeline("object-detection", model=MODEL_ID) + + +def detect_objects(image, threshold): + """Detect objects in an image and draw bounding boxes.""" + if image is None: + return None, "Please upload an image" + + # Run detection + results = detector(image, threshold=threshold) + + if not results: + return image, "No objects detected above threshold" + + # Draw bounding boxes on image + draw = ImageDraw.Draw(image) + + # Generate colors for different labels + labels = list(set(r["label"] for r in results)) + colors = {{}} + for i, label in enumerate(labels): + # Generate distinct colors + hue = i / len(labels) + import colorsys + rgb = colorsys.hsv_to_rgb(hue, 0.8, 0.9) + colors[label] = tuple(int(c * 255) for c in rgb) + + # Draw each detection + detections_text = [] + for result in results: + box = result["box"] + label = result["label"] + score = result["score"] + + # Draw rectangle + draw.rectangle( + [box["xmin"], box["ymin"], box["xmax"], box["ymax"]], + outline=colors[label], + width=3 + ) + + # Draw label + label_text = f"{{label}} ({{score:.2f}})" + draw.text((box["xmin"], box["ymin"] - 15), label_text, fill=colors[label]) + + detections_text.append(f"{{label}}: {{score:.2%}}") + + summary = f"Detected {{len(results)}} object(s):\\n" + "\\n".join(detections_text) + return image, summary + + +demo = gr.Interface( + fn=detect_objects, + inputs=[ + gr.Image(type="pil", label="Upload Image"), + gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Confidence Threshold"), + ], + outputs=[ + gr.Image(label="Detections"), + gr.Textbox(label="Results", lines=5), + ], + title="{title}", + description="Detect objects in images using {model_id}", + examples=[], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_segmentation.py b/skills/hf-create-a-space/templates/gradio_segmentation.py new file mode 100644 index 0000000..8971a02 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_segmentation.py @@ -0,0 +1,80 @@ +"""Image Segmentation with {model_id}""" +import gradio as gr +from transformers import pipeline +from PIL import Image +import numpy as np + +MODEL_ID = "{model_id}" + +# Load segmentation pipeline +segmenter = pipeline("image-segmentation", model=MODEL_ID) + + +def segment_image(image): + """Segment an image into different regions.""" + if image is None: + return None, "Please upload an image" + + # Run segmentation + results = segmenter(image) + + if not results: + return image, "No segments detected" + + # Create colored segmentation mask + # Combine all masks into one visualization + width, height = image.size + combined_mask = np.zeros((height, width, 3), dtype=np.uint8) + + segments_text = [] + for i, result in enumerate(results): + label = result.get("label", f"Segment {{i}}") + score = result.get("score", 1.0) + mask = result.get("mask") + + if mask is not None: + # Convert mask to numpy if needed + if hasattr(mask, "numpy"): + mask_array = np.array(mask) + else: + mask_array = np.array(mask) + + # Generate color for this segment + import colorsys + hue = i / max(len(results), 1) + rgb = colorsys.hsv_to_rgb(hue, 0.8, 0.9) + color = tuple(int(c * 255) for c in rgb) + + # Apply color to mask + if len(mask_array.shape) == 2: + for c in range(3): + combined_mask[:, :, c] = np.where( + mask_array > 0, + color[c], + combined_mask[:, :, c] + ) + + segments_text.append(f"{{label}}: {{score:.2%}}") + + # Blend with original image + mask_image = Image.fromarray(combined_mask) + blended = Image.blend(image.convert("RGB"), mask_image, alpha=0.5) + + summary = f"Found {{len(results)}} segment(s):\\n" + "\\n".join(segments_text) + return blended, summary + + +demo = gr.Interface( + fn=segment_image, + inputs=gr.Image(type="pil", label="Upload Image"), + outputs=[ + gr.Image(label="Segmentation"), + gr.Textbox(label="Segments", lines=5), + ], + title="{title}", + description="Segment images into different regions using {model_id}", + examples=[], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_tts.py b/skills/hf-create-a-space/templates/gradio_tts.py new file mode 100644 index 0000000..a1b31ad --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_tts.py @@ -0,0 +1,39 @@ +"""Text-to-Speech with {model_id}""" +import gradio as gr +from huggingface_hub import InferenceClient + +MODEL_ID = "{model_id}" +client = InferenceClient() + + +def synthesize(text): + """Convert text to speech.""" + if not text or not text.strip(): + return None + + try: + # Use Inference API for TTS + audio = client.text_to_speech(text, model=MODEL_ID) + return audio + except Exception as e: + raise gr.Error(f"TTS failed: {e}") + + +demo = gr.Interface( + fn=synthesize, + inputs=gr.Textbox( + label="Text to speak", + placeholder="Enter text to convert to speech...", + lines=3, + ), + outputs=gr.Audio(label="Generated Speech", type="filepath"), + title="{title}", + description="Convert text to speech using {model_id}", + examples=[ + ["Hello! Welcome to this text-to-speech demo."], + ["The quick brown fox jumps over the lazy dog."], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_vqa.py b/skills/hf-create-a-space/templates/gradio_vqa.py new file mode 100644 index 0000000..2c58582 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_vqa.py @@ -0,0 +1,79 @@ +"""Visual Question Answering with {model_id}""" +import gradio as gr +import spaces +import torch +from transformers import AutoProcessor, AutoModelForVision2Seq +from PIL import Image + +MODEL_ID = "{model_id}" + +# Load processor at startup +processor = AutoProcessor.from_pretrained(MODEL_ID) + +# Global model - loaded lazily +model = None + + +def load_model(): + global model + if model is None: + model = AutoModelForVision2Seq.from_pretrained( + MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + return model + + +@spaces.GPU(duration=90) +def answer_question(image, question): + """Answer a question about an image.""" + if image is None: + return "Please upload an image." + if not question or not question.strip(): + return "Please enter a question." + + model = load_model() + + # Prepare inputs + inputs = processor( + images=image, + text=question, + return_tensors="pt" + ).to(model.device, torch.float16) + + # Generate answer + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=256, + do_sample=False, + ) + + # Decode response + response = processor.decode(outputs[0], skip_special_tokens=True) + + # Remove the question from the response if it's echoed + if question in response: + response = response.replace(question, "").strip() + + return response + + +demo = gr.Interface( + fn=answer_question, + inputs=[ + gr.Image(type="pil", label="Upload Image"), + gr.Textbox(label="Question", placeholder="What do you see in this image?"), + ], + outputs=gr.Textbox(label="Answer", lines=5), + title="{title}", + description="Ask questions about images using {model_id} (powered by ZeroGPU)", + examples=[ + [None, "What objects are in this image?"], + [None, "Describe this image in detail."], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_zero_shot.py b/skills/hf-create-a-space/templates/gradio_zero_shot.py new file mode 100644 index 0000000..2604c96 --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_zero_shot.py @@ -0,0 +1,59 @@ +"""Zero-Shot Classification with {model_id}""" +import gradio as gr +from transformers import pipeline + +MODEL_ID = "{model_id}" + +# Load zero-shot classification pipeline +classifier = pipeline("zero-shot-classification", model=MODEL_ID) + + +def classify(text, labels, multi_label): + """Classify text into custom categories.""" + if not text or not text.strip(): + return {{"error": "Please enter some text"}} + if not labels or not labels.strip(): + return {{"error": "Please enter at least one label"}} + + # Parse labels (comma or newline separated) + label_list = [l.strip() for l in labels.replace("\\n", ",").split(",") if l.strip()] + + if not label_list: + return {{"error": "Please enter valid labels"}} + + # Run classification + result = classifier(text, label_list, multi_label=multi_label) + + # Format as dict for label output + return dict(zip(result["labels"], result["scores"])) + + +demo = gr.Interface( + fn=classify, + inputs=[ + gr.Textbox( + label="Text to classify", + placeholder="Enter the text you want to classify...", + lines=3, + ), + gr.Textbox( + label="Candidate labels", + placeholder="Enter labels (comma or newline separated)\\ne.g., positive, negative, neutral", + lines=3, + ), + gr.Checkbox( + label="Multi-label (text can belong to multiple categories)", + value=False, + ), + ], + outputs=gr.Label(label="Classification Results", num_top_classes=10), + title="{title}", + description="Classify text into any categories you define using {model_id}", + examples=[ + ["I love this product! It's amazing.", "positive, negative, neutral", False], + ["The new policy will affect healthcare and education.", "politics, healthcare, education, sports, technology", True], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/gradio_zerogpu_chat.py b/skills/hf-create-a-space/templates/gradio_zerogpu_chat.py new file mode 100644 index 0000000..a4be43f --- /dev/null +++ b/skills/hf-create-a-space/templates/gradio_zerogpu_chat.py @@ -0,0 +1,155 @@ +""" +Gradio Chat Interface with ZeroGPU Template + +Use this template for models that DON'T have Inference API support. +This is FREE with daily GPU quota on Hugging Face Spaces. + +Requirements: +- gradio>=5.0.0 +- torch +- transformers +- accelerate +- spaces + +README.md must include: suggested_hardware: zero-a10g + +IMPORTANT: Hardware must be set to ZeroGPU in Space Settings after deployment! +""" + +import gradio as gr +import spaces +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +# ============================================================================ +# CONFIGURATION - Modify these values +# ============================================================================ +MODEL_ID = "YOUR_USERNAME/YOUR_MODEL" # Your fine-tuned model +TITLE = "My Fine-Tuned Model" +DESCRIPTION = "Chat with my custom model, powered by ZeroGPU (free!)" +DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." + +# ============================================================================ +# MODEL LOADING - Lazy loading inside GPU context +# ============================================================================ +# Load tokenizer at startup (lightweight, no GPU needed) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Model will be loaded lazily on first request +model = None + + +def load_model(): + """Load model - called inside GPU context.""" + global model + if model is None: + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + torch_dtype=torch.float16, + device_map="auto", + ) + return model + + +# ============================================================================ +# GENERATION FUNCTION - GPU allocated only during this function +# ============================================================================ +@spaces.GPU(duration=120) # GPU allocated for up to 120 seconds +def generate_response( + message: str, + history: list[tuple[str, str]], + system_message: str, + max_tokens: int, + temperature: float, + top_p: float, +) -> str: + """Generate response using the model. GPU is allocated only during this call.""" + + # Load model on GPU + model = load_model() + + # Build conversation history + messages = [{"role": "system", "content": system_message}] + + for user_msg, assistant_msg in history: + if user_msg: + messages.append({"role": "user", "content": user_msg}) + if assistant_msg: + messages.append({"role": "assistant", "content": assistant_msg}) + + messages.append({"role": "user", "content": message}) + + # Apply chat template (model-specific formatting) + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + + # Tokenize and move to GPU + inputs = tokenizer([text], return_tensors="pt").to(model.device) + + # Generate response (no streaming with ZeroGPU) + outputs = model.generate( + **inputs, + max_new_tokens=int(max_tokens), + temperature=temperature, + top_p=top_p, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + # Decode only the new tokens (skip the input) + response = tokenizer.decode( + outputs[0][inputs['input_ids'].shape[1]:], + skip_special_tokens=True + ) + + return response + + +# ============================================================================ +# GRADIO INTERFACE +# ============================================================================ +demo = gr.ChatInterface( + generate_response, + title=TITLE, + description=DESCRIPTION, + additional_inputs=[ + gr.Textbox( + value=DEFAULT_SYSTEM_MESSAGE, + label="System message", + lines=2, + ), + gr.Slider( + minimum=1, + maximum=2048, + value=512, + step=1, + label="Max tokens", + ), + gr.Slider( + minimum=0.1, + maximum=2.0, + value=0.7, + step=0.1, + label="Temperature", + ), + gr.Slider( + minimum=0.1, + maximum=1.0, + value=0.95, + step=0.05, + label="Top-p (nucleus sampling)", + ), + ], + # IMPORTANT: Examples must be nested lists in Gradio 5.x! + examples=[ + ["Hello! How are you today?"], + ["Can you help me write a Python function?"], + ["Explain this code to me."], + ], +) + +if __name__ == "__main__": + demo.launch() diff --git a/skills/hf-create-a-space/templates/requirements_inference_api.txt b/skills/hf-create-a-space/templates/requirements_inference_api.txt new file mode 100644 index 0000000..f26b07c --- /dev/null +++ b/skills/hf-create-a-space/templates/requirements_inference_api.txt @@ -0,0 +1,4 @@ +# For models that support HF Inference API (InferenceClient) +# Use with: cpu-basic hardware (free) +gradio>=5.0.0 +huggingface_hub>=0.26.0 diff --git a/skills/hf-create-a-space/templates/requirements_lora.txt b/skills/hf-create-a-space/templates/requirements_lora.txt new file mode 100644 index 0000000..06f0ade --- /dev/null +++ b/skills/hf-create-a-space/templates/requirements_lora.txt @@ -0,0 +1,8 @@ +# For LoRA/PEFT adapters with ZeroGPU +# Use with: zero-a10g hardware (free with quota) +gradio>=5.0.0 +torch +transformers +accelerate +spaces +peft diff --git a/skills/hf-create-a-space/templates/requirements_zerogpu.txt b/skills/hf-create-a-space/templates/requirements_zerogpu.txt new file mode 100644 index 0000000..af3d938 --- /dev/null +++ b/skills/hf-create-a-space/templates/requirements_zerogpu.txt @@ -0,0 +1,7 @@ +# For models that need local loading with ZeroGPU +# Use with: zero-a10g hardware (free with quota) +gradio>=5.0.0 +torch +transformers +accelerate +spaces diff --git a/skills/hf-create-a-space/templates/streamlit_app.py b/skills/hf-create-a-space/templates/streamlit_app.py new file mode 100644 index 0000000..4826dd7 --- /dev/null +++ b/skills/hf-create-a-space/templates/streamlit_app.py @@ -0,0 +1,100 @@ +""" +Streamlit Chat App Template + +A ready-to-use chat interface for conversational models using Streamlit. +Replace MODEL_ID with your model of choice. +""" + +import streamlit as st +from huggingface_hub import InferenceClient + +# ============================================================================ +# CONFIGURATION - Modify these values +# ============================================================================ +MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" # Change to your model +PAGE_TITLE = "Chat Assistant" +PAGE_ICON = "🤖" +WELCOME_MESSAGE = "Hello! I'm an AI assistant. How can I help you today?" + +# ============================================================================ +# PAGE SETUP +# ============================================================================ +st.set_page_config( + page_title=PAGE_TITLE, + page_icon=PAGE_ICON, + layout="centered", +) + +st.title(f"{PAGE_ICON} {PAGE_TITLE}") + + +# ============================================================================ +# APPLICATION CODE +# ============================================================================ +@st.cache_resource +def get_client(): + """Initialize the Hugging Face client (cached).""" + return InferenceClient(MODEL_ID) + + +client = get_client() + +# Sidebar for settings +with st.sidebar: + st.header("Settings") + system_message = st.text_area( + "System Message", + value="You are a helpful, harmless, and honest assistant.", + height=100, + ) + max_tokens = st.slider("Max Tokens", 50, 2048, 512) + temperature = st.slider("Temperature", 0.1, 2.0, 0.7, 0.1) + top_p = st.slider("Top-p", 0.1, 1.0, 0.95, 0.05) + + st.divider() + if st.button("Clear Chat"): + st.session_state.messages = [] + st.rerun() + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Display welcome message if no history +if not st.session_state.messages: + with st.chat_message("assistant"): + st.markdown(WELCOME_MESSAGE) + +# Display chat history +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# Handle user input +if prompt := st.chat_input("Type your message..."): + # Add user message to history + st.session_state.messages.append({"role": "user", "content": prompt}) + + # Display user message + with st.chat_message("user"): + st.markdown(prompt) + + # Generate response + with st.chat_message("assistant"): + # Build messages for API + messages = [{"role": "system", "content": system_message}] + messages.extend(st.session_state.messages) + + # Stream the response + with st.spinner("Thinking..."): + response = client.chat_completion( + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + ) + reply = response.choices[0].message.content + st.markdown(reply) + + # Add assistant response to history + st.session_state.messages.append({"role": "assistant", "content": reply})