Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions FL_Image_Randomizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import numpy as np
import torch
from PIL import Image, ImageOps
import cv2
import random

class FL_ImageRandomizer:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"mode": (["Image", "Video"], {"default": "Image"}),
"directory_path": ("STRING", {"default": ""}),
"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
"search_subdirectories": ("BOOLEAN", {"default": False}),
}
}

RETURN_TYPES = ("IMAGE", "PATH", "IMAGE", "STRING")
RETURN_NAMES = ("image_batch", "selected_path", "image_list", "filename")
OUTPUT_IS_LIST = (False, False, True, False)
FUNCTION = "select_media"
CATEGORY = "🏵️Fill Nodes/Image"

def select_media(self, mode, directory_path, seed, search_subdirectories=False):
if not directory_path:
raise ValueError("Directory path is not provided.")

if mode == "Image":
image_tensor, selected_path = self.select_image_data(directory_path, seed, search_subdirectories)
filename = os.path.basename(selected_path)
return (image_tensor, selected_path, [image_tensor], filename)
else: # Video mode
frames_tensor, selected_path = self.select_video_data(directory_path, seed, search_subdirectories)
filename = os.path.basename(selected_path)
return (frames_tensor, selected_path, [frames_tensor], filename) # Video frames are already a batch, but we wrap in list for consistency

def select_image_data(self, directory_path, seed, search_subdirectories=False):
images = self.load_files(directory_path, search_subdirectories, file_type="image")
if not images:
raise ValueError("No images found in the specified directory.")

num_images = len(images)
selected_index = seed % num_images

selected_image_path = images[selected_index]

image = Image.open(selected_image_path)
image = ImageOps.exif_transpose(image)
image = image.convert("RGB")
image_np = np.array(image).astype(np.float32) / 255.0
image_tensor = torch.from_numpy(image_np)[None,]

return image_tensor, selected_image_path

def select_video_data(self, directory_path, seed, search_subdirectories=False):
videos = self.load_files(directory_path, search_subdirectories, file_type="video")
if not videos:
raise ValueError("No videos found in the specified directory.")

num_videos = len(videos)
selected_index = seed % num_videos

selected_video_path = videos[selected_index]

cap = cv2.VideoCapture(selected_video_path)
if not cap.isOpened():
raise ValueError(f"Could not open video file: {selected_video_path}")

frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

if frame_count <= 0:
raise ValueError(f"No frames found in video: {selected_video_path}")

frames = []
success = True
while success:
success, frame = cap.read()
if success:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame_np = np.array(frame).astype(np.float32) / 255.0
frames.append(frame_np)

cap.release()

if not frames:
raise ValueError(f"Failed to extract frames from video: {selected_video_path}")

frames_tensor = torch.from_numpy(np.stack(frames))

return frames_tensor, selected_video_path

def load_files(self, directory, search_subdirectories=False, file_type="image"):
if file_type == "image":
supported_formats = ["jpg", "jpeg", "png", "bmp", "gif", "webp"]
else: # video
supported_formats = ["mp4", "avi", "mov", "mkv", "wmv", "webm"]

file_paths = []

if search_subdirectories:
for root, _, files in os.walk(directory):
for f in files:
if f.split('.')[-1].lower() in supported_formats:
file_paths.append(os.path.join(root, f))
else:
file_paths = sorted([os.path.join(directory, f) for f in os.listdir(directory)
if os.path.isfile(os.path.join(directory, f)) and f.split('.')[-1].lower() in supported_formats])

return sorted(file_paths)
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# ComfyUI_Fill-ChatterBox

If you enjoy this project, consider supporting me on Patreon!
<p align="left">
<a href="https://www.patreon.com/c/Machinedelusions">
<img src="assets/Patreon.png" width="150px" alt="Patreon">
</a>
</p>

A custom node extension for ComfyUI that adds text-to-speech (TTS) and voice conversion (VC) capabilities using the Chatterbox library.
Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the model falls apart really badly with anything longer than that, so it remains.

Expand All @@ -18,6 +25,12 @@ Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the mod
pip install -r ComfyUI_Fill-ChatterBox/requirements.txt
```

3. (Optional) Install watermarking support:
```bash
pip install resemble-perth
```
**Note**: The `resemble-perth` package may have compatibility issues with Python 3.12+. If you encounter import errors, the nodes will still function without watermarking.


## Usage

Expand All @@ -33,7 +46,15 @@ Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the mod

## Change Log

### 6/24/2025
- Added seed parameter to both TTS and VC nodes for reproducible generation
- Seed range: 0 to 4,294,967,295 (32-bit integer)
- Enables consistent audio output for debugging and workflow control
- Made Perth watermarking optional to fix Python 3.12+ compatibility issues
- Nodes now function without watermarking if resemble-perth import fails

### 5/31/2025
- Added Persistent model loading, and loading bar functionality
- Added Mac support (needs to be tested so HMU)
- removed the chatterbox-tts library and implemented native inference code.

Binary file added assets/Patreon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 28 additions & 2 deletions chatterbox_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def INPUT_TYPES(cls):
"exaggeration": ("FLOAT", {"default": 0.5, "min": 0.25, "max": 2.0, "step": 0.05}),
"cfg_weight": ("FLOAT", {"default": 0.5, "min": 0.2, "max": 1.0, "step": 0.05}),
"temperature": ("FLOAT", {"default": 0.8, "min": 0.05, "max": 5.0, "step": 0.05}),
"seed": ("INT", {"default": 0, "min": 0, "max": 4294967295}),
},
"optional": {
"audio_prompt": ("AUDIO",),
Expand All @@ -70,7 +71,7 @@ def INPUT_TYPES(cls):
FUNCTION = "generate_speech"
CATEGORY = "ChatterBox"

def generate_speech(self, text, exaggeration, cfg_weight, temperature, audio_prompt=None, use_cpu=False, keep_model_loaded=False):
def generate_speech(self, text, exaggeration, cfg_weight, temperature, seed, audio_prompt=None, use_cpu=False, keep_model_loaded=False):
"""
Generate speech from text.

Expand All @@ -79,13 +80,25 @@ def generate_speech(self, text, exaggeration, cfg_weight, temperature, audio_pro
exaggeration: Controls emotion intensity (0.25-2.0).
cfg_weight: Controls pace/classifier-free guidance (0.2-1.0).
temperature: Controls randomness in generation (0.05-5.0).
seed: Random seed for reproducible generation.
audio_prompt: AUDIO object containing the reference voice for TTS voice cloning.
use_cpu: If True, forces CPU usage even if CUDA is available.
keep_model_loaded: If True, keeps the model loaded in memory after generation.

Returns:
Tuple of (audio, message)
"""
# Set random seeds for reproducibility
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if torch.backends.mps.is_available():
torch.mps.manual_seed(seed)
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)
# Determine device to use
device = "cpu" if use_cpu else ("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
if use_cpu:
Expand Down Expand Up @@ -291,6 +304,7 @@ def INPUT_TYPES(cls):
"required": {
"input_audio": ("AUDIO",),
"target_voice": ("AUDIO",),
"seed": ("INT", {"default": 0, "min": 0, "max": 4294967295}),
},
"optional": {
"use_cpu": ("BOOLEAN", {"default": False}),
Expand All @@ -303,19 +317,31 @@ def INPUT_TYPES(cls):
FUNCTION = "convert_voice"
CATEGORY = "ChatterBox"

def convert_voice(self, input_audio, target_voice, use_cpu=False, keep_model_loaded=False):
def convert_voice(self, input_audio, target_voice, seed, use_cpu=False, keep_model_loaded=False):
"""
Convert the voice in an audio file to match a target voice.

Args:
input_audio: AUDIO object containing the audio to convert.
target_voice: AUDIO object containing the target voice.
seed: Random seed for reproducible generation.
use_cpu: If True, forces CPU usage even if CUDA is available.
keep_model_loaded: If True, keeps the model loaded in memory after conversion.

Returns:
Tuple of (audio, message)
"""
# Set random seeds for reproducibility
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if torch.backends.mps.is_available():
torch.mps.manual_seed(seed)
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)
# Determine device to use
device = "cpu" if use_cpu else ("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
if use_cpu:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@ def attention_forward_hook(module, input, output):
- When `output_attentions=True`, `LlamaSdpaAttention.forward` calls `LlamaAttention.forward`.
- `attn_output` has shape [B, H, T0, T0] for the 0th entry, and [B, H, 1, T0+i] for the rest i-th.
"""
step_attention = output[1].cpu() # (B, 16, N, N)
if output[1] is None:
print("⚠️ [DEBUG] Attention output is None – skipping hook.")
return
step_attention = output[1].cpu()

self.last_aligned_attn = step_attention[0].mean(0) # (N, N)

target_layer = tfmr.layers[alignment_layer_idx].self_attn
Expand Down
18 changes: 14 additions & 4 deletions local_chatterbox/chatterbox/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@

import librosa
import torch
import perth
import torch.nn.functional as F

# Optional Perth watermarking - gracefully handle import failure
try:
import perth
PERTH_AVAILABLE = True
except ImportError:
PERTH_AVAILABLE = False
print("Warning: Perth watermarking not available. Audio will be generated without watermarking.")
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

Expand Down Expand Up @@ -123,7 +130,7 @@ def __init__(
self.tokenizer = tokenizer
self.device = device
self.conds = conds
self.watermarker = perth.PerthImplicitWatermarker()
self.watermarker = perth.PerthImplicitWatermarker() if PERTH_AVAILABLE else None

@classmethod
def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
Expand Down Expand Up @@ -259,5 +266,8 @@ def generate(
ref_dict=self.conds.gen,
)
wav = wav.squeeze(0).detach().cpu().numpy()
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
return torch.from_numpy(watermarked_wav).unsqueeze(0)
if self.watermarker is not None:
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
return torch.from_numpy(watermarked_wav).unsqueeze(0)
else:
return torch.from_numpy(wav).unsqueeze(0)
18 changes: 14 additions & 4 deletions local_chatterbox/chatterbox/vc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@

import librosa
import torch
import perth
from huggingface_hub import hf_hub_download

# Optional Perth watermarking - gracefully handle import failure
try:
import perth
PERTH_AVAILABLE = True
except ImportError:
PERTH_AVAILABLE = False
print("Warning: Perth watermarking not available. Audio will be generated without watermarking.")

from .models.s3tokenizer import S3_SR
from .models.s3gen import S3GEN_SR, S3Gen

Expand All @@ -25,7 +32,7 @@ def __init__(
self.sr = S3GEN_SR
self.s3gen = s3gen
self.device = device
self.watermarker = perth.PerthImplicitWatermarker()
self.watermarker = perth.PerthImplicitWatermarker() if PERTH_AVAILABLE else None
if ref_dict is None:
self.ref_dict = None
else:
Expand Down Expand Up @@ -99,5 +106,8 @@ def generate(
ref_dict=self.ref_dict,
)
wav = wav.squeeze(0).detach().cpu().numpy()
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
return torch.from_numpy(watermarked_wav).unsqueeze(0)
if self.watermarker is not None:
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
return torch.from_numpy(watermarked_wav).unsqueeze(0)
else:
return torch.from_numpy(wav).unsqueeze(0)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "comfyui_fill-chatterbox"
description = "Voice Clone and TTS model."
version = "1.0.0"
version = "1.0.1"
license = "LICENSE"
dependencies = ["numpy", "resampy", "librosa", "s3tokenizer", "transformers", "diffusers", "resemble-perth", "omegaconf", "conformer"]

Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ librosa
s3tokenizer
transformers
diffusers
resemble-perth
omegaconf
conformer
safetensors
safetensors
# Optional watermarking (may have Python 3.12+ compatibility issues)
# resemble-perth