filliptm · millerlight · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 25, 2025
diff --git a/FL_Image_Randomizer.py b/FL_Image_Randomizer.py
@@ -0,0 +1,111 @@
+import os
+import numpy as np
+import torch
+from PIL import Image, ImageOps
+import cv2
+import random
+
+class FL_ImageRandomizer:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "mode": (["Image", "Video"], {"default": "Image"}),
+                "directory_path": ("STRING", {"default": ""}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                "search_subdirectories": ("BOOLEAN", {"default": False}),
+            }
+        }
+
+    RETURN_TYPES = ("IMAGE", "PATH", "IMAGE", "STRING")
+    RETURN_NAMES = ("image_batch", "selected_path", "image_list", "filename")
+    OUTPUT_IS_LIST = (False, False, True, False)
+    FUNCTION = "select_media"
+    CATEGORY = "🏵️Fill Nodes/Image"
+
+    def select_media(self, mode, directory_path, seed, search_subdirectories=False):
+        if not directory_path:
+            raise ValueError("Directory path is not provided.")
+
+        if mode == "Image":
+            image_tensor, selected_path = self.select_image_data(directory_path, seed, search_subdirectories)
+            filename = os.path.basename(selected_path)
+            return (image_tensor, selected_path, [image_tensor], filename)
+        else:  # Video mode
+            frames_tensor, selected_path = self.select_video_data(directory_path, seed, search_subdirectories)
+            filename = os.path.basename(selected_path)
+            return (frames_tensor, selected_path, [frames_tensor], filename) # Video frames are already a batch, but we wrap in list for consistency
+
+    def select_image_data(self, directory_path, seed, search_subdirectories=False):
+        images = self.load_files(directory_path, search_subdirectories, file_type="image")
+        if not images:
+            raise ValueError("No images found in the specified directory.")
+
+        num_images = len(images)
+        selected_index = seed % num_images
+
+        selected_image_path = images[selected_index]
+
+        image = Image.open(selected_image_path)
+        image = ImageOps.exif_transpose(image)
+        image = image.convert("RGB")
+        image_np = np.array(image).astype(np.float32) / 255.0
+        image_tensor = torch.from_numpy(image_np)[None,]
+
+        return image_tensor, selected_image_path
+
+    def select_video_data(self, directory_path, seed, search_subdirectories=False):
+        videos = self.load_files(directory_path, search_subdirectories, file_type="video")
+        if not videos:
+            raise ValueError("No videos found in the specified directory.")
+
+        num_videos = len(videos)
+        selected_index = seed % num_videos
+
+        selected_video_path = videos[selected_index]
+
+        cap = cv2.VideoCapture(selected_video_path)
+        if not cap.isOpened():
+            raise ValueError(f"Could not open video file: {selected_video_path}")
+
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        if frame_count <= 0:
+            raise ValueError(f"No frames found in video: {selected_video_path}")
+
+        frames = []
+        success = True
+        while success:
+            success, frame = cap.read()
+            if success:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame_np = np.array(frame).astype(np.float32) / 255.0
+                frames.append(frame_np)
+
+        cap.release()
+
+        if not frames:
+            raise ValueError(f"Failed to extract frames from video: {selected_video_path}")
+
+        frames_tensor = torch.from_numpy(np.stack(frames))
+
+        return frames_tensor, selected_video_path
+
+    def load_files(self, directory, search_subdirectories=False, file_type="image"):
+        if file_type == "image":
+            supported_formats = ["jpg", "jpeg", "png", "bmp", "gif", "webp"]
+        else:  # video
+            supported_formats = ["mp4", "avi", "mov", "mkv", "wmv", "webm"]
+
+        file_paths = []
+
+        if search_subdirectories:
+            for root, _, files in os.walk(directory):
+                for f in files:
+                    if f.split('.')[-1].lower() in supported_formats:
+                        file_paths.append(os.path.join(root, f))
+        else:
+            file_paths = sorted([os.path.join(directory, f) for f in os.listdir(directory)
+                                if os.path.isfile(os.path.join(directory, f)) and f.split('.')[-1].lower() in supported_formats])
+
+        return sorted(file_paths)
diff --git a/README.md b/README.md
@@ -1,5 +1,12 @@
 # ComfyUI_Fill-ChatterBox
 
+If you enjoy this project, consider supporting me on Patreon!
+<p align="left">
+  <a href="https://www.patreon.com/c/Machinedelusions">
+    <img src="assets/Patreon.png" width="150px" alt="Patreon">
+  </a>
+</p>
+
 A custom node extension for ComfyUI that adds text-to-speech (TTS) and voice conversion (VC) capabilities using the Chatterbox library.
 Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the model falls apart really badly with anything longer than that, so it remains.
 
@@ -18,6 +25,12 @@ Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the mod
    pip install -r ComfyUI_Fill-ChatterBox/requirements.txt
    ```
 
+3. (Optional) Install watermarking support:
+   ```bash
+   pip install resemble-perth
+   ```
+   **Note**: The `resemble-perth` package may have compatibility issues with Python 3.12+. If you encounter import errors, the nodes will still function without watermarking.
+
 
 ## Usage
 
@@ -33,7 +46,15 @@ Supports a MAXIMUM of 40 seconds. Iv tried removing this limitation, but the mod
 
 ## Change Log
 
+### 6/24/2025
+- Added seed parameter to both TTS and VC nodes for reproducible generation
+- Seed range: 0 to 4,294,967,295 (32-bit integer)
+- Enables consistent audio output for debugging and workflow control
+- Made Perth watermarking optional to fix Python 3.12+ compatibility issues
+- Nodes now function without watermarking if resemble-perth import fails
+
 ### 5/31/2025
 - Added Persistent model loading, and loading bar functionality
 - Added Mac support (needs to be tested so HMU)
 - removed the chatterbox-tts library and implemented native inference code.
+
diff --git a/assets/Patreon.png b/assets/Patreon.png
diff --git a/chatterbox_node.py b/chatterbox_node.py
@@ -57,6 +57,7 @@ def INPUT_TYPES(cls):
                 "exaggeration": ("FLOAT", {"default": 0.5, "min": 0.25, "max": 2.0, "step": 0.05}),
                 "cfg_weight": ("FLOAT", {"default": 0.5, "min": 0.2, "max": 1.0, "step": 0.05}),
                 "temperature": ("FLOAT", {"default": 0.8, "min": 0.05, "max": 5.0, "step": 0.05}),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 4294967295}),
             },
             "optional": {
                 "audio_prompt": ("AUDIO",),
@@ -70,7 +71,7 @@ def INPUT_TYPES(cls):
     FUNCTION = "generate_speech"
     CATEGORY = "ChatterBox"
 
-    def generate_speech(self, text, exaggeration, cfg_weight, temperature, audio_prompt=None, use_cpu=False, keep_model_loaded=False):
+    def generate_speech(self, text, exaggeration, cfg_weight, temperature, seed, audio_prompt=None, use_cpu=False, keep_model_loaded=False):
         """
         Generate speech from text.
 
@@ -79,13 +80,25 @@ def generate_speech(self, text, exaggeration, cfg_weight, temperature, audio_pro
             exaggeration: Controls emotion intensity (0.25-2.0).
             cfg_weight: Controls pace/classifier-free guidance (0.2-1.0).
             temperature: Controls randomness in generation (0.05-5.0).
+            seed: Random seed for reproducible generation.
             audio_prompt: AUDIO object containing the reference voice for TTS voice cloning.
             use_cpu: If True, forces CPU usage even if CUDA is available.
             keep_model_loaded: If True, keeps the model loaded in memory after generation.
 
         Returns:
             Tuple of (audio, message)
         """
+        # Set random seeds for reproducibility
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+        if torch.backends.mps.is_available():
+            torch.mps.manual_seed(seed)
+        import numpy as np
+        import random
+        np.random.seed(seed)
+        random.seed(seed)
         # Determine device to use
         device = "cpu" if use_cpu else ("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
         if use_cpu:
@@ -291,6 +304,7 @@ def INPUT_TYPES(cls):
             "required": {
                 "input_audio": ("AUDIO",),
                 "target_voice": ("AUDIO",),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 4294967295}),
             },
             "optional": {
                 "use_cpu": ("BOOLEAN", {"default": False}),
@@ -303,19 +317,31 @@ def INPUT_TYPES(cls):
     FUNCTION = "convert_voice"
     CATEGORY = "ChatterBox"
 
-    def convert_voice(self, input_audio, target_voice, use_cpu=False, keep_model_loaded=False):
+    def convert_voice(self, input_audio, target_voice, seed, use_cpu=False, keep_model_loaded=False):
         """
         Convert the voice in an audio file to match a target voice.
 
         Args:
             input_audio: AUDIO object containing the audio to convert.
             target_voice: AUDIO object containing the target voice.
+            seed: Random seed for reproducible generation.
             use_cpu: If True, forces CPU usage even if CUDA is available.
             keep_model_loaded: If True, keeps the model loaded in memory after conversion.
 
         Returns:
             Tuple of (audio, message)
         """
+        # Set random seeds for reproducibility
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+        if torch.backends.mps.is_available():
+            torch.mps.manual_seed(seed)
+        import numpy as np
+        import random
+        np.random.seed(seed)
+        random.seed(seed)
         # Determine device to use
         device = "cpu" if use_cpu else ("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
         if use_cpu:

diff --git a/local_chatterbox/chatterbox/models/t3/inference/alignment_stream_analyzer.py b/local_chatterbox/chatterbox/models/t3/inference/alignment_stream_analyzer.py
@@ -71,7 +71,11 @@ def attention_forward_hook(module, input, output):
             - When `output_attentions=True`, `LlamaSdpaAttention.forward` calls `LlamaAttention.forward`.
             - `attn_output` has shape [B, H, T0, T0] for the 0th entry, and [B, H, 1, T0+i] for the rest i-th.
             """
-            step_attention = output[1].cpu() # (B, 16, N, N)
+            if output[1] is None:
+                print("⚠️ [DEBUG] Attention output is None – skipping hook.")
+                return
+            step_attention = output[1].cpu()
+
             self.last_aligned_attn = step_attention[0].mean(0) # (N, N)
 
         target_layer = tfmr.layers[alignment_layer_idx].self_attn

diff --git a/local_chatterbox/chatterbox/tts.py b/local_chatterbox/chatterbox/tts.py
@@ -3,8 +3,15 @@
 
 import librosa
 import torch
-import perth
 import torch.nn.functional as F
+
+# Optional Perth watermarking - gracefully handle import failure
+try:
+    import perth
+    PERTH_AVAILABLE = True
+except ImportError:
+    PERTH_AVAILABLE = False
+    print("Warning: Perth watermarking not available. Audio will be generated without watermarking.")
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 
@@ -123,7 +130,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.device = device
         self.conds = conds
-        self.watermarker = perth.PerthImplicitWatermarker()
+        self.watermarker = perth.PerthImplicitWatermarker() if PERTH_AVAILABLE else None
 
     @classmethod
     def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
@@ -259,5 +266,8 @@ def generate(
                 ref_dict=self.conds.gen,
             )
             wav = wav.squeeze(0).detach().cpu().numpy()
-            watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
-        return torch.from_numpy(watermarked_wav).unsqueeze(0)
+            if self.watermarker is not None:
+                watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
+                return torch.from_numpy(watermarked_wav).unsqueeze(0)
+            else:
+                return torch.from_numpy(wav).unsqueeze(0)
diff --git a/local_chatterbox/chatterbox/vc.py b/local_chatterbox/chatterbox/vc.py
@@ -2,9 +2,16 @@
 
 import librosa
 import torch
-import perth
 from huggingface_hub import hf_hub_download
 
+# Optional Perth watermarking - gracefully handle import failure
+try:
+    import perth
+    PERTH_AVAILABLE = True
+except ImportError:
+    PERTH_AVAILABLE = False
+    print("Warning: Perth watermarking not available. Audio will be generated without watermarking.")
+
 from .models.s3tokenizer import S3_SR
 from .models.s3gen import S3GEN_SR, S3Gen
 
@@ -25,7 +32,7 @@ def __init__(
         self.sr = S3GEN_SR
         self.s3gen = s3gen
         self.device = device
-        self.watermarker = perth.PerthImplicitWatermarker()
+        self.watermarker = perth.PerthImplicitWatermarker() if PERTH_AVAILABLE else None
         if ref_dict is None:
             self.ref_dict = None
         else:
@@ -99,5 +106,8 @@ def generate(
                 ref_dict=self.ref_dict,
             )
             wav = wav.squeeze(0).detach().cpu().numpy()
-            watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
-        return torch.from_numpy(watermarked_wav).unsqueeze(0)
+            if self.watermarker is not None:
+                watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr)
+                return torch.from_numpy(watermarked_wav).unsqueeze(0)
+            else:
+                return torch.from_numpy(wav).unsqueeze(0)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui_fill-chatterbox"
 description = "Voice Clone and TTS model."
-version = "1.0.0"
+version = "1.0.1"
 license = "LICENSE"
 dependencies = ["numpy", "resampy", "librosa", "s3tokenizer", "transformers", "diffusers", "resemble-perth", "omegaconf", "conformer"]
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,8 @@ librosa
 s3tokenizer
 transformers
 diffusers
-resemble-perth
 omegaconf
 conformer
-safetensors
+safetensors
+# Optional watermarking (may have Python 3.12+ compatibility issues)
+# resemble-perth