Skip to content

The model doesn't quite work well it seems #17

@codename0og

Description

@codename0og

Yo ~ Currently doing some tests in terms of integrating LavaSR in my rvc-based fork ( voice conversion stuff. )

Original output from inference ( In this particular case, doing tests on 32khz models. (( from available 24, 32, 40 and 48)) )
Image

After LavaSR v2 ( input sr set to 16khz )
Image

After LavaSR v2 ( input sr set to 32khz )
Image

The resulting audio quality is.. pretty bad, lightly put.
You know, too much of vibrance and " aspiration " quality is gone.
Is there something I don't understand? ( Quite possible ~ ) or is it by design? ( and my case simply seems unfortunate? )

Here's how I currently handle it in my inference code:

    @staticmethod
    def apply_lavasr(
        audio: np.ndarray,
        current_sr: int,
    ):
        """
        Args:
            audio:      Audio as a float32 NumPy array (mono).
            current_sr: Sample rate of `audio` (the model's target SR).

        Returns:
            Tuple (enhanced_numpy_array, 48000).
        """
        try:
            from LavaSR.model import LavaEnhance2

        try:
            import tempfile
            import torch as _torch
            from huggingface_hub import snapshot_download

            # Resolve / download model to rvc/models/enhancement/LavaSR
            _here = os.getcwd()
            local_model_dir = os.path.join(_here, "rvc", "models", "enhancement", "LavaSR")
            os.makedirs(local_model_dir, exist_ok=True)

            if not os.path.isdir(os.path.join(local_model_dir, "enhancer_v2")):
                print("[LavaSR] Model not found locally – downloading from HuggingFace...")
                print(f"[LavaSR] Saving to: {local_model_dir}")
                snapshot_download("YatharthS/LavaSR", local_dir=local_model_dir)
                print("[LavaSR] Download complete.")
            else:
                print(f"[LavaSR] Using cached model at: {local_model_dir}")

            # Write inference output to a temp WAV
            tmp_fd, tmp_path = tempfile.mkstemp(suffix=".wav")
            os.close(tmp_fd)
            sf.write(tmp_path, audio, current_sr)

            # Load model
            device = "cuda" if _torch.cuda.is_available() else "cpu"
            print(f"[LavaSR] Loading LavaSR v2 on {device}...")
            lava_model = LavaEnhance2(local_model_dir, device)

            # Run inference
            print(f"[LavaSR] Enhancing: {current_sr} Hz → 48000 Hz ")
            input_audio, _ = lava_model.load_audio(tmp_path, input_sr=32000)
            enhanced = lava_model.enhance(input_audio, denoise=False, batch=False)
            enhanced_np = enhanced.cpu().numpy().squeeze()

            # Cleanup
            os.remove(tmp_path)
            del lava_model
            if _torch.cuda.is_available():
                _torch.cuda.empty_cache()

            print("[LavaSR] Enhancement complete. Output SR: 48000 Hz")
            return enhanced_np, 48000

        except Exception as error:
            print(f"[LavaSR] Enhancement failed: {error}")
            import traceback as _tb
            print(_tb.format_exc())
            return audio, current_sr

.......

            converted_chunks = []
            for c in chunks:
                audio_opt = self.vc.pipeline(
                    model=self.hubert_model,
                    net_g=self.net_g,
                    sid=sid,
                    audio=c,
                    pitch=pitch,
                    f0_method=f0_method,
                    file_index=file_index,
                    index_rate=index_rate,
                    pitch_guidance=self.use_f0,
                    filter_radius=filter_radius,
                    volume_envelope=volume_envelope,
                    version=self.version,
                    protect=protect,
                    f0_autotune=f0_autotune,
                    f0_autotune_strength=f0_autotune_strength,
                    f0_file=f0_file,
                    seed=seed,
                    loaded_index=self.loaded_index,
                )
                converted_chunks.append(audio_opt)
                if split_audio:
                    print(f"Converted audio chunk {len(converted_chunks)}")

            if split_audio:
                audio_opt = merge_audio(chunks, converted_chunks, intervals, 16000, self.tgt_sr)
            else:
                audio_opt = converted_chunks[0]

            if clean_audio:
                cleaned_audio = self.remove_audio_noise(
                    audio_opt, self.tgt_sr, clean_strength
                )
                if cleaned_audio is not None:
                    audio_opt = cleaned_audio

            if post_process:
                audio_opt = self.post_process_audio(
                    audio_input=audio_opt,
                    sample_rate=self.tgt_sr,
                    **kwargs,
                )

            output_sr = self.tgt_sr
            if lavasr_enhance:
                audio_opt, output_sr = self.apply_lavasr(
                    audio_opt, self.tgt_sr
                )
                # output_sr is now 48000 — keep it as-is.

            sf.write(audio_output_path, audio_opt, output_sr, format="WAV")
            output_path_format = audio_output_path.replace(
                ".wav", f".{export_format.lower()}"
            )
            audio_output_path = self.convert_audio_format(
                audio_output_path, output_path_format, export_format
            )

            elapsed_time = time.time() - start_time
            print(
                f"Conversion completed! Result available in: '{audio_output_path}'. Time taken: {elapsed_time:.2f} seconds."
            )

Thanks in advance!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions