-
Notifications
You must be signed in to change notification settings - Fork 43
The model doesn't quite work well it seems #17
Copy link
Copy link
Open
Description
Yo ~ Currently doing some tests in terms of integrating LavaSR in my rvc-based fork ( voice conversion stuff. )
Original output from inference ( In this particular case, doing tests on 32khz models. (( from available 24, 32, 40 and 48)) )

After LavaSR v2 ( input sr set to 16khz )

After LavaSR v2 ( input sr set to 32khz )

The resulting audio quality is.. pretty bad, lightly put.
You know, too much of vibrance and " aspiration " quality is gone.
Is there something I don't understand? ( Quite possible ~ ) or is it by design? ( and my case simply seems unfortunate? )
Here's how I currently handle it in my inference code:
@staticmethod
def apply_lavasr(
audio: np.ndarray,
current_sr: int,
):
"""
Args:
audio: Audio as a float32 NumPy array (mono).
current_sr: Sample rate of `audio` (the model's target SR).
Returns:
Tuple (enhanced_numpy_array, 48000).
"""
try:
from LavaSR.model import LavaEnhance2
try:
import tempfile
import torch as _torch
from huggingface_hub import snapshot_download
# Resolve / download model to rvc/models/enhancement/LavaSR
_here = os.getcwd()
local_model_dir = os.path.join(_here, "rvc", "models", "enhancement", "LavaSR")
os.makedirs(local_model_dir, exist_ok=True)
if not os.path.isdir(os.path.join(local_model_dir, "enhancer_v2")):
print("[LavaSR] Model not found locally – downloading from HuggingFace...")
print(f"[LavaSR] Saving to: {local_model_dir}")
snapshot_download("YatharthS/LavaSR", local_dir=local_model_dir)
print("[LavaSR] Download complete.")
else:
print(f"[LavaSR] Using cached model at: {local_model_dir}")
# Write inference output to a temp WAV
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".wav")
os.close(tmp_fd)
sf.write(tmp_path, audio, current_sr)
# Load model
device = "cuda" if _torch.cuda.is_available() else "cpu"
print(f"[LavaSR] Loading LavaSR v2 on {device}...")
lava_model = LavaEnhance2(local_model_dir, device)
# Run inference
print(f"[LavaSR] Enhancing: {current_sr} Hz → 48000 Hz ")
input_audio, _ = lava_model.load_audio(tmp_path, input_sr=32000)
enhanced = lava_model.enhance(input_audio, denoise=False, batch=False)
enhanced_np = enhanced.cpu().numpy().squeeze()
# Cleanup
os.remove(tmp_path)
del lava_model
if _torch.cuda.is_available():
_torch.cuda.empty_cache()
print("[LavaSR] Enhancement complete. Output SR: 48000 Hz")
return enhanced_np, 48000
except Exception as error:
print(f"[LavaSR] Enhancement failed: {error}")
import traceback as _tb
print(_tb.format_exc())
return audio, current_sr
.......
converted_chunks = []
for c in chunks:
audio_opt = self.vc.pipeline(
model=self.hubert_model,
net_g=self.net_g,
sid=sid,
audio=c,
pitch=pitch,
f0_method=f0_method,
file_index=file_index,
index_rate=index_rate,
pitch_guidance=self.use_f0,
filter_radius=filter_radius,
volume_envelope=volume_envelope,
version=self.version,
protect=protect,
f0_autotune=f0_autotune,
f0_autotune_strength=f0_autotune_strength,
f0_file=f0_file,
seed=seed,
loaded_index=self.loaded_index,
)
converted_chunks.append(audio_opt)
if split_audio:
print(f"Converted audio chunk {len(converted_chunks)}")
if split_audio:
audio_opt = merge_audio(chunks, converted_chunks, intervals, 16000, self.tgt_sr)
else:
audio_opt = converted_chunks[0]
if clean_audio:
cleaned_audio = self.remove_audio_noise(
audio_opt, self.tgt_sr, clean_strength
)
if cleaned_audio is not None:
audio_opt = cleaned_audio
if post_process:
audio_opt = self.post_process_audio(
audio_input=audio_opt,
sample_rate=self.tgt_sr,
**kwargs,
)
output_sr = self.tgt_sr
if lavasr_enhance:
audio_opt, output_sr = self.apply_lavasr(
audio_opt, self.tgt_sr
)
# output_sr is now 48000 — keep it as-is.
sf.write(audio_output_path, audio_opt, output_sr, format="WAV")
output_path_format = audio_output_path.replace(
".wav", f".{export_format.lower()}"
)
audio_output_path = self.convert_audio_format(
audio_output_path, output_path_format, export_format
)
elapsed_time = time.time() - start_time
print(
f"Conversion completed! Result available in: '{audio_output_path}'. Time taken: {elapsed_time:.2f} seconds."
)
Thanks in advance!
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels