surport load model from local path
This commit is contained in:
@@ -87,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`).
|
||||
|
||||
```bash
|
||||
# 1) Direct synthesis (single text)
|
||||
voxcpm --text "Hello VoxCPM" --output out.wav
|
||||
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav
|
||||
|
||||
# 2) Voice cloning (reference audio + transcript)
|
||||
voxcpm --text "Hello" \
|
||||
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
|
||||
--prompt-audio path/to/voice.wav \
|
||||
--prompt-text "reference transcript" \
|
||||
--output out.wav \
|
||||
|
||||
@@ -2,8 +2,6 @@ import torch
|
||||
import torchaudio
|
||||
import os
|
||||
import tempfile
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
from huggingface_hub import snapshot_download
|
||||
from .model.voxcpm import VoxCPMModel
|
||||
from .utils.text_normalize import TextNormalizer
|
||||
@@ -29,9 +27,8 @@ class VoxCPM:
|
||||
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
|
||||
self.text_normalizer = TextNormalizer()
|
||||
if enable_denoiser and zipenhancer_model_path is not None:
|
||||
self.denoiser = pipeline(
|
||||
Tasks.acoustic_noise_suppression,
|
||||
model=zipenhancer_model_path)
|
||||
from .zipenhancer import ZipEnhancer
|
||||
self.denoiser = ZipEnhancer(zipenhancer_model_path)
|
||||
else:
|
||||
self.denoiser = None
|
||||
print("Warm up VoxCPMModel...")
|
||||
@@ -41,7 +38,7 @@ class VoxCPM:
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
hf_model_id: str = "openbmb/VoxCPM-0.5B",
|
||||
hf_model_id: str = "openbmb/VoxCPM",
|
||||
load_denoiser: bool = True,
|
||||
zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
|
||||
cache_dir: str = None,
|
||||
@@ -50,7 +47,7 @@ class VoxCPM:
|
||||
"""Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
|
||||
|
||||
Args:
|
||||
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo").
|
||||
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
|
||||
load_denoiser: Whether to initialize the denoiser pipeline.
|
||||
zipenhancer_model_id: Denoiser model id or path for ModelScope
|
||||
acoustic noise suppression.
|
||||
@@ -67,26 +64,25 @@ class VoxCPM:
|
||||
``hf_model_id`` is provided.
|
||||
"""
|
||||
repo_id = hf_model_id
|
||||
if not repo_id or repo_id.strip() == "":
|
||||
raise ValueError("You must provide a valid hf_model_id")
|
||||
|
||||
local_path = snapshot_download(
|
||||
repo_id=repo_id,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
if not repo_id:
|
||||
raise ValueError("You must provide hf_model_id")
|
||||
|
||||
# Load from local path if provided
|
||||
if os.path.isdir(repo_id):
|
||||
local_path = repo_id
|
||||
else:
|
||||
# Otherwise, try from_pretrained (Hub); exit on failure
|
||||
local_path = snapshot_download(
|
||||
repo_id=repo_id,
|
||||
cache_dir=cache_dir,
|
||||
local_files_only=local_files_only,
|
||||
)
|
||||
|
||||
return cls(
|
||||
voxcpm_model_path=local_path,
|
||||
zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
|
||||
enable_denoiser=load_denoiser,
|
||||
)
|
||||
|
||||
def _normalize_loudness(self, wav_path: str):
|
||||
audio, sr = torchaudio.load(wav_path)
|
||||
loudness = torchaudio.functional.loudness(audio, sr)
|
||||
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
|
||||
torchaudio.save(wav_path, normalized_audio, sr)
|
||||
|
||||
def generate(self,
|
||||
text : str,
|
||||
@@ -135,9 +131,7 @@ class VoxCPM:
|
||||
if denoise and self.denoiser is not None:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
|
||||
temp_prompt_wav_path = tmp_file.name
|
||||
|
||||
self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path)
|
||||
self._normalize_loudness(temp_prompt_wav_path)
|
||||
self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
|
||||
prompt_wav_path = temp_prompt_wav_path
|
||||
fixed_prompt_cache = self.tts_model.build_prompt_cache(
|
||||
prompt_wav_path=prompt_wav_path,
|
||||
|
||||
76
src/voxcpm/zipenhancer.py
Normal file
76
src/voxcpm/zipenhancer.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
ZipEnhancer Module - Audio Denoising Enhancer
|
||||
|
||||
Provides on-demand import ZipEnhancer functionality for audio denoising processing.
|
||||
Related dependencies are imported only when denoising functionality is needed.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional, Union
|
||||
import torchaudio
|
||||
import torch
|
||||
from modelscope.pipelines import pipeline
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
class ZipEnhancer:
|
||||
"""ZipEnhancer Audio Denoising Enhancer"""
|
||||
def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
|
||||
"""
|
||||
Initialize ZipEnhancer
|
||||
Args:
|
||||
model_path: ModelScope model path or local path
|
||||
"""
|
||||
self.model_path = model_path
|
||||
self._pipeline = pipeline(
|
||||
Tasks.acoustic_noise_suppression,
|
||||
model=self.model_path
|
||||
)
|
||||
|
||||
def _normalize_loudness(self, wav_path: str):
|
||||
"""
|
||||
Audio loudness normalization
|
||||
|
||||
Args:
|
||||
wav_path: Audio file path
|
||||
"""
|
||||
audio, sr = torchaudio.load(wav_path)
|
||||
loudness = torchaudio.functional.loudness(audio, sr)
|
||||
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
|
||||
torchaudio.save(wav_path, normalized_audio, sr)
|
||||
|
||||
def enhance(self, input_path: str, output_path: Optional[str] = None,
|
||||
normalize_loudness: bool = True) -> str:
|
||||
"""
|
||||
Audio denoising enhancement
|
||||
Args:
|
||||
input_path: Input audio file path
|
||||
output_path: Output audio file path (optional, creates temp file by default)
|
||||
normalize_loudness: Whether to perform loudness normalization
|
||||
Returns:
|
||||
str: Output audio file path
|
||||
Raises:
|
||||
RuntimeError: If pipeline is not initialized or processing fails
|
||||
"""
|
||||
if not os.path.exists(input_path):
|
||||
raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
|
||||
# Create temporary file if no output path is specified
|
||||
if output_path is None:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
|
||||
output_path = tmp_file.name
|
||||
try:
|
||||
# Perform denoising processing
|
||||
self._pipeline(input_path, output_path=output_path)
|
||||
# Loudness normalization
|
||||
if normalize_loudness:
|
||||
self._normalize_loudness(output_path)
|
||||
return output_path
|
||||
except Exception as e:
|
||||
# Clean up possibly created temporary files
|
||||
if output_path and os.path.exists(output_path):
|
||||
try:
|
||||
os.unlink(output_path)
|
||||
except OSError:
|
||||
pass
|
||||
raise RuntimeError(f"Audio denoising processing failed: {e}")
|
||||
Reference in New Issue
Block a user