surport load model from local path

This commit is contained in:
刘鑫
2025-09-16 16:46:44 +08:00
parent 706403187e
commit 11574ae93d
3 changed files with 96 additions and 26 deletions

View File

@@ -87,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`).
```bash
# 1) Direct synthesis (single text)
voxcpm --text "Hello VoxCPM" --output out.wav
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav
# 2) Voice cloning (reference audio + transcript)
voxcpm --text "Hello" \
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
--prompt-audio path/to/voice.wav \
--prompt-text "reference transcript" \
--output out.wav \

View File

@@ -2,8 +2,6 @@ import torch
import torchaudio
import os
import tempfile
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from huggingface_hub import snapshot_download
from .model.voxcpm import VoxCPMModel
from .utils.text_normalize import TextNormalizer
@@ -29,9 +27,8 @@ class VoxCPM:
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
self.text_normalizer = TextNormalizer()
if enable_denoiser and zipenhancer_model_path is not None:
self.denoiser = pipeline(
Tasks.acoustic_noise_suppression,
model=zipenhancer_model_path)
from .zipenhancer import ZipEnhancer
self.denoiser = ZipEnhancer(zipenhancer_model_path)
else:
self.denoiser = None
print("Warm up VoxCPMModel...")
@@ -41,7 +38,7 @@ class VoxCPM:
@classmethod
def from_pretrained(cls,
hf_model_id: str = "openbmb/VoxCPM-0.5B",
hf_model_id: str = "openbmb/VoxCPM",
load_denoiser: bool = True,
zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
cache_dir: str = None,
@@ -50,7 +47,7 @@ class VoxCPM:
"""Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
Args:
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo").
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
load_denoiser: Whether to initialize the denoiser pipeline.
zipenhancer_model_id: Denoiser model id or path for ModelScope
acoustic noise suppression.
@@ -67,14 +64,19 @@ class VoxCPM:
``hf_model_id`` is provided.
"""
repo_id = hf_model_id
if not repo_id or repo_id.strip() == "":
raise ValueError("You must provide a valid hf_model_id")
if not repo_id:
raise ValueError("You must provide hf_model_id")
local_path = snapshot_download(
repo_id=repo_id,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
# Load from local path if provided
if os.path.isdir(repo_id):
local_path = repo_id
else:
# Otherwise, try from_pretrained (Hub); exit on failure
local_path = snapshot_download(
repo_id=repo_id,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
return cls(
voxcpm_model_path=local_path,
@@ -82,12 +84,6 @@ class VoxCPM:
enable_denoiser=load_denoiser,
)
def _normalize_loudness(self, wav_path: str):
audio, sr = torchaudio.load(wav_path)
loudness = torchaudio.functional.loudness(audio, sr)
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
torchaudio.save(wav_path, normalized_audio, sr)
def generate(self,
text : str,
prompt_wav_path : str = None,
@@ -135,9 +131,7 @@ class VoxCPM:
if denoise and self.denoiser is not None:
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
temp_prompt_wav_path = tmp_file.name
self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path)
self._normalize_loudness(temp_prompt_wav_path)
self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
prompt_wav_path = temp_prompt_wav_path
fixed_prompt_cache = self.tts_model.build_prompt_cache(
prompt_wav_path=prompt_wav_path,

76
src/voxcpm/zipenhancer.py Normal file
View File

@@ -0,0 +1,76 @@
"""
ZipEnhancer Module - Audio Denoising Enhancer
Provides on-demand import ZipEnhancer functionality for audio denoising processing.
Related dependencies are imported only when denoising functionality is needed.
"""
import os
import tempfile
from typing import Optional, Union
import torchaudio
import torch
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
class ZipEnhancer:
"""ZipEnhancer Audio Denoising Enhancer"""
def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
"""
Initialize ZipEnhancer
Args:
model_path: ModelScope model path or local path
"""
self.model_path = model_path
self._pipeline = pipeline(
Tasks.acoustic_noise_suppression,
model=self.model_path
)
def _normalize_loudness(self, wav_path: str):
"""
Audio loudness normalization
Args:
wav_path: Audio file path
"""
audio, sr = torchaudio.load(wav_path)
loudness = torchaudio.functional.loudness(audio, sr)
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
torchaudio.save(wav_path, normalized_audio, sr)
def enhance(self, input_path: str, output_path: Optional[str] = None,
normalize_loudness: bool = True) -> str:
"""
Audio denoising enhancement
Args:
input_path: Input audio file path
output_path: Output audio file path (optional, creates temp file by default)
normalize_loudness: Whether to perform loudness normalization
Returns:
str: Output audio file path
Raises:
RuntimeError: If pipeline is not initialized or processing fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
# Create temporary file if no output path is specified
if output_path is None:
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
output_path = tmp_file.name
try:
# Perform denoising processing
self._pipeline(input_path, output_path=output_path)
# Loudness normalization
if normalize_loudness:
self._normalize_loudness(output_path)
return output_path
except Exception as e:
# Clean up possibly created temporary files
if output_path and os.path.exists(output_path):
try:
os.unlink(output_path)
except OSError:
pass
raise RuntimeError(f"Audio denoising processing failed: {e}")