From 11574ae93dd92065139ca007699ef4e6475e4379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Tue, 16 Sep 2025 16:46:44 +0800 Subject: [PATCH] surport load model from local path --- README.md | 4 +-- src/voxcpm/core.py | 42 ++++++++++------------ src/voxcpm/zipenhancer.py | 76 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 26 deletions(-) create mode 100644 src/voxcpm/zipenhancer.py diff --git a/README.md b/README.md index a5262b5..bd04d17 100644 --- a/README.md +++ b/README.md @@ -87,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`). ```bash # 1) Direct synthesis (single text) -voxcpm --text "Hello VoxCPM" --output out.wav +voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav # 2) Voice cloning (reference audio + transcript) -voxcpm --text "Hello" \ +voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \ --prompt-audio path/to/voice.wav \ --prompt-text "reference transcript" \ --output out.wav \ diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py index baa52f7..55662a9 100644 --- a/src/voxcpm/core.py +++ b/src/voxcpm/core.py @@ -2,8 +2,6 @@ import torch import torchaudio import os import tempfile -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks from huggingface_hub import snapshot_download from .model.voxcpm import VoxCPMModel from .utils.text_normalize import TextNormalizer @@ -29,9 +27,8 @@ class VoxCPM: self.tts_model = VoxCPMModel.from_local(voxcpm_model_path) self.text_normalizer = TextNormalizer() if enable_denoiser and zipenhancer_model_path is not None: - self.denoiser = pipeline( - Tasks.acoustic_noise_suppression, - model=zipenhancer_model_path) + from .zipenhancer import ZipEnhancer + self.denoiser = ZipEnhancer(zipenhancer_model_path) else: self.denoiser = None print("Warm up VoxCPMModel...") @@ -41,7 +38,7 @@ class VoxCPM: @classmethod def from_pretrained(cls, - hf_model_id: str = "openbmb/VoxCPM-0.5B", + hf_model_id: str = "openbmb/VoxCPM", load_denoiser: bool = True, zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base", cache_dir: str = None, @@ -50,7 +47,7 @@ class VoxCPM: """Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot. Args: - hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo"). + hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path. load_denoiser: Whether to initialize the denoiser pipeline. zipenhancer_model_id: Denoiser model id or path for ModelScope acoustic noise suppression. @@ -67,26 +64,25 @@ class VoxCPM: ``hf_model_id`` is provided. """ repo_id = hf_model_id - if not repo_id or repo_id.strip() == "": - raise ValueError("You must provide a valid hf_model_id") - - local_path = snapshot_download( - repo_id=repo_id, - cache_dir=cache_dir, - local_files_only=local_files_only, - ) + if not repo_id: + raise ValueError("You must provide hf_model_id") + + # Load from local path if provided + if os.path.isdir(repo_id): + local_path = repo_id + else: + # Otherwise, try from_pretrained (Hub); exit on failure + local_path = snapshot_download( + repo_id=repo_id, + cache_dir=cache_dir, + local_files_only=local_files_only, + ) return cls( voxcpm_model_path=local_path, zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None, enable_denoiser=load_denoiser, ) - - def _normalize_loudness(self, wav_path: str): - audio, sr = torchaudio.load(wav_path) - loudness = torchaudio.functional.loudness(audio, sr) - normalized_audio = torchaudio.functional.gain(audio, -20-loudness) - torchaudio.save(wav_path, normalized_audio, sr) def generate(self, text : str, @@ -135,9 +131,7 @@ class VoxCPM: if denoise and self.denoiser is not None: with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: temp_prompt_wav_path = tmp_file.name - - self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path) - self._normalize_loudness(temp_prompt_wav_path) + self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path) prompt_wav_path = temp_prompt_wav_path fixed_prompt_cache = self.tts_model.build_prompt_cache( prompt_wav_path=prompt_wav_path, diff --git a/src/voxcpm/zipenhancer.py b/src/voxcpm/zipenhancer.py new file mode 100644 index 0000000..f8b8e6f --- /dev/null +++ b/src/voxcpm/zipenhancer.py @@ -0,0 +1,76 @@ +""" +ZipEnhancer Module - Audio Denoising Enhancer + +Provides on-demand import ZipEnhancer functionality for audio denoising processing. +Related dependencies are imported only when denoising functionality is needed. +""" + +import os +import tempfile +from typing import Optional, Union +import torchaudio +import torch +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + + +class ZipEnhancer: + """ZipEnhancer Audio Denoising Enhancer""" + def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"): + """ + Initialize ZipEnhancer + Args: + model_path: ModelScope model path or local path + """ + self.model_path = model_path + self._pipeline = pipeline( + Tasks.acoustic_noise_suppression, + model=self.model_path + ) + + def _normalize_loudness(self, wav_path: str): + """ + Audio loudness normalization + + Args: + wav_path: Audio file path + """ + audio, sr = torchaudio.load(wav_path) + loudness = torchaudio.functional.loudness(audio, sr) + normalized_audio = torchaudio.functional.gain(audio, -20-loudness) + torchaudio.save(wav_path, normalized_audio, sr) + + def enhance(self, input_path: str, output_path: Optional[str] = None, + normalize_loudness: bool = True) -> str: + """ + Audio denoising enhancement + Args: + input_path: Input audio file path + output_path: Output audio file path (optional, creates temp file by default) + normalize_loudness: Whether to perform loudness normalization + Returns: + str: Output audio file path + Raises: + RuntimeError: If pipeline is not initialized or processing fails + """ + if not os.path.exists(input_path): + raise FileNotFoundError(f"Input audio file does not exist: {input_path}") + # Create temporary file if no output path is specified + if output_path is None: + with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: + output_path = tmp_file.name + try: + # Perform denoising processing + self._pipeline(input_path, output_path=output_path) + # Loudness normalization + if normalize_loudness: + self._normalize_loudness(output_path) + return output_path + except Exception as e: + # Clean up possibly created temporary files + if output_path and os.path.exists(output_path): + try: + os.unlink(output_path) + except OSError: + pass + raise RuntimeError(f"Audio denoising processing failed: {e}") \ No newline at end of file