set default repo id

surport load model from local path
update requirements for zipenhancer
2025-12-12 19:58:12 +00:00 · 2025-09-16 16:52:42 +08:00 · 2025-09-16 16:46:44 +08:00 · 2025-09-16 16:15:10 +08:00 · 2025-09-16 16:06:03 +08:00
4 changed files with 99 additions and 26 deletions
--- a/README.md
+++ b/README.md
@@ -87,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`).
 ```bash
 # 1) Direct synthesis (single text)
-voxcpm --text "Hello VoxCPM" --output out.wav
+voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav
 # 2) Voice cloning (reference audio + transcript)
-voxcpm --text "Hello" \
+voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
  --prompt-audio path/to/voice.wav \
  --prompt-text "reference transcript" \
  --output out.wav \
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,10 +35,13 @@ dependencies = [
    "inflect",
    "addict",
    "WeTextProcessing",
-    "modelscope",
+    "modelscope>=1.22.0",
    "datasets>=2,<4",
    "huggingface-hub",
    "pydantic",
    "tqdm",
    "simplejson",
    "sortedcontainers",
    "soundfile",
    "funasr",
    "spaces"
--- a/src/voxcpm/core.py
+++ b/src/voxcpm/core.py
@@ -2,8 +2,6 @@ import torch
 import torchaudio
 import os
 import tempfile
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from huggingface_hub import snapshot_download
 from .model.voxcpm import VoxCPMModel
 from .utils.text_normalize import TextNormalizer
@@ -29,9 +27,8 @@ class VoxCPM:
        self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
        self.text_normalizer = TextNormalizer()
        if enable_denoiser and zipenhancer_model_path is not None:
-            self.denoiser = pipeline(
+            from .zipenhancer import ZipEnhancer
-                Tasks.acoustic_noise_suppression,
+            self.denoiser = ZipEnhancer(zipenhancer_model_path)
                model=zipenhancer_model_path)
        else:
            self.denoiser = None
        print("Warm up VoxCPMModel...")
@@ -50,7 +47,7 @@ class VoxCPM:
        """Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
        Args:
-            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo").
+            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
            load_denoiser: Whether to initialize the denoiser pipeline.
            zipenhancer_model_id: Denoiser model id or path for ModelScope
                acoustic noise suppression.
@@ -67,26 +64,25 @@ class VoxCPM:
                ``hf_model_id`` is provided.
        """
        repo_id = hf_model_id
-        if not repo_id or repo_id.strip() == "":
+        if not repo_id:
-            raise ValueError("You must provide a valid hf_model_id")
+            raise ValueError("You must provide hf_model_id")
-
+        
-        local_path = snapshot_download(
+        # Load from local path if provided
-            repo_id=repo_id,
+        if os.path.isdir(repo_id):
-            cache_dir=cache_dir,
+            local_path = repo_id
-            local_files_only=local_files_only,
+        else:
-        )
+            # Otherwise, try from_pretrained (Hub); exit on failure
            local_path = snapshot_download(
                repo_id=repo_id,
                cache_dir=cache_dir,
                local_files_only=local_files_only,
            )
        return cls(
            voxcpm_model_path=local_path,
            zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
            enable_denoiser=load_denoiser,
        )
    def _normalize_loudness(self, wav_path: str):
        audio, sr = torchaudio.load(wav_path)
        loudness = torchaudio.functional.loudness(audio, sr)
        normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
        torchaudio.save(wav_path, normalized_audio, sr)
    def generate(self, 
            text : str,
@@ -135,9 +131,7 @@ class VoxCPM:
                if denoise and self.denoiser is not None:
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                        temp_prompt_wav_path = tmp_file.name
-                    
+                    self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
                    self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path)
                    self._normalize_loudness(temp_prompt_wav_path)
                    prompt_wav_path = temp_prompt_wav_path
                fixed_prompt_cache = self.tts_model.build_prompt_cache(
                    prompt_wav_path=prompt_wav_path,
--- a/src/voxcpm/zipenhancer.py
+++ b/src/voxcpm/zipenhancer.py
@@ -0,0 +1,76 @@
 """
 ZipEnhancer Module - Audio Denoising Enhancer
 Provides on-demand import ZipEnhancer functionality for audio denoising processing.
 Related dependencies are imported only when denoising functionality is needed.
 """
 import os
 import tempfile
 from typing import Optional, Union
 import torchaudio
 import torch
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 class ZipEnhancer:
    """ZipEnhancer Audio Denoising Enhancer"""
    def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
        """
        Initialize ZipEnhancer
        Args:
            model_path: ModelScope model path or local path
        """
        self.model_path = model_path
        self._pipeline = pipeline(
                Tasks.acoustic_noise_suppression,
                model=self.model_path
            )
    def _normalize_loudness(self, wav_path: str):
        """
        Audio loudness normalization
        Args:
            wav_path: Audio file path
        """
        audio, sr = torchaudio.load(wav_path)
        loudness = torchaudio.functional.loudness(audio, sr)
        normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
        torchaudio.save(wav_path, normalized_audio, sr)
    def enhance(self, input_path: str, output_path: Optional[str] = None, 
                normalize_loudness: bool = True) -> str:
        """
        Audio denoising enhancement
        Args:
            input_path: Input audio file path
            output_path: Output audio file path (optional, creates temp file by default)
            normalize_loudness: Whether to perform loudness normalization
        Returns:
            str: Output audio file path
        Raises:
            RuntimeError: If pipeline is not initialized or processing fails
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
        # Create temporary file if no output path is specified
        if output_path is None:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                output_path = tmp_file.name
        try:
            # Perform denoising processing
            self._pipeline(input_path, output_path=output_path)
            # Loudness normalization
            if normalize_loudness:
                self._normalize_loudness(output_path)
            return output_path
        except Exception as e:
            # Clean up possibly created temporary files
            if output_path and os.path.exists(output_path):
                try:
                    os.unlink(output_path)
                except OSError:
                    pass
            raise RuntimeError(f"Audio denoising processing failed: {e}")
Author	SHA1	Message	Date
刘鑫	436e8cd6e5	set default repo id	2025-09-16 16:52:42 +08:00
刘鑫	11574ae93d	surport load model from local path	2025-09-16 16:46:44 +08:00
zengguoyang	706403187e	update requirements for zipenhancer	2025-09-16 16:15:10 +08:00
zengguoyang	38a76704ee	update requirements for zipenhancer	2025-09-16 16:06:03 +08:00