From 11574ae93dd92065139ca007699ef4e6475e4379 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E9=91=AB?= <liuxin@modelbest.cn>
Date: Tue, 16 Sep 2025 16:46:44 +0800
Subject: [PATCH] surport load model from local path

---
 README.md                 |  4 +--
 src/voxcpm/core.py        | 42 ++++++++++------------
 src/voxcpm/zipenhancer.py | 76 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 26 deletions(-)
 create mode 100644 src/voxcpm/zipenhancer.py

diff --git a/README.md b/README.md
index a5262b5..bd04d17 100644
--- a/README.md
+++ b/README.md
@@ -87,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`).
 
 ```bash
 # 1) Direct synthesis (single text)
-voxcpm --text "Hello VoxCPM" --output out.wav
+voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav
 
 # 2) Voice cloning (reference audio + transcript)
-voxcpm --text "Hello" \
+voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
   --prompt-audio path/to/voice.wav \
   --prompt-text "reference transcript" \
   --output out.wav \
diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py
index baa52f7..55662a9 100644
--- a/src/voxcpm/core.py
+++ b/src/voxcpm/core.py
@@ -2,8 +2,6 @@ import torch
 import torchaudio
 import os
 import tempfile
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
 from huggingface_hub import snapshot_download
 from .model.voxcpm import VoxCPMModel
 from .utils.text_normalize import TextNormalizer
@@ -29,9 +27,8 @@ class VoxCPM:
         self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
         self.text_normalizer = TextNormalizer()
         if enable_denoiser and zipenhancer_model_path is not None:
-            self.denoiser = pipeline(
-                Tasks.acoustic_noise_suppression,
-                model=zipenhancer_model_path)
+            from .zipenhancer import ZipEnhancer
+            self.denoiser = ZipEnhancer(zipenhancer_model_path)
         else:
             self.denoiser = None
         print("Warm up VoxCPMModel...")
@@ -41,7 +38,7 @@ class VoxCPM:
 
     @classmethod
     def from_pretrained(cls,
-            hf_model_id: str = "openbmb/VoxCPM-0.5B",
+            hf_model_id: str = "openbmb/VoxCPM",
             load_denoiser: bool = True,
             zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
             cache_dir: str = None,
@@ -50,7 +47,7 @@ class VoxCPM:
         """Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
 
         Args:
-            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo").
+            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
             load_denoiser: Whether to initialize the denoiser pipeline.
             zipenhancer_model_id: Denoiser model id or path for ModelScope
                 acoustic noise suppression.
@@ -67,26 +64,25 @@ class VoxCPM:
                 ``hf_model_id`` is provided.
         """
         repo_id = hf_model_id
-        if not repo_id or repo_id.strip() == "":
-            raise ValueError("You must provide a valid hf_model_id")
-
-        local_path = snapshot_download(
-            repo_id=repo_id,
-            cache_dir=cache_dir,
-            local_files_only=local_files_only,
-        )
+        if not repo_id:
+            raise ValueError("You must provide hf_model_id")
+        
+        # Load from local path if provided
+        if os.path.isdir(repo_id):
+            local_path = repo_id
+        else:
+            # Otherwise, try from_pretrained (Hub); exit on failure
+            local_path = snapshot_download(
+                repo_id=repo_id,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+            )
 
         return cls(
             voxcpm_model_path=local_path,
             zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
             enable_denoiser=load_denoiser,
         )
-        
-    def _normalize_loudness(self, wav_path: str):
-        audio, sr = torchaudio.load(wav_path)
-        loudness = torchaudio.functional.loudness(audio, sr)
-        normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
-        torchaudio.save(wav_path, normalized_audio, sr)
 
     def generate(self, 
             text : str,
@@ -135,9 +131,7 @@ class VoxCPM:
                 if denoise and self.denoiser is not None:
                     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                         temp_prompt_wav_path = tmp_file.name
-                    
-                    self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path)
-                    self._normalize_loudness(temp_prompt_wav_path)
+                    self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
                     prompt_wav_path = temp_prompt_wav_path
                 fixed_prompt_cache = self.tts_model.build_prompt_cache(
                     prompt_wav_path=prompt_wav_path,
diff --git a/src/voxcpm/zipenhancer.py b/src/voxcpm/zipenhancer.py
new file mode 100644
index 0000000..f8b8e6f
--- /dev/null
+++ b/src/voxcpm/zipenhancer.py
@@ -0,0 +1,76 @@
+"""
+ZipEnhancer Module - Audio Denoising Enhancer
+
+Provides on-demand import ZipEnhancer functionality for audio denoising processing.
+Related dependencies are imported only when denoising functionality is needed.
+"""
+
+import os
+import tempfile
+from typing import Optional, Union
+import torchaudio
+import torch
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+
+class ZipEnhancer:
+    """ZipEnhancer Audio Denoising Enhancer"""
+    def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
+        """
+        Initialize ZipEnhancer
+        Args:
+            model_path: ModelScope model path or local path
+        """
+        self.model_path = model_path
+        self._pipeline = pipeline(
+                Tasks.acoustic_noise_suppression,
+                model=self.model_path
+            )
+        
+    def _normalize_loudness(self, wav_path: str):
+        """
+        Audio loudness normalization
+        
+        Args:
+            wav_path: Audio file path
+        """
+        audio, sr = torchaudio.load(wav_path)
+        loudness = torchaudio.functional.loudness(audio, sr)
+        normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
+        torchaudio.save(wav_path, normalized_audio, sr)
+    
+    def enhance(self, input_path: str, output_path: Optional[str] = None, 
+                normalize_loudness: bool = True) -> str:
+        """
+        Audio denoising enhancement
+        Args:
+            input_path: Input audio file path
+            output_path: Output audio file path (optional, creates temp file by default)
+            normalize_loudness: Whether to perform loudness normalization
+        Returns:
+            str: Output audio file path
+        Raises:
+            RuntimeError: If pipeline is not initialized or processing fails
+        """
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
+        # Create temporary file if no output path is specified
+        if output_path is None:
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+                output_path = tmp_file.name
+        try:
+            # Perform denoising processing
+            self._pipeline(input_path, output_path=output_path)
+            # Loudness normalization
+            if normalize_loudness:
+                self._normalize_loudness(output_path)
+            return output_path
+        except Exception as e:
+            # Clean up possibly created temporary files
+            if output_path and os.path.exists(output_path):
+                try:
+                    os.unlink(output_path)
+                except OSError:
+                    pass
+            raise RuntimeError(f"Audio denoising processing failed: {e}")
\ No newline at end of file