Update: default no denoise & normalize

2025-12-12 03:48:12 +00:00 · 2025-12-05 22:16:27 +08:00
parent 6a5e713698
commit b1f7593ae0
2 changed files with 9 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ By default, when you first run the script, the model will be downloaded automati
    from huggingface_hub import snapshot_download
    snapshot_download("openbmb/VoxCPM-0.5B")
    ```
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
+- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo. 
    ```
    from modelscope import snapshot_download
    snapshot_download('iic/speech_zipenhancer_ans_multiloss_16k_base')
@@ -98,8 +98,8 @@ wav = model.generate(
    prompt_text=None,          # optional: reference text
    cfg_value=2.0,             # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
    inference_timesteps=10,   # LocDiT inference timesteps, higher for better result, lower for fast speed
-    normalize=True,           # enable external TN tool, but will disable native raw text support
+    normalize=False,           # enable external TN tool, but will disable native raw text support
-    denoise=True,             # enable external Denoise tool, but it may cause some distortion and restrict the sampling rate to 16kHz
+    denoise=False,             # enable external Denoise tool, but it may cause some distortion and restrict the sampling rate to 16kHz
    retry_badcase=True,        # enable retrying mode for some bad cases (unstoppable)
    retry_badcase_max_times=3,  # maximum retrying times
    retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
@@ -134,14 +134,14 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi
  --prompt-audio path/to/voice.wav \
  --prompt-text "reference transcript" \
  --output out.wav \
-  --denoise
+  # --denoise
 # (Optinal) Voice cloning (reference audio + transcript file)
 voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
  --prompt-audio path/to/voice.wav \
  --prompt-file "/path/to/text-file" \
  --output out.wav \
-  --denoise
+  # --denoise
 # 3) Batch processing (one text per line)
 voxcpm --input examples/input.txt --output-dir outs
@@ -149,7 +149,7 @@ voxcpm --input examples/input.txt --output-dir outs
 voxcpm --input examples/input.txt --output-dir outs \
  --prompt-audio path/to/voice.wav \
  --prompt-text "reference transcript" \
-  --denoise
+  # --denoise
 # 4) Inference parameters (quality/speed)
 voxcpm --text "..." --output out.wav \
--- a/src/voxcpm/core.py
+++ b/src/voxcpm/core.py
@@ -40,7 +40,7 @@ class VoxCPM:
    @classmethod
    def from_pretrained(cls,
-            hf_model_id: str = "openbmb/VoxCPM-0.5B",
+            hf_model_id: str = "openbmb/VoxCPM1.5",
            load_denoiser: bool = True,
            zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
            cache_dir: str = None,
@@ -107,8 +107,8 @@ class VoxCPM:
            inference_timesteps : int = 10,
            min_len : int = 2,
            max_len : int = 4096,
-            normalize : bool = True,
+            normalize : bool = False,
-            denoise : bool = True,
+            denoise : bool = False,
            retry_badcase : bool = True,
            retry_badcase_max_times : int = 3,
            retry_badcase_ratio_threshold : float = 6.0,