mirror of
https://github.com/OpenBMB/VoxCPM
synced 2025-12-12 03:48:12 +00:00
Update: default no denoise & normalize
This commit is contained in:
12
README.md
12
README.md
@@ -76,7 +76,7 @@ By default, when you first run the script, the model will be downloaded automati
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
snapshot_download("openbmb/VoxCPM-0.5B")
|
snapshot_download("openbmb/VoxCPM-0.5B")
|
||||||
```
|
```
|
||||||
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
|
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
|
||||||
```
|
```
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
snapshot_download('iic/speech_zipenhancer_ans_multiloss_16k_base')
|
snapshot_download('iic/speech_zipenhancer_ans_multiloss_16k_base')
|
||||||
@@ -98,8 +98,8 @@ wav = model.generate(
|
|||||||
prompt_text=None, # optional: reference text
|
prompt_text=None, # optional: reference text
|
||||||
cfg_value=2.0, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
|
cfg_value=2.0, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
|
||||||
inference_timesteps=10, # LocDiT inference timesteps, higher for better result, lower for fast speed
|
inference_timesteps=10, # LocDiT inference timesteps, higher for better result, lower for fast speed
|
||||||
normalize=True, # enable external TN tool, but will disable native raw text support
|
normalize=False, # enable external TN tool, but will disable native raw text support
|
||||||
denoise=True, # enable external Denoise tool, but it may cause some distortion and restrict the sampling rate to 16kHz
|
denoise=False, # enable external Denoise tool, but it may cause some distortion and restrict the sampling rate to 16kHz
|
||||||
retry_badcase=True, # enable retrying mode for some bad cases (unstoppable)
|
retry_badcase=True, # enable retrying mode for some bad cases (unstoppable)
|
||||||
retry_badcase_max_times=3, # maximum retrying times
|
retry_badcase_max_times=3, # maximum retrying times
|
||||||
retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
|
retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
|
||||||
@@ -134,14 +134,14 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi
|
|||||||
--prompt-audio path/to/voice.wav \
|
--prompt-audio path/to/voice.wav \
|
||||||
--prompt-text "reference transcript" \
|
--prompt-text "reference transcript" \
|
||||||
--output out.wav \
|
--output out.wav \
|
||||||
--denoise
|
# --denoise
|
||||||
|
|
||||||
# (Optinal) Voice cloning (reference audio + transcript file)
|
# (Optinal) Voice cloning (reference audio + transcript file)
|
||||||
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
|
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
|
||||||
--prompt-audio path/to/voice.wav \
|
--prompt-audio path/to/voice.wav \
|
||||||
--prompt-file "/path/to/text-file" \
|
--prompt-file "/path/to/text-file" \
|
||||||
--output out.wav \
|
--output out.wav \
|
||||||
--denoise
|
# --denoise
|
||||||
|
|
||||||
# 3) Batch processing (one text per line)
|
# 3) Batch processing (one text per line)
|
||||||
voxcpm --input examples/input.txt --output-dir outs
|
voxcpm --input examples/input.txt --output-dir outs
|
||||||
@@ -149,7 +149,7 @@ voxcpm --input examples/input.txt --output-dir outs
|
|||||||
voxcpm --input examples/input.txt --output-dir outs \
|
voxcpm --input examples/input.txt --output-dir outs \
|
||||||
--prompt-audio path/to/voice.wav \
|
--prompt-audio path/to/voice.wav \
|
||||||
--prompt-text "reference transcript" \
|
--prompt-text "reference transcript" \
|
||||||
--denoise
|
# --denoise
|
||||||
|
|
||||||
# 4) Inference parameters (quality/speed)
|
# 4) Inference parameters (quality/speed)
|
||||||
voxcpm --text "..." --output out.wav \
|
voxcpm --text "..." --output out.wav \
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class VoxCPM:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls,
|
def from_pretrained(cls,
|
||||||
hf_model_id: str = "openbmb/VoxCPM-0.5B",
|
hf_model_id: str = "openbmb/VoxCPM1.5",
|
||||||
load_denoiser: bool = True,
|
load_denoiser: bool = True,
|
||||||
zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
|
zipenhancer_model_id: str = "iic/speech_zipenhancer_ans_multiloss_16k_base",
|
||||||
cache_dir: str = None,
|
cache_dir: str = None,
|
||||||
@@ -107,8 +107,8 @@ class VoxCPM:
|
|||||||
inference_timesteps : int = 10,
|
inference_timesteps : int = 10,
|
||||||
min_len : int = 2,
|
min_len : int = 2,
|
||||||
max_len : int = 4096,
|
max_len : int = 4096,
|
||||||
normalize : bool = True,
|
normalize : bool = False,
|
||||||
denoise : bool = True,
|
denoise : bool = False,
|
||||||
retry_badcase : bool = True,
|
retry_badcase : bool = True,
|
||||||
retry_badcase_max_times : int = 3,
|
retry_badcase_max_times : int = 3,
|
||||||
retry_badcase_ratio_threshold : float = 6.0,
|
retry_badcase_ratio_threshold : float = 6.0,
|
||||||
|
|||||||
Reference in New Issue
Block a user