merge from main

This commit is contained in:
刘鑫
2025-09-19 22:08:56 +08:00
6 changed files with 38 additions and 19 deletions

View File

@@ -39,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
## Quick Start ## Quick Start
### 🔧 Install from PyPI ### 🔧 Install from PyPI
@@ -55,7 +50,7 @@ By default, when you first run the script, the model will be downloaded automati
- Download VoxCPM-0.5B - Download VoxCPM-0.5B
``` ```
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
snapshot_download("openbmb/VoxCPM-0.5B",local_files_only=local_files_only) snapshot_download("openbmb/VoxCPM-0.5B")
``` ```
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo. - Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
``` ```
@@ -103,6 +98,13 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi
--output out.wav \ --output out.wav \
--denoise --denoise
# (Optinal) Voice cloning (reference audio + transcript file)
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
--prompt-audio path/to/voice.wav \
--prompt-file "/path/to/text-file" \
--output out.wav \
--denoise
# 3) Batch processing (one text per line) # 3) Batch processing (one text per line)
voxcpm --input examples/input.txt --output-dir outs voxcpm --input examples/input.txt --output-dir outs
# (optional) Batch + cloning # (optional) Batch + cloning
@@ -245,6 +247,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
## 📝TO-DO List
Please stay tuned for updates!
- [ ] Release the VoxCPM technical report.
- [ ] Support higher sampling rate (next version).
## 📄 License ## 📄 License
The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license. The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
@@ -265,6 +274,8 @@ This project is developed by the following institutions:
- <img src="assets/thuhcsi_logo.png" width="28px"> [THUHCSI](https://github.com/thuhcsi) - <img src="assets/thuhcsi_logo.png" width="28px"> [THUHCSI](https://github.com/thuhcsi)
## ⭐ Star History
[![Star History Chart](https://api.star-history.com/svg?repos=OpenBMB/VoxCPM&type=Date)](https://star-history.com/#OpenBMB/VoxCPM&Date)
## 📚 Citation ## 📚 Citation

7
app.py
View File

@@ -194,10 +194,6 @@ def create_demo_interface(demo: VoxCPMDemo):
**调低**:合成速度更快。 **调低**:合成速度更快。
- **Higher** for better synthesis quality. - **Higher** for better synthesis quality.
**调高**:合成质量更佳。 **调高**:合成质量更佳。
### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音)
While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.
虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。
""") """)
# Main controls # Main controls
@@ -244,14 +240,13 @@ def create_demo_interface(demo: VoxCPMDemo):
text = gr.Textbox( text = gr.Textbox(
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.", value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
label="Target Text", label="Target Text",
info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
) )
with gr.Row(): with gr.Row():
DoNormalizeText = gr.Checkbox( DoNormalizeText = gr.Checkbox(
value=False, value=False,
label="Text Normalization", label="Text Normalization",
elem_id="chk_normalize", elem_id="chk_normalize",
info="We use WeTextPorcessing library to normalize the input text." info="We use wetext library to normalize the input text."
) )
audio_output = gr.Audio(label="Output Audio") audio_output = gr.Audio(label="Output Audio")

View File

@@ -36,7 +36,7 @@ dependencies = [
"addict", "addict",
"wetext", "wetext",
"modelscope>=1.22.0", "modelscope>=1.22.0",
"datasets>=2,<4", "datasets>=3,<4",
"huggingface-hub", "huggingface-hub",
"pydantic", "pydantic",
"tqdm", "tqdm",

View File

@@ -240,6 +240,7 @@ Examples:
# Prompt audio (for voice cloning) # Prompt audio (for voice cloning)
parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path") parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path")
parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio") parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio")
parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio")
parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)") parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)")
# Generation parameters # Generation parameters
@@ -279,6 +280,12 @@ def main():
# If prompt audio+text provided → voice cloning # If prompt audio+text provided → voice cloning
if args.prompt_audio or args.prompt_text: if args.prompt_audio or args.prompt_text:
if not args.prompt_text and args.prompt_file:
assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible."
with open(args.prompt_file, 'r', encoding='utf-8') as f:
args.prompt_text = f.read()
if not args.prompt_audio or not args.prompt_text: if not args.prompt_audio or not args.prompt_text:
print("Error: Voice cloning requires both --prompt-audio and --prompt-text") print("Error: Voice cloning requires both --prompt-audio and --prompt-text")
sys.exit(1) sys.exit(1)

View File

@@ -1,6 +1,7 @@
import torch import torch
import torchaudio import torchaudio
import os import os
import re
import tempfile import tempfile
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from .model.voxcpm import VoxCPMModel from .model.voxcpm import VoxCPMModel
@@ -130,6 +131,8 @@ class VoxCPM:
if (prompt_wav_path is None) != (prompt_text is None): if (prompt_wav_path is None) != (prompt_text is None):
raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
text = text.replace("\n", " ")
text = re.sub(r'\s+', ' ', text)
temp_prompt_wav_path = None temp_prompt_wav_path = None
try: try:

View File

@@ -160,8 +160,8 @@ class VoxCPMModel(nn.Module):
self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True) self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True) self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
except Exception as e: except Exception as e:
print(e) print(f"Error: {e}")
print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
self.base_lm.forward_step = self.base_lm.forward_step self.base_lm.forward_step = self.base_lm.forward_step
self.residual_lm.forward_step = self.residual_lm.forward_step self.residual_lm.forward_step = self.residual_lm.forward_step
self.feat_encoder_step = self.feat_encoder self.feat_encoder_step = self.feat_encoder
@@ -284,7 +284,10 @@ class VoxCPMModel(nn.Module):
break break
else: else:
break break
return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
return decode_audio
@torch.inference_mode() @torch.inference_mode()
def build_prompt_cache( def build_prompt_cache(
@@ -468,6 +471,7 @@ class VoxCPMModel(nn.Module):
else: else:
break break
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
return ( return (
decode_audio, decode_audio,
@@ -580,7 +584,6 @@ class VoxCPMModel(nn.Module):
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size) feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token
return feat_pred, pred_feat_seq.squeeze(0).cpu() return feat_pred, pred_feat_seq.squeeze(0).cpu()
@classmethod @classmethod