merge from main
This commit is contained in:
23
README.md
23
README.md
@@ -39,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### 🔧 Install from PyPI
|
### 🔧 Install from PyPI
|
||||||
@@ -55,7 +50,7 @@ By default, when you first run the script, the model will be downloaded automati
|
|||||||
- Download VoxCPM-0.5B
|
- Download VoxCPM-0.5B
|
||||||
```
|
```
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
snapshot_download("openbmb/VoxCPM-0.5B",local_files_only=local_files_only)
|
snapshot_download("openbmb/VoxCPM-0.5B")
|
||||||
```
|
```
|
||||||
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
|
- Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
|
||||||
```
|
```
|
||||||
@@ -103,6 +98,13 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi
|
|||||||
--output out.wav \
|
--output out.wav \
|
||||||
--denoise
|
--denoise
|
||||||
|
|
||||||
|
# (Optinal) Voice cloning (reference audio + transcript file)
|
||||||
|
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
|
||||||
|
--prompt-audio path/to/voice.wav \
|
||||||
|
--prompt-file "/path/to/text-file" \
|
||||||
|
--output out.wav \
|
||||||
|
--denoise
|
||||||
|
|
||||||
# 3) Batch processing (one text per line)
|
# 3) Batch processing (one text per line)
|
||||||
voxcpm --input examples/input.txt --output-dir outs
|
voxcpm --input examples/input.txt --output-dir outs
|
||||||
# (optional) Batch + cloning
|
# (optional) Batch + cloning
|
||||||
@@ -245,6 +247,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 📝TO-DO List
|
||||||
|
Please stay tuned for updates!
|
||||||
|
- [ ] Release the VoxCPM technical report.
|
||||||
|
- [ ] Support higher sampling rate (next version).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 📄 License
|
## 📄 License
|
||||||
The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
|
The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
|
||||||
|
|
||||||
@@ -265,6 +274,8 @@ This project is developed by the following institutions:
|
|||||||
- <img src="assets/thuhcsi_logo.png" width="28px"> [THUHCSI](https://github.com/thuhcsi)
|
- <img src="assets/thuhcsi_logo.png" width="28px"> [THUHCSI](https://github.com/thuhcsi)
|
||||||
|
|
||||||
|
|
||||||
|
## ⭐ Star History
|
||||||
|
[](https://star-history.com/#OpenBMB/VoxCPM&Date)
|
||||||
|
|
||||||
|
|
||||||
## 📚 Citation
|
## 📚 Citation
|
||||||
|
|||||||
7
app.py
7
app.py
@@ -194,10 +194,6 @@ def create_demo_interface(demo: VoxCPMDemo):
|
|||||||
**调低**:合成速度更快。
|
**调低**:合成速度更快。
|
||||||
- **Higher** for better synthesis quality.
|
- **Higher** for better synthesis quality.
|
||||||
**调高**:合成质量更佳。
|
**调高**:合成质量更佳。
|
||||||
|
|
||||||
### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音)
|
|
||||||
While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.
|
|
||||||
虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# Main controls
|
# Main controls
|
||||||
@@ -244,14 +240,13 @@ def create_demo_interface(demo: VoxCPMDemo):
|
|||||||
text = gr.Textbox(
|
text = gr.Textbox(
|
||||||
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
|
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
|
||||||
label="Target Text",
|
label="Target Text",
|
||||||
info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
|
|
||||||
)
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
DoNormalizeText = gr.Checkbox(
|
DoNormalizeText = gr.Checkbox(
|
||||||
value=False,
|
value=False,
|
||||||
label="Text Normalization",
|
label="Text Normalization",
|
||||||
elem_id="chk_normalize",
|
elem_id="chk_normalize",
|
||||||
info="We use WeTextPorcessing library to normalize the input text."
|
info="We use wetext library to normalize the input text."
|
||||||
)
|
)
|
||||||
audio_output = gr.Audio(label="Output Audio")
|
audio_output = gr.Audio(label="Output Audio")
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ dependencies = [
|
|||||||
"addict",
|
"addict",
|
||||||
"wetext",
|
"wetext",
|
||||||
"modelscope>=1.22.0",
|
"modelscope>=1.22.0",
|
||||||
"datasets>=2,<4",
|
"datasets>=3,<4",
|
||||||
"huggingface-hub",
|
"huggingface-hub",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
|
|||||||
@@ -240,6 +240,7 @@ Examples:
|
|||||||
# Prompt audio (for voice cloning)
|
# Prompt audio (for voice cloning)
|
||||||
parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path")
|
parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path")
|
||||||
parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio")
|
parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio")
|
||||||
|
parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio")
|
||||||
parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)")
|
parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)")
|
||||||
|
|
||||||
# Generation parameters
|
# Generation parameters
|
||||||
@@ -279,6 +280,12 @@ def main():
|
|||||||
|
|
||||||
# If prompt audio+text provided → voice cloning
|
# If prompt audio+text provided → voice cloning
|
||||||
if args.prompt_audio or args.prompt_text:
|
if args.prompt_audio or args.prompt_text:
|
||||||
|
if not args.prompt_text and args.prompt_file:
|
||||||
|
assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible."
|
||||||
|
|
||||||
|
with open(args.prompt_file, 'r', encoding='utf-8') as f:
|
||||||
|
args.prompt_text = f.read()
|
||||||
|
|
||||||
if not args.prompt_audio or not args.prompt_text:
|
if not args.prompt_audio or not args.prompt_text:
|
||||||
print("Error: Voice cloning requires both --prompt-audio and --prompt-text")
|
print("Error: Voice cloning requires both --prompt-audio and --prompt-text")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from .model.voxcpm import VoxCPMModel
|
from .model.voxcpm import VoxCPMModel
|
||||||
@@ -130,6 +131,8 @@ class VoxCPM:
|
|||||||
if (prompt_wav_path is None) != (prompt_text is None):
|
if (prompt_wav_path is None) != (prompt_text is None):
|
||||||
raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
|
raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
|
||||||
|
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
temp_prompt_wav_path = None
|
temp_prompt_wav_path = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -160,8 +160,8 @@ class VoxCPMModel(nn.Module):
|
|||||||
self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
|
self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
|
||||||
self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
|
self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(f"Error: {e}")
|
||||||
print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
|
print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
|
||||||
self.base_lm.forward_step = self.base_lm.forward_step
|
self.base_lm.forward_step = self.base_lm.forward_step
|
||||||
self.residual_lm.forward_step = self.residual_lm.forward_step
|
self.residual_lm.forward_step = self.residual_lm.forward_step
|
||||||
self.feat_encoder_step = self.feat_encoder
|
self.feat_encoder_step = self.feat_encoder
|
||||||
@@ -284,7 +284,10 @@ class VoxCPMModel(nn.Module):
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
|
|
||||||
|
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
|
||||||
|
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
|
||||||
|
return decode_audio
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def build_prompt_cache(
|
def build_prompt_cache(
|
||||||
@@ -468,6 +471,7 @@ class VoxCPMModel(nn.Module):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
|
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
|
||||||
|
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
|
||||||
|
|
||||||
return (
|
return (
|
||||||
decode_audio,
|
decode_audio,
|
||||||
@@ -580,7 +584,6 @@ class VoxCPMModel(nn.Module):
|
|||||||
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
|
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
|
||||||
|
|
||||||
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
|
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
|
||||||
feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token
|
|
||||||
return feat_pred, pred_feat_seq.squeeze(0).cpu()
|
return feat_pred, pred_feat_seq.squeeze(0).cpu()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user