diff --git a/README.md b/README.md index bc25618..f81bc57 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses - - - - - ## Quick Start ### 🔧 Install from PyPI @@ -55,7 +50,7 @@ By default, when you first run the script, the model will be downloaded automati - Download VoxCPM-0.5B ``` from huggingface_hub import snapshot_download - snapshot_download("openbmb/VoxCPM-0.5B",local_files_only=local_files_only) + snapshot_download("openbmb/VoxCPM-0.5B") ``` - Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo. ``` @@ -103,6 +98,13 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi --output out.wav \ --denoise +# (Optinal) Voice cloning (reference audio + transcript file) +voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \ + --prompt-audio path/to/voice.wav \ + --prompt-file "/path/to/text-file" \ + --output out.wav \ + --denoise + # 3) Batch processing (one text per line) voxcpm --input examples/input.txt --output-dir outs # (optional) Batch + cloning @@ -245,6 +247,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks: +## 📝TO-DO List +Please stay tuned for updates! +- [ ] Release the VoxCPM technical report. +- [ ] Support higher sampling rate (next version). + + + ## 📄 License The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license. @@ -265,6 +274,8 @@ This project is developed by the following institutions: - [THUHCSI](https://github.com/thuhcsi) +## ⭐ Star History + [![Star History Chart](https://api.star-history.com/svg?repos=OpenBMB/VoxCPM&type=Date)](https://star-history.com/#OpenBMB/VoxCPM&Date) ## 📚 Citation diff --git a/app.py b/app.py index 3f64801..f109c09 100644 --- a/app.py +++ b/app.py @@ -194,10 +194,6 @@ def create_demo_interface(demo: VoxCPMDemo): **调低**:合成速度更快。 - **Higher** for better synthesis quality. **调高**:合成质量更佳。 - - ### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音) - While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually. - 虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。 """) # Main controls @@ -244,14 +240,13 @@ def create_demo_interface(demo: VoxCPMDemo): text = gr.Textbox( value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.", label="Target Text", - info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio." ) with gr.Row(): DoNormalizeText = gr.Checkbox( value=False, label="Text Normalization", elem_id="chk_normalize", - info="We use WeTextPorcessing library to normalize the input text." + info="We use wetext library to normalize the input text." ) audio_output = gr.Audio(label="Output Audio") diff --git a/pyproject.toml b/pyproject.toml index dfb3399..8f9d5ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "addict", "wetext", "modelscope>=1.22.0", - "datasets>=2,<4", + "datasets>=3,<4", "huggingface-hub", "pydantic", "tqdm", diff --git a/src/voxcpm/cli.py b/src/voxcpm/cli.py index 801266f..f58e8b1 100644 --- a/src/voxcpm/cli.py +++ b/src/voxcpm/cli.py @@ -240,6 +240,7 @@ Examples: # Prompt audio (for voice cloning) parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path") parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio") + parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio") parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)") # Generation parameters @@ -279,6 +280,12 @@ def main(): # If prompt audio+text provided → voice cloning if args.prompt_audio or args.prompt_text: + if not args.prompt_text and args.prompt_file: + assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible." + + with open(args.prompt_file, 'r', encoding='utf-8') as f: + args.prompt_text = f.read() + if not args.prompt_audio or not args.prompt_text: print("Error: Voice cloning requires both --prompt-audio and --prompt-text") sys.exit(1) diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py index 7ff1d08..3b88b55 100644 --- a/src/voxcpm/core.py +++ b/src/voxcpm/core.py @@ -1,6 +1,7 @@ import torch import torchaudio import os +import re import tempfile from huggingface_hub import snapshot_download from .model.voxcpm import VoxCPMModel @@ -130,6 +131,8 @@ class VoxCPM: if (prompt_wav_path is None) != (prompt_text is None): raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + text = text.replace("\n", " ") + text = re.sub(r'\s+', ' ', text) temp_prompt_wav_path = None try: diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py index 3af0af9..1f5fdec 100644 --- a/src/voxcpm/model/voxcpm.py +++ b/src/voxcpm/model/voxcpm.py @@ -160,8 +160,8 @@ class VoxCPMModel(nn.Module): self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True) self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True) except Exception as e: - print(e) - print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") + print(f"Error: {e}") + print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") self.base_lm.forward_step = self.base_lm.forward_step self.residual_lm.forward_step = self.residual_lm.forward_step self.feat_encoder_step = self.feat_encoder @@ -283,8 +283,11 @@ class VoxCPMModel(nn.Module): else: break else: - break - return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() + break + + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() + decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio + return decode_audio @torch.inference_mode() def build_prompt_cache( @@ -468,7 +471,8 @@ class VoxCPMModel(nn.Module): else: break decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() - + decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio + return ( decode_audio, target_text_token, @@ -580,7 +584,6 @@ class VoxCPMModel(nn.Module): pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size) - feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token return feat_pred, pred_feat_seq.squeeze(0).cpu() @classmethod