5 Commits
1.0.4 ... 1.0.5

Author SHA1 Message Date
Labmem-Zhouyx
d1bb6aaf41 update technical report 2025-09-30 10:47:39 +08:00
刘鑫
2eb4d39719 FX: Add MPS support 2025-09-28 21:06:35 +08:00
刘鑫
fbf8984d4e Merge branch 'main' into dev 2025-09-27 16:20:47 +08:00
刘鑫
961569e76d merge from main 2025-09-19 22:08:56 +08:00
刘鑫
f26a1ea2f7 Remove segment text logic 2025-09-18 12:01:26 +08:00
3 changed files with 42 additions and 17 deletions

View File

@@ -1,7 +1,7 @@
## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning ## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning
[![Project Page](https://img.shields.io/badge/Project%20Page-GitHub-blue)](https://github.com/OpenBMB/VoxCPM/) [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OpenBMB-yellow)](https://huggingface.co/openbmb/VoxCPM-0.5B) [![ModelScope](https://img.shields.io/badge/ModelScope-OpenBMB-purple)](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) [![Live Playground](https://img.shields.io/badge/Live%20PlayGround-Demo-orange)](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [![Samples](https://img.shields.io/badge/Page-Samples-red)](https://openbmb.github.io/VoxCPM-demopage) [![Project Page](https://img.shields.io/badge/Project%20Page-GitHub-blue)](https://github.com/OpenBMB/VoxCPM/) [![Technical Report](https://img.shields.io/badge/Technical%20Report-Arxiv-red)](https://arxiv.org/abs/2509.24650) [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OpenBMB-yellow)](https://huggingface.co/openbmb/VoxCPM-0.5B) [![ModelScope](https://img.shields.io/badge/ModelScope-OpenBMB-purple)](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) [![Live Playground](https://img.shields.io/badge/Live%20PlayGround-Demo-orange)](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [![Samples](https://img.shields.io/badge/Audio%20Samples-Page-green)](https://openbmb.github.io/VoxCPM-demopage)
@@ -16,6 +16,7 @@
</div> </div>
## News ## News
* [2025.09.30] 🔥 🔥 🔥 We Release VoxCPM [Technical Report](https://arxiv.org/abs/2509.24650)!
* [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)! * [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)!
* [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now! * [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now!
@@ -195,6 +196,19 @@ Happy creating! 🎉 Start with the default settings and tweak from there to sui
--- ---
## 🌟 Community Projects
We're excited to see the VoxCPM community growing! Here are some amazing projects and features built by our community:
- **[ComfyUI-VoxCPM](https://github.com/wildminder/ComfyUI-VoxCPM)**
- **[ComfyUI-VoxCPMTTS](https://github.com/1038lab/ComfyUI-VoxCPMTTS)**
- **[WebUI-VoxCPM](https://github.com/rsxdalv/tts_webui_extension.vox_cpm)**
- **[PR: Streaming API Support (by AbrahamSanders)](https://github.com/OpenBMB/VoxCPM/pull/26)**
*Have you built something cool with VoxCPM? We'd love to feature it here! Please open an issue or pull request to add your project.*
## 📊 Performance Highlights ## 📊 Performance Highlights
@@ -263,7 +277,7 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
## 📝TO-DO List ## 📝TO-DO List
Please stay tuned for updates! Please stay tuned for updates!
- [ ] Release the VoxCPM technical report. - [x] Release the VoxCPM technical report.
- [ ] Support higher sampling rate (next version). - [ ] Support higher sampling rate (next version).
@@ -294,16 +308,13 @@ This project is developed by the following institutions:
## 📚 Citation ## 📚 Citation
The techical report is coming soon, please wait for the release 😊
If you find our model helpful, please consider citing our projects 📝 and staring us ⭐️! If you find our model helpful, please consider citing our projects 📝 and staring us ⭐️!
```bib ```bib
@misc{voxcpm2025, @article{voxcpm2025,
author = {{Yixuan Zhou, Guoyang Zeng, Xin Liu, Xiang Li, Renjie Yu, Ziyang Wang, Runchuan Ye, Weiyue Sun, Jiancheng Gui, Kehan Li, Zhiyong Wu, Zhiyuan Liu}}, title = {VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning},
title = {{VoxCPM}}, author = {Zhou, Yixuan and Zeng, Guoyang and Liu, Xin and Li, Xiang and Yu, Renjie and Wang, Ziyang and Ye, Runchuan and Sun, Weiyue and Gui, Jiancheng and Li, Kehan and Wu, Zhiyong and Liu, Zhiyuan},
journal = {arXiv preprint arXiv:2509.24650},
year = {2025}, year = {2025},
publish = {\url{https://github.com/OpenBMB/VoxCPM}},
note = {GitHub repository}
} }
``` ```

View File

@@ -85,11 +85,15 @@ class VoxCPMModel(nn.Module):
self.patch_size = config.patch_size self.patch_size = config.patch_size
self.device = config.device self.device = config.device
if not torch.cuda.is_available(): if not torch.cuda.is_available():
if torch.backends.mps.is_available():
self.device = "mps"
else:
self.device = "cpu" self.device = "cpu"
print(f"Running on device: {self.device}, dtype: {self.config.dtype}")
# Text-Semantic LM # Text-Semantic LM
self.base_lm = MiniCPMModel(config.lm_config) self.base_lm = MiniCPMModel(config.lm_config)
self.base_lm.setup_cache(1, config.max_length, self.device, get_dtype(config.dtype)) self.base_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
self.text_tokenizer = mask_multichar_chinese_tokens(tokenizer) self.text_tokenizer = mask_multichar_chinese_tokens(tokenizer)
self.audio_start_token = 101 self.audio_start_token = 101
@@ -100,7 +104,7 @@ class VoxCPMModel(nn.Module):
residual_lm_config.num_hidden_layers = config.residual_lm_num_layers residual_lm_config.num_hidden_layers = config.residual_lm_num_layers
residual_lm_config.vocab_size = 0 residual_lm_config.vocab_size = 0
self.residual_lm = MiniCPMModel(residual_lm_config) self.residual_lm = MiniCPMModel(residual_lm_config)
self.residual_lm.setup_cache(1, config.max_length, self.device, get_dtype(config.dtype)) self.residual_lm.setup_cache(1, config.max_length, self.device, get_dtype(self.config.dtype))
# Local Encoder # Local Encoder
encoder_config = config.lm_config.model_copy(deep=True) encoder_config = config.lm_config.model_copy(deep=True)
@@ -271,7 +275,7 @@ class VoxCPMModel(nn.Module):
text_token = text_token.unsqueeze(0).to(self.device) text_token = text_token.unsqueeze(0).to(self.device)
text_mask = text_mask.unsqueeze(0).to(self.device) text_mask = text_mask.unsqueeze(0).to(self.device)
audio_feat = audio_feat.unsqueeze(0).to(self.device).to(torch.bfloat16) audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
audio_mask = audio_mask.unsqueeze(0).to(self.device) audio_mask = audio_mask.unsqueeze(0).to(self.device)
target_text_length = len(self.text_tokenizer(target_text)) target_text_length = len(self.text_tokenizer(target_text))
@@ -484,7 +488,7 @@ class VoxCPMModel(nn.Module):
text_token = text_token.unsqueeze(0).to(self.device) text_token = text_token.unsqueeze(0).to(self.device)
text_mask = text_mask.unsqueeze(0).to(self.device) text_mask = text_mask.unsqueeze(0).to(self.device)
audio_feat = audio_feat.unsqueeze(0).to(self.device).to(torch.bfloat16) audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype))
audio_mask = audio_mask.unsqueeze(0).to(self.device) audio_mask = audio_mask.unsqueeze(0).to(self.device)
# run inference # run inference
@@ -670,7 +674,7 @@ class VoxCPMModel(nn.Module):
)["state_dict"] )["state_dict"]
model = cls(config, tokenizer, audio_vae) model = cls(config, tokenizer, audio_vae)
lm_dtype = get_dtype(config.dtype) lm_dtype = get_dtype(model.config.dtype)
model = model.to(lm_dtype) model = model.to(lm_dtype)
model.audio_vae = model.audio_vae.to(torch.float32) model.audio_vae = model.audio_vae.to(torch.float32)

View File

@@ -154,6 +154,11 @@ class MiniCPMAttention(nn.Module):
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
# ref: https://github.com/pytorch/pytorch/issues/163597
# there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
query_states = query_states.contiguous()
key_states = key_states.contiguous()
value_states = value_states.contiguous()
attn_output = torch.nn.functional.scaled_dot_product_attention( attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states, query_states,
key_states, key_states,
@@ -198,6 +203,11 @@ class MiniCPMAttention(nn.Module):
attn_mask = torch.arange(key_cache.size(2), device=key_cache.device) <= position_id attn_mask = torch.arange(key_cache.size(2), device=key_cache.device) <= position_id
# ref: https://github.com/pytorch/pytorch/issues/163597
# there is a bug in MPS for non-contiguous tensors, so we need to make them contiguous
query_states = query_states.contiguous()
key_cache = key_cache.contiguous()
value_cache = value_cache.contiguous()
attn_output = torch.nn.functional.scaled_dot_product_attention( attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states, query_states,
key_cache, key_cache,