15 Commits
1.0.1 ... 1.0.2

Author SHA1 Message Date
周逸轩
10f48ba330 update README 2025-09-17 19:36:32 +08:00
周逸轩
639b2272ab update README 2025-09-17 19:34:08 +08:00
周逸轩
7e8f754ba1 update README 2025-09-17 19:33:37 +08:00
刘鑫
032c7fe403 capture torch compile error 2025-09-17 18:09:09 +08:00
刘鑫
5390a47862 Merge branch 'dev'; Replace the text normalization library 2025-09-16 22:17:30 +08:00
刘鑫
e7012f1a94 Replace the text normalization library 2025-09-16 22:17:14 +08:00
刘鑫
82332cfc99 Replace the text normalization library 2025-09-16 22:17:14 +08:00
刘鑫
605ac2d8e4 Replace the text normalization library 2025-09-16 22:16:40 +08:00
周逸轩
0fa8d894d1 update README 2025-09-16 21:33:57 +08:00
周逸轩
776c0d19fb FX: typo 2025-09-16 19:40:27 +08:00
周逸轩
ed6e6b4dac FX: typo 2025-09-16 19:37:55 +08:00
周逸轩
e3108d4a12 FX: typo 2025-09-16 19:36:17 +08:00
周逸轩
59fe3f30a1 update README 2025-09-16 19:05:00 +08:00
周逸轩
6f2fb45756 ModelScope 2025-09-16 17:12:52 +08:00
周逸轩
91128d823d ModelScope 2025-09-16 17:12:52 +08:00
7 changed files with 35 additions and 78 deletions

View File

@@ -1,13 +1,20 @@
## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning ## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning
[![Project Page](https://img.shields.io/badge/Project%20Page-GitHub-blue)](https://github.com/OpenBMB/VoxCPM/) [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OpenBMB-yellow)](https://huggingface.co/openbmb/VoxCPM-0.5B) [![Live Playground](https://img.shields.io/badge/Live%20PlayGround-Demo-orange)](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [![Samples](https://img.shields.io/badge/Page-Samples-red)](https://thuhcsi.github.io/VoxCPM/) [![Project Page](https://img.shields.io/badge/Project%20Page-GitHub-blue)](https://github.com/OpenBMB/VoxCPM/) [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-OpenBMB-yellow)](https://huggingface.co/openbmb/VoxCPM-0.5B) [![ModelScope](https://img.shields.io/badge/ModelScope-OpenBMB-purple)](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) [![Live Playground](https://img.shields.io/badge/Live%20PlayGround-Demo-orange)](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [![Samples](https://img.shields.io/badge/Page-Samples-red)](https://openbmb.github.io/VoxCPM-demopage)
<div align="center"> <div align="center">
<img src="assets/voxcpm_logo.png" alt="VoxCPM Logo" width="40%"> <img src="assets/voxcpm_logo.png" alt="VoxCPM Logo" width="40%">
</div> </div>
<div align="center">
👋 Contact us on [WeChat](assets/wechat.png)
</div>
## News ## News
* [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)! * [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)!
* [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now! * [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now!
@@ -32,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
## Quick Start ## Quick Start
### 🔧 Install from PyPI ### 🔧 Install from PyPI
@@ -238,6 +240,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
## 📝TO-DO List
Please stay tuned for updates!
- [ ] Release the VoxCPM technical report.
- [ ] Support higher sampling rate (next version).
## 📄 License ## 📄 License
The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license. The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
@@ -262,6 +271,8 @@ This project is developed by the following institutions:
## 📚 Citation ## 📚 Citation
The techical report is coming soon, please wait for the release 😊
If you find our model helpful, please consider citing our projects 📝 and staring us ⭐️! If you find our model helpful, please consider citing our projects 📝 and staring us ⭐️!
```bib ```bib

2
app.py
View File

@@ -170,7 +170,7 @@ def create_demo_interface(demo: VoxCPMDemo):
# Pro Tips # Pro Tips
with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"): with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
gr.Markdown(f""" gr.Markdown("""
### Prompt Speech Enhancement参考语音降噪 ### Prompt Speech Enhancement参考语音降噪
- **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component. - **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
**启用**:通过 ZipEnhancer 组件消除背景噪音,获得更好的音质。 **启用**:通过 ZipEnhancer 组件消除背景噪音,获得更好的音质。

BIN
assets/wechat.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.5 KiB

View File

@@ -34,7 +34,7 @@ dependencies = [
"gradio", "gradio",
"inflect", "inflect",
"addict", "addict",
"WeTextProcessing", "wetext",
"modelscope>=1.22.0", "modelscope>=1.22.0",
"datasets>=2,<4", "datasets>=2,<4",
"huggingface-hub", "huggingface-hub",

View File

@@ -4,8 +4,6 @@ import os
import tempfile import tempfile
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from .model.voxcpm import VoxCPMModel from .model.voxcpm import VoxCPMModel
from .utils.text_normalize import TextNormalizer
class VoxCPM: class VoxCPM:
def __init__(self, def __init__(self,
@@ -25,7 +23,7 @@ class VoxCPM:
""" """
print(f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}") print(f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}")
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path) self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
self.text_normalizer = TextNormalizer() self.text_normalizer = None
if enable_denoiser and zipenhancer_model_path is not None: if enable_denoiser and zipenhancer_model_path is not None:
from .zipenhancer import ZipEnhancer from .zipenhancer import ZipEnhancer
self.denoiser = ZipEnhancer(zipenhancer_model_path) self.denoiser = ZipEnhancer(zipenhancer_model_path)
@@ -33,7 +31,8 @@ class VoxCPM:
self.denoiser = None self.denoiser = None
print("Warm up VoxCPMModel...") print("Warm up VoxCPMModel...")
self.tts_model.generate( self.tts_model.generate(
target_text="Hello, this is the first test sentence." target_text="Hello, this is the first test sentence.",
max_len=10,
) )
@classmethod @classmethod
@@ -145,6 +144,9 @@ class VoxCPM:
continue continue
print("sub_text:", sub_text) print("sub_text:", sub_text)
if normalize: if normalize:
if self.text_normalizer is None:
from .utils.text_normalize import TextNormalizer
self.text_normalizer = TextNormalizer()
sub_text = self.text_normalizer.normalize(sub_text) sub_text = self.text_normalizer.normalize(sub_text)
wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache( wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache(
target_text=sub_text, target_text=sub_text,

View File

@@ -148,12 +148,15 @@ class VoxCPMModel(nn.Module):
def optimize(self): def optimize(self):
if self.device == "cuda": try:
if self.device != "cuda":
raise ValueError("VoxCPMModel can only be optimized on CUDA device")
self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True) self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True)
self.residual_lm.forward_step = torch.compile(self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True) self.residual_lm.forward_step = torch.compile(self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True)
self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True) self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True) self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
else: except:
print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
self.base_lm.forward_step = self.base_lm.forward_step self.base_lm.forward_step = self.base_lm.forward_step
self.residual_lm.forward_step = self.residual_lm.forward_step self.residual_lm.forward_step = self.residual_lm.forward_step
self.feat_encoder_step = self.feat_encoder self.feat_encoder_step = self.feat_encoder

View File

@@ -3,40 +3,7 @@ import re
import regex import regex
import inflect import inflect
from functools import partial from functools import partial
from tn.chinese.normalizer import Normalizer as ZhNormalizer from wetext import Normalizer
from tn.english.normalizer import Normalizer as EnNormalizer
def normal_cut_sentence(text):
# 先处理括号内的逗号,将其替换为特殊标记
text = re.sub(r'([(][^)]*)([,])([^)]*[)])', r'\1&&&\3', text)
text = re.sub('([。!,?\?])([^’”])',r'\1\n\2',text)#普通断句符号且后面没有引号
text = re.sub('(\.{6})([^’”])',r'\1\n\2',text)#英文省略号且后面没有引号
text = re.sub('(\{2})([^’”])',r'\1\n\2',text)#中文省略号且后面没有引号
text = re.sub('([. ,。!;?\?\.{6}\{2}][’”])([^’”])',r'\1\n\2',text)#断句号+引号且后面没有引号
# 处理英文句子的分隔
text = re.sub(r'([.,!?])([^’”\'"])', r'\1\n\2', text) # 句号、感叹号、问号后面没有引号
text = re.sub(r'([.!?][’”\'"])([^’”\'"])', r'\1\n\2', text) # 句号、感叹号、问号加引号后面的部分
text = re.sub(r'([(][^)]*)(&&&)([^)]*[)])', r'\1\3', text)
text = [t for t in text.split("\n") if t]
return text
def cut_sentence_with_fix_length(text : str, length : int):
sentences = normal_cut_sentence(text)
cur_length = 0
res = ""
for sentence in sentences:
if not sentence:
continue
if cur_length > length or cur_length + len(sentence) > length:
yield res
res = ""
cur_length = 0
res += sentence
cur_length += len(sentence)
if res:
yield res
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+') chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
@@ -195,8 +162,8 @@ def clean_text(text):
class TextNormalizer: class TextNormalizer:
def __init__(self, tokenizer=None): def __init__(self, tokenizer=None):
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, remove_interjections=False, overwrite_cache=True) self.zh_tn_model = Normalizer(lang="zh", operator="tn", remove_erhua=True)
self.en_tn_model = EnNormalizer() self.en_tn_model = Normalizer(lang="en", operator="tn")
self.inflect_parser = inflect.engine() self.inflect_parser = inflect.engine()
def normalize(self, text, split=False): def normalize(self, text, split=False):
@@ -207,38 +174,12 @@ class TextNormalizer:
text = text.replace("=", "等于") # 修复 ”550 + 320 等于 870 千卡。“ 被错误正则为 ”五百五十加三百二十等于八七十千卡.“ text = text.replace("=", "等于") # 修复 ”550 + 320 等于 870 千卡。“ 被错误正则为 ”五百五十加三百二十等于八七十千卡.“
if re.search(r'([\d$%^*_+≥≤≠×÷?=])', text): # 避免 英文连字符被错误正则为减 if re.search(r'([\d$%^*_+≥≤≠×÷?=])', text): # 避免 英文连字符被错误正则为减
text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2 text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2
text = self.zh_tn_model.normalize(text)
text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2
text = self.zh_tn_model.normalize(text) text = self.zh_tn_model.normalize(text)
text = replace_blank(text) text = replace_blank(text)
text = replace_corner_mark(text) text = replace_corner_mark(text)
text = remove_bracket(text) text = remove_bracket(text)
text = re.sub(r'[,]+$', '', text)
else: else:
text = self.en_tn_model.normalize(text) text = self.en_tn_model.normalize(text)
text = spell_out_number(text, self.inflect_parser) text = spell_out_number(text, self.inflect_parser)
if split is False: if split is False:
return text return text
if __name__ == "__main__":
text_normalizer = TextNormalizer()
text = r"""今天我们学习一元二次方程。一元二次方程的标准形式是:
ax2+bx+c=0ax^2 + bx + c = 0ax2+bx+c=0
其中aaa、bbb 和 ccc 是常数xxx 是变量。这个方程的解可以通过求根公式来找到。
一元二次方程的解法有几种:
- 因式分解法通过将方程因式分解来求解。我们首先尝试将方程表达成两个括号的形式解决方程的解。比如方程x25x+6=0x^2 - 5x + 6 = 0x25x+6=0可以因式分解为(x2)(x3)=0(x - 2)(x - 3) = 0(x2)(x3)=0因此根为2和3。
- 配方法:通过配方将方程转化为完全平方的形式,从而解出。我们通过加上或减去适当的常数来完成这一过程,使得方程可以直接写成一个完全平方的形式。
- 求根公式:我们可以使用求根公式直接求出方程的解。这个公式适用于所有的一元二次方程,即使我们无法通过因式分解或配方法来解决时,也能使用该公式。
公式x=b±b24ac2ax = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}x=2ab±b24ac这个公式可以帮助我们求解任何一元二次方程的根。
对于一元二次方程,我们需要了解判别式。判别式的作用是帮助我们判断方程的解的个数和性质。判别式 Δ\DeltaΔ 由下式给出:Δ=b24ac\Delta = b^2 - 4acΔ=b24ac 根据判别式的值,我们可以知道:
- 如果 Δ>0\Delta > 0Δ>0方程有两个不相等的实数解。这是因为判别式大于0时根号内的值是正数所以我们可以得到两个不同的解。
- 如果 Δ=0\Delta = 0Δ=0方程有一个实数解。这是因为根号内的值为零导致两个解相等也就是说方程有一个解。
- 如果 Δ<0\Delta < 0Δ<0方程没有实数解。这意味着根号内的值是负数无法进行实数运算因此方程没有实数解可能有复数解。"""
texts = ["这是一个公式 (a+b)³=a³+3a²b+3ab²+b³ S=(a×b)÷2", "这样的发展为AI仅仅作为“工具”这一观点提出了新的挑战", "550 + 320 = 870千卡。", "解一元二次方程3x^2+x-2=0", "你好啊"]
texts = [text]
for text in texts:
text = text_normalizer.normalize(text)
print(text)
for t in cut_sentence_with_fix_length(text, 15):
print(t)