mirror of
https://github.com/OpenBMB/VoxCPM
synced 2025-12-12 03:48:12 +00:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
10f48ba330 | ||
|
|
639b2272ab | ||
|
|
7e8f754ba1 | ||
|
|
032c7fe403 | ||
|
|
5390a47862 | ||
|
|
e7012f1a94 | ||
|
|
82332cfc99 | ||
|
|
605ac2d8e4 | ||
|
|
0fa8d894d1 | ||
|
|
776c0d19fb | ||
|
|
ed6e6b4dac | ||
|
|
e3108d4a12 | ||
|
|
59fe3f30a1 | ||
|
|
6f2fb45756 | ||
|
|
91128d823d |
23
README.md
23
README.md
@@ -1,13 +1,20 @@
|
||||
## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning
|
||||
|
||||
|
||||
[](https://github.com/OpenBMB/VoxCPM/) [](https://huggingface.co/openbmb/VoxCPM-0.5B) [](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [](https://thuhcsi.github.io/VoxCPM/)
|
||||
[](https://github.com/OpenBMB/VoxCPM/) [](https://huggingface.co/openbmb/VoxCPM-0.5B) [](https://modelscope.cn/models/OpenBMB/VoxCPM-0.5B) [](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [](https://openbmb.github.io/VoxCPM-demopage)
|
||||
|
||||
|
||||
|
||||
<div align="center">
|
||||
<img src="assets/voxcpm_logo.png" alt="VoxCPM Logo" width="40%">
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
|
||||
👋 Contact us on [WeChat](assets/wechat.png)
|
||||
|
||||
</div>
|
||||
|
||||
## News
|
||||
* [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)!
|
||||
* [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now!
|
||||
@@ -32,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 🔧 Install from PyPI
|
||||
@@ -238,6 +240,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
|
||||
|
||||
|
||||
|
||||
## 📝TO-DO List
|
||||
Please stay tuned for updates!
|
||||
- [ ] Release the VoxCPM technical report.
|
||||
- [ ] Support higher sampling rate (next version).
|
||||
|
||||
|
||||
|
||||
## 📄 License
|
||||
The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
|
||||
|
||||
@@ -262,6 +271,8 @@ This project is developed by the following institutions:
|
||||
|
||||
## 📚 Citation
|
||||
|
||||
The techical report is coming soon, please wait for the release 😊
|
||||
|
||||
If you find our model helpful, please consider citing our projects 📝 and staring us ⭐️!
|
||||
|
||||
```bib
|
||||
|
||||
2
app.py
2
app.py
@@ -170,7 +170,7 @@ def create_demo_interface(demo: VoxCPMDemo):
|
||||
|
||||
# Pro Tips
|
||||
with gr.Accordion("💡 Pro Tips |使用建议", open=False, elem_id="acc_tips"):
|
||||
gr.Markdown(f"""
|
||||
gr.Markdown("""
|
||||
### Prompt Speech Enhancement|参考语音降噪
|
||||
- **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
|
||||
**启用**:通过 ZipEnhancer 组件消除背景噪音,获得更好的音质。
|
||||
|
||||
BIN
assets/wechat.png
Normal file
BIN
assets/wechat.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 9.5 KiB |
@@ -34,7 +34,7 @@ dependencies = [
|
||||
"gradio",
|
||||
"inflect",
|
||||
"addict",
|
||||
"WeTextProcessing",
|
||||
"wetext",
|
||||
"modelscope>=1.22.0",
|
||||
"datasets>=2,<4",
|
||||
"huggingface-hub",
|
||||
|
||||
@@ -4,8 +4,6 @@ import os
|
||||
import tempfile
|
||||
from huggingface_hub import snapshot_download
|
||||
from .model.voxcpm import VoxCPMModel
|
||||
from .utils.text_normalize import TextNormalizer
|
||||
|
||||
|
||||
class VoxCPM:
|
||||
def __init__(self,
|
||||
@@ -25,7 +23,7 @@ class VoxCPM:
|
||||
"""
|
||||
print(f"voxcpm_model_path: {voxcpm_model_path}, zipenhancer_model_path: {zipenhancer_model_path}, enable_denoiser: {enable_denoiser}")
|
||||
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
|
||||
self.text_normalizer = TextNormalizer()
|
||||
self.text_normalizer = None
|
||||
if enable_denoiser and zipenhancer_model_path is not None:
|
||||
from .zipenhancer import ZipEnhancer
|
||||
self.denoiser = ZipEnhancer(zipenhancer_model_path)
|
||||
@@ -33,8 +31,9 @@ class VoxCPM:
|
||||
self.denoiser = None
|
||||
print("Warm up VoxCPMModel...")
|
||||
self.tts_model.generate(
|
||||
target_text="Hello, this is the first test sentence."
|
||||
)
|
||||
target_text="Hello, this is the first test sentence.",
|
||||
max_len=10,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls,
|
||||
@@ -145,6 +144,9 @@ class VoxCPM:
|
||||
continue
|
||||
print("sub_text:", sub_text)
|
||||
if normalize:
|
||||
if self.text_normalizer is None:
|
||||
from .utils.text_normalize import TextNormalizer
|
||||
self.text_normalizer = TextNormalizer()
|
||||
sub_text = self.text_normalizer.normalize(sub_text)
|
||||
wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache(
|
||||
target_text=sub_text,
|
||||
|
||||
@@ -148,12 +148,15 @@ class VoxCPMModel(nn.Module):
|
||||
|
||||
|
||||
def optimize(self):
|
||||
if self.device == "cuda":
|
||||
try:
|
||||
if self.device != "cuda":
|
||||
raise ValueError("VoxCPMModel can only be optimized on CUDA device")
|
||||
self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True)
|
||||
self.residual_lm.forward_step = torch.compile(self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True)
|
||||
self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
|
||||
self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
|
||||
else:
|
||||
except:
|
||||
print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
|
||||
self.base_lm.forward_step = self.base_lm.forward_step
|
||||
self.residual_lm.forward_step = self.residual_lm.forward_step
|
||||
self.feat_encoder_step = self.feat_encoder
|
||||
|
||||
@@ -3,41 +3,8 @@ import re
|
||||
import regex
|
||||
import inflect
|
||||
from functools import partial
|
||||
from tn.chinese.normalizer import Normalizer as ZhNormalizer
|
||||
from tn.english.normalizer import Normalizer as EnNormalizer
|
||||
from wetext import Normalizer
|
||||
|
||||
def normal_cut_sentence(text):
|
||||
# 先处理括号内的逗号,将其替换为特殊标记
|
||||
text = re.sub(r'([((][^))]*)([,,])([^))]*[))])', r'\1&&&\3', text)
|
||||
text = re.sub('([。!,?\?])([^’”])',r'\1\n\2',text)#普通断句符号且后面没有引号
|
||||
text = re.sub('(\.{6})([^’”])',r'\1\n\2',text)#英文省略号且后面没有引号
|
||||
text = re.sub('(\…{2})([^’”])',r'\1\n\2',text)#中文省略号且后面没有引号
|
||||
text = re.sub('([. ,。!;?\?\.{6}\…{2}][’”])([^’”])',r'\1\n\2',text)#断句号+引号且后面没有引号
|
||||
# 处理英文句子的分隔
|
||||
text = re.sub(r'([.,!?])([^’”\'"])', r'\1\n\2', text) # 句号、感叹号、问号后面没有引号
|
||||
text = re.sub(r'([.!?][’”\'"])([^’”\'"])', r'\1\n\2', text) # 句号、感叹号、问号加引号后面的部分
|
||||
text = re.sub(r'([((][^))]*)(&&&)([^))]*[))])', r'\1,\3', text)
|
||||
text = [t for t in text.split("\n") if t]
|
||||
return text
|
||||
|
||||
|
||||
def cut_sentence_with_fix_length(text : str, length : int):
|
||||
sentences = normal_cut_sentence(text)
|
||||
cur_length = 0
|
||||
res = ""
|
||||
for sentence in sentences:
|
||||
if not sentence:
|
||||
continue
|
||||
if cur_length > length or cur_length + len(sentence) > length:
|
||||
yield res
|
||||
res = ""
|
||||
cur_length = 0
|
||||
res += sentence
|
||||
cur_length += len(sentence)
|
||||
if res:
|
||||
yield res
|
||||
|
||||
|
||||
chinese_char_pattern = re.compile(r'[\u4e00-\u9fff]+')
|
||||
|
||||
# whether contain chinese character
|
||||
@@ -195,8 +162,8 @@ def clean_text(text):
|
||||
class TextNormalizer:
|
||||
def __init__(self, tokenizer=None):
|
||||
self.tokenizer = tokenizer
|
||||
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False, remove_interjections=False, overwrite_cache=True)
|
||||
self.en_tn_model = EnNormalizer()
|
||||
self.zh_tn_model = Normalizer(lang="zh", operator="tn", remove_erhua=True)
|
||||
self.en_tn_model = Normalizer(lang="en", operator="tn")
|
||||
self.inflect_parser = inflect.engine()
|
||||
|
||||
def normalize(self, text, split=False):
|
||||
@@ -207,38 +174,12 @@ class TextNormalizer:
|
||||
text = text.replace("=", "等于") # 修复 ”550 + 320 等于 870 千卡。“ 被错误正则为 ”五百五十加三百二十等于八七十千卡.“
|
||||
if re.search(r'([\d$%^*_+≥≤≠×÷?=])', text): # 避免 英文连字符被错误正则为减
|
||||
text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2
|
||||
text = self.zh_tn_model.normalize(text)
|
||||
text = re.sub(r'(?<=[a-zA-Z0-9])-(?=\d)', ' - ', text) # 修复 x-2 被正则为 x负2
|
||||
text = self.zh_tn_model.normalize(text)
|
||||
text = replace_blank(text)
|
||||
text = replace_corner_mark(text)
|
||||
text = remove_bracket(text)
|
||||
text = re.sub(r'[,,]+$', '。', text)
|
||||
else:
|
||||
text = self.en_tn_model.normalize(text)
|
||||
text = spell_out_number(text, self.inflect_parser)
|
||||
if split is False:
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text_normalizer = TextNormalizer()
|
||||
text = r"""今天我们学习一元二次方程。一元二次方程的标准形式是:
|
||||
ax2+bx+c=0ax^2 + bx + c = 0ax2+bx+c=0
|
||||
其中,aaa、bbb 和 ccc 是常数,xxx 是变量。这个方程的解可以通过求根公式来找到。
|
||||
一元二次方程的解法有几种:
|
||||
- 因式分解法:通过将方程因式分解来求解。我们首先尝试将方程表达成两个括号的形式,解决方程的解。比如,方程x2−5x+6=0x^2 - 5x + 6 = 0x2−5x+6=0可以因式分解为(x−2)(x−3)=0(x - 2)(x - 3) = 0(x−2)(x−3)=0,因此根为2和3。
|
||||
- 配方法:通过配方将方程转化为完全平方的形式,从而解出。我们通过加上或减去适当的常数来完成这一过程,使得方程可以直接写成一个完全平方的形式。
|
||||
- 求根公式:我们可以使用求根公式直接求出方程的解。这个公式适用于所有的一元二次方程,即使我们无法通过因式分解或配方法来解决时,也能使用该公式。
|
||||
公式:x=−b±b2−4ac2ax = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}x=2a−b±b2−4ac这个公式可以帮助我们求解任何一元二次方程的根。
|
||||
对于一元二次方程,我们需要了解判别式。判别式的作用是帮助我们判断方程的解的个数和性质。判别式 Δ\DeltaΔ 由下式给出:Δ=b2−4ac\Delta = b^2 - 4acΔ=b2−4ac 根据判别式的值,我们可以知道:
|
||||
- 如果 Δ>0\Delta > 0Δ>0,方程有两个不相等的实数解。这是因为判别式大于0时,根号内的值是正数,所以我们可以得到两个不同的解。
|
||||
- 如果 Δ=0\Delta = 0Δ=0,方程有一个实数解。这是因为根号内的值为零,导致两个解相等,也就是说方程有一个解。
|
||||
- 如果 Δ<0\Delta < 0Δ<0,方程没有实数解。这意味着根号内的值是负数,无法进行实数运算,因此方程没有实数解,可能有复数解。"""
|
||||
texts = ["这是一个公式 (a+b)³=a³+3a²b+3ab²+b³ S=(a×b)÷2", "这样的发展为AI仅仅作为“工具”这一观点提出了新的挑战,", "550 + 320 = 870千卡。", "解一元二次方程:3x^2+x-2=0", "你好啊"]
|
||||
texts = [text]
|
||||
for text in texts:
|
||||
text = text_normalizer.normalize(text)
|
||||
print(text)
|
||||
for t in cut_sentence_with_fix_length(text, 15):
|
||||
print(t)
|
||||
return text
|
||||
Reference in New Issue
Block a user