From 7e8f754ba1e6ae42fb7ae589f3358a4171b4cbe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E9=80=B8=E8=BD=A9?= Date: Wed, 17 Sep 2025 19:33:37 +0800 Subject: [PATCH 01/13] update README --- README.md | 16 ++++++++++++++++ src/voxcpm/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 230 bytes src/voxcpm/__pycache__/core.cpython-312.pyc | Bin 0 -> 8095 bytes 3 files changed, 16 insertions(+) create mode 100644 src/voxcpm/__pycache__/__init__.cpython-312.pyc create mode 100644 src/voxcpm/__pycache__/core.cpython-312.pyc diff --git a/README.md b/README.md index bc25618..9cb09d2 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,22 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses +### 📝 TODO List +🎉 Please stay tuned for updates! + + - + + - + + + + diff --git a/src/voxcpm/__pycache__/__init__.cpython-312.pyc b/src/voxcpm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3419d8a8edf635ec176c45fb1ee326003bc2422d GIT binary patch literal 230 zcmX@j%ge<81P?!*$S?!ak3k$5V1hC}ivbza8B!Qh7;_kM8KW2(8B&(Lo>`JnnxqdgOux7&S-&j5BDo+}KR!M)FS8^*Uaz3? v7KaT`YiUlZT@feHJdgv5MS#QyW=2NFy9^Q^*ce!OKQJ?KGc~dou>&Olj5Rum literal 0 HcmV?d00001 diff --git a/src/voxcpm/__pycache__/core.cpython-312.pyc b/src/voxcpm/__pycache__/core.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba3ed5a573eb556fc9441f6c8a3f21a36524fcde GIT binary patch literal 8095 zcmbt3NpKricE7O$B(V_S0+J%XM2R9K60~-i+N{woTa;|kR*4=PY@!=PQ$RQTyCE(E zOxYT2)0Sx70t1VY)$|v0*ODQLw9P<8eG!}|- zO_C9%$#$uciQe{4; zrWGwAD6?`#;f1Fh48cg#d&=C2OBXIEf|yB~Yt3&*rSbf4pt6KSMA4bBd65?hodPG? z1sl9}(UEcqjvIiI=%QohpAZ#XFzOZ^f(J@3ygr!e6Pj)~?^OU#xUJdsrO+ib!|WD# z{qP1vj}R2SLI~bgfEI=~0&g3%+eJ^RUFd+lee=#(x8X96VuY?MbIDvbk)>x!<*HX6m6;`U;#ivw#xSnSuja)!79*X)oA5i3m=?m2qZWMF(YZ z!BMc)t#wxbBH7JWSKD=Xx~=&fDxevcwW3zeglcuGCU^wzvd9nQ};iMv`q-m&#S~4Dsfc}5f2s8Lbm>yMl>Sv2p(6K|4T=JHffj)=+Y5xe7j2KDTi2o^E76hCYsXik zzgTqK^y_{7i;jn$o<~hxrM`WozT@9DzxJqSduj0KYR|F7lYf5UNe^B;dGo?!PfN)^ z@X)he_cVXz|J1+c>09yi>7B>*NPF3dy!bzm-RpY}sG%{VDV?e)R!J~gsD-I2RU=YI z<=P}~l24{V^-9DDGEOC!GDs+LWf8l^dV3NFyP&Sl%Zil#2QK@X#XyI>=pg1ul{c32~q zsU>J=$DI5gSawcUHC_hAsfl=U(q!)BB&Gt2d3-iMJuS)8_%xpsp)rLmL;bVrw2s!J z_|#l3BPAuR5f%}1iYhU+2Rdd@j8Dh$7KKcY(19(nby}xEFEudl|ENfqGI`17O2Q~U zX=xvJPbvzK>HZXGLZ%g%E^R=Gf_t@YZPbKSu^PoEKp6v>Sr4w+0lE>@GBsMW8(jIU zm&H-4lE+CNR5BG!oL9w)!e;V{d07Cf!4x#Ffg#IjFQE=P*s?2ie0_(Rqji?j6cE_6 zrszR`QxO6InN2IO`Q)UjJn9@75U892+7Gm+(y%){rnNN~KI_CaV$|cqgA24C9X^v6 zPZ6RJMg~~~MFvKN$JarF3V=$TC;}3oT44-Ko|3sv9jgXhD1Fy20DYe$%5_N)>+IP= z&s8Olv%oxS5Z7%Jvm0(I#0g0t;7=pc2cNN+eVuv+t?hfSioIb|RKvl1L8qjGC=j@s zDCRNj$&5<6DVQGm*+f58sAMxV4WB6ybg@cJKt)b4>7=6pW@^C9YlH_kNLXmEDDZXt z129=t;PWvm??>U@MfamX`{UN0(xyYJt%pm^hhMl*v;VULpC0(^$frl{bUbX@tVi*u z=b^i6r{3A8clGO=cfD{sy8MghpEaXkSnnNtbm(lU|A5{TgZ`kej2vFyld*Ri+N6tO z>0s6}0JH}AOw97vO~1IQ(!oZ($<%ZL_?j1uuq3BMLd|YMld@pT#GGpQDzdx$}^;tD*tPmgvv>~f^CT`Vg<;<>H z-KrG=*1n{}i?m^@e`dE;tCh969J;!)i@SMa&VCim za{Wl#Yr)uxh@H;3jAoHF3V5IXf8jmwzrmYg`Z7MVReWl-Scs2tXsp8y(3 zax#+_L@bN5H07Zl!Xm^l;PuGE)bE?ev==b0qG{1mBO>_!0;Do13+oQw=(?^W^eDbm z$%*iAgo;7Nd|r?gmN;SLG}pk{HUwp!W1NZ+WJw^9rX>MN&Dr8rH{uMX6!hvd%qjCKPo&;nF^4TeHBb5}8)H)Y)G8)&`91h5`@N-f& zp9Kf^8e9wr`N0jrLLOrNX0M z*5rX9qMDyoBbi*KHgmveO6X4px6V5o z#p9NvW?p70jKdoIQxRj6YskH`QGf%;59EUEuBeHIOUq-_Zl6II-&G1dr@BynX-5-79~8^;T%9<95eyqMx5ybe4h#AQ!jjg^XUw(^Gcr@<#PQZz-_ji(i#S zPCp8F-O>KK_?N{}-@(Vx&1=!UE784Qo_a9wP4w;3yC3Ky2ksZ|7E7H&KiW~(JKU2< z=hB7S7uF)Nl}PN1kOOe>a$S?JQ(OVbwZLz=o@OK}sZ9BcP?XboBPo)o}mk@?OltVUx}U9x9`-qAJn%WD!b8+H(`-uM|&wU0OtsG ztp)K)5Pu&2Cb+G<4{h4=;vkAdmtMd9`kkv^9Df*&uZ8!og!li$N2}r2*TP3u!beua z$JWB9R>G%N!)KnkP;l$A?Y{f2`@a9KzjWZuhl6jGZT3*~aio3e-0gFv-re6s_5k@} z7r0WO|G$1n0@Dxv=%-9riN5{E7ZV)n?E3+6q3EMXXW0(rAD_AZB%uHKxTCjW;ZJZn zbDN)35#;PI+RhwwEVshvD`)I%6nzzMIdjDR)uB^%X#5_xo%Pv%f5ds#?S5eMK+gl8 z_iVfKK^sTE(Jklt?GFyOoojV`9pa$=b*uMWulwt68?A5RXua17^_cCaVM$7km@aLi zKKskdkc$T|P0A>$n%K4f8l($UaXjW+2*d9u@>JGJk`KQY-}j4gk^r2?PRD#>W7ANh z&l`k>x)K7dHa9tS1{rqxg95#?cM(gYn3tFa_Jn?DT(5ROYi z->wf0>w6D8>v*4YM)h!e*-k4@dN-Gyv;rG+_mOs-ovX@nTblWCYZAM+2$}Ozw zN6}5?0ILR3M|U~Is;y|#=5m--BWU}MavQ6*qv72z*rq1UD5b!#UVy^zF+;&LrfCyS z(%wMY%z~#N37&x8Nmy>qXl4oGxS8u@KkG1}6A6~oxM6q>n_NChvM@jceU(}pxHcU3 gEo%N2g}z1JZ&BboH2fXf_M=0xanT Date: Wed, 17 Sep 2025 19:34:08 +0800 Subject: [PATCH 02/13] update README --- src/voxcpm/__pycache__/__init__.cpython-312.pyc | Bin 230 -> 0 bytes src/voxcpm/__pycache__/core.cpython-312.pyc | Bin 8095 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/voxcpm/__pycache__/__init__.cpython-312.pyc delete mode 100644 src/voxcpm/__pycache__/core.cpython-312.pyc diff --git a/src/voxcpm/__pycache__/__init__.cpython-312.pyc b/src/voxcpm/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 3419d8a8edf635ec176c45fb1ee326003bc2422d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 230 zcmX@j%ge<81P?!*$S?!ak3k$5V1hC}ivbza8B!Qh7;_kM8KW2(8B&(Lo>`JnnxqdgOux7&S-&j5BDo+}KR!M)FS8^*Uaz3? v7KaT`YiUlZT@feHJdgv5MS#QyW=2NFy9^Q^*ce!OKQJ?KGc~dou>&Olj5Rum diff --git a/src/voxcpm/__pycache__/core.cpython-312.pyc b/src/voxcpm/__pycache__/core.cpython-312.pyc deleted file mode 100644 index ba3ed5a573eb556fc9441f6c8a3f21a36524fcde..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8095 zcmbt3NpKricE7O$B(V_S0+J%XM2R9K60~-i+N{woTa;|kR*4=PY@!=PQ$RQTyCE(E zOxYT2)0Sx70t1VY)$|v0*ODQLw9P<8eG!}|- zO_C9%$#$uciQe{4; zrWGwAD6?`#;f1Fh48cg#d&=C2OBXIEf|yB~Yt3&*rSbf4pt6KSMA4bBd65?hodPG? z1sl9}(UEcqjvIiI=%QohpAZ#XFzOZ^f(J@3ygr!e6Pj)~?^OU#xUJdsrO+ib!|WD# z{qP1vj}R2SLI~bgfEI=~0&g3%+eJ^RUFd+lee=#(x8X96VuY?MbIDvbk)>x!<*HX6m6;`U;#ivw#xSnSuja)!79*X)oA5i3m=?m2qZWMF(YZ z!BMc)t#wxbBH7JWSKD=Xx~=&fDxevcwW3zeglcuGCU^wzvd9nQ};iMv`q-m&#S~4Dsfc}5f2s8Lbm>yMl>Sv2p(6K|4T=JHffj)=+Y5xe7j2KDTi2o^E76hCYsXik zzgTqK^y_{7i;jn$o<~hxrM`WozT@9DzxJqSduj0KYR|F7lYf5UNe^B;dGo?!PfN)^ z@X)he_cVXz|J1+c>09yi>7B>*NPF3dy!bzm-RpY}sG%{VDV?e)R!J~gsD-I2RU=YI z<=P}~l24{V^-9DDGEOC!GDs+LWf8l^dV3NFyP&Sl%Zil#2QK@X#XyI>=pg1ul{c32~q zsU>J=$DI5gSawcUHC_hAsfl=U(q!)BB&Gt2d3-iMJuS)8_%xpsp)rLmL;bVrw2s!J z_|#l3BPAuR5f%}1iYhU+2Rdd@j8Dh$7KKcY(19(nby}xEFEudl|ENfqGI`17O2Q~U zX=xvJPbvzK>HZXGLZ%g%E^R=Gf_t@YZPbKSu^PoEKp6v>Sr4w+0lE>@GBsMW8(jIU zm&H-4lE+CNR5BG!oL9w)!e;V{d07Cf!4x#Ffg#IjFQE=P*s?2ie0_(Rqji?j6cE_6 zrszR`QxO6InN2IO`Q)UjJn9@75U892+7Gm+(y%){rnNN~KI_CaV$|cqgA24C9X^v6 zPZ6RJMg~~~MFvKN$JarF3V=$TC;}3oT44-Ko|3sv9jgXhD1Fy20DYe$%5_N)>+IP= z&s8Olv%oxS5Z7%Jvm0(I#0g0t;7=pc2cNN+eVuv+t?hfSioIb|RKvl1L8qjGC=j@s zDCRNj$&5<6DVQGm*+f58sAMxV4WB6ybg@cJKt)b4>7=6pW@^C9YlH_kNLXmEDDZXt z129=t;PWvm??>U@MfamX`{UN0(xyYJt%pm^hhMl*v;VULpC0(^$frl{bUbX@tVi*u z=b^i6r{3A8clGO=cfD{sy8MghpEaXkSnnNtbm(lU|A5{TgZ`kej2vFyld*Ri+N6tO z>0s6}0JH}AOw97vO~1IQ(!oZ($<%ZL_?j1uuq3BMLd|YMld@pT#GGpQDzdx$}^;tD*tPmgvv>~f^CT`Vg<;<>H z-KrG=*1n{}i?m^@e`dE;tCh969J;!)i@SMa&VCim za{Wl#Yr)uxh@H;3jAoHF3V5IXf8jmwzrmYg`Z7MVReWl-Scs2tXsp8y(3 zax#+_L@bN5H07Zl!Xm^l;PuGE)bE?ev==b0qG{1mBO>_!0;Do13+oQw=(?^W^eDbm z$%*iAgo;7Nd|r?gmN;SLG}pk{HUwp!W1NZ+WJw^9rX>MN&Dr8rH{uMX6!hvd%qjCKPo&;nF^4TeHBb5}8)H)Y)G8)&`91h5`@N-f& zp9Kf^8e9wr`N0jrLLOrNX0M z*5rX9qMDyoBbi*KHgmveO6X4px6V5o z#p9NvW?p70jKdoIQxRj6YskH`QGf%;59EUEuBeHIOUq-_Zl6II-&G1dr@BynX-5-79~8^;T%9<95eyqMx5ybe4h#AQ!jjg^XUw(^Gcr@<#PQZz-_ji(i#S zPCp8F-O>KK_?N{}-@(Vx&1=!UE784Qo_a9wP4w;3yC3Ky2ksZ|7E7H&KiW~(JKU2< z=hB7S7uF)Nl}PN1kOOe>a$S?JQ(OVbwZLz=o@OK}sZ9BcP?XboBPo)o}mk@?OltVUx}U9x9`-qAJn%WD!b8+H(`-uM|&wU0OtsG ztp)K)5Pu&2Cb+G<4{h4=;vkAdmtMd9`kkv^9Df*&uZ8!og!li$N2}r2*TP3u!beua z$JWB9R>G%N!)KnkP;l$A?Y{f2`@a9KzjWZuhl6jGZT3*~aio3e-0gFv-re6s_5k@} z7r0WO|G$1n0@Dxv=%-9riN5{E7ZV)n?E3+6q3EMXXW0(rAD_AZB%uHKxTCjW;ZJZn zbDN)35#;PI+RhwwEVshvD`)I%6nzzMIdjDR)uB^%X#5_xo%Pv%f5ds#?S5eMK+gl8 z_iVfKK^sTE(Jklt?GFyOoojV`9pa$=b*uMWulwt68?A5RXua17^_cCaVM$7km@aLi zKKskdkc$T|P0A>$n%K4f8l($UaXjW+2*d9u@>JGJk`KQY-}j4gk^r2?PRD#>W7ANh z&l`k>x)K7dHa9tS1{rqxg95#?cM(gYn3tFa_Jn?DT(5ROYi z->wf0>w6D8>v*4YM)h!e*-k4@dN-Gyv;rG+_mOs-ovX@nTblWCYZAM+2$}Ozw zN6}5?0ILR3M|U~Is;y|#=5m--BWU}MavQ6*qv72z*rq1UD5b!#UVy^zF+;&LrfCyS z(%wMY%z~#N37&x8Nmy>qXl4oGxS8u@KkG1}6A6~oxM6q>n_NChvM@jceU(}pxHcU3 gEo%N2g}z1JZ&BboH2fXf_M=0xanT Date: Wed, 17 Sep 2025 19:36:32 +0800 Subject: [PATCH 03/13] update README --- README.md | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 9cb09d2..9b96d54 100644 --- a/README.md +++ b/README.md @@ -37,27 +37,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses -### 📝 TODO List -🎉 Please stay tuned for updates! - - - - - - - - - - - - - - - ## Quick Start @@ -261,6 +240,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks: +## 📝TO-DO List +Please stay tuned for updates! +- [ ] Release the VoxCPM technical report. +- [ ] Support higher sampling rate (next version). + + + ## 📄 License The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license. From 1fa9e2ca02ad9414a9a44cefea87a99fbd0d6dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E9=80=B8=E8=BD=A9?= Date: Thu, 18 Sep 2025 01:21:45 +0800 Subject: [PATCH 04/13] update README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9b96d54..2f18ae2 100644 --- a/README.md +++ b/README.md @@ -267,6 +267,8 @@ This project is developed by the following institutions: - [THUHCSI](https://github.com/thuhcsi) +## ⭐ Star History + [![Star History Chart](https://api.star-history.com/svg?repos=OpenBMB/VoxCPM&type=Date)](https://star-history.com/#OpenBMB/VoxCPM&Date) ## 📚 Citation From e5bcb735f05c84d0d2b1d298329a34e23861c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Thu, 18 Sep 2025 12:01:26 +0800 Subject: [PATCH 05/13] Remove segment text logic --- app.py | 2 +- src/voxcpm/core.py | 61 ++++++++++++++++++-------------------- src/voxcpm/model/voxcpm.py | 9 ++++-- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/app.py b/app.py index eacba7d..3f64801 100644 --- a/app.py +++ b/app.py @@ -206,7 +206,7 @@ def create_demo_interface(demo: VoxCPMDemo): prompt_wav = gr.Audio( sources=["upload", 'microphone'], type="filepath", - label="Prompt Speech", + label="Prompt Speech (Optional, or let VoxCPM improvise)", value="./examples/example.wav", ) DoDenoisePromptAudio = gr.Checkbox( diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py index 533497d..7ff1d08 100644 --- a/src/voxcpm/core.py +++ b/src/voxcpm/core.py @@ -120,10 +120,17 @@ class VoxCPM: Returns: numpy.ndarray: 1D waveform array (float32) on CPU. """ - texts = text.split("\n") - texts = [t.strip() for t in texts if t.strip()] - final_wav = [] - temp_prompt_wav_path = None + if not text.strip() or not isinstance(text, str): + raise ValueError("target text must be a non-empty string") + + if prompt_wav_path is not None: + if not os.path.exists(prompt_wav_path): + raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}") + + if (prompt_wav_path is None) != (prompt_text is None): + raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + + temp_prompt_wav_path = None try: if prompt_wav_path is not None and prompt_text is not None: @@ -139,35 +146,25 @@ class VoxCPM: else: fixed_prompt_cache = None # will be built from the first inference - for sub_text in texts: - if sub_text.strip() == "": - continue - print("sub_text:", sub_text) - if normalize: - if self.text_normalizer is None: - from .utils.text_normalize import TextNormalizer - self.text_normalizer = TextNormalizer() - sub_text = self.text_normalizer.normalize(sub_text) - wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache( - target_text=sub_text, - prompt_cache=fixed_prompt_cache, - min_len=2, - max_len=max_length, - inference_timesteps=inference_timesteps, - cfg_value=cfg_value, - retry_badcase=retry_badcase, - retry_badcase_max_times=retry_badcase_max_times, - retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, - ) - if fixed_prompt_cache is None: - fixed_prompt_cache = self.tts_model.merge_prompt_cache( - original_cache=None, - new_text_token=target_text_token, - new_audio_feat=generated_audio_feat - ) - final_wav.append(wav) + if normalize: + if self.text_normalizer is None: + from .utils.text_normalize import TextNormalizer + self.text_normalizer = TextNormalizer() + text = self.text_normalizer.normalize(text) + + wav, target_text_token, generated_audio_feat = self.tts_model.generate_with_prompt_cache( + target_text=text, + prompt_cache=fixed_prompt_cache, + min_len=2, + max_len=max_length, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) - return torch.cat(final_wav, dim=1).squeeze(0).cpu().numpy() + return wav.squeeze(0).cpu().numpy() finally: if temp_prompt_wav_path and os.path.exists(temp_prompt_wav_path): diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py index 7268704..3af0af9 100644 --- a/src/voxcpm/model/voxcpm.py +++ b/src/voxcpm/model/voxcpm.py @@ -151,11 +151,16 @@ class VoxCPMModel(nn.Module): try: if self.device != "cuda": raise ValueError("VoxCPMModel can only be optimized on CUDA device") + try: + import triton + except: + raise ValueError("triton is not installed") self.base_lm.forward_step = torch.compile(self.base_lm.forward_step, mode="reduce-overhead", fullgraph=True) self.residual_lm.forward_step = torch.compile(self.residual_lm.forward_step, mode="reduce-overhead", fullgraph=True) self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True) self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True) - except: + except Exception as e: + print(e) print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") self.base_lm.forward_step = self.base_lm.forward_step self.residual_lm.forward_step = self.residual_lm.forward_step @@ -317,7 +322,7 @@ class VoxCPMModel(nn.Module): audio = torch.nn.functional.pad(audio, (0, patch_len - audio.size(1) % patch_len)) # extract audio features - audio_feat = self.audio_vae.encode(audio.cuda(), self.sample_rate).cpu() + audio_feat = self.audio_vae.encode(audio.to(self.device), self.sample_rate).cpu() audio_feat = audio_feat.view( self.audio_vae.latent_dim, From 11568f07762fa4b42bb1bf3caa27e0a29b8f13aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Thu, 18 Sep 2025 12:58:27 +0800 Subject: [PATCH 06/13] remove target text anotation --- app.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app.py b/app.py index 3f64801..0eb53ff 100644 --- a/app.py +++ b/app.py @@ -244,14 +244,13 @@ def create_demo_interface(demo: VoxCPMDemo): text = gr.Textbox( value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.", label="Target Text", - info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio." ) with gr.Row(): DoNormalizeText = gr.Checkbox( value=False, label="Text Normalization", elem_id="chk_normalize", - info="We use WeTextPorcessing library to normalize the input text." + info="We use wetext library to normalize the input text." ) audio_output = gr.Audio(label="Output Audio") From bdd516b579e783506fdaf976a2a54a837f08af9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Thu, 18 Sep 2025 13:07:43 +0800 Subject: [PATCH 07/13] remove target text anotation --- app.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/app.py b/app.py index 0eb53ff..f109c09 100644 --- a/app.py +++ b/app.py @@ -194,10 +194,6 @@ def create_demo_interface(demo: VoxCPMDemo): **调低**:合成速度更快。 - **Higher** for better synthesis quality. **调高**:合成质量更佳。 - - ### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音) - While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually. - 虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。 """) # Main controls From 5257ec3dc564f96b2ef741cb4b05ee4607f0f236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E9=80=B8=E8=BD=A9?= Date: Thu, 18 Sep 2025 14:50:01 +0800 Subject: [PATCH 08/13] FX: noise point --- src/voxcpm/model/voxcpm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py index 3af0af9..0d8f1c2 100644 --- a/src/voxcpm/model/voxcpm.py +++ b/src/voxcpm/model/voxcpm.py @@ -283,8 +283,10 @@ class VoxCPMModel(nn.Module): else: break else: - break - return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() + + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() + decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio + return decode_audio @torch.inference_mode() def build_prompt_cache( @@ -468,7 +470,8 @@ class VoxCPMModel(nn.Module): else: break decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() - + decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio + return ( decode_audio, target_text_token, @@ -580,7 +583,6 @@ class VoxCPMModel(nn.Module): pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size) - feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token return feat_pred, pred_feat_seq.squeeze(0).cpu() @classmethod From 1a46c5d1ad724097200d1b3fa713e1baad595cf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E9=80=B8=E8=BD=A9?= Date: Thu, 18 Sep 2025 14:53:37 +0800 Subject: [PATCH 09/13] update README --- src/voxcpm/model/voxcpm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py index 0d8f1c2..df13188 100644 --- a/src/voxcpm/model/voxcpm.py +++ b/src/voxcpm/model/voxcpm.py @@ -283,10 +283,11 @@ class VoxCPMModel(nn.Module): else: break else: + break decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio - return decode_audio + return decode_audio @torch.inference_mode() def build_prompt_cache( From cef6aefb3d6151f46a82a9b27c8f2fba3d58c2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Thu, 18 Sep 2025 14:52:22 +0800 Subject: [PATCH 10/13] remove \n from input text --- src/voxcpm/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py index 7ff1d08..ccb548e 100644 --- a/src/voxcpm/core.py +++ b/src/voxcpm/core.py @@ -130,6 +130,7 @@ class VoxCPM: if (prompt_wav_path is None) != (prompt_text is None): raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + text = text.replace("\n", " ") temp_prompt_wav_path = None try: From dc6b6d1d1c8ff49f3ec237e07b3230b25bc48f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Thu, 18 Sep 2025 19:23:13 +0800 Subject: [PATCH 11/13] Fx: capture compile error on Windows --- pyproject.toml | 2 +- src/voxcpm/core.py | 2 ++ src/voxcpm/model/voxcpm.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfb3399..8f9d5ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "addict", "wetext", "modelscope>=1.22.0", - "datasets>=2,<4", + "datasets>=3,<4", "huggingface-hub", "pydantic", "tqdm", diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py index ccb548e..3b88b55 100644 --- a/src/voxcpm/core.py +++ b/src/voxcpm/core.py @@ -1,6 +1,7 @@ import torch import torchaudio import os +import re import tempfile from huggingface_hub import snapshot_download from .model.voxcpm import VoxCPMModel @@ -131,6 +132,7 @@ class VoxCPM: raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") text = text.replace("\n", " ") + text = re.sub(r'\s+', ' ', text) temp_prompt_wav_path = None try: diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py index df13188..1f5fdec 100644 --- a/src/voxcpm/model/voxcpm.py +++ b/src/voxcpm/model/voxcpm.py @@ -160,8 +160,8 @@ class VoxCPMModel(nn.Module): self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True) self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True) except Exception as e: - print(e) - print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") + print(f"Error: {e}") + print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions") self.base_lm.forward_step = self.base_lm.forward_step self.residual_lm.forward_step = self.residual_lm.forward_step self.feat_encoder_step = self.feat_encoder From 996c69a1a891e81b24fde31e4cb5cb0b4de27faa Mon Sep 17 00:00:00 2001 From: MayDomine <1583143678@qq.com> Date: Fri, 19 Sep 2025 12:53:23 +0800 Subject: [PATCH 12/13] add prompt-file option to set prompt text --- src/voxcpm/cli.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/voxcpm/cli.py b/src/voxcpm/cli.py index 801266f..f58e8b1 100644 --- a/src/voxcpm/cli.py +++ b/src/voxcpm/cli.py @@ -240,6 +240,7 @@ Examples: # Prompt audio (for voice cloning) parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path") parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio") + parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio") parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)") # Generation parameters @@ -279,6 +280,12 @@ def main(): # If prompt audio+text provided → voice cloning if args.prompt_audio or args.prompt_text: + if not args.prompt_text and args.prompt_file: + assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible." + + with open(args.prompt_file, 'r', encoding='utf-8') as f: + args.prompt_text = f.read() + if not args.prompt_audio or not args.prompt_text: print("Error: Voice cloning requires both --prompt-audio and --prompt-text") sys.exit(1) From 5f56d5ff5d9972b4b41f13bcc4d159f10efd2739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=91=AB?= Date: Fri, 19 Sep 2025 13:44:33 +0800 Subject: [PATCH 13/13] FX: update README --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f18ae2..f81bc57 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ By default, when you first run the script, the model will be downloaded automati - Download VoxCPM-0.5B ``` from huggingface_hub import snapshot_download - snapshot_download("openbmb/VoxCPM-0.5B",local_files_only=local_files_only) + snapshot_download("openbmb/VoxCPM-0.5B") ``` - Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo. ``` @@ -98,6 +98,13 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi --output out.wav \ --denoise +# (Optinal) Voice cloning (reference audio + transcript file) +voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \ + --prompt-audio path/to/voice.wav \ + --prompt-file "/path/to/text-file" \ + --output out.wav \ + --denoise + # 3) Batch processing (one text per line) voxcpm --input examples/input.txt --output-dir outs # (optional) Batch + cloning