mirror of
https://github.com/OpenBMB/VoxCPM
synced 2025-12-12 11:58:11 +00:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
436e8cd6e5 | ||
|
|
11574ae93d | ||
|
|
706403187e | ||
|
|
38a76704ee | ||
|
|
dfd487f5af | ||
|
|
081845b35b |
77
README.md
77
README.md
@@ -1,7 +1,7 @@
|
|||||||
## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning
|
## 🎙️ VoxCPM: Tokenizer-Free TTS for Context-Aware Speech Generation and True-to-Life Voice Cloning
|
||||||
|
|
||||||
|
|
||||||
[](https://github.com/OpenBMB/VoxCPM/) [](hhttps://huggingface.co/openbmb/VoxCPM-0.5B) [](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [](https://thuhcsi.github.io/VoxCPM/)
|
[](https://github.com/OpenBMB/VoxCPM/) [](https://huggingface.co/openbmb/VoxCPM-0.5B) [](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) [](https://thuhcsi.github.io/VoxCPM/)
|
||||||
|
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
@@ -9,17 +9,17 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
## News
|
## News
|
||||||
* [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B weights!
|
* [2025.09.16] 🔥 🔥 🔥 We Open Source the VoxCPM-0.5B [weights](https://huggingface.co/openbmb/VoxCPM-0.5B)!
|
||||||
* [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now!
|
* [2025.09.16] 🎉 🎉 🎉 We Provide the [Gradio PlayGround](https://huggingface.co/spaces/OpenBMB/VoxCPM-Demo) for VoxCPM-0.5B, try it now!
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
VoxCPM is a novel tokenizer-free Text-to-Speech (TTS) system that redefines realism in speech synthesis. By modeling speech in a continuous space, it overcomes the limitations of discrete tokenization and enables two flagship capabilities: context-aware speech generation and true-to-life zero-shot voice cloning.
|
VoxCPM is a novel tokenizer-free Text-to-Speech (TTS) system that redefines realism in speech synthesis. By modeling speech in a continuous space, it overcomes the limitations of discrete tokenization and enables two flagship capabilities: context-aware speech generation and true-to-life zero-shot voice cloning.
|
||||||
|
|
||||||
Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses an end-to-end diffusion autoregressive architecture that directly generates continuous speech representations from text. Built on [MiniCPM-4](https://huggingface.co/openbmb/MiniCPM4-0.5B), it achieves implicit semantic-acoustic decoupling through hierachical language modeling and FSQ constraints, greatly enhancing both expressiveness and generation stability.
|
Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses an end-to-end diffusion autoregressive architecture that directly generates continuous speech representations from text. Built on [MiniCPM-4](https://huggingface.co/openbmb/MiniCPM4-0.5B) backbone, it achieves implicit semantic-acoustic decoupling through hierachical language modeling and FSQ constraints, greatly enhancing both expressiveness and generation stability.
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
<img src="assets/voxcpm_model.png" alt="VoxCPM Model Architecture" width="500">
|
<img src="assets/voxcpm_model.png" alt="VoxCPM Model Architecture" width="90%">
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
@@ -30,6 +30,13 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### 🔧 Install from PyPI
|
### 🔧 Install from PyPI
|
||||||
@@ -61,13 +68,13 @@ wav = model.generate(
|
|||||||
text="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.",
|
text="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.",
|
||||||
prompt_wav_path=None, # optional: path to a prompt speech for voice cloning
|
prompt_wav_path=None, # optional: path to a prompt speech for voice cloning
|
||||||
prompt_text=None, # optional: reference text
|
prompt_text=None, # optional: reference text
|
||||||
cfg_value=2.0,
|
cfg_value=2.0, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse
|
||||||
inference_timesteps=10,
|
inference_timesteps=10, # LocDiT inference timesteps, higher for better result, lower for fast speed
|
||||||
normalize=True,
|
normalize=True, # enable external TN tool
|
||||||
denoise=True,
|
denoise=True, # enable external Denoise tool
|
||||||
retry_badcase=True, # optional: enable retrying mode
|
retry_badcase=True, # enable retrying mode for some bad cases (unstoppable)
|
||||||
retry_badcase_max_times=3,
|
retry_badcase_max_times=3, # maximum retrying times
|
||||||
retry_badcase_ratio_threshold=6.0,
|
retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech
|
||||||
)
|
)
|
||||||
|
|
||||||
sf.write("output.wav", wav, 16000)
|
sf.write("output.wav", wav, 16000)
|
||||||
@@ -80,10 +87,10 @@ After installation, the entry point is `voxcpm` (or use `python -m voxcpm.cli`).
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1) Direct synthesis (single text)
|
# 1) Direct synthesis (single text)
|
||||||
voxcpm --text "Hello VoxCPM" --output out.wav
|
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." --output out.wav
|
||||||
|
|
||||||
# 2) Voice cloning (reference audio + transcript)
|
# 2) Voice cloning (reference audio + transcript)
|
||||||
voxcpm --text "Hello" \
|
voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
|
||||||
--prompt-audio path/to/voice.wav \
|
--prompt-audio path/to/voice.wav \
|
||||||
--prompt-text "reference transcript" \
|
--prompt-text "reference transcript" \
|
||||||
--output out.wav \
|
--output out.wav \
|
||||||
@@ -175,41 +182,41 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
|
|||||||
| Model | Parameters | Open-Source | test-EN | | test-ZH | | test-Hard | |
|
| Model | Parameters | Open-Source | test-EN | | test-ZH | | test-Hard | |
|
||||||
|------|------|------|:------------:|:--:|:------------:|:--:|:-------------:|:--:|
|
|------|------|------|:------------:|:--:|:------------:|:--:|:-------------:|:--:|
|
||||||
| | | | WER/%⬇ | SIM/%⬆| CER/%⬇| SIM/%⬆ | CER/%⬇ | SIM/%⬆ |
|
| | | | WER/%⬇ | SIM/%⬆| CER/%⬇| SIM/%⬆ | CER/%⬇ | SIM/%⬆ |
|
||||||
|
| MegaTTS3 | 0.5B | ❌ | 2.79 | 77.1 | 1.52 | 79.0 | - | - |
|
||||||
|
| DiTAR | 0.6B | ❌ | 1.69 | 73.5 | 1.02 | 75.3 | - | - |
|
||||||
|
| CosyVoice3 | 0.5B | ❌ | 2.02 | 71.8 | 1.16 | 78.0 | 6.08 | 75.8 |
|
||||||
|
| CosyVoice3 | 1.5B | ❌ | 2.22 | 72.0 | 1.12 | 78.1 | 5.83 | 75.8 |
|
||||||
|
| Seed-TTS | - | ❌ | 2.25 | 76.2 | 1.12 | 79.6 | 7.59 | 77.6 |
|
||||||
|
| MiniMax-Speech | - | ❌ | 1.65 | 69.2 | 0.83 | 78.3 | - | - |
|
||||||
| CosyVoice | 0.3B | ✅ | 4.29 | 60.9 | 3.63 | 72.3 | 11.75 | 70.9 |
|
| CosyVoice | 0.3B | ✅ | 4.29 | 60.9 | 3.63 | 72.3 | 11.75 | 70.9 |
|
||||||
| CosyVoice2 | 0.5B | ✅ | 3.09 | 65.9 | 1.38 | 75.7 | 6.83 | 72.4 |
|
| CosyVoice2 | 0.5B | ✅ | 3.09 | 65.9 | 1.38 | 75.7 | **6.83** | 72.4 |
|
||||||
| F5-TTS | 0.3B | ✅ | 2.00 | 67.0 | 1.53 | 76.0 | 8.67 | 71.3 |
|
| F5-TTS | 0.3B | ✅ | 2.00 | 67.0 | 1.53 | 76.0 | 8.67 | 71.3 |
|
||||||
| SparkTTS | 0.5B | ✅ | 3.14 | 57.3 | 1.54 | 66.0 | - | - |
|
| SparkTTS | 0.5B | ✅ | 3.14 | 57.3 | 1.54 | 66.0 | - | - |
|
||||||
| FireRedTTS | 0.5B | ✅ | 3.82 | 46.0 | 1.51 | 63.5 | 17.45 | 62.1 |
|
| FireRedTTS | 0.5B | ✅ | 3.82 | 46.0 | 1.51 | 63.5 | 17.45 | 62.1 |
|
||||||
| FireRedTTS-2 | 1.5B | ✅ | 1.95 | 66.5 | 1.14 | 73.6 | - | - |
|
| FireRedTTS-2 | 1.5B | ✅ | 1.95 | 66.5 | 1.14 | 73.6 | - | - |
|
||||||
| Qwen2.5-Omni | 7B | ✅ | 2.72 | 63.2 | 1.70 | 75.2 | 7.97 | 74.7 |
|
| Qwen2.5-Omni | 7B | ✅ | 2.72 | 63.2 | 1.70 | 75.2 | 7.97 | **74.7** |
|
||||||
| OpenAudio-s1-mini | 0.5B | ✅ | 1.94 | 55.0 | 1.18 | 68.5 | - | - |
|
| OpenAudio-s1-mini | 0.5B | ✅ | 1.94 | 55.0 | 1.18 | 68.5 | - | - |
|
||||||
| IndexTTS2 | 1.5B | ✅ | 2.23 | 70.6 | 1.03 | 76.5 | - | - |
|
| IndexTTS2 | 1.5B | ✅ | 2.23 | 70.6 | 1.03 | 76.5 | - | - |
|
||||||
| VibeVoice | 1.5B | ✅ | 3.04 | 68.9 | 1.16 | 74.4 | - | - |
|
| VibeVoice | 1.5B | ✅ | 3.04 | 68.9 | 1.16 | 74.4 | - | - |
|
||||||
| HiggsAudio-v2 | 3B | ✅ | 2.44 | 67.7 | 1.50 | 74.0 | - | - |
|
| HiggsAudio-v2 | 3B | ✅ | 2.44 | 67.7 | 1.50 | 74.0 | - | - |
|
||||||
| CosyVoice3 | 0.5B | ❌ | 2.02 | 71.8 | 1.16 | 78.0 | 6.08 | 75.8 |
|
| **VoxCPM** | 0.5B | ✅ | **1.85** | **72.9** | **0.93** | **77.2** | 8.87 | 73.0 |
|
||||||
| CosyVoice3 | 1.5B | ❌ | 2.22 | 72.0 | 1.12 | 78.1 | 5.83 | 75.8 |
|
|
||||||
| MegaTTS3 | 0.5B | ❌ | 2.79 | 77.1 | 1.52 | 79.0 | - | - |
|
|
||||||
| DiTAR | 0.6B | ❌ | 1.69 | 73.5 | 1.02 | 75.3 | - | - |
|
|
||||||
| Seed-TTS | - | ❌ | 2.25 | 76.2 | 1.12 | 79.6 | 7.59 | 77.6 |
|
|
||||||
| MiniMax-Speech | - | ❌ | 1.65 | 69.2 | 0.83 | 78.3 | - | - |
|
|
||||||
| **VoxCPM** | **0.5B** | **✅** | **1.85** | **72.9** | **0.93** | **77.2** | 8.87 | 73.0 |
|
|
||||||
|
|
||||||
|
|
||||||
### CV3-eval Benchmark
|
### CV3-eval Benchmark
|
||||||
|
|
||||||
| Model | zh | en | hard-zh | | | hard-en | | | |
|
| Model | zh | en | hard-zh | | | hard-en | | |
|
||||||
|-------|:--:|:--:|:-------:|:--:|:--:|:-------:|:--:|:--:|:--:|
|
|-------|:--:|:--:|:-------:|:--:|:--:|:-------:|:--:|:--:|
|
||||||
| | CER/%⬇ | WER/%⬇ | CER/%⬇ | SIM/%⬆ | DNSMOS⬆ | WER/%⬇ | SIM/%⬆ | DNSMOS⬆ | |
|
| | CER/%⬇ | WER/%⬇ | CER/%⬇ | SIM/%⬆ | DNSMOS⬆ | WER/%⬇ | SIM/%⬆ | DNSMOS⬆ |
|
||||||
| F5-TTS | 5.47 | 8.90 | - | - | - | - | - | - | |
|
| F5-TTS | 5.47 | 8.90 | - | - | - | - | - | - |
|
||||||
| SparkTTS | 5.15 | 11.0 | - | - | - | - | - | - | |
|
| SparkTTS | 5.15 | 11.0 | - | - | - | - | - | - |
|
||||||
| GPT-SoVits | 7.34 | 12.5 | - | - | - | - | - | - | |
|
| GPT-SoVits | 7.34 | 12.5 | - | - | - | - | - | - |
|
||||||
| CosyVoice2 | 4.08 | 6.32 | 12.58 | 72.6 | 3.81 | 11.96 | 66.7 | 3.95 | |
|
| CosyVoice2 | 4.08 | 6.32 | 12.58 | 72.6 | 3.81 | 11.96 | 66.7 | 3.95 |
|
||||||
| OpenAudio-s1-mini | 4.00 | 5.54 | 18.1 | 58.2 | 3.77 | 12.4 | 55.7 | 3.89 | |
|
| OpenAudio-s1-mini | 4.00 | 5.54 | 18.1 | 58.2 | 3.77 | 12.4 | 55.7 | 3.89 |
|
||||||
| IndexTTS2 | 3.58 | 4.45 | 12.8 | 74.6 | 3.65 | fail | fail | fail | |
|
| IndexTTS2 | 3.58 | 4.45 | 12.8 | 74.6 | 3.65 | - | - | - |
|
||||||
| HiggsAudio-v2 | 9.54 | 7.89 | 41.0 | 60.2 | 3.39 | 10.3 | 61.8 | 3.68 | |
|
| HiggsAudio-v2 | 9.54 | 7.89 | 41.0 | 60.2 | 3.39 | 10.3 | 61.8 | 3.68 |
|
||||||
| CosyVoice3-0.5B | 3.89 | 5.24 | 14.15 | 78.6 | 3.75 | 9.04 | 75.9 | 3.92 | |
|
| CosyVoice3-0.5B | 3.89 | 5.24 | 14.15 | 78.6 | 3.75 | 9.04 | 75.9 | 3.92 |
|
||||||
| CosyVoice3-1.5B | 3.91 | 4.99 | 9.77 | 78.5 | 3.79 | 10.55 | 76.1 | 3.95 | |
|
| CosyVoice3-1.5B | 3.91 | 4.99 | 9.77 | 78.5 | 3.79 | 10.55 | 76.1 | 3.95 |
|
||||||
| **VoxCPM** | **3.40** | **4.04** | 12.9 | 66.1 | 3.59 | **7.89** | 64.3 | 3.74 | |
|
| **VoxCPM** | **3.40** | **4.04** | 12.9 | 66.1 | 3.59 | **7.89** | 64.3 | 3.74 |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -27,22 +27,21 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"torch==2.5.1",
|
"torch>=2.5.0",
|
||||||
"torchaudio==2.5.1",
|
"torchaudio>=2.5.0",
|
||||||
"transformers==4.50.1",
|
"transformers>=4.36.2",
|
||||||
"einops",
|
"einops",
|
||||||
"gradio",
|
"gradio",
|
||||||
"inflect",
|
"inflect",
|
||||||
"WeTextProcessing",
|
|
||||||
"addict",
|
"addict",
|
||||||
"modelscope==1.22.0",
|
"WeTextProcessing",
|
||||||
"simplejson",
|
"modelscope>=1.22.0",
|
||||||
"datasets==2.18.0",
|
"datasets>=2,<4",
|
||||||
"sortedcontainers",
|
|
||||||
"librosa",
|
|
||||||
"huggingface-hub",
|
"huggingface-hub",
|
||||||
"pydantic",
|
"pydantic",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
|
"simplejson",
|
||||||
|
"sortedcontainers",
|
||||||
"soundfile",
|
"soundfile",
|
||||||
"funasr",
|
"funasr",
|
||||||
"spaces"
|
"spaces"
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
torch==2.5.1
|
|
||||||
torchaudio==2.5.1
|
|
||||||
transformers==4.50.1
|
|
||||||
einops
|
|
||||||
gradio
|
|
||||||
inflect
|
|
||||||
WeTextProcessing
|
|
||||||
addicts
|
|
||||||
modelscope==1.22.0
|
|
||||||
simplejson
|
|
||||||
datasets==2.18.0
|
|
||||||
addicts
|
|
||||||
sortedcontainers
|
|
||||||
librosa
|
|
||||||
huggingface-hub
|
|
||||||
spaces
|
|
||||||
@@ -2,8 +2,6 @@ import torch
|
|||||||
import torchaudio
|
import torchaudio
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from modelscope.pipelines import pipeline
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from .model.voxcpm import VoxCPMModel
|
from .model.voxcpm import VoxCPMModel
|
||||||
from .utils.text_normalize import TextNormalizer
|
from .utils.text_normalize import TextNormalizer
|
||||||
@@ -29,9 +27,8 @@ class VoxCPM:
|
|||||||
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
|
self.tts_model = VoxCPMModel.from_local(voxcpm_model_path)
|
||||||
self.text_normalizer = TextNormalizer()
|
self.text_normalizer = TextNormalizer()
|
||||||
if enable_denoiser and zipenhancer_model_path is not None:
|
if enable_denoiser and zipenhancer_model_path is not None:
|
||||||
self.denoiser = pipeline(
|
from .zipenhancer import ZipEnhancer
|
||||||
Tasks.acoustic_noise_suppression,
|
self.denoiser = ZipEnhancer(zipenhancer_model_path)
|
||||||
model=zipenhancer_model_path)
|
|
||||||
else:
|
else:
|
||||||
self.denoiser = None
|
self.denoiser = None
|
||||||
print("Warm up VoxCPMModel...")
|
print("Warm up VoxCPMModel...")
|
||||||
@@ -50,7 +47,7 @@ class VoxCPM:
|
|||||||
"""Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
|
"""Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo").
|
hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
|
||||||
load_denoiser: Whether to initialize the denoiser pipeline.
|
load_denoiser: Whether to initialize the denoiser pipeline.
|
||||||
zipenhancer_model_id: Denoiser model id or path for ModelScope
|
zipenhancer_model_id: Denoiser model id or path for ModelScope
|
||||||
acoustic noise suppression.
|
acoustic noise suppression.
|
||||||
@@ -67,9 +64,14 @@ class VoxCPM:
|
|||||||
``hf_model_id`` is provided.
|
``hf_model_id`` is provided.
|
||||||
"""
|
"""
|
||||||
repo_id = hf_model_id
|
repo_id = hf_model_id
|
||||||
if not repo_id or repo_id.strip() == "":
|
if not repo_id:
|
||||||
raise ValueError("You must provide a valid hf_model_id")
|
raise ValueError("You must provide hf_model_id")
|
||||||
|
|
||||||
|
# Load from local path if provided
|
||||||
|
if os.path.isdir(repo_id):
|
||||||
|
local_path = repo_id
|
||||||
|
else:
|
||||||
|
# Otherwise, try from_pretrained (Hub); exit on failure
|
||||||
local_path = snapshot_download(
|
local_path = snapshot_download(
|
||||||
repo_id=repo_id,
|
repo_id=repo_id,
|
||||||
cache_dir=cache_dir,
|
cache_dir=cache_dir,
|
||||||
@@ -82,12 +84,6 @@ class VoxCPM:
|
|||||||
enable_denoiser=load_denoiser,
|
enable_denoiser=load_denoiser,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _normalize_loudness(self, wav_path: str):
|
|
||||||
audio, sr = torchaudio.load(wav_path)
|
|
||||||
loudness = torchaudio.functional.loudness(audio, sr)
|
|
||||||
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
|
|
||||||
torchaudio.save(wav_path, normalized_audio, sr)
|
|
||||||
|
|
||||||
def generate(self,
|
def generate(self,
|
||||||
text : str,
|
text : str,
|
||||||
prompt_wav_path : str = None,
|
prompt_wav_path : str = None,
|
||||||
@@ -135,9 +131,7 @@ class VoxCPM:
|
|||||||
if denoise and self.denoiser is not None:
|
if denoise and self.denoiser is not None:
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
|
||||||
temp_prompt_wav_path = tmp_file.name
|
temp_prompt_wav_path = tmp_file.name
|
||||||
|
self.denoiser.enhance(prompt_wav_path, output_path=temp_prompt_wav_path)
|
||||||
self.denoiser(prompt_wav_path, output_path=temp_prompt_wav_path)
|
|
||||||
self._normalize_loudness(temp_prompt_wav_path)
|
|
||||||
prompt_wav_path = temp_prompt_wav_path
|
prompt_wav_path = temp_prompt_wav_path
|
||||||
fixed_prompt_cache = self.tts_model.build_prompt_cache(
|
fixed_prompt_cache = self.tts_model.build_prompt_cache(
|
||||||
prompt_wav_path=prompt_wav_path,
|
prompt_wav_path=prompt_wav_path,
|
||||||
|
|||||||
@@ -88,7 +88,7 @@ class UnifiedCFM(torch.nn.Module):
|
|||||||
shape: (n_timesteps + 1,)
|
shape: (n_timesteps + 1,)
|
||||||
mu (torch.Tensor): output of encoder
|
mu (torch.Tensor): output of encoder
|
||||||
shape: (batch_size, n_feats)
|
shape: (batch_size, n_feats)
|
||||||
cond: Not used but kept for future purposes
|
cond: condition -- prefix prompt
|
||||||
cfg_value (float, optional): cfg value for guidance. Defaults to 1.0.
|
cfg_value (float, optional): cfg value for guidance. Defaults to 1.0.
|
||||||
"""
|
"""
|
||||||
t, _, dt = t_span[0], t_span[-1], t_span[0] - t_span[1]
|
t, _, dt = t_span[0], t_span[-1], t_span[0] - t_span[1]
|
||||||
|
|||||||
76
src/voxcpm/zipenhancer.py
Normal file
76
src/voxcpm/zipenhancer.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
ZipEnhancer Module - Audio Denoising Enhancer
|
||||||
|
|
||||||
|
Provides on-demand import ZipEnhancer functionality for audio denoising processing.
|
||||||
|
Related dependencies are imported only when denoising functionality is needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from typing import Optional, Union
|
||||||
|
import torchaudio
|
||||||
|
import torch
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
|
||||||
|
class ZipEnhancer:
|
||||||
|
"""ZipEnhancer Audio Denoising Enhancer"""
|
||||||
|
def __init__(self, model_path: str = "iic/speech_zipenhancer_ans_multiloss_16k_base"):
|
||||||
|
"""
|
||||||
|
Initialize ZipEnhancer
|
||||||
|
Args:
|
||||||
|
model_path: ModelScope model path or local path
|
||||||
|
"""
|
||||||
|
self.model_path = model_path
|
||||||
|
self._pipeline = pipeline(
|
||||||
|
Tasks.acoustic_noise_suppression,
|
||||||
|
model=self.model_path
|
||||||
|
)
|
||||||
|
|
||||||
|
def _normalize_loudness(self, wav_path: str):
|
||||||
|
"""
|
||||||
|
Audio loudness normalization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
wav_path: Audio file path
|
||||||
|
"""
|
||||||
|
audio, sr = torchaudio.load(wav_path)
|
||||||
|
loudness = torchaudio.functional.loudness(audio, sr)
|
||||||
|
normalized_audio = torchaudio.functional.gain(audio, -20-loudness)
|
||||||
|
torchaudio.save(wav_path, normalized_audio, sr)
|
||||||
|
|
||||||
|
def enhance(self, input_path: str, output_path: Optional[str] = None,
|
||||||
|
normalize_loudness: bool = True) -> str:
|
||||||
|
"""
|
||||||
|
Audio denoising enhancement
|
||||||
|
Args:
|
||||||
|
input_path: Input audio file path
|
||||||
|
output_path: Output audio file path (optional, creates temp file by default)
|
||||||
|
normalize_loudness: Whether to perform loudness normalization
|
||||||
|
Returns:
|
||||||
|
str: Output audio file path
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If pipeline is not initialized or processing fails
|
||||||
|
"""
|
||||||
|
if not os.path.exists(input_path):
|
||||||
|
raise FileNotFoundError(f"Input audio file does not exist: {input_path}")
|
||||||
|
# Create temporary file if no output path is specified
|
||||||
|
if output_path is None:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
|
||||||
|
output_path = tmp_file.name
|
||||||
|
try:
|
||||||
|
# Perform denoising processing
|
||||||
|
self._pipeline(input_path, output_path=output_path)
|
||||||
|
# Loudness normalization
|
||||||
|
if normalize_loudness:
|
||||||
|
self._normalize_loudness(output_path)
|
||||||
|
return output_path
|
||||||
|
except Exception as e:
|
||||||
|
# Clean up possibly created temporary files
|
||||||
|
if output_path and os.path.exists(output_path):
|
||||||
|
try:
|
||||||
|
os.unlink(output_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise RuntimeError(f"Audio denoising processing failed: {e}")
|
||||||
Reference in New Issue
Block a user