Initial commit with large files ignored

This commit is contained in:
admin
2025-12-11 00:12:18 +08:00
parent a266c0a88d
commit 1e44eba871
17 changed files with 179286 additions and 329 deletions

84
app.py
View File

@@ -45,7 +45,8 @@ class VoxCPMDemo:
repo_id = os.environ.get("HF_REPO_ID", "").strip()
if len(repo_id) > 0:
target_dir = os.path.join("models", repo_id.replace("/", "__"))
if not os.path.isdir(target_dir):
# Check if directory exists AND contains config.json
if not os.path.isdir(target_dir) or not os.path.exists(os.path.join(target_dir, "config.json")):
try:
from huggingface_hub import snapshot_download # type: ignore
os.makedirs(target_dir, exist_ok=True)
@@ -155,45 +156,33 @@ def create_demo_interface(demo: VoxCPMDemo):
gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo"></div>')
# Quick Start
with gr.Accordion("📋 Quick Start Guide 快速入门", open=False, elem_id="acc_quick"):
with gr.Accordion("📋 快速入门", open=False, elem_id="acc_quick"):
gr.Markdown("""
### How to Use 使用说明
1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
**(可选)提供参考声音** - 上传或录制一段音频,为声音合成提供音色、语调和情感等个性化特征
2. **(Optional) Enter prompt text** - If you provided a voice prompt, enter the corresponding transcript here (auto-recognition available).
**(可选项)输入参考文本** - 如果提供了参考语音,请输入其对应的文本内容(支持自动识别)
3. **Enter target text** - Type the text you want the model to speak.
**输入目标文本** - 输入您希望模型朗读的文字内容。
4. **Generate Speech** - Click the "Generate" button to create your audio.
**生成语音** - 点击"生成"按钮,即可为您创造出音频。
### 使用说明
1. **(可选)提供参考声音** - 上传或录制一段音频,为声音合成提供音色、语调和情感等个性化特征。
2. **(可选)输入参考文本** - 如果提供了参考语音,请输入其对应的文本内容(支持自动识别)。
3. **输入目标文本** - 输入您希望模型朗读的文字内容。
4. **生成语音** - 点击"生成语音"按钮,即可为您创造出音频
""")
# Pro Tips
with gr.Accordion("💡 Pro Tips 使用建议", open=False, elem_id="acc_tips"):
with gr.Accordion("💡 使用建议", open=False, elem_id="acc_tips"):
gr.Markdown("""
### Prompt Speech Enhancement参考语音降噪
- **Enable** to remove background noise for a clean voice, with an external ZipEnhancer component. However, this will limit the audio sampling rate to 16kHz, restricting the cloning quality ceiling.
**用**通过 ZipEnhancer 组件消除背景噪音但会将音频采样率限制在16kHz限制克隆上限
- **Disable** to preserve the original audio's all information, including background atmosphere, and support audio cloning up to 44.1kHz sampling rate.
**禁用**保留原始音频的全部信息包括背景环境声最高支持44.1kHz的音频复刻。
### 参考语音降噪
- **启用**:通过 ZipEnhancer 组件消除背景噪音但会将音频采样率限制在16kHz限制克隆上限。
- **用**保留原始音频的全部信息包括背景环境声最高支持44.1kHz的音频复刻
### Text Normalization文本正则化
- **Enable** to process general text with an external WeTextProcessing component.
**用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
- **Disable** to use VoxCPM's native text understanding ability. For example, it supports phonemes input (For Chinese, phonemes are converted using pinyin, {ni3}{hao3}; For English, phonemes are converted using CMUDict, {HH AH0 L OW1}), try it!
**禁用**:将使用 VoxCPM 内置的文本理解能力。如,支持音素输入(如中文转拼音:{ni3}{hao3}英文转CMUDict{HH AH0 L OW1})和公式符号合成,尝试一下!
### 文本正则化
- **启用**:使用 WeTextProcessing 组件,可支持常见文本的正则化处理。
- **用**使用 VoxCPM 内置的文本理解能力。如,支持音素输入(如中文转拼音:{ni3}{hao3}英文转CMUDict{HH AH0 L OW1})和公式符号合成,尝试一下!
### CFG ValueCFG
- **Lower CFG** if the voice prompt sounds strained or expressive, or instability occurs with long text input.
**调**如果提示语音听起来不自然或过于夸张,或者长文本输入出现稳定性问题。
- **Higher CFG** for better adherence to the prompt speech style or input text, or instability occurs with too short text input.
**调高**:为更好地贴合提示音频的风格或输入文本, 或者极短文本输入出现稳定性问题。
### CFG 值
- **调低**:如果提示语音听起来不自然或过于夸张,或者长文本输入出现稳定性问题。
- **调**为更好地贴合提示音频的风格或输入文本, 或者极短文本输入出现稳定性问题。
### Inference Timesteps推理时间步
- **Lower** for faster synthesis speed.
**调**:合成速度更快
- **Higher** for better synthesis quality.
**调高**:合成质量更佳。
### 推理时间步
- **调低**:合成速度更快。
- **调**:合成质量更佳
""")
# Main controls
@@ -202,22 +191,22 @@ def create_demo_interface(demo: VoxCPMDemo):
prompt_wav = gr.Audio(
sources=["upload", 'microphone'],
type="filepath",
label="Prompt Speech (Optional, or let VoxCPM improvise)",
label="参考语音(可选,或让 VoxCPM 自由发挥)",
value="./examples/example.wav",
)
DoDenoisePromptAudio = gr.Checkbox(
value=False,
label="Prompt Speech Enhancement",
label="参考语音增强",
elem_id="chk_denoise",
info="We use ZipEnhancer model to denoise the prompt audio."
info="使用 ZipEnhancer 模型对参考音频进行降噪。"
)
with gr.Row():
prompt_text = gr.Textbox(
value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
label="Prompt Text",
placeholder="Please enter the prompt text. Automatic recognition is supported, and you can correct the results yourself..."
label="参考文本",
placeholder="请输入参考文本。支持自动识别,您也可以自行修改结果..."
)
run_btn = gr.Button("Generate Speech", variant="primary")
run_btn = gr.Button("生成语音", variant="primary")
with gr.Column():
cfg_value = gr.Slider(
@@ -225,30 +214,31 @@ def create_demo_interface(demo: VoxCPMDemo):
maximum=3.0,
value=2.0,
step=0.1,
label="CFG Value (Guidance Scale)",
info="Higher values increase adherence to prompt, lower values allow more creativity"
label="CFG 值 (引导比例)",
info="值越高越贴合提示,值越低允许更多的创造性"
)
inference_timesteps = gr.Slider(
minimum=4,
maximum=30,
value=10,
step=1,
label="Inference Timesteps",
info="Number of inference timesteps for generation (higher values may improve quality but slower)"
label="推理时间步",
info="生成的推理时间步数(值越高可能质量越好,但速度更慢)"
)
with gr.Row():
text = gr.Textbox(
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
label="Target Text",
value="VoxCPM 是 ModelBest 推出的一款创新型端到端 TTS 模型,旨在生成极具表现力的语音。",
label="目标文本",
)
with gr.Row():
DoNormalizeText = gr.Checkbox(
value=False,
label="Text Normalization",
label="文本正则化",
elem_id="chk_normalize",
info="We use wetext library to normalize the input text."
info="使用 wetext 库对输入文本进行标准化。"
)
audio_output = gr.Audio(label="Output Audio")
audio_output = gr.Audio(label="输出音频")
# Wiring
run_btn.click(