Initial commit with large files ignored

2025-12-11 00:12:18 +08:00
parent a266c0a88d
commit 1e44eba871
17 changed files with 179286 additions and 329 deletions
--- a/app.py
+++ b/app.py
@@ -45,7 +45,8 @@ class VoxCPMDemo:
        repo_id = os.environ.get("HF_REPO_ID", "").strip()
        if len(repo_id) > 0:
            target_dir = os.path.join("models", repo_id.replace("/", "__"))
-            if not os.path.isdir(target_dir):
+            # Check if directory exists AND contains config.json
+            if not os.path.isdir(target_dir) or not os.path.exists(os.path.join(target_dir, "config.json")):
                try:
                    from huggingface_hub import snapshot_download  # type: ignore
                    os.makedirs(target_dir, exist_ok=True)
@@ -155,45 +156,33 @@ def create_demo_interface(demo: VoxCPMDemo):
        gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm_logo.png" alt="VoxCPM Logo"></div>')

        # Quick Start
-        with gr.Accordion("📋 Quick Start Guide ｜快速入门", open=False, elem_id="acc_quick"):
+        with gr.Accordion("📋 快速入门", open=False, elem_id="acc_quick"):
            gr.Markdown("""
-            ### How to Use ｜使用说明
-            1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.  
-               **（可选）提供参考声音** - 上传或录制一段音频，为声音合成提供音色、语调和情感等个性化特征
-            2. **(Optional) Enter prompt text** - If you provided a voice prompt, enter the corresponding transcript here (auto-recognition available).  
-               **（可选项）输入参考文本** - 如果提供了参考语音，请输入其对应的文本内容（支持自动识别）。
-            3. **Enter target text** - Type the text you want the model to speak.  
-               **输入目标文本** - 输入您希望模型朗读的文字内容。
-            4. **Generate Speech** - Click the "Generate" button to create your audio.  
-               **生成语音** - 点击"生成"按钮，即可为您创造出音频。
+            ### 使用说明
+            1. **（可选）提供参考声音** - 上传或录制一段音频，为声音合成提供音色、语调和情感等个性化特征。
+            2. **（可选）输入参考文本** - 如果提供了参考语音，请输入其对应的文本内容（支持自动识别）。
+            3. **输入目标文本** - 输入您希望模型朗读的文字内容。
+            4. **生成语音** - 点击"生成语音"按钮，即可为您创造出音频。
            """)

        # Pro Tips
-        with gr.Accordion("💡 Pro Tips ｜使用建议", open=False, elem_id="acc_tips"):
+        with gr.Accordion("💡 使用建议", open=False, elem_id="acc_tips"):
            gr.Markdown("""
-            ### Prompt Speech Enhancement｜参考语音降噪
-            - **Enable** to remove background noise for a clean voice, with an external ZipEnhancer component. However, this will limit the audio sampling rate to 16kHz, restricting the cloning quality ceiling.  
-              **启用**：通过 ZipEnhancer 组件消除背景噪音，但会将音频采样率限制在16kHz，限制克隆上限。
-            - **Disable** to preserve the original audio's all information, including background atmosphere, and support audio cloning up to 44.1kHz sampling rate.  
-              **禁用**：保留原始音频的全部信息，包括背景环境声，最高支持44.1kHz的音频复刻。
+            ### 参考语音降噪
+            - **启用**：通过 ZipEnhancer 组件消除背景噪音，但会将音频采样率限制在16kHz，限制克隆上限。
+            - **禁用**：保留原始音频的全部信息，包括背景环境声，最高支持44.1kHz的音频复刻。

-            ### Text Normalization｜文本正则化
-            - **Enable** to process general text with an external WeTextProcessing component.  
-              **启用**：使用 WeTextProcessing 组件，可支持常见文本的正则化处理。
-            - **Disable** to use VoxCPM's native text understanding ability. For example, it supports phonemes input (For Chinese, phonemes are converted using pinyin, {ni3}{hao3}; For English, phonemes are converted using CMUDict, {HH AH0 L OW1}), try it!  
-              **禁用**：将使用 VoxCPM 内置的文本理解能力。如，支持音素输入（如中文转拼音：{ni3}{hao3}；英文转CMUDict：{HH AH0 L OW1}）和公式符号合成，尝试一下！
+            ### 文本正则化
+            - **启用**：使用 WeTextProcessing 组件，可支持常见文本的正则化处理。
+            - **禁用**：将使用 VoxCPM 内置的文本理解能力。如，支持音素输入（如中文转拼音：{ni3}{hao3}；英文转CMUDict：{HH AH0 L OW1}）和公式符号合成，尝试一下！

-            ### CFG Value｜CFG 值
-            - **Lower CFG** if the voice prompt sounds strained or expressive, or instability occurs with long text input.  
-              **调低**：如果提示语音听起来不自然或过于夸张，或者长文本输入出现稳定性问题。
-            - **Higher CFG** for better adherence to the prompt speech style or input text, or instability occurs with too short text input.
-              **调高**：为更好地贴合提示音频的风格或输入文本， 或者极短文本输入出现稳定性问题。
+            ### CFG 值
+            - **调低**：如果提示语音听起来不自然或过于夸张，或者长文本输入出现稳定性问题。
+            - **调高**：为更好地贴合提示音频的风格或输入文本， 或者极短文本输入出现稳定性问题。

-            ### Inference Timesteps｜推理时间步
-            - **Lower** for faster synthesis speed.  
-              **调低**：合成速度更快。
-            - **Higher** for better synthesis quality.  
-              **调高**：合成质量更佳。
+            ### 推理时间步
+            - **调低**：合成速度更快。
+            - **调高**：合成质量更佳。
            """)

        # Main controls
@@ -202,22 +191,22 @@ def create_demo_interface(demo: VoxCPMDemo):
                prompt_wav = gr.Audio(
                    sources=["upload", 'microphone'],
                    type="filepath",
-                    label="Prompt Speech (Optional, or let VoxCPM improvise)",
+                    label="参考语音（可选，或让 VoxCPM 自由发挥）",
                    value="./examples/example.wav",
                )
                DoDenoisePromptAudio = gr.Checkbox(
                    value=False,
-                    label="Prompt Speech Enhancement",
+                    label="参考语音增强",
                    elem_id="chk_denoise",
-                    info="We use ZipEnhancer model to denoise the prompt audio."
+                    info="使用 ZipEnhancer 模型对参考音频进行降噪。"
                )
                with gr.Row():
                    prompt_text = gr.Textbox(
                        value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
-                        label="Prompt Text",
-                        placeholder="Please enter the prompt text. Automatic recognition is supported, and you can correct the results yourself..."
+                        label="参考文本",
+                        placeholder="请输入参考文本。支持自动识别，您也可以自行修改结果..."
                    )
-                run_btn = gr.Button("Generate Speech", variant="primary")
+                run_btn = gr.Button("生成语音", variant="primary")

            with gr.Column():
                cfg_value = gr.Slider(
@@ -225,30 +214,31 @@ def create_demo_interface(demo: VoxCPMDemo):
                    maximum=3.0,
                    value=2.0,
                    step=0.1,
-                    label="CFG Value (Guidance Scale)",
-                    info="Higher values increase adherence to prompt, lower values allow more creativity"
+                    label="CFG 值 (引导比例)",
+                    info="值越高越贴合提示，值越低允许更多的创造性"
                )
                inference_timesteps = gr.Slider(
                    minimum=4,
                    maximum=30,
                    value=10,
                    step=1,
-                    label="Inference Timesteps",
-                    info="Number of inference timesteps for generation (higher values may improve quality but slower)"
+                    label="推理时间步",
+                    info="生成的推理时间步数（值越高可能质量越好，但速度更慢）"
                )
                with gr.Row():
                    text = gr.Textbox(
-                        value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
-                        label="Target Text",
+                        value="VoxCPM 是 ModelBest 推出的一款创新型端到端 TTS 模型，旨在生成极具表现力的语音。",
+                        label="目标文本",
                    )
                with gr.Row():
                    DoNormalizeText = gr.Checkbox(
                        value=False,
-                        label="Text Normalization",
+                        label="文本正则化",
                        elem_id="chk_normalize",
-                        info="We use wetext library to normalize the input text."
+                        info="使用 wetext 库对输入文本进行标准化。"
                    )
-                audio_output = gr.Audio(label="Output Audio")
+                audio_output = gr.Audio(label="输出音频")
+

        # Wiring
        run_btn.click(