Update: VoxCPM1.5 and fine-tuning supprt

2025-12-05 21:00:01 +08:00
parent d1bb6aaf41
commit 3443dbb212
29 changed files with 2928 additions and 228 deletions
--- a/scripts/test_voxcpm_ft_infer.py
+++ b/scripts/test_voxcpm_ft_infer.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Full finetune inference script (no LoRA).
+
+Checkpoint directory contains complete model files (pytorch_model.bin, config.json, audiovae.pth, etc.),
+can be loaded directly via VoxCPMModel.from_local().
+
+Usage:
+
+    python scripts/test_voxcpm_ft_infer.py \
+        --ckpt_dir /path/to/checkpoints/step_0001000 \
+        --text "Hello, I am the finetuned VoxCPM." \
+        --output ft_test.wav
+
+With voice cloning:
+
+    python scripts/test_voxcpm_ft_infer.py \
+        --ckpt_dir /path/to/checkpoints/step_0001000 \
+        --text "Hello, this is voice cloning result." \
+        --prompt_audio path/to/ref.wav \
+        --prompt_text "Reference audio transcript" \
+        --output ft_clone.wav
+"""
+
+import argparse
+from pathlib import Path
+
+import soundfile as sf
+import torch
+
+from voxcpm.model import VoxCPMModel
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("VoxCPM full-finetune inference test (no LoRA)")
+    parser.add_argument(
+        "--ckpt_dir",
+        type=str,
+        required=True,
+        help="Checkpoint directory (contains pytorch_model.bin, config.json, audiovae.pth, etc.)",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="Target text to synthesize",
+    )
+    parser.add_argument(
+        "--prompt_audio",
+        type=str,
+        default="",
+        help="Optional: reference audio path for voice cloning",
+    )
+    parser.add_argument(
+        "--prompt_text",
+        type=str,
+        default="",
+        help="Optional: transcript of reference audio",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="ft_test.wav",
+        help="Output wav file path",
+    )
+    parser.add_argument(
+        "--cfg_value",
+        type=float,
+        default=2.0,
+        help="CFG scale (default: 2.0)",
+    )
+    parser.add_argument(
+        "--inference_timesteps",
+        type=int,
+        default=10,
+        help="Diffusion inference steps (default: 10)",
+    )
+    parser.add_argument(
+        "--max_len",
+        type=int,
+        default=600,
+        help="Max generation steps",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Load model from checkpoint directory
+    print(f"[FT Inference] Loading model: {args.ckpt_dir}")
+    model = VoxCPMModel.from_local(args.ckpt_dir, optimize=True, training=False)
+
+    # Run inference
+    prompt_wav_path = args.prompt_audio or ""
+    prompt_text = args.prompt_text or ""
+
+    print(f"[FT Inference] Synthesizing: text='{args.text}'")
+    if prompt_wav_path:
+        print(f"[FT Inference] Using reference audio: {prompt_wav_path}")
+        print(f"[FT Inference] Reference text: {prompt_text}")
+
+    with torch.inference_mode():
+        audio = model.generate(
+            target_text=args.text,
+            prompt_text=prompt_text,
+            prompt_wav_path=prompt_wav_path,
+            max_len=args.max_len,
+            inference_timesteps=args.inference_timesteps,
+            cfg_value=args.cfg_value,
+        )
+
+    # Squeeze and save audio
+    if isinstance(audio, torch.Tensor):
+        audio_np = audio.squeeze(0).cpu().numpy()
+    else:
+        raise TypeError(f"Unexpected return type from model.generate: {type(audio)}")
+
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    sf.write(str(out_path), audio_np, model.sample_rate)
+
+    print(f"[FT Inference] Saved to: {out_path}, duration: {len(audio_np) / model.sample_rate:.2f}s")
+
+
+if __name__ == "__main__":
+    main()
+
+