Update: VoxCPM1.5 and fine-tuning supprt

This commit is contained in:
Labmem-Zhouyx
2025-12-05 21:00:01 +08:00
parent d1bb6aaf41
commit 461ad7e506
29 changed files with 2928 additions and 228 deletions

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Full finetune inference script (no LoRA).
Checkpoint directory contains complete model files (pytorch_model.bin, config.json, audiovae.pth, etc.),
can be loaded directly via VoxCPMModel.from_local().
Usage:
python scripts/test_voxcpm_ft_infer.py \
--ckpt_dir /path/to/checkpoints/step_0001000 \
--text "Hello, I am the finetuned VoxCPM." \
--output ft_test.wav
With voice cloning:
python scripts/test_voxcpm_ft_infer.py \
--ckpt_dir /path/to/checkpoints/step_0001000 \
--text "Hello, this is voice cloning result." \
--prompt_audio path/to/ref.wav \
--prompt_text "Reference audio transcript" \
--output ft_clone.wav
"""
import argparse
from pathlib import Path
import soundfile as sf
import torch
from voxcpm.model import VoxCPMModel
def parse_args():
parser = argparse.ArgumentParser("VoxCPM full-finetune inference test (no LoRA)")
parser.add_argument(
"--ckpt_dir",
type=str,
required=True,
help="Checkpoint directory (contains pytorch_model.bin, config.json, audiovae.pth, etc.)",
)
parser.add_argument(
"--text",
type=str,
required=True,
help="Target text to synthesize",
)
parser.add_argument(
"--prompt_audio",
type=str,
default="",
help="Optional: reference audio path for voice cloning",
)
parser.add_argument(
"--prompt_text",
type=str,
default="",
help="Optional: transcript of reference audio",
)
parser.add_argument(
"--output",
type=str,
default="ft_test.wav",
help="Output wav file path",
)
parser.add_argument(
"--cfg_value",
type=float,
default=2.0,
help="CFG scale (default: 2.0)",
)
parser.add_argument(
"--inference_timesteps",
type=int,
default=10,
help="Diffusion inference steps (default: 10)",
)
parser.add_argument(
"--max_len",
type=int,
default=600,
help="Max generation steps",
)
return parser.parse_args()
def main():
args = parse_args()
# Load model from checkpoint directory
print(f"[FT Inference] Loading model: {args.ckpt_dir}")
model = VoxCPMModel.from_local(args.ckpt_dir, optimize=True, training=False)
# Run inference
prompt_wav_path = args.prompt_audio or ""
prompt_text = args.prompt_text or ""
print(f"[FT Inference] Synthesizing: text='{args.text}'")
if prompt_wav_path:
print(f"[FT Inference] Using reference audio: {prompt_wav_path}")
print(f"[FT Inference] Reference text: {prompt_text}")
with torch.inference_mode():
audio = model.generate(
target_text=args.text,
prompt_text=prompt_text,
prompt_wav_path=prompt_wav_path,
max_len=args.max_len,
inference_timesteps=args.inference_timesteps,
cfg_value=args.cfg_value,
)
# Squeeze and save audio
if isinstance(audio, torch.Tensor):
audio_np = audio.squeeze(0).cpu().numpy()
else:
raise TypeError(f"Unexpected return type from model.generate: {type(audio)}")
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
sf.write(str(out_path), audio_np, model.sample_rate)
print(f"[FT Inference] Saved to: {out_path}, duration: {len(audio_np) / model.sample_rate:.2f}s")
if __name__ == "__main__":
main()