FX: noise point

This commit is contained in:
周逸轩
2025-09-18 14:50:01 +08:00
parent bdd516b579
commit 5257ec3dc5

View File

@@ -283,8 +283,10 @@ class VoxCPMModel(nn.Module):
else: else:
break break
else: else:
break
return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
return decode_audio
@torch.inference_mode() @torch.inference_mode()
def build_prompt_cache( def build_prompt_cache(
@@ -468,7 +470,8 @@ class VoxCPMModel(nn.Module):
else: else:
break break
decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu() decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
return ( return (
decode_audio, decode_audio,
target_text_token, target_text_token,
@@ -580,7 +583,6 @@ class VoxCPMModel(nn.Module):
pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d pred_feat_seq = torch.cat(pred_feat_seq, dim=1) # b, t, p, d
feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size) feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token
return feat_pred, pred_feat_seq.squeeze(0).cpu() return feat_pred, pred_feat_seq.squeeze(0).cpu()
@classmethod @classmethod