diff --git a/README.md b/README.md
index bc25618..f81bc57 100644
--- a/README.md
+++ b/README.md
@@ -39,11 +39,6 @@ Unlike mainstream approaches that convert speech to discrete tokens, VoxCPM uses
 
 
 
-
-
-
-
-
 ##  Quick Start
 
 ### 🔧 Install from PyPI
@@ -55,7 +50,7 @@ By default, when you first run the script, the model will be downloaded automati
 - Download VoxCPM-0.5B
     ```
     from huggingface_hub import snapshot_download
-    snapshot_download("openbmb/VoxCPM-0.5B",local_files_only=local_files_only)
+    snapshot_download("openbmb/VoxCPM-0.5B")
     ```
 - Download ZipEnhancer and SenseVoice-Small. We use ZipEnhancer to enhance speech prompts and SenseVoice-Small for speech prompt ASR in the web demo.
     ```
@@ -103,6 +98,13 @@ voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, desi
   --output out.wav \
   --denoise
 
+# (Optinal) Voice cloning (reference audio + transcript file)
+voxcpm --text "VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." \
+  --prompt-audio path/to/voice.wav \
+  --prompt-file "/path/to/text-file" \
+  --output out.wav \
+  --denoise
+
 # 3) Batch processing (one text per line)
 voxcpm --input examples/input.txt --output-dir outs
 # (optional) Batch + cloning
@@ -245,6 +247,13 @@ VoxCPM achieves competitive results on public zero-shot TTS benchmarks:
 
 
 
+## 📝TO-DO List
+Please stay tuned for updates!
+- [ ] Release the VoxCPM technical report.
+- [ ] Support higher sampling rate (next version).
+
+
+
 ## 📄 License
 The VoxCPM model weights and code are open-sourced under the [Apache-2.0](LICENSE) license.
 
@@ -265,6 +274,8 @@ This project is developed by the following institutions:
 - <img src="assets/thuhcsi_logo.png" width="28px"> [THUHCSI](https://github.com/thuhcsi)
 
 
+## ⭐ Star History
+ [![Star History Chart](https://api.star-history.com/svg?repos=OpenBMB/VoxCPM&type=Date)](https://star-history.com/#OpenBMB/VoxCPM&Date)
 
 
 ## 📚 Citation
diff --git a/app.py b/app.py
index 3f64801..f109c09 100644
--- a/app.py
+++ b/app.py
@@ -194,10 +194,6 @@ def create_demo_interface(demo: VoxCPMDemo):
               **调低**：合成速度更快。
             - **Higher** for better synthesis quality.  
               **调高**：合成质量更佳。
-
-            ### Long Text (e.g., >5 min speech)｜长文本 (如 >5分钟的合成语音)
-            While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.  
-            虽然 VoxCPM 支持直接生成长文本，但如果目标文本过长，我们建议使用换行符将内容分段；模型将对每个段落分别合成。
             """)
 
         # Main controls
@@ -244,14 +240,13 @@ def create_demo_interface(demo: VoxCPMDemo):
                     text = gr.Textbox(
                         value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly realistic speech.",
                         label="Target Text",
-                        info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
                     )
                 with gr.Row():
                     DoNormalizeText = gr.Checkbox(
                         value=False,
                         label="Text Normalization",
                         elem_id="chk_normalize",
-                        info="We use WeTextPorcessing library to normalize the input text."
+                        info="We use wetext library to normalize the input text."
                     )
                 audio_output = gr.Audio(label="Output Audio")
 
diff --git a/pyproject.toml b/pyproject.toml
index dfb3399..8f9d5ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "addict",
     "wetext",
     "modelscope>=1.22.0",
-    "datasets>=2,<4",
+    "datasets>=3,<4",
     "huggingface-hub",
     "pydantic",
     "tqdm",
diff --git a/src/voxcpm/cli.py b/src/voxcpm/cli.py
index 801266f..f58e8b1 100644
--- a/src/voxcpm/cli.py
+++ b/src/voxcpm/cli.py
@@ -240,6 +240,7 @@ Examples:
     # Prompt audio (for voice cloning)
     parser.add_argument("--prompt-audio", "-pa", help="Reference audio file path")
     parser.add_argument("--prompt-text", "-pt", help="Reference text corresponding to the audio")
+    parser.add_argument("--prompt-file", "-pf", help="Reference text file corresponding to the audio")
     parser.add_argument("--denoise", action="store_true", help="Enable prompt speech enhancement (denoising)")
 
     # Generation parameters
@@ -279,6 +280,12 @@ def main():
 
     # If prompt audio+text provided → voice cloning
     if args.prompt_audio or args.prompt_text:
+        if not args.prompt_text and args.prompt_file:
+            assert os.path.isfile(args.prompt_file), "Prompt file does not exist or is not accessible."
+        
+            with open(args.prompt_file, 'r', encoding='utf-8') as f:
+                args.prompt_text = f.read()
+
         if not args.prompt_audio or not args.prompt_text:
             print("Error: Voice cloning requires both --prompt-audio and --prompt-text")
             sys.exit(1)
diff --git a/src/voxcpm/core.py b/src/voxcpm/core.py
index 7ff1d08..3b88b55 100644
--- a/src/voxcpm/core.py
+++ b/src/voxcpm/core.py
@@ -1,6 +1,7 @@
 import torch
 import torchaudio
 import os
+import re
 import tempfile
 from huggingface_hub import snapshot_download
 from .model.voxcpm import VoxCPMModel
@@ -130,6 +131,8 @@ class VoxCPM:
         if (prompt_wav_path is None) != (prompt_text is None):
             raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None")
         
+        text = text.replace("\n", " ")
+        text = re.sub(r'\s+', ' ', text)
         temp_prompt_wav_path = None
         
         try:
diff --git a/src/voxcpm/model/voxcpm.py b/src/voxcpm/model/voxcpm.py
index 3af0af9..1f5fdec 100644
--- a/src/voxcpm/model/voxcpm.py
+++ b/src/voxcpm/model/voxcpm.py
@@ -160,8 +160,8 @@ class VoxCPMModel(nn.Module):
             self.feat_encoder_step = torch.compile(self.feat_encoder, mode="reduce-overhead", fullgraph=True)
             self.feat_decoder.estimator = torch.compile(self.feat_decoder.estimator, mode="reduce-overhead", fullgraph=True)
         except Exception as e:
-            print(e)
-            print("VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
+            print(f"Error: {e}")
+            print("Warning: VoxCPMModel can not be optimized by torch.compile, using original forward_step functions")
             self.base_lm.forward_step = self.base_lm.forward_step
             self.residual_lm.forward_step = self.residual_lm.forward_step
             self.feat_encoder_step = self.feat_encoder
@@ -283,8 +283,11 @@ class VoxCPMModel(nn.Module):
                 else:
                     break
             else:
-                break
-        return self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
+                break   
+                
+        decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()  
+        decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
+        return decode_audio        
     
     @torch.inference_mode()
     def build_prompt_cache(
@@ -468,7 +471,8 @@ class VoxCPMModel(nn.Module):
             else:
                 break
         decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)).squeeze(1).cpu()
-            
+        decode_audio = decode_audio[..., 640:-640] # trick: trim the start and end of the audio
+
         return (
             decode_audio,
             target_text_token,
@@ -580,7 +584,6 @@ class VoxCPMModel(nn.Module):
         pred_feat_seq = torch.cat(pred_feat_seq, dim=1)  # b, t, p, d
 
         feat_pred = rearrange(pred_feat_seq, "b t p d -> b d (t p)", b=B, p=self.patch_size)
-        feat_pred = feat_pred[..., 1:-1] # trick: remove the first and last token
         return feat_pred, pred_feat_seq.squeeze(0).cpu()
 
     @classmethod