Add Guidance for Repetition Penalty

https://github.com/huggingface/transformers/pull/37625 added support for excluding the input tokens from RepetitionPenaltyLogitsProcessor - this updates the code snippet to do this with a repetition penalty of 3.

Files changed (1) hide show

README.md +10 -4

README.md CHANGED Viewed

@@ -51,7 +51,7 @@ Then run the code:
 ```python
 import torch
 import torchaudio
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 from huggingface_hub import hf_hub_download
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -64,7 +64,6 @@ speech_granite = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_name).to(device)
 # prepare speech and text prompt, using the appropriate prompt template
 audio_path = hf_hub_download(repo_id=model_name, filename='10226_10111_000000.wav')
 wav, sr = torchaudio.load(audio_path, normalize=True)
 assert wav.shape[0] == 1 and sr == 16000 # mono, 16khz
@@ -92,7 +91,14 @@ model_inputs = speech_granite_processor(
     device=device, # Computation device; returned tensors are put on CPU
     return_tensors="pt",
 ).to(device)
 model_outputs = speech_granite.generate(
     **model_inputs,
     max_new_tokens=200,
@@ -100,9 +106,9 @@ model_outputs = speech_granite.generate(
     do_sample=False,
     min_length=1,
     top_p=1.0,
-    repetition_penalty=1.0,
     length_penalty=1.0,
     temperature=1.0,
     bos_token_id=tokenizer.bos_token_id,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.pad_token_id,

 ```python
 import torch
 import torchaudio
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, RepetitionPenaltyLogitsProcessor
 from huggingface_hub import hf_hub_download
 device = "cuda" if torch.cuda.is_available() else "cpu"
     model_name).to(device)
 # prepare speech and text prompt, using the appropriate prompt template
 audio_path = hf_hub_download(repo_id=model_name, filename='10226_10111_000000.wav')
 wav, sr = torchaudio.load(audio_path, normalize=True)
 assert wav.shape[0] == 1 and sr == 16000 # mono, 16khz
     device=device, # Computation device; returned tensors are put on CPU
     return_tensors="pt",
 ).to(device)
+# The recommended repetition penalty is 3 as long as input IDs are excluded.
+# Otherwise, you should use a reptition penalty of 1 to keep results stable.
+reptition_penalty_processor = RepetitionPenaltyLogitsProcessor(
+    penalty=3.0,
+    prompt_ignore_length=model_inputs["input_ids"].shape[-1],
+)
 model_outputs = speech_granite.generate(
     **model_inputs,
     max_new_tokens=200,
     do_sample=False,
     min_length=1,
     top_p=1.0,
     length_penalty=1.0,
     temperature=1.0,
+    logits_processor=[reptition_penalty_processor],
     bos_token_id=tokenizer.bos_token_id,
     eos_token_id=tokenizer.eos_token_id,
     pad_token_id=tokenizer.pad_token_id,