Commit 
							
							·
						
						266740b
	
1
								Parent(s):
							
							fd6acee
								
Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -60,4 +60,47 @@ To transcribe audio files the model can be used as a standalone acoustic model a | |
| 60 | 
             
             # take argmax and decode
         | 
| 61 | 
             
             predicted_ids = torch.argmax(logits, dim=-1)
         | 
| 62 | 
             
             transcription = tokenizer.batch_decode(predicted_ids)
         | 
| 63 | 
            -
             ```
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 60 | 
             
             # take argmax and decode
         | 
| 61 | 
             
             predicted_ids = torch.argmax(logits, dim=-1)
         | 
| 62 | 
             
             transcription = tokenizer.batch_decode(predicted_ids)
         | 
| 63 | 
            +
             ```
         | 
| 64 | 
            +
             
         | 
| 65 | 
            +
             ## Evalution
         | 
| 66 | 
            +
             
         | 
| 67 | 
            +
             This is a short script to show how the dataset can be evaluated on librispeech "clean" and "other" test data.
         | 
| 68 | 
            +
             
         | 
| 69 | 
            +
            ```python
         | 
| 70 | 
            +
            from datasets import load_dataset
         | 
| 71 | 
            +
            from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer
         | 
| 72 | 
            +
            import soundfile as sf
         | 
| 73 | 
            +
            import torch
         | 
| 74 | 
            +
            from jiwer import wer
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")
         | 
| 80 | 
            +
            tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
         | 
| 81 | 
            +
             | 
| 82 | 
            +
            def map_to_array(batch):
         | 
| 83 | 
            +
                speech, _ = sf.read(batch["file"])
         | 
| 84 | 
            +
                batch["speech"] = speech
         | 
| 85 | 
            +
                return batch
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            librispeech_eval = librispeech_eval.map(map_to_array)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            def map_to_pred(batch):
         | 
| 90 | 
            +
                input_values = tokenizer(batch["speech"], return_tensors="pt", padding="longest").input_values
         | 
| 91 | 
            +
                with torch.no_grad():
         | 
| 92 | 
            +
                    logits = model(input_values.to("cuda")).logits
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                predicted_ids = torch.argmax(logits, dim=-1)
         | 
| 95 | 
            +
                transcription = tokenizer.batch_decode(predicted_ids)
         | 
| 96 | 
            +
                batch["transcription"] = transcription
         | 
| 97 | 
            +
                return batch
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            result = librispeech_eval.map(map_to_pred, batched=True, batch_size=16, remove_columns=["speech"])
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            print("WER:", wer(result["text"], result["transcription"]))
         | 
| 102 | 
            +
            ```
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            | "clean" | "other" |
         | 
| 105 | 
            +
            |---|---|
         | 
| 106 | 
            +
            | 4.1 |   |
         | 

