NbAiLab
/

wav2vec2-1b-npsc-nst-bokmaal-repaired

@@ -6,8 +6,8 @@ from typing import Dict
 import torch
 from datasets import Audio, Dataset, load_dataset, load_metric
-from transformers import AutoFeatureExtractor, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
-from pyctcdecode import BeamSearchDecoderCTC
 def log_results(result: Dataset, args: Dict[str, str]):
@@ -15,8 +15,8 @@ def log_results(result: Dataset, args: Dict[str, str]):
     log_outputs = args.log_outputs
     lm = "withLM" if args.use_lm else "noLM"
-    model_id = args.model_id.replace("/", "_")
-    dataset_id = "_".join(args.dataset.split("/") + [model_id, args.config, args.split, lm])
     # load metric
     wer = load_metric("wer")
@@ -27,7 +27,7 @@ def log_results(result: Dataset, args: Dict[str, str]):
     cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
     # print & log results
-    result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
     print(result_str)
     with open(f"{dataset_id}_eval_results.txt", "w") as f:
@@ -57,7 +57,7 @@ def normalize_text(text: str, dataset: str) -> str:
     if dataset.lower().endswith("nst"):
         text = text.lower()
-        text = text.replace("(...Vær stille under dette opptaket...)", "")
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
@@ -78,10 +78,10 @@ def normalize_text(text: str, dataset: str) -> str:
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
         text = re.sub('\s', ' ', text)
-        text = re.sub('<ee>', 'eee', text)
-        text = re.sub('<qq>', 'qqq', text)
-        text = re.sub('<mm>', 'mmm', text)
-        text = re.sub('<inaudible>', 'xxx', text)
     # # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # # note that order is important here!
@@ -112,11 +112,27 @@ def main(args):
         args.device = 0 if torch.cuda.is_available() else -1
     # asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
-    feature_extractor_dict, _ = Wav2Vec2FeatureExtractor.get_feature_extractor_dict(args.model_id)
-    feature_extractor_dict["processor_class"] = "Wav2Vec2Processor" if not args.use_lm else "Wav2Vec2ProcessorWithLM"
-    feature_extractor = Wav2Vec2FeatureExtractor.from_dict(feature_extractor_dict)
-    asr = pipeline("automatic-speech-recognition", model=args.model_id, feature_extractor=feature_extractor, device=args.device, decoder=BeamSearchDecoderCTC.load_from_dir("./"))
     # map function to decode audio
     def map_to_pred(batch):
@@ -172,4 +188,4 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
-    main(args)

 import torch
 from datasets import Audio, Dataset, load_dataset, load_metric
+from transformers import AutoFeatureExtractor, AutoModelForCTC, pipeline, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2FeatureExtractor
+# from pyctcdecode import BeamSearchDecoderCTC
 def log_results(result: Dataset, args: Dict[str, str]):
     log_outputs = args.log_outputs
     lm = "withLM" if args.use_lm else "noLM"
+    model_id = args.model_id.replace("/", "_").replace(".", "")
+    dataset_id = "_".join([model_id] + args.dataset.split("/") + [args.config, args.split, lm])
     # load metric
     wer = load_metric("wer")
     cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
     # print & log results
+    result_str = f"{dataset_id}\nWER: {wer_result}\nCER: {cer_result}"
     print(result_str)
     with open(f"{dataset_id}_eval_results.txt", "w") as f:
     if dataset.lower().endswith("nst"):
         text = text.lower()
+        text = text.replace("(...vær stille under dette opptaket...)", "")
         text = re.sub('[áàâ]', 'a', text)
         text = re.sub('[ä]', 'æ', text)
         text = re.sub('[éèëê]', 'e', text)
         text = re.sub('[ç]', 'c', text)
         text = re.sub('[úùüû]', 'u', text)
         text = re.sub('\s', ' ', text)
+    text = re.sub("<ee(eh)?>", "e", text)
+    text = re.sub("<mmm?>", "m", text)
+    text = re.sub("<qq>", "q", text)
+    text = re.sub("<inaudible>", "i", text)
     # # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # # note that order is important here!
         args.device = 0 if torch.cuda.is_available() else -1
     # asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
+    model_instance = AutoModelForCTC.from_pretrained(args.model_id)
+    if args.use_lm:
+        processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
+        decoder = processor.decoder
+    else:
+        processor = Wav2Vec2Processor.from_pretrained(args.model_id)
+        decoder = None
+    asr = pipeline(
+        "automatic-speech-recognition",
+        model=model_instance,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        decoder=decoder,
+        device=args.device
+    )
+    # feature_extractor_dict, _ = Wav2Vec2FeatureExtractor.get_feature_extractor_dict(args.model_id)
+    # feature_extractor_dict["processor_class"] = "Wav2Vec2Processor" if not args.use_lm else "Wav2Vec2ProcessorWithLM"
+    # feature_extractor = Wav2Vec2FeatureExtractor.from_dict(feature_extractor_dict)
+    # asr = pipeline("automatic-speech-recognition", model=args.model_id, feature_extractor=feature_extractor, device=args.device, decoder=BeamSearchDecoderCTC.load_from_dir("./"))
     # map function to decode audio
     def map_to_pred(batch):
     )
     args = parser.parse_args()
+    main(args)