Shuwei Hou
commited on
Commit
·
a213dac
1
Parent(s):
ed8d94c
update_speaker_id_to_json
Browse files- app.py +7 -4
- requirements.txt +0 -2
- speaker/speaker_identification.py +79 -5
- transcription/transcription.py +5 -0
app.py
CHANGED
|
@@ -11,6 +11,7 @@ from segmentation import reorganize_transcription_c_unit
|
|
| 11 |
from annotation import annotate_maze_for_mazewhisper
|
| 12 |
from morpheme import stanza_v1
|
| 13 |
from morpheme import annotate_morpheme
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
|
|
@@ -43,13 +44,17 @@ def process_audio():
|
|
| 43 |
result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
|
| 44 |
|
| 45 |
cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
|
| 46 |
-
print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
annotate_maze_for_mazewhisper(session_id)
|
| 49 |
|
|
|
|
| 50 |
annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
|
| 51 |
|
| 52 |
-
|
| 53 |
# annotate_pauses(session_id, pause_threshold)
|
| 54 |
# annotate_repetitions(session_id)
|
| 55 |
# # annotate_syllables(session_id)
|
|
@@ -58,8 +63,6 @@ def process_audio():
|
|
| 58 |
# annotate_morpheme(session_id)
|
| 59 |
# annotate_morpheme_omission(session_id)
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
json_path = f"session_data/{session_id}/transcription_cunit.json"
|
| 64 |
if not os.path.isfile(json_path):
|
| 65 |
return jsonify({'error': f"Annotation file {json_path} not found"}), 500
|
|
|
|
| 11 |
from annotation import annotate_maze_for_mazewhisper
|
| 12 |
from morpheme import stanza_v1
|
| 13 |
from morpheme import annotate_morpheme
|
| 14 |
+
from speaker import assign_speaker
|
| 15 |
|
| 16 |
|
| 17 |
|
|
|
|
| 44 |
result, session_id = translate_audio_file(model="mazeWhisper", audio_path = audio_path, device=device, original_filename=filename)
|
| 45 |
|
| 46 |
cunit_count, ignored_count = reorganize_transcription_c_unit(session_id, segment_batchalign)
|
| 47 |
+
# print(f"Created {cunit_count} C-units, ignored {ignored_count} boundaries")
|
| 48 |
|
| 49 |
+
print("Processing speaker identification ... ...")
|
| 50 |
+
assign_speaker(session_id = session_id)
|
| 51 |
+
|
| 52 |
+
print("Processing maze detection ... ...")
|
| 53 |
annotate_maze_for_mazewhisper(session_id)
|
| 54 |
|
| 55 |
+
print("Processing morpheme detection ... ...")
|
| 56 |
annotate_morpheme(session_id = session_id, morpheme_function = stanza_v1)
|
| 57 |
|
|
|
|
| 58 |
# annotate_pauses(session_id, pause_threshold)
|
| 59 |
# annotate_repetitions(session_id)
|
| 60 |
# # annotate_syllables(session_id)
|
|
|
|
| 63 |
# annotate_morpheme(session_id)
|
| 64 |
# annotate_morpheme_omission(session_id)
|
| 65 |
|
|
|
|
|
|
|
| 66 |
json_path = f"session_data/{session_id}/transcription_cunit.json"
|
| 67 |
if not os.path.isfile(json_path):
|
| 68 |
return jsonify({'error': f"Annotation file {json_path} not found"}), 500
|
requirements.txt
CHANGED
|
@@ -21,5 +21,3 @@ matplotlib>=3.3.0
|
|
| 21 |
seaborn>=0.11.0
|
| 22 |
|
| 23 |
# install ffmpeg
|
| 24 |
-
librosa>=0.8.0
|
| 25 |
-
transformers>=4.0.0
|
|
|
|
| 21 |
seaborn>=0.11.0
|
| 22 |
|
| 23 |
# install ffmpeg
|
|
|
|
|
|
speaker/speaker_identification.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from typing import List, Union, Optional
|
| 2 |
import os
|
| 3 |
-
|
| 4 |
import numpy as np
|
| 5 |
import librosa
|
| 6 |
from transformers import pipeline
|
|
@@ -21,6 +21,7 @@ def get_predictor():
|
|
| 21 |
if _PREDICTOR_INSTANCE is None:
|
| 22 |
_PREDICTOR_INSTANCE = Predictor()
|
| 23 |
return _PREDICTOR_INSTANCE
|
|
|
|
| 24 |
class Predictor:
|
| 25 |
def __init__(self, model_path: Optional[str] = None):
|
| 26 |
"""
|
|
@@ -100,7 +101,7 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
|
|
| 100 |
|
| 101 |
Returns:
|
| 102 |
List[str]: List of speaker IDs corresponding to each audio segment.
|
| 103 |
-
"
|
| 104 |
"""
|
| 105 |
if not audio_list:
|
| 106 |
return []
|
|
@@ -111,12 +112,85 @@ def assign_speaker_for_audio_list(audio_list: List[Union[str, np.ndarray]]) -> L
|
|
| 111 |
# Get list of 0 (child) or 1 (adult)
|
| 112 |
numeric_labels = predictor.predict(audio_list)
|
| 113 |
|
| 114 |
-
# Map to
|
| 115 |
-
speaker_ids = [
|
| 116 |
return speaker_ids
|
| 117 |
|
| 118 |
|
| 119 |
# you don't have to implement this function
|
| 120 |
def assign_speaker(session_id: str):
|
| 121 |
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import List, Union, Optional
|
| 2 |
import os
|
| 3 |
+
import json
|
| 4 |
import numpy as np
|
| 5 |
import librosa
|
| 6 |
from transformers import pipeline
|
|
|
|
| 21 |
if _PREDICTOR_INSTANCE is None:
|
| 22 |
_PREDICTOR_INSTANCE = Predictor()
|
| 23 |
return _PREDICTOR_INSTANCE
|
| 24 |
+
|
| 25 |
class Predictor:
|
| 26 |
def __init__(self, model_path: Optional[str] = None):
|
| 27 |
"""
|
|
|
|
| 101 |
|
| 102 |
Returns:
|
| 103 |
List[str]: List of speaker IDs corresponding to each audio segment.
|
| 104 |
+
"Child" for child, "Examiner" for adult.
|
| 105 |
"""
|
| 106 |
if not audio_list:
|
| 107 |
return []
|
|
|
|
| 112 |
# Get list of 0 (child) or 1 (adult)
|
| 113 |
numeric_labels = predictor.predict(audio_list)
|
| 114 |
|
| 115 |
+
# Map to Child and Examiner, preserving order
|
| 116 |
+
speaker_ids = ["Child" if label == 0 else "Examiner" if label == 1 else "Unknown" for label in numeric_labels]
|
| 117 |
return speaker_ids
|
| 118 |
|
| 119 |
|
| 120 |
# you don't have to implement this function
|
| 121 |
def assign_speaker(session_id: str):
|
| 122 |
|
| 123 |
+
base_dir = os.path.join("session_data", session_id)
|
| 124 |
+
json_path = os.path.join(base_dir, "transcription_cunit.json")
|
| 125 |
+
wav_path = os.path.join(base_dir, "audio.wav")
|
| 126 |
+
|
| 127 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 128 |
+
data = json.load(f)
|
| 129 |
+
segments = data.get("segments", [])
|
| 130 |
+
|
| 131 |
+
if not segments:
|
| 132 |
+
return
|
| 133 |
+
|
| 134 |
+
audio, sr = librosa.load(wav_path, sr=DEFAULT_SAMPLE_RATE, mono=True)
|
| 135 |
+
n_samples = len(audio)
|
| 136 |
+
dur_sec = n_samples / float(DEFAULT_SAMPLE_RATE)
|
| 137 |
+
|
| 138 |
+
model_inputs: List[np.ndarray] = []
|
| 139 |
+
model_indices: List[int] = []
|
| 140 |
+
prefilled_unknown: List[int] = []
|
| 141 |
+
|
| 142 |
+
for i, seg in enumerate(segments):
|
| 143 |
+
start = seg.get("start")
|
| 144 |
+
end = seg.get("end")
|
| 145 |
+
|
| 146 |
+
if (
|
| 147 |
+
start is None or end is None
|
| 148 |
+
or not isinstance(start, (int, float))
|
| 149 |
+
or not isinstance(end, (int, float))
|
| 150 |
+
or end <= start
|
| 151 |
+
or start >= dur_sec
|
| 152 |
+
):
|
| 153 |
+
prefilled_unknown.append(i)
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
s = max(0.0, float(start))
|
| 157 |
+
e = min(float(end), dur_sec)
|
| 158 |
+
|
| 159 |
+
if e <= s:
|
| 160 |
+
prefilled_unknown.append(i)
|
| 161 |
+
continue
|
| 162 |
+
|
| 163 |
+
s_idx = int(round(s * DEFAULT_SAMPLE_RATE))
|
| 164 |
+
e_idx = int(round(e * DEFAULT_SAMPLE_RATE))
|
| 165 |
+
|
| 166 |
+
s_idx = max(0, min(s_idx, n_samples))
|
| 167 |
+
e_idx = max(0, min(e_idx, n_samples))
|
| 168 |
+
|
| 169 |
+
if e_idx <= s_idx:
|
| 170 |
+
prefilled_unknown.append(i)
|
| 171 |
+
continue
|
| 172 |
+
|
| 173 |
+
snippet = audio[s_idx:e_idx]
|
| 174 |
+
|
| 175 |
+
if snippet.size == 0:
|
| 176 |
+
prefilled_unknown.append(i)
|
| 177 |
+
continue
|
| 178 |
+
|
| 179 |
+
model_inputs.append(snippet)
|
| 180 |
+
model_indices.append(i)
|
| 181 |
+
|
| 182 |
+
speakers = ["Unknown"] * len(segments)
|
| 183 |
+
if model_inputs:
|
| 184 |
+
predicted = assign_speaker_for_audio_list(model_inputs) # ["Child"/"Examiner"/"Unknown"]
|
| 185 |
+
for seg_idx, spk in zip(model_indices, predicted):
|
| 186 |
+
speakers[seg_idx] = spk
|
| 187 |
+
|
| 188 |
+
for seg_idx in prefilled_unknown:
|
| 189 |
+
speakers[seg_idx] = "Unknown"
|
| 190 |
+
|
| 191 |
+
for i, seg in enumerate(segments):
|
| 192 |
+
seg["speaker"] = speakers[i]
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
| 196 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
transcription/transcription.py
CHANGED
|
@@ -298,6 +298,11 @@ def translate_audio_file(model: str = "mazeWhisper", audio_path: str = "", devic
|
|
| 298 |
|
| 299 |
audio = load_audio(audio_path)
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
print("Starting transcription...")
|
| 302 |
result = pipeline.transcribe(audio_path, verbose=True)
|
| 303 |
|
|
|
|
| 298 |
|
| 299 |
audio = load_audio(audio_path)
|
| 300 |
|
| 301 |
+
# Save the entire audio as audio.wav in the session directory
|
| 302 |
+
audio_output_path = session_dir / "audio.wav"
|
| 303 |
+
sf.write(audio_output_path, audio, SAMPLE_RATE)
|
| 304 |
+
print(f"Audio saved: {audio_output_path}")
|
| 305 |
+
|
| 306 |
print("Starting transcription...")
|
| 307 |
result = pipeline.transcribe(audio_path, verbose=True)
|
| 308 |
|