Spaces:

subhannadeem1
/

Audio_to_text_classification

Runtime error

App Files Files Community

subhannadeem1 commited on Nov 19, 2023

Commit

d26e658

1 Parent(s): a0f51ae

Create app.py

Browse files

Files changed (1) hide show

app.py +107 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+os.system("pip install git+https://github.com/openai/whisper.git")
+import gradio as gr
+import whisper
+from huggingface_hub import from_pretrained_keras
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+from sklearn.preprocessing import StandardScaler
+import logging
+import librosa
+import numpy as np
+import pickle
+#call tokenizer and NLP model for text classification
+tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
+# call whisper model for audio/speech processing
+model = whisper.load_model("small")
+# call model for audio emotions
+reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
+# call scaler and decoder
+with open("scaler.pkl", "rb") as f:
+    scaler = pickle.load(f)
+with open("encoder.pkl", "rb") as f:
+    encoder = pickle.load(f)
+def inference_audio(audio):
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    _, probs = model.detect_language(mel)
+    options = whisper.DecodingOptions(fp16 = False)
+    result = whisper.decode(model, mel, options)
+    return result.text
+def inference_text(audio):
+    text =inference_audio(audio)
+    sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
+    res=sentiment_task(text)[0]
+    return text,res['label'],res['score']
+def extract_features(data):
+    # ZCR
+    result = np.array([])
+    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
+    result=np.hstack((result, zcr)) # stacking horizontally
+    # Chroma_stft
+    stft = np.abs(librosa.stft(data))
+    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, chroma_stft)) # stacking horizontally
+    # MFCC
+    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mfcc)) # stacking horizontally
+    # Root Mean Square Value
+    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
+    result = np.hstack((result, rms)) # stacking horizontally
+    # MelSpectogram
+    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mel)) # stacking horizontally
+    return result
+"""
+def audio_emotions(audio):
+    sr,data = audio
+    features_audio = extract_features(data)
+    features_audio = np.array(features_audio)
+    scaled_features=scaler.transform(features_audio)
+    scaled_features = np.expand_dims(scaled_features, axis=2)
+    prediction=reloaded_model.predict(scaled_features)
+    y_pred = encoder.inverse_transform(prediction)
+    return y_pred
+"""
+def main(audio):
+    r1,r2,r3=inference_text(audio)
+    #r3=audio_emotions(audio)
+    return r1,r2,r3
+audio = gr.Audio(
+                    label="Input Audio",
+                    show_label=False,
+                    source="microphone",
+                    type="filepath"
+                )
+app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True)