diff --git "a/exploration/speech_recognition.ipynb" "b/exploration/speech_recognition.ipynb" --- "a/exploration/speech_recognition.ipynb" +++ "b/exploration/speech_recognition.ipynb" @@ -1,3 +1,1318 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1123c2d76f235f9458ced0a7b8ba516a40cdc28406daa8d0ce7fa8d7bfa685ed -size 105242 +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1a35bf05", + "metadata": {}, + "source": [ + "# Exploration for Speech Recognition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b97abbe3", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade pip\n", + "%pip install devtools\n", + "%pip install torch transformers smolagents openai" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a8eaca42", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "from pathlib import Path\n", + "import sys\n", + "\n", + "sys.path.insert(0, str(Path.cwd().parent))\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "id": "be97b71d", + "metadata": {}, + "source": [ + "We are going to use `openai/whisper-large-v3-turbo` model from HuggingFace Hub for speech recognition.\n", + "\n", + "First, we are going to create valid configuration.\n", + "\n", + "Be sure `ffmpeg` is installed in the system (e.g. with `brew install fffmpeg`) or with `pip install static-ffmpeg`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "805efb89", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Device set to use cpu\n", + "/Users/kublytsk/Projects/huggingface_agents/Final_Assignment_Agent/.venv_speech_recognition/lib/python3.13/site-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n", + " warnings.warn(\n", + "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text:\n", + " Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134\n", + "Chunks:\n", + "[\n", + " {\n", + " 'timestamp': (\n", + " 0.0,\n", + " 3.62,\n", + " ),\n", + " 'text': ' Before you all go, I want to remind you that the midterm is next week.',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 4.2,\n", + " 4.98,\n", + " ),\n", + " 'text': \" Here's a little hint.\",\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 5.48,\n", + " 9.0,\n", + " ),\n", + " 'text': ' You should be familiar with the differential equations on page 245.',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 9.64,\n", + " 14.98,\n", + " ),\n", + " 'text': ' Problems that are very similar to problems 32, 33 and 44 from that page might be on the test.',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 15.24,\n", + " 20.38,\n", + " ),\n", + " 'text': ' And also some of you might want to brush up on the last page in the integration section, page 197.',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 21.3,\n", + " 23.28,\n", + " ),\n", + " 'text': \" I know some of you struggled on last week's quiz.\",\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 23.82,\n", + " 27.52,\n", + " ),\n", + " 'text': ' I foresee problem 22 from page 197 being on your midterm.',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 28.18,\n", + " 0.0,\n", + " ),\n", + " 'text': '',\n", + " },\n", + " {\n", + " 'timestamp': (\n", + " 3.18,\n", + " 7.28,\n", + " ),\n", + " 'text': \" Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134\",\n", + " },\n", + "]\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n", + "from devtools import pprint\n", + "\n", + "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", + "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", + "\n", + "model_id = \"openai/whisper-large-v3-turbo\"\n", + "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch_dtype,\n", + " low_cpu_mem_usage=True,\n", + " use_safetensors=True,\n", + ")\n", + "model.to(device)\n", + "processor = AutoProcessor.from_pretrained(model_id)\n", + "\n", + "pipe = pipeline(\n", + " \"automatic-speech-recognition\",\n", + " model=model,\n", + " tokenizer=processor.tokenizer,\n", + " feature_extractor=processor.feature_extractor,\n", + " torch_dtype=torch_dtype,\n", + " device=device,\n", + " return_timestamps=True,\n", + ")\n", + "\n", + "result = pipe(\n", + " \"data/tasks/1f975693-876d-457b-a649-393859e79bf3/1f975693-876d-457b-a649-393859e79bf3.mp3\"\n", + ")\n", + "\n", + "print(f\"Text:\\n{result['text']}\")\n", + "print(\"Chunks:\")\n", + "pprint(result[\"chunks\"])" + ] + }, + { + "cell_type": "markdown", + "id": "9aa5d5d2", + "metadata": {}, + "source": [ + "Working with URL directly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b3b6e02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kublytsk/Projects/huggingface_agents/Final_Assignment_Agent/.venv_speech_recognition/lib/python3.13/site-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text:\n", + " Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134\n" + ] + } + ], + "source": [ + "result = pipe(\n", + " \"https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3\"\n", + ")\n", + "print(f\"Text:\\n{result['text']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d2538b5b", + "metadata": {}, + "source": [ + "We are going to wrap this code as a smolagents tool. As chunks timestamp are relative to chunk we are going to fix chunk length and recalculate absolute timestamps so it may be matched with other data (e.g. vide frames). Also to clean up output we are going to suppress undesired messages and warnings. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8469e251", + "metadata": {}, + "outputs": [], + "source": [ + "from smolagents import Tool\n", + "import torch\n", + "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging\n", + "import warnings\n", + "\n", + "\n", + "class SpeechRecognitionTool(Tool):\n", + " name = \"speech_to_text\"\n", + " description = \"\"\"Transcribes speech from audio.\"\"\"\n", + "\n", + " inputs = {\n", + " \"audio\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Path to the audio file to transcribe.\",\n", + " },\n", + " \"with_time_markers\": {\n", + " \"type\": \"boolean\",\n", + " \"description\": \"Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float, float], indicating the number of seconds elapsed from the start of the audio.\",\n", + " \"nullable\": True,\n", + " \"default\": False,\n", + " },\n", + " }\n", + " output_type = \"string\"\n", + "\n", + " chunk_length_s = 30\n", + "\n", + " def __new__(cls, *args, **kwargs):\n", + " device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", + " torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", + "\n", + " model_id = \"openai/whisper-large-v3-turbo\"\n", + " model = AutoModelForSpeechSeq2Seq.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch_dtype,\n", + " low_cpu_mem_usage=True,\n", + " use_safetensors=True,\n", + " )\n", + " model.to(device)\n", + " processor = AutoProcessor.from_pretrained(model_id)\n", + "\n", + " logging.set_verbosity_error()\n", + " warnings.filterwarnings(\n", + " \"ignore\",\n", + " category=FutureWarning,\n", + " message=r\".*The input name `inputs` is deprecated.*\",\n", + " )\n", + " cls.pipe = pipeline(\n", + " \"automatic-speech-recognition\",\n", + " model=model,\n", + " tokenizer=processor.tokenizer,\n", + " feature_extractor=processor.feature_extractor,\n", + " torch_dtype=torch_dtype,\n", + " device=device,\n", + " chunk_length_s=cls.chunk_length_s,\n", + " return_timestamps=True,\n", + " )\n", + "\n", + " return super().__new__(cls, *args, **kwargs)\n", + "\n", + " def forward(self, audio: str, with_time_markers: bool = False) -> str:\n", + " \"\"\"\n", + " Transcribes speech from audio.\n", + "\n", + " Args:\n", + " audio (str): Path to the audio file to transcribe.\n", + " with_time_markers (bool): Whether to include timestamps in the transcription output. Each timestamp appears on its own line in the format [float], indicating the number of seconds elapsed from the start of the audio.\n", + "\n", + " Returns:\n", + " str: The transcribed text.\n", + " \"\"\"\n", + " result = self.pipe(audio)\n", + " if not with_time_markers:\n", + " return result[\"text\"].strip()\n", + "\n", + " txt = \"\"\n", + " chunk_length_s = self.chunk_length_s\n", + " absolute_offset = 0.0\n", + " chunk_offset = 0.0\n", + " for chunk in result[\"chunks\"]:\n", + " timestamp_start = chunk[\"timestamp\"][0]\n", + " timestamp_end = chunk[\"timestamp\"][1]\n", + " if timestamp_start < chunk_offset:\n", + " absolute_offset += chunk_length_s\n", + " chunk_offset = timestamp_start\n", + " absolute_start = absolute_offset + timestamp_start\n", + "\n", + " if timestamp_end < timestamp_start:\n", + " absolute_offset += chunk_length_s\n", + " absolute_end = absolute_offset + timestamp_end\n", + " chunk_offset = timestamp_end\n", + "\n", + " chunk_text = chunk[\"text\"].strip()\n", + " if chunk_text:\n", + " txt += f\"[{absolute_start:.2f}]\\n{chunk_text}\\n[{absolute_end:.2f}]\\n\"\n", + " return txt.strip()\n", + "\n", + "\n", + "speech_to_text = SpeechRecognitionTool()" + ] + }, + { + "cell_type": "markdown", + "id": "e0801c20", + "metadata": {}, + "source": [ + "Verify tool implementation:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fa663b9f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.00]\n", + "Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm.\n", + "[27.52]\n", + "[28.20]\n", + "Oh, and don't forget to brush up on the section on related rates\n", + "[30.70]\n", + "[30.70]\n", + "on pages 132, 133 and 134.\n", + "[34.78]\n" + ] + } + ], + "source": [ + "transcription = speech_to_text(\n", + " audio=\"data/tasks/1f975693-876d-457b-a649-393859e79bf3/1f975693-876d-457b-a649-393859e79bf3.mp3\",\n", + " with_time_markers=True,\n", + ")\n", + "\n", + "print(transcription)" + ] + }, + { + "cell_type": "markdown", + "id": "b4fe3d85", + "metadata": {}, + "source": [ + "Now time to verify if agent can use our tools in GAIA challenges environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c62eb0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭────────────────��─────────────────────────────────── New run ────────────────────────────────────────────────────╮\n",
+       "                                                                                                                 \n",
+       " Transcribe attached audio                                                                                       \n",
+       "                                                                                                                 \n",
+       "╰─ OpenAIServerModel - gpt-4.1 ───────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m╭─\u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[1;38;2;212;183;2mNew run\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╮\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mTranscribe attached audio\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m╰─\u001b[0m\u001b[38;2;212;183;2m OpenAIServerModel - gpt-4.1 \u001b[0m\u001b[38;2;212;183;2m──────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m1\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────────────────��─────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'}                                                   │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3\n",
+       "
\n" + ], + "text/plain": [ + "Observations: \u001b[4;94mhttps://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 1: Duration 1.58 seconds| Input tokens: 1,238 | Output tokens: 15]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 1: Duration 1.58 seconds| Input tokens: 1,238 | Output tokens: 15]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m2\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'speech_to_text' with arguments: {'audio':                                                        │\n",
+       "│ 'https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3'}                      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'speech_to_text' with arguments: {'audio': │\n", + "│ 'https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \n",
+       "should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 \n",
+       "and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the \n",
+       "integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197\n",
+       "being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134.\n",
+       "
\n" + ], + "text/plain": [ + "Observations: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \n", + "should be familiar with the differential equations on page \u001b[1;36m245\u001b[0m. Problems that are very similar to problems \u001b[1;36m32\u001b[0m, \u001b[1;36m33\u001b[0m \n", + "and \u001b[1;36m44\u001b[0m from that page might be on the test. And also some of you might want to brush up on the last page in the \n", + "integration section, page \u001b[1;36m197\u001b[0m. I know some of you struggled on last week's quiz. I foresee problem \u001b[1;36m22\u001b[0m from page \u001b[1;36m197\u001b[0m\n", + "being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages \u001b[1;36m132\u001b[0m, \u001b[1;36m133\u001b[0m and \u001b[1;36m134\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 2: Duration 18.06 seconds| Input tokens: 2,590 | Output tokens: 63]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 2: Duration 18.06 seconds| Input tokens: 2,590 | Output tokens: 63]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m3\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'final_answer' with arguments: {'answer': \"Before you all go, I want to remind you that the       │\n",
+       "│ midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. │\n",
+       "│ Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of  │\n",
+       "│ you might want to brush up on the last page in the integration section, page 197. I know some of you struggled  │\n",
+       "│ on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up │\n",
+       "│ on the section on related rates on pages 132, 133 and 134.\"}                                                    │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'final_answer' with arguments: {'answer': \"Before you all go, I want to remind you that the │\n", + "│ midterm is next week. Here's a little hint. You should be familiar with the differential equations on page 245. │\n", + "│ Problems that are very similar to problems 32, 33 and 44 from that page might be on the test. And also some of │\n", + "│ you might want to brush up on the last page in the integration section, page 197. I know some of you struggled │\n", + "│ on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up │\n", + "│ on the section on related rates on pages 132, 133 and 134.\"} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final answer: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \n",
+       "should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 \n",
+       "and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the \n",
+       "integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197\n",
+       "being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134.\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;212;183;2mFinal answer: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \u001b[0m\n", + "\u001b[1;38;2;212;183;2mshould be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 \u001b[0m\n", + "\u001b[1;38;2;212;183;2mand 44 from that page might be on the test. And also some of you might want to brush up on the last page in the \u001b[0m\n", + "\u001b[1;38;2;212;183;2mintegration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197\u001b[0m\n", + "\u001b[1;38;2;212;183;2mbeing on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134.\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 3: Duration 1.55 seconds| Input tokens: 4,219 | Output tokens: 211]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 3: Duration 1.55 seconds| Input tokens: 4,219 | Output tokens: 211]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭──────────────────────────────────────────────────── New run ────────────────────────────────────────────────────╮\n",
+       "                                                                                                                 \n",
+       " Transcribe attached audio                                                                                       \n",
+       "                                                                                                                 \n",
+       "╰─ OpenAIServerModel - gpt-4.1 ───────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m╭─\u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[1;38;2;212;183;2mNew run\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╮\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mTranscribe attached audio\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m╰─\u001b[0m\u001b[38;2;212;183;2m OpenAIServerModel - gpt-4.1 \u001b[0m\u001b[38;2;212;183;2m──────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m1\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'}                                                   │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\n",
+       "
\n" + ], + "text/plain": [ + "Observations: \u001b[4;94mhttps://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 1: Duration 0.61 seconds| Input tokens: 1,238 | Output tokens: 15]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 1: Duration 0.61 seconds| Input tokens: 1,238 | Output tokens: 15]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m2\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'speech_to_text' with arguments: {'audio':                                                        │\n",
+       "│ 'https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3'}                      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'speech_to_text' with arguments: {'audio': │\n", + "│ 'https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \n",
+       "cornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \n",
+       "Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \n",
+       "it as a delicious and fruity filling for your pie crust.\n",
+       "
\n" + ], + "text/plain": [ + "Observations: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \n", + "cornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \n", + "Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \n", + "it as a delicious and fruity filling for your pie crust.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 2: Duration 7.78 seconds| Input tokens: 2,595 | Output tokens: 71]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 2: Duration 7.78 seconds| Input tokens: 2,595 | Output tokens: 71]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m3\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'final_answer' with arguments: {'answer': 'In a saucepan, combine ripe strawberries, granulated   │\n",
+       "│ sugar, freshly squeezed lemon juice and cornstarch. Cook the mixture over medium heat, stirring constantly      │\n",
+       "│ until it thickens to a smooth consistency. Remove from heat and stir in a dash of pure vanilla extract. Allow   │\n",
+       "│ the strawberry pie filling to cool before using it as a delicious and fruity filling for your pie crust.'}      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'final_answer' with arguments: {'answer': 'In a saucepan, combine ripe strawberries, granulated │\n", + "│ sugar, freshly squeezed lemon juice and cornstarch. Cook the mixture over medium heat, stirring constantly │\n", + "│ until it thickens to a smooth consistency. Remove from heat and stir in a dash of pure vanilla extract. Allow │\n", + "│ the strawberry pie filling to cool before using it as a delicious and fruity filling for your pie crust.'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final answer: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \n",
+       "cornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \n",
+       "Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \n",
+       "it as a delicious and fruity filling for your pie crust.\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;212;183;2mFinal answer: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \u001b[0m\n", + "\u001b[1;38;2;212;183;2mcornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \u001b[0m\n", + "\u001b[1;38;2;212;183;2mRemove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \u001b[0m\n", + "\u001b[1;38;2;212;183;2mit as a delicious and fruity filling for your pie crust.\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 3: Duration 1.34 seconds| Input tokens: 4,184 | Output tokens: 158]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 3: Duration 1.34 seconds| Input tokens: 4,184 | Output tokens: 158]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from smolagents import ToolCallingAgent, OpenAIServerModel\n", + "from tools import GetAttachmentTool\n", + "\n", + "get_attachment = GetAttachmentTool()\n", + "model = OpenAIServerModel(model_id=\"gpt-4.1\")\n", + "agent = ToolCallingAgent(\n", + " model=model,\n", + " tools=[get_attachment, speech_to_text],\n", + ")\n", + "\n", + "for task_with_audio_attachment in [\n", + " \"1f975693-876d-457b-a649-393859e79bf3\",\n", + " \"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\",\n", + "]:\n", + " get_attachment.attachment_for(task_with_audio_attachment)\n", + " agent.run(\"Transcribe attached audio\")" + ] + }, + { + "cell_type": "markdown", + "id": "1c56fe71", + "metadata": {}, + "source": [ + "And now let's verify full GAIA task execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575ce70a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭──────────────────────────────────────────────────── New run ────────────────────────────────────────────────────╮\n",
+       "                                                                                                                 \n",
+       " Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus  \n",
+       " mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the     \n",
+       " recommended reading for the test, but my headphones are broken :(                                               \n",
+       " Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've      \n",
+       " attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a           \n",
+       " comma-delimited list. And please provide the list in ascending order.                                           \n",
+       "                                                                                                                 \n",
+       "╰─ OpenAIServerModel - gpt-4.1 ───────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m╭─\u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[1;38;2;212;183;2mNew run\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╮\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mHi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mmid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mrecommended reading for the test, but my headphones are broken :(\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mattached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mcomma-delimited list. And please provide the list in ascending order.\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m╰─\u001b[0m\u001b[38;2;212;183;2m OpenAIServerModel - gpt-4.1 \u001b[0m\u001b[38;2;212;183;2m──────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m1\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━���━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'}                                                   │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3\n",
+       "
\n" + ], + "text/plain": [ + "Observations: \u001b[4;94mhttps://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 1: Duration 0.83 seconds| Input tokens: 1,351 | Output tokens: 15]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 1: Duration 0.83 seconds| Input tokens: 1,351 | Output tokens: 15]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m2\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭───────────────────────────────────────���─────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'speech_to_text' with arguments: {'audio':                                                        │\n",
+       "│ 'https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3'}                      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'speech_to_text' with arguments: {'audio': │\n", + "│ 'https://agents-course-unit4-scoring.hf.space/files/1f975693-876d-457b-a649-393859e79bf3'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \n",
+       "should be familiar with the differential equations on page 245. Problems that are very similar to problems 32, 33 \n",
+       "and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the \n",
+       "integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197\n",
+       "being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages 132, 133 and 134.\n",
+       "
\n" + ], + "text/plain": [ + "Observations: Before you all go, I want to remind you that the midterm is next week. Here's a little hint. You \n", + "should be familiar with the differential equations on page \u001b[1;36m245\u001b[0m. Problems that are very similar to problems \u001b[1;36m32\u001b[0m, \u001b[1;36m33\u001b[0m \n", + "and \u001b[1;36m44\u001b[0m from that page might be on the test. And also some of you might want to brush up on the last page in the \n", + "integration section, page \u001b[1;36m197\u001b[0m. I know some of you struggled on last week's quiz. I foresee problem \u001b[1;36m22\u001b[0m from page \u001b[1;36m197\u001b[0m\n", + "being on your midterm. Oh, and don't forget to brush up on the section on related rates on pages \u001b[1;36m132\u001b[0m, \u001b[1;36m133\u001b[0m and \u001b[1;36m134\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 2: Duration 17.90 seconds| Input tokens: 2,812 | Output tokens: 63]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 2: Duration 17.90 seconds| Input tokens: 2,812 | Output tokens: 63]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m3\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'final_answer' with arguments: {'answer': '132,133,134,197,245'}                                  │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'final_answer' with arguments: {'answer': '132,133,134,197,245'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final answer: 132,133,134,197,245\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;212;183;2mFinal answer: 132,133,134,197,245\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 3: Duration 0.84 seconds| Input tokens: 4,549 | Output tokens: 86]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 3: Duration 0.84 seconds| Input tokens: 4,549 | Output tokens: 86]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭──────────────────────────────────────────────────── New run ────────────────────────────────────────────────────╮\n",
+       "                                                                                                                 \n",
+       " Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust,   \n",
+       " but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and  \n",
+       " the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the  \n",
+       " recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling,   \n",
+       " as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.      \n",
+       " In your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch   \n",
+       " of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe               \n",
+       " strawberries\".                                                                                                  \n",
+       " Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients. \n",
+       "                                                                                                                 \n",
+       "╰─ OpenAIServerModel - gpt-4.1 ───────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m╭─\u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[1;38;2;212;183;2mNew run\u001b[0m\u001b[38;2;212;183;2m \u001b[0m\u001b[38;2;212;183;2m───────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╮\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mHi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mbut I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mthe speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mrecipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mas I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mof salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe \u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mstrawberries\".\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[1mPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m│\u001b[0m \u001b[38;2;212;183;2m│\u001b[0m\n", + "\u001b[38;2;212;183;2m╰─\u001b[0m\u001b[38;2;212;183;2m OpenAIServerModel - gpt-4.1 \u001b[0m\u001b[38;2;212;183;2m──────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[38;2;212;183;2m─╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 1 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m1\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'}                                                   │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'get_attachment' with arguments: {'fmt': 'URL'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\n",
+       "
\n" + ], + "text/plain": [ + "Observations: \u001b[4;94mhttps://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 1: Duration 0.84 seconds| Input tokens: 1,424 | Output tokens: 15]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 1: Duration 0.84 seconds| Input tokens: 1,424 | Output tokens: 15]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 2 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m2\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'speech_to_text' with arguments: {'audio':                                                        │\n",
+       "│ 'https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3'}                      │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'speech_to_text' with arguments: {'audio': │\n", + "│ 'https://agents-course-unit4-scoring.hf.space/files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Observations: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \n",
+       "cornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \n",
+       "Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \n",
+       "it as a delicious and fruity filling for your pie crust.\n",
+       "
\n" + ], + "text/plain": [ + "Observations: In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice and \n", + "cornstarch. Cook the mixture over medium heat, stirring constantly until it thickens to a smooth consistency. \n", + "Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using \n", + "it as a delicious and fruity filling for your pie crust.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 2: Duration 8.32 seconds| Input tokens: 2,966 | Output tokens: 71]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 2: Duration 8.32 seconds| Input tokens: 2,966 | Output tokens: 71]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Step 3 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;212;183;2m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ \u001b[0m\u001b[1mStep \u001b[0m\u001b[1;36m3\u001b[0m\u001b[38;2;212;183;2m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n",
+       "│ Calling tool: 'final_answer' with arguments: {'answer': 'cornstarch, granulated sugar, freshly squeezed lemon   │\n",
+       "│ juice, pure vanilla extract, ripe strawberries'}                                                                │\n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "╭─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮\n", + "│ Calling tool: 'final_answer' with arguments: {'answer': 'cornstarch, granulated sugar, freshly squeezed lemon │\n", + "│ juice, pure vanilla extract, ripe strawberries'} │\n", + "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final answer: cornstarch, granulated sugar, freshly squeezed lemon juice, pure vanilla extract, ripe strawberries\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;212;183;2mFinal answer: cornstarch, granulated sugar, freshly squeezed lemon juice, pure vanilla extract, ripe strawberries\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[Step 3: Duration 1.62 seconds| Input tokens: 4,739 | Output tokens: 104]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2m[Step 3: Duration 1.62 seconds| Input tokens: 4,739 | Output tokens: 104]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for task_with_audio_attachment, question in {\n", + " \"1f975693-876d-457b-a649-393859e79bf3\": \"\"\"\\\n", + "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n", + "Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.\n", + "\"\"\",\n", + " \"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3\": \"\"\"\\\n", + "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n", + "In your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n", + "Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.\n", + "\"\"\",\n", + "}.items():\n", + " get_attachment.attachment_for(task_with_audio_attachment)\n", + " agent.run(question)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_speech_recognition (3.13.3)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}