Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,13 +10,14 @@ from base64 import b64encode
|
|
| 10 |
from speech_recognition import AudioFile, Recognizer
|
| 11 |
import numpy as np
|
| 12 |
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
|
|
|
|
| 13 |
|
| 14 |
model = YOLO('ultralyticsplus/yolov8s')
|
| 15 |
CLASS = model.model.names
|
| 16 |
defaul_bot_voice = "γγ―γγγγγγγΎγ"
|
| 17 |
area_thres = 0.3
|
| 18 |
|
| 19 |
-
def infer(image):
|
| 20 |
results = model.predict(image, show=False)[0]
|
| 21 |
masks, boxes = results.masks, results.boxes
|
| 22 |
area_image = image.width * image.height
|
|
@@ -33,6 +34,10 @@ def infer(image):
|
|
| 33 |
if area_rate >= most_close:
|
| 34 |
out_img = image.crop(tuple(box)).resize((128, 128))
|
| 35 |
most_close = area_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
print(most_close, diff_value)
|
| 37 |
if most_close >= area_thres and diff_value >= 0.5:
|
| 38 |
voice_bot = tts(defaul_bot_voice, language="ja")
|
|
@@ -41,7 +46,7 @@ def infer(image):
|
|
| 41 |
iface = gr.Interface(
|
| 42 |
fn=infer,
|
| 43 |
title="aisatsu api",
|
| 44 |
-
inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
|
| 45 |
outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
|
| 46 |
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
|
| 47 |
).launch(enable_queue=True, debug=True)
|
|
|
|
| 10 |
from speech_recognition import AudioFile, Recognizer
|
| 11 |
import numpy as np
|
| 12 |
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
|
| 13 |
+
from scipy.spatial import distance as dist
|
| 14 |
|
| 15 |
model = YOLO('ultralyticsplus/yolov8s')
|
| 16 |
CLASS = model.model.names
|
| 17 |
defaul_bot_voice = "γγ―γγγγγγγΎγ"
|
| 18 |
area_thres = 0.3
|
| 19 |
|
| 20 |
+
def infer(image, last_seen):
|
| 21 |
results = model.predict(image, show=False)[0]
|
| 22 |
masks, boxes = results.masks, results.boxes
|
| 23 |
area_image = image.width * image.height
|
|
|
|
| 34 |
if area_rate >= most_close:
|
| 35 |
out_img = image.crop(tuple(box)).resize((128, 128))
|
| 36 |
most_close = area_rate
|
| 37 |
+
if last_seen != "":
|
| 38 |
+
last_seen = base64_to_pil(last_seen)
|
| 39 |
+
if out_img is not None:
|
| 40 |
+
diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
|
| 41 |
print(most_close, diff_value)
|
| 42 |
if most_close >= area_thres and diff_value >= 0.5:
|
| 43 |
voice_bot = tts(defaul_bot_voice, language="ja")
|
|
|
|
| 46 |
iface = gr.Interface(
|
| 47 |
fn=infer,
|
| 48 |
title="aisatsu api",
|
| 49 |
+
inputs=[gr.Image(label="image", type="pil", shape=(960, 640)), gr.Textbox(label="last seen", value="")],
|
| 50 |
outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
|
| 51 |
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
|
| 52 |
).launch(enable_queue=True, debug=True)
|