Spaces:

Detomo
/

aisatsu-app-api

Paused

vumichien commited on Mar 25, 2023

Commit

2b9ea7f

1 Parent(s): 0ac7fe7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,13 +10,14 @@ from base64 import b64encode
 from speech_recognition import AudioFile, Recognizer
 import numpy as np
 from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
 model = YOLO('ultralyticsplus/yolov8s')
 CLASS = model.model.names
 defaul_bot_voice = "おはいようございます"
 area_thres = 0.3
-def infer(image):
     results = model.predict(image, show=False)[0]
     masks, boxes = results.masks, results.boxes
     area_image = image.width * image.height
@@ -33,6 +34,10 @@ def infer(image):
             if area_rate >= most_close:
                 out_img = image.crop(tuple(box)).resize((128, 128))
                 most_close = area_rate
     print(most_close, diff_value)
     if most_close >= area_thres and diff_value >= 0.5:
         voice_bot = tts(defaul_bot_voice, language="ja")
@@ -41,7 +46,7 @@ def infer(image):
 iface = gr.Interface(
     fn=infer,
     title="aisatsu api",
-    inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
     outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
     article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
 ).launch(enable_queue=True, debug=True)

 from speech_recognition import AudioFile, Recognizer
 import numpy as np
 from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
+from scipy.spatial import distance as dist
 model = YOLO('ultralyticsplus/yolov8s')
 CLASS = model.model.names
 defaul_bot_voice = "おはいようございます"
 area_thres = 0.3
+def infer(image, last_seen):
     results = model.predict(image, show=False)[0]
     masks, boxes = results.masks, results.boxes
     area_image = image.width * image.height
             if area_rate >= most_close:
                 out_img = image.crop(tuple(box)).resize((128, 128))
                 most_close = area_rate
+    if last_seen != "":
+        last_seen = base64_to_pil(last_seen)
+        if out_img is not None:
+            diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
     print(most_close, diff_value)
     if most_close >= area_thres and diff_value >= 0.5:
         voice_bot = tts(defaul_bot_voice, language="ja")
 iface = gr.Interface(
     fn=infer,
     title="aisatsu api",
+    inputs=[gr.Image(label="image", type="pil", shape=(960, 640)), gr.Textbox(label="last seen", value="")],
     outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice")],
     article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
 ).launch(enable_queue=True, debug=True)