Spaces:

chripto
/

desc_compare

Sleeping

App Files Files Community

chripto commited on 13 days ago

Commit

8f315c1

verified ·

1 Parent(s): 9e6f606

Update main.py

Browse files

Files changed (1) hide show

main.py +68 -3

main.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from huggingface_hub import InferenceClient
 import uvicorn
 app = FastAPI()
@@ -49,6 +52,56 @@ def format_prompt_for_text_generation(system_prompt: str, history: list, prompt:
     return out
 class Item(BaseModel):
     """Payload per /generate/. Per Ministral-3: temperature < 0.1 in produzione (raccomandato)."""
     prompt: str
@@ -102,7 +155,19 @@ def generate(item: Item) -> str:
     except Exception as e2:
         last_error = e2
-    # 3) Ultima risorsa: text_generation (solo per modelli che lo supportano su hf-inference)
     try:
         formatted = format_prompt_for_text_generation(
             item.system_prompt, item.history or [], item.prompt
@@ -123,8 +188,8 @@ def generate(item: Item) -> str:
                 for r in stream
             )
         return str(stream)
-    except Exception as e3:
-        last_error = e3
     raise HTTPException(status_code=502, detail=f"Inference fallita: {str(last_error)}")

 import os
+import json
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from huggingface_hub import InferenceClient
 import uvicorn
+import urllib.request
+import urllib.error
 app = FastAPI()
     return out
+def chat_completion_via_http(messages: list, max_tokens: int, temperature: float, top_p: float) -> str | None:
+    """
+    Chiamata diretta all'endpoint HF chat completions (v1).
+    Usata quando il SDK fallisce perché il modello non dichiara il task (es. Ministral-3).
+    """
+    if not HF_TOKEN:
+        return None
+    base = INFERENCE_ENDPOINT_URL.rstrip("/") if INFERENCE_ENDPOINT_URL else f"https://api-inference.huggingface.co/models/{MODEL_ID}"
+    url = f"{base}/v1/chat/completions"
+    body = {
+        "messages": messages,
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+    }
+    data = json.dumps(body).encode("utf-8")
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={
+            "Authorization": f"Bearer {HF_TOKEN}",
+            "Content-Type": "application/json",
+        },
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=120) as resp:
+            out = json.loads(resp.read().decode())
+    except urllib.error.HTTPError as e:
+        if e.code == 503:
+            err_body = e.read().decode() if e.fp else ""
+            try:
+                err_json = json.loads(err_body)
+                if "estimated_time" in err_json:
+                    raise HTTPException(
+                        status_code=503,
+                        detail=f"Modello in caricamento. Riprova tra {err_json.get('estimated_time', 0):.0f}s.",
+                    )
+            except (ValueError, TypeError):
+                pass
+        return None
+    except Exception:
+        return None
+    choices = out.get("choices") or []
+    if not choices:
+        return None
+    msg = choices[0].get("message") or {}
+    return (msg.get("content") or "").strip()
 class Item(BaseModel):
     """Payload per /generate/. Per Ministral-3: temperature < 0.1 in produzione (raccomandato)."""
     prompt: str
     except Exception as e2:
         last_error = e2
+    # 3) Chat completions via HTTP (endpoint v1) – funziona per modelli che non dichiarano il task (es. Ministral-3)
+    try:
+        content = chat_completion_via_http(
+            messages, item.max_new_tokens, temperature, top_p
+        )
+        if content is not None and content != "":
+            return content
+    except HTTPException:
+        raise
+    except Exception as e3:
+        last_error = e3
+    # 4) Ultima risorsa: text_generation (solo per modelli che lo supportano su hf-inference)
     try:
         formatted = format_prompt_for_text_generation(
             item.system_prompt, item.history or [], item.prompt
                 for r in stream
             )
         return str(stream)
+    except Exception as e4:
+        last_error = e4
     raise HTTPException(status_code=502, detail=f"Inference fallita: {str(last_error)}")