Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| import psutil | |
| import multiprocessing | |
| app = FastAPI(title="LLM Agent API", version="1.0.0") | |
| class ChatRequest(BaseModel): | |
| message: str | |
| max_tokens: int = 100 | |
| temperature: float = 0.7 | |
| class ChatResponse(BaseModel): | |
| response: str | |
| class LocalLLMAgent: | |
| def __init__(self): | |
| # Download do modelo se não existir | |
| model_path = "./llama-2-7b-chat.Q4_K_M.gguf" | |
| if not os.path.exists(model_path): | |
| print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True) | |
| print(" Isso pode levar alguns minutos...", flush=True) | |
| model_path = hf_hub_download( | |
| repo_id="TheBloke/Llama-2-7B-Chat-GGUF", | |
| filename="llama-2-7b-chat.Q4_K_M.gguf", | |
| local_dir="./" | |
| ) | |
| print("✅ Modelo baixado com sucesso!", flush=True) | |
| else: | |
| print("📁 Modelo já existe, carregando...", flush=True) | |
| # Configura para usar todas as CPUs disponíveis | |
| n_threads = multiprocessing.cpu_count() | |
| print(f"🔧 Configurando llama-cpp-python:", flush=True) | |
| print(f" - CPUs disponíveis: {n_threads}", flush=True) | |
| print(f" - Threads: {n_threads}", flush=True) | |
| print(f" - Contexto: 2048 tokens", flush=True) | |
| print("🚀 Inicializando modelo...", flush=True) | |
| self.llm = Llama( | |
| model_path=model_path, | |
| chat_format="llama-2", | |
| n_ctx=2048, | |
| n_threads=n_threads, | |
| n_threads_batch=n_threads, | |
| verbose=False | |
| ) | |
| print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True) | |
| self.messages = [ | |
| {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."} | |
| ] | |
| def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str: | |
| print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}") | |
| print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}") | |
| self.messages.append({"role": "user", "content": message}) | |
| response = self.llm.create_chat_completion( | |
| messages=self.messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature | |
| ) | |
| assistant_message = response['choices'][0]['message']['content'] | |
| self.messages.append({"role": "assistant", "content": assistant_message}) | |
| print(f"✅ Resposta gerada ({len(assistant_message)} chars)") | |
| return assistant_message | |
| # Inicializa o agente globalmente | |
| agent = None | |
| async def startup_event(): | |
| print("=== INICIANDO LLM AGENT API ===", flush=True) | |
| print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True) | |
| print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True) | |
| global agent | |
| agent = LocalLLMAgent() | |
| print("✅ API pronta para uso!", flush=True) | |
| print("Endpoints disponíveis:", flush=True) | |
| print(" - POST /chat", flush=True) | |
| print(" - GET /health", flush=True) | |
| print(" - GET /system", flush=True) | |
| async def chat_endpoint(request: ChatRequest): | |
| if agent is None: | |
| return ChatResponse(response="Modelo ainda carregando, tente novamente.") | |
| response = agent.chat(request.message, request.max_tokens, request.temperature) | |
| return ChatResponse(response=response) | |
| async def health_check(): | |
| return {"status": "healthy"} | |
| async def system_info(): | |
| cpu_count = multiprocessing.cpu_count() | |
| cpu_percent = psutil.cpu_percent(interval=1, percpu=True) | |
| memory = psutil.virtual_memory() | |
| return { | |
| "cpu_cores": cpu_count, | |
| "cpu_usage_per_core": cpu_percent, | |
| "cpu_usage_total": psutil.cpu_percent(interval=1), | |
| "memory_total_gb": round(memory.total / (1024**3), 2), | |
| "memory_used_gb": round(memory.used / (1024**3), 2), | |
| "memory_percent": memory.percent | |
| } | |
| # Removido - uvicorn será executado pelo Dockerfile |