Spaces:

caiiofc
/

llm-agent-api

Sleeping

llm-agent-api / app.py

caiocampos-hotmart

chore: improve logs

fcba0d8 4 months ago

4.36 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os
	import psutil
	import multiprocessing

	app = FastAPI(title="LLM Agent API", version="1.0.0")

	class ChatRequest(BaseModel):
	message: str
	max_tokens: int = 100
	temperature: float = 0.7

	class ChatResponse(BaseModel):
	response: str

	class LocalLLMAgent:
	def __init__(self):
	# Download do modelo se não existir
	model_path = "./llama-2-7b-chat.Q4_K_M.gguf"

	if not os.path.exists(model_path):
	print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True)
	print(" Isso pode levar alguns minutos...", flush=True)
	model_path = hf_hub_download(
	repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
	filename="llama-2-7b-chat.Q4_K_M.gguf",
	local_dir="./"
	)
	print("✅ Modelo baixado com sucesso!", flush=True)
	else:
	print("📁 Modelo já existe, carregando...", flush=True)

	# Configura para usar todas as CPUs disponíveis
	n_threads = multiprocessing.cpu_count()
	print(f"🔧 Configurando llama-cpp-python:", flush=True)
	print(f" - CPUs disponíveis: {n_threads}", flush=True)
	print(f" - Threads: {n_threads}", flush=True)
	print(f" - Contexto: 2048 tokens", flush=True)

	print("🚀 Inicializando modelo...", flush=True)
	self.llm = Llama(
	model_path=model_path,
	chat_format="llama-2",
	n_ctx=2048,
	n_threads=n_threads,
	n_threads_batch=n_threads,
	verbose=False
	)
	print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True)
	self.messages = [
	{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
	]

	def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
	print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
	print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}")

	self.messages.append({"role": "user", "content": message})

	response = self.llm.create_chat_completion(
	messages=self.messages,
	max_tokens=max_tokens,
	temperature=temperature
	)

	assistant_message = response['choices'][0]['message']['content']
	self.messages.append({"role": "assistant", "content": assistant_message})

	print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
	return assistant_message

	# Inicializa o agente globalmente
	agent = None

	@app.on_event("startup")
	async def startup_event():
	print("=== INICIANDO LLM AGENT API ===", flush=True)
	print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True)
	print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True)

	global agent
	agent = LocalLLMAgent()

	print("✅ API pronta para uso!", flush=True)
	print("Endpoints disponíveis:", flush=True)
	print(" - POST /chat", flush=True)
	print(" - GET /health", flush=True)
	print(" - GET /system", flush=True)

	@app.post("/chat", response_model=ChatResponse)
	async def chat_endpoint(request: ChatRequest):
	if agent is None:
	return ChatResponse(response="Modelo ainda carregando, tente novamente.")
	response = agent.chat(request.message, request.max_tokens, request.temperature)
	return ChatResponse(response=response)

	@app.get("/health")
	async def health_check():
	return {"status": "healthy"}

	@app.get("/system")
	async def system_info():
	cpu_count = multiprocessing.cpu_count()
	cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
	memory = psutil.virtual_memory()

	return {
	"cpu_cores": cpu_count,
	"cpu_usage_per_core": cpu_percent,
	"cpu_usage_total": psutil.cpu_percent(interval=1),
	"memory_total_gb": round(memory.total / (1024**3), 2),
	"memory_used_gb": round(memory.used / (1024**3), 2),
	"memory_percent": memory.percent
	}

	# Removido - uvicorn será executado pelo Dockerfile