Spaces:

caiiofc
/

llm-agent-api

Sleeping

App Files Files Community

caiocampos-hotmart commited on Jul 31

Commit

fcba0d8

1 Parent(s): a6981fd

chore: improve logs

Browse files

Files changed (2) hide show

Dockerfile +4 -0
app.py +18 -18

Dockerfile CHANGED Viewed

@@ -8,6 +8,10 @@ ENV PATH="/home/user/.local/bin:$PATH"
 ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
 ENV FORCE_CMAKE=1
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt

 ENV CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
 ENV FORCE_CMAKE=1
+# Forçar logs em tempo real
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=utf-8
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt

app.py CHANGED Viewed

@@ -22,25 +22,25 @@ class LocalLLMAgent:
         model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
         if not os.path.exists(model_path):
-            print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...")
-            print("   Isso pode levar alguns minutos...")
             model_path = hf_hub_download(
                 repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
                 filename="llama-2-7b-chat.Q4_K_M.gguf",
                 local_dir="./"
             )
-            print("✅ Modelo baixado com sucesso!")
         else:
-            print("📁 Modelo já existe, carregando...")
         # Configura para usar todas as CPUs disponíveis
         n_threads = multiprocessing.cpu_count()
-        print(f"🔧 Configurando llama-cpp-python:")
-        print(f"   - CPUs disponíveis: {n_threads}")
-        print(f"   - Threads: {n_threads}")
-        print(f"   - Contexto: 2048 tokens")
-        print("🚀 Inicializando modelo...")
         self.llm = Llama(
             model_path=model_path,
             chat_format="llama-2",
@@ -49,7 +49,7 @@ class LocalLLMAgent:
             n_threads_batch=n_threads,
             verbose=False
         )
-        print(f"✅ Modelo carregado! Usando {n_threads} threads")
         self.messages = [
             {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
         ]
@@ -77,18 +77,18 @@ agent = None
 @app.on_event("startup")
 async def startup_event():
-    print("=== INICIANDO LLM AGENT API ===")
-    print(f"CPUs disponíveis: {multiprocessing.cpu_count()}")
-    print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
     global agent
     agent = LocalLLMAgent()
-    print("✅ API pronta para uso!")
-    print("Endpoints disponíveis:")
-    print("  - POST /chat")
-    print("  - GET /health")
-    print("  - GET /system")
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(request: ChatRequest):

         model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
         if not os.path.exists(model_path):
+            print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...", flush=True)
+            print("   Isso pode levar alguns minutos...", flush=True)
             model_path = hf_hub_download(
                 repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
                 filename="llama-2-7b-chat.Q4_K_M.gguf",
                 local_dir="./"
             )
+            print("✅ Modelo baixado com sucesso!", flush=True)
         else:
+            print("📁 Modelo já existe, carregando...", flush=True)
         # Configura para usar todas as CPUs disponíveis
         n_threads = multiprocessing.cpu_count()
+        print(f"🔧 Configurando llama-cpp-python:", flush=True)
+        print(f"   - CPUs disponíveis: {n_threads}", flush=True)
+        print(f"   - Threads: {n_threads}", flush=True)
+        print(f"   - Contexto: 2048 tokens", flush=True)
+        print("🚀 Inicializando modelo...", flush=True)
         self.llm = Llama(
             model_path=model_path,
             chat_format="llama-2",
             n_threads_batch=n_threads,
             verbose=False
         )
+        print(f"✅ Modelo carregado! Usando {n_threads} threads", flush=True)
         self.messages = [
             {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
         ]
 @app.on_event("startup")
 async def startup_event():
+    print("=== INICIANDO LLM AGENT API ===", flush=True)
+    print(f"CPUs disponíveis: {multiprocessing.cpu_count()}", flush=True)
+    print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB", flush=True)
     global agent
     agent = LocalLLMAgent()
+    print("✅ API pronta para uso!", flush=True)
+    print("Endpoints disponíveis:", flush=True)
+    print("  - POST /chat", flush=True)
+    print("  - GET /health", flush=True)
+    print("  - GET /system", flush=True)
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(request: ChatRequest):