Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

App Files Files Community

sethmcknight commited on Oct 24

Commit

3916e13

1 Parent(s): 6d37c4a

fix: Add detailed logging and improved locking for ingestion startup

Browse files

Files changed (1) hide show

src/app_factory.py +130 -43

src/app_factory.py CHANGED Viewed

@@ -27,30 +27,58 @@ class InitializationTimeoutError(Exception):
 def ensure_embeddings_on_startup():
     """
     Ensure embeddings exist and have the correct dimension on app startup.
-    This is critical for Render deployments where the vector store is ephemeral.
     Uses a file-based lock to prevent race conditions between workers.
     """
     lock_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "locks")
     if not os.path.exists(lock_dir):
-        os.makedirs(lock_dir)
     lock_file = os.path.join(lock_dir, "ingestion.lock")
-    lock_timeout = 180  # 3 minutes
-    start_time = time.time()
-    while os.path.exists(lock_file):
-        if time.time() - start_time > lock_timeout:
-            logging.error(f"Lock file {lock_file} has been present for over {lock_timeout} seconds. Aborting wait.")
-            # In a real-world scenario, you might want to raise an exception
-            # or attempt to delete a stale lock file. For now, we just stop waiting.
             return
-        logging.info(f"Another process is handling ingestion. Waiting for lock file {lock_file} to be released...")
-        time.sleep(5)
     try:
-        # Acquire lock
-        with open(lock_file, "w") as f:
-            f.write(str(os.getpid()))
-        logging.info(f"Acquired ingestion lock: {lock_file}")
         from src.config import (
             COLLECTION_NAME,
@@ -65,54 +93,112 @@ def ensure_embeddings_on_startup():
         from src.ingestion.ingestion_pipeline import IngestionPipeline
         from src.vector_store.vector_db import VectorDatabase
-        logging.info("Checking vector store on startup...")
         # Initialize vector database to check its state
-        vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME)
         # Check if embeddings exist and have correct dimension
-        if not vector_db.has_valid_embeddings(EMBEDDING_DIMENSION):
             logging.warning(
-                f"Vector store is empty or has wrong dimension. "
-                f"Expected: {EMBEDDING_DIMENSION}, "
-                f"Current: {vector_db.get_embedding_dimension()}"
             )
-            logging.info(f"Running ingestion pipeline with model: {EMBEDDING_MODEL_NAME}")
             # Run ingestion pipeline to rebuild embeddings
-            ingestion_pipeline = IngestionPipeline(
-                chunk_size=DEFAULT_CHUNK_SIZE,
-                overlap=DEFAULT_OVERLAP,
-                seed=RANDOM_SEED,
-                store_embeddings=True,
-            )
             # Process the corpus directory
-            results = ingestion_pipeline.process_directory(CORPUS_DIRECTORY)
             if not results or len(results) == 0:
                 logging.error(
-                    "Ingestion failed or processed 0 chunks. "
-                    "Please check the corpus directory and "
-                    "ingestion pipeline for errors."
                 )
             else:
-                logging.info(f"Ingestion completed: {len(results)} chunks processed")
         else:
             logging.info(
-                f"Vector store is valid with {vector_db.get_count()} embeddings "
-                f"of dimension {vector_db.get_embedding_dimension()}"
             )
     except Exception as e:
-        logging.error(f"Failed to ensure embeddings on startup: {e}")
         # Don't crash the app, but log the error
         # The app will still start but searches may fail
     finally:
         # Release lock
-        if os.path.exists(lock_file):
-            os.remove(lock_file)
-            logging.info(f"Released ingestion lock: {lock_file}")
 def create_app(
@@ -1119,13 +1205,14 @@ def create_app(
         except Exception as e:
             logging.warning(f"Failed to register document management blueprint: {e}")
-        # Conditionally run ingestion pipeline on startup based on environment variable
         if os.getenv("REBUILD_EMBEDDINGS_ON_START", "false").lower() == "true":
             with app.app_context():
-                logging.info("REBUILD_EMBEDDINGS_ON_START is true, ensuring embeddings exist.")
                 ensure_embeddings_on_startup()
         else:
-            logging.info("REBUILD_EMBEDDINGS_ON_START is not set to true, skipping initial embedding.")
         # Add Render-specific memory middleware if running on Render and
         # memory monitoring is enabled

 def ensure_embeddings_on_startup():
     """
     Ensure embeddings exist and have the correct dimension on app startup.
+    This is critical for Hugging Face deployments where the vector store needs to be built on startup.
     Uses a file-based lock to prevent race conditions between workers.
     """
+    import fcntl
+    logging.info(f"[PID {os.getpid()}] Starting ensure_embeddings_on_startup function")
     lock_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "locks")
     if not os.path.exists(lock_dir):
+        try:
+            os.makedirs(lock_dir)
+            logging.info(f"[PID {os.getpid()}] Created lock directory: {lock_dir}")
+        except Exception as e:
+            logging.error(f"[PID {os.getpid()}] Failed to create lock directory: {e}")
+            return
     lock_file = os.path.join(lock_dir, "ingestion.lock")
+    lock_timeout = 300  # 5 minutes for Hugging Face with more resources
+    logging.info(f"[PID {os.getpid()}] Attempting to acquire lock: {lock_file}")
+    # Use proper file locking with fcntl for better reliability
+    try:
+        lock_fd = open(lock_file, "w")
+        fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+        logging.info(f"[PID {os.getpid()}] Successfully acquired exclusive lock")
+        # Write PID to lock file for debugging
+        lock_fd.write(f"{os.getpid()}\n")
+        lock_fd.flush()
+    except (IOError, OSError):
+        logging.info(f"[PID {os.getpid()}] Lock is held by another process, waiting...")
+        lock_fd.close()
+        # Wait for lock to be released
+        start_time = time.time()
+        while time.time() - start_time < lock_timeout:
+            try:
+                lock_fd = open(lock_file, "w")
+                fcntl.flock(lock_fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
+                logging.info(f"[PID {os.getpid()}] Lock acquired after waiting {time.time() - start_time:.1f}s")
+                break
+            except (IOError, OSError):
+                lock_fd.close()
+                time.sleep(2)
+        else:
+            logging.error(f"[PID {os.getpid()}] Timeout waiting for lock after {lock_timeout}s")
             return
     try:
+        logging.info(f"[PID {os.getpid()}] Lock acquired, starting ingestion process")
         from src.config import (
             COLLECTION_NAME,
         from src.ingestion.ingestion_pipeline import IngestionPipeline
         from src.vector_store.vector_db import VectorDatabase
+        logging.info(f"[PID {os.getpid()}] Imported modules successfully")
+        logging.info(f"[PID {os.getpid()}] Checking vector store at: {VECTOR_DB_PERSIST_PATH}")
+        logging.info(f"[PID {os.getpid()}] Collection name: {COLLECTION_NAME}")
+        logging.info(f"[PID {os.getpid()}] Corpus directory: {CORPUS_DIRECTORY}")
+        logging.info(f"[PID {os.getpid()}] Expected embedding dimension: {EMBEDDING_DIMENSION}")
         # Initialize vector database to check its state
+        try:
+            vector_db = VectorDatabase(VECTOR_DB_PERSIST_PATH, COLLECTION_NAME)
+            logging.info(f"[PID {os.getpid()}] Vector database initialized successfully")
+        except Exception as e:
+            logging.error(f"[PID {os.getpid()}] Failed to initialize vector database: {e}")
+            raise
         # Check if embeddings exist and have correct dimension
+        try:
+            current_count = vector_db.get_count()
+            current_dimension = vector_db.get_embedding_dimension()
+            logging.info(
+                f"[PID {os.getpid()}] Current database state: {current_count} embeddings, dimension {current_dimension}"
+            )
+            has_valid = vector_db.has_valid_embeddings(EMBEDDING_DIMENSION)
+            logging.info(f"[PID {os.getpid()}] Has valid embeddings: {has_valid}")
+        except Exception as e:
+            logging.error(f"[PID {os.getpid()}] Failed to check vector database state: {e}")
+            # Assume we need to rebuild
+            has_valid = False
+            current_count = 0
+            current_dimension = 0
+        if not has_valid:
             logging.warning(
+                f"[PID {os.getpid()}] Vector store is empty or has wrong dimension. "
+                f"Expected: {EMBEDDING_DIMENSION}, Current: {current_dimension}, "
+                f"Count: {current_count}"
             )
+            logging.info(f"[PID {os.getpid()}] Starting ingestion pipeline with model: {EMBEDDING_MODEL_NAME}")
+            # Check if corpus directory exists
+            if not os.path.exists(CORPUS_DIRECTORY):
+                logging.error(f"[PID {os.getpid()}] Corpus directory does not exist: {CORPUS_DIRECTORY}")
+                return
+            corpus_files = os.listdir(CORPUS_DIRECTORY)
+            logging.info(f"[PID {os.getpid()}] Found {len(corpus_files)} files in corpus directory")
             # Run ingestion pipeline to rebuild embeddings
+            try:
+                ingestion_pipeline = IngestionPipeline(
+                    chunk_size=DEFAULT_CHUNK_SIZE,
+                    overlap=DEFAULT_OVERLAP,
+                    seed=RANDOM_SEED,
+                    store_embeddings=True,
+                )
+                logging.info(f"[PID {os.getpid()}] Ingestion pipeline created successfully")
+            except Exception as e:
+                logging.error(f"[PID {os.getpid()}] Failed to create ingestion pipeline: {e}")
+                raise
             # Process the corpus directory
+            try:
+                logging.info(f"[PID {os.getpid()}] Starting to process corpus directory...")
+                results = ingestion_pipeline.process_directory(CORPUS_DIRECTORY)
+                logging.info(f"[PID {os.getpid()}] Process directory completed, got results: {type(results)}")
+            except Exception as e:
+                logging.error(f"[PID {os.getpid()}] Failed during directory processing: {e}", exc_info=True)
+                raise
             if not results or len(results) == 0:
                 logging.error(
+                    f"[PID {os.getpid()}] Ingestion failed or processed 0 chunks. "
+                    "Please check the corpus directory and ingestion pipeline for errors."
                 )
             else:
+                logging.info(f"[PID {os.getpid()}] Ingestion completed successfully: {len(results)} chunks processed")
+                # Verify the embeddings were actually stored
+                try:
+                    final_count = vector_db.get_count()
+                    final_dimension = vector_db.get_embedding_dimension()
+                    logging.info(
+                        f"[PID {os.getpid()}] Final database state: {final_count} embeddings, "
+                        f"dimension {final_dimension}"
+                    )
+                except Exception as e:
+                    logging.error(f"[PID {os.getpid()}] Failed to verify final database state: {e}")
         else:
             logging.info(
+                f"[PID {os.getpid()}] Vector store is valid with {current_count} embeddings "
+                f"of dimension {current_dimension}"
             )
     except Exception as e:
+        logging.error(f"[PID {os.getpid()}] Failed to ensure embeddings on startup: {e}", exc_info=True)
         # Don't crash the app, but log the error
         # The app will still start but searches may fail
     finally:
         # Release lock
+        try:
+            fcntl.flock(lock_fd.fileno(), fcntl.LOCK_UN)
+            lock_fd.close()
+            logging.info(f"[PID {os.getpid()}] Released ingestion lock")
+        except Exception as e:
+            logging.error(f"[PID {os.getpid()}] Failed to release lock: {e}")
 def create_app(
         except Exception as e:
             logging.warning(f"Failed to register document management blueprint: {e}")
+        # Use pre-built embeddings by default for reliable deployment
+        # Only rebuild embeddings if explicitly requested via environment variable
         if os.getenv("REBUILD_EMBEDDINGS_ON_START", "false").lower() == "true":
             with app.app_context():
+                logging.info("REBUILD_EMBEDDINGS_ON_START is true, rebuilding embeddings on startup.")
                 ensure_embeddings_on_startup()
         else:
+            logging.info("Using pre-built embeddings. Set REBUILD_EMBEDDINGS_ON_START=true to rebuild.")
         # Add Render-specific memory middleware if running on Render and
         # memory monitoring is enabled