Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on Sep 17

Commit

7a031ac

verified ·

1 Parent(s): b24fcf4

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

config.py +6 -0
db.py +58 -2

config.py CHANGED Viewed

@@ -533,6 +533,12 @@ class SanatanConfig:
             "collection_name": "yt_metadata",
             "collection_embedding_fn": "openai",
             "unit": "video",
             "metadata_fields": [
                 {
                     "name": "video_id",

             "collection_name": "yt_metadata",
             "collection_embedding_fn": "openai",
             "unit": "video",
+            "field_mapping": {
+                "text": "description",
+                "title": "video_title",
+                "author": "channel_title",
+                "reference_link": lambda doc: f"https://www.youtube.com/watch?v={doc.get('video_id','')}",
+            },
             "metadata_fields": [
                 {
                     "name": "video_id",

db.py CHANGED Viewed

@@ -155,13 +155,20 @@ class SanatanDatabase:
             return result
         else:
             print("No data available")
             # show a sample data record
-            response = collection.get(
                 limit=2,
                 # offset=index,  # pagination via offset
                 include=["metadatas", "documents"],
             )
-            print("sample data : ",response)
             return {"error": "No data available."}
@@ -393,3 +400,52 @@ class SanatanDatabase:
                 embeddings=embeddings,
             )
         print("All documents re-embedded and added to new collection successfully!")

             return result
         else:
             print("No data available")
+            if index == 1:
+                # there should be atleast one row in the collection?
+                # add index
+                self.add_unit_index_to_collection(
+                    collection_name=collection_name, unit_field=unit_name
+                )
             # show a sample data record
+            response1 = collection.get(
                 limit=2,
                 # offset=index,  # pagination via offset
                 include=["metadatas", "documents"],
             )
+            print("sample data : ", response1)
             return {"error": "No data available."}
                 embeddings=embeddings,
             )
         print("All documents re-embedded and added to new collection successfully!")
+    def add_unit_index_to_collection(self, collection_name: str, unit_field: str):
+        if collection_name != "yt_metadata":
+            # safeguard just incase
+            return
+        collection = self.chroma_client.get_collection(name=collection_name)
+        # fetch everything in batches (in case your collection is large)
+        batch_size = 100
+        offset = 0
+        unit_counter = 1
+        while True:
+            result = collection.get(
+                limit=batch_size,
+                offset=offset,
+                include=["documents", "metadatas", "embeddings"],
+            )
+            ids = result["ids"]
+            if not ids:
+                break  # no more docs
+            docs = result["documents"]
+            metas = result["metadatas"]
+            embeddings = result["embeddings"]
+            # add unit_index to metadata
+            updated_metas = []
+            for meta in metas:
+                # ensure meta is not None
+                m = meta.copy() if meta else {}
+                m[unit_field] = unit_counter
+                updated_metas.append(m)
+                unit_counter += 1
+            # upsert with same IDs (will overwrite metadata but keep same id+doc)
+            collection.upsert(
+                ids=ids,
+                documents=docs,
+                metadatas=updated_metas,
+                embeddings=embeddings,
+            )
+            offset += batch_size
+        print(
+            f"✅ Finished adding {unit_field} to {unit_counter-1} documents in {collection_name}."
+        )