Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files
config.py
CHANGED
|
@@ -533,6 +533,12 @@ class SanatanConfig:
|
|
| 533 |
"collection_name": "yt_metadata",
|
| 534 |
"collection_embedding_fn": "openai",
|
| 535 |
"unit": "video",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
"metadata_fields": [
|
| 537 |
{
|
| 538 |
"name": "video_id",
|
|
|
|
| 533 |
"collection_name": "yt_metadata",
|
| 534 |
"collection_embedding_fn": "openai",
|
| 535 |
"unit": "video",
|
| 536 |
+
"field_mapping": {
|
| 537 |
+
"text": "description",
|
| 538 |
+
"title": "video_title",
|
| 539 |
+
"author": "channel_title",
|
| 540 |
+
"reference_link": lambda doc: f"https://www.youtube.com/watch?v={doc.get('video_id','')}",
|
| 541 |
+
},
|
| 542 |
"metadata_fields": [
|
| 543 |
{
|
| 544 |
"name": "video_id",
|
db.py
CHANGED
|
@@ -155,13 +155,20 @@ class SanatanDatabase:
|
|
| 155 |
return result
|
| 156 |
else:
|
| 157 |
print("No data available")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# show a sample data record
|
| 159 |
-
|
| 160 |
limit=2,
|
| 161 |
# offset=index, # pagination via offset
|
| 162 |
include=["metadatas", "documents"],
|
| 163 |
)
|
| 164 |
-
print("sample data : ",
|
| 165 |
|
| 166 |
return {"error": "No data available."}
|
| 167 |
|
|
@@ -393,3 +400,52 @@ class SanatanDatabase:
|
|
| 393 |
embeddings=embeddings,
|
| 394 |
)
|
| 395 |
print("All documents re-embedded and added to new collection successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
return result
|
| 156 |
else:
|
| 157 |
print("No data available")
|
| 158 |
+
if index == 1:
|
| 159 |
+
# there should be atleast one row in the collection?
|
| 160 |
+
# add index
|
| 161 |
+
self.add_unit_index_to_collection(
|
| 162 |
+
collection_name=collection_name, unit_field=unit_name
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
# show a sample data record
|
| 166 |
+
response1 = collection.get(
|
| 167 |
limit=2,
|
| 168 |
# offset=index, # pagination via offset
|
| 169 |
include=["metadatas", "documents"],
|
| 170 |
)
|
| 171 |
+
print("sample data : ", response1)
|
| 172 |
|
| 173 |
return {"error": "No data available."}
|
| 174 |
|
|
|
|
| 400 |
embeddings=embeddings,
|
| 401 |
)
|
| 402 |
print("All documents re-embedded and added to new collection successfully!")
|
| 403 |
+
|
| 404 |
+
def add_unit_index_to_collection(self, collection_name: str, unit_field: str):
|
| 405 |
+
if collection_name != "yt_metadata":
|
| 406 |
+
# safeguard just incase
|
| 407 |
+
return
|
| 408 |
+
collection = self.chroma_client.get_collection(name=collection_name)
|
| 409 |
+
|
| 410 |
+
# fetch everything in batches (in case your collection is large)
|
| 411 |
+
batch_size = 100
|
| 412 |
+
offset = 0
|
| 413 |
+
unit_counter = 1
|
| 414 |
+
|
| 415 |
+
while True:
|
| 416 |
+
result = collection.get(
|
| 417 |
+
limit=batch_size,
|
| 418 |
+
offset=offset,
|
| 419 |
+
include=["documents", "metadatas", "embeddings"],
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
ids = result["ids"]
|
| 423 |
+
if not ids:
|
| 424 |
+
break # no more docs
|
| 425 |
+
|
| 426 |
+
docs = result["documents"]
|
| 427 |
+
metas = result["metadatas"]
|
| 428 |
+
embeddings = result["embeddings"]
|
| 429 |
+
|
| 430 |
+
# add unit_index to metadata
|
| 431 |
+
updated_metas = []
|
| 432 |
+
for meta in metas:
|
| 433 |
+
# ensure meta is not None
|
| 434 |
+
m = meta.copy() if meta else {}
|
| 435 |
+
m[unit_field] = unit_counter
|
| 436 |
+
updated_metas.append(m)
|
| 437 |
+
unit_counter += 1
|
| 438 |
+
|
| 439 |
+
# upsert with same IDs (will overwrite metadata but keep same id+doc)
|
| 440 |
+
collection.upsert(
|
| 441 |
+
ids=ids,
|
| 442 |
+
documents=docs,
|
| 443 |
+
metadatas=updated_metas,
|
| 444 |
+
embeddings=embeddings,
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
offset += batch_size
|
| 448 |
+
|
| 449 |
+
print(
|
| 450 |
+
f"✅ Finished adding {unit_field} to {unit_counter-1} documents in {collection_name}."
|
| 451 |
+
)
|