vikramvasudevan commited on
Commit
7a031ac
·
verified ·
1 Parent(s): b24fcf4

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. config.py +6 -0
  2. db.py +58 -2
config.py CHANGED
@@ -533,6 +533,12 @@ class SanatanConfig:
533
  "collection_name": "yt_metadata",
534
  "collection_embedding_fn": "openai",
535
  "unit": "video",
 
 
 
 
 
 
536
  "metadata_fields": [
537
  {
538
  "name": "video_id",
 
533
  "collection_name": "yt_metadata",
534
  "collection_embedding_fn": "openai",
535
  "unit": "video",
536
+ "field_mapping": {
537
+ "text": "description",
538
+ "title": "video_title",
539
+ "author": "channel_title",
540
+ "reference_link": lambda doc: f"https://www.youtube.com/watch?v={doc.get('video_id','')}",
541
+ },
542
  "metadata_fields": [
543
  {
544
  "name": "video_id",
db.py CHANGED
@@ -155,13 +155,20 @@ class SanatanDatabase:
155
  return result
156
  else:
157
  print("No data available")
 
 
 
 
 
 
 
158
  # show a sample data record
159
- response = collection.get(
160
  limit=2,
161
  # offset=index, # pagination via offset
162
  include=["metadatas", "documents"],
163
  )
164
- print("sample data : ",response)
165
 
166
  return {"error": "No data available."}
167
 
@@ -393,3 +400,52 @@ class SanatanDatabase:
393
  embeddings=embeddings,
394
  )
395
  print("All documents re-embedded and added to new collection successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  return result
156
  else:
157
  print("No data available")
158
+ if index == 1:
159
+ # there should be atleast one row in the collection?
160
+ # add index
161
+ self.add_unit_index_to_collection(
162
+ collection_name=collection_name, unit_field=unit_name
163
+ )
164
+
165
  # show a sample data record
166
+ response1 = collection.get(
167
  limit=2,
168
  # offset=index, # pagination via offset
169
  include=["metadatas", "documents"],
170
  )
171
+ print("sample data : ", response1)
172
 
173
  return {"error": "No data available."}
174
 
 
400
  embeddings=embeddings,
401
  )
402
  print("All documents re-embedded and added to new collection successfully!")
403
+
404
+ def add_unit_index_to_collection(self, collection_name: str, unit_field: str):
405
+ if collection_name != "yt_metadata":
406
+ # safeguard just incase
407
+ return
408
+ collection = self.chroma_client.get_collection(name=collection_name)
409
+
410
+ # fetch everything in batches (in case your collection is large)
411
+ batch_size = 100
412
+ offset = 0
413
+ unit_counter = 1
414
+
415
+ while True:
416
+ result = collection.get(
417
+ limit=batch_size,
418
+ offset=offset,
419
+ include=["documents", "metadatas", "embeddings"],
420
+ )
421
+
422
+ ids = result["ids"]
423
+ if not ids:
424
+ break # no more docs
425
+
426
+ docs = result["documents"]
427
+ metas = result["metadatas"]
428
+ embeddings = result["embeddings"]
429
+
430
+ # add unit_index to metadata
431
+ updated_metas = []
432
+ for meta in metas:
433
+ # ensure meta is not None
434
+ m = meta.copy() if meta else {}
435
+ m[unit_field] = unit_counter
436
+ updated_metas.append(m)
437
+ unit_counter += 1
438
+
439
+ # upsert with same IDs (will overwrite metadata but keep same id+doc)
440
+ collection.upsert(
441
+ ids=ids,
442
+ documents=docs,
443
+ metadatas=updated_metas,
444
+ embeddings=embeddings,
445
+ )
446
+
447
+ offset += batch_size
448
+
449
+ print(
450
+ f"✅ Finished adding {unit_field} to {unit_counter-1} documents in {collection_name}."
451
+ )