import json from dataclasses import dataclass from collections import defaultdict import logging logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @dataclass class Pasuram: prabandham_code: str azhwar_name: str prabandham_name: str def get_standardized_prabandham_names() -> list[Pasuram]: """ Get a list of prabandham names along with the azhwars who authored them in divya_prabandham, sorted by the prabandham name (3rd field, index 2). """ with open("./data/azhwars.json", "r", encoding="utf-8") as f: azhwars = json.load(f) header = azhwars[0] rows = azhwars[1:] # Sort by 3rd field (index 2) rows.sort(key=lambda row: row[2]) final_azhwars = [Pasuram(**dict(zip(header, row))) for row in rows] return final_azhwars def get_standardized_azhwar_names() -> list[str]: """ Get a list of azhwar names along with the pasurams they have authored in divya_prabandham """ with open("./data/azhwars.json", "r", encoding="utf-8") as f: azhwars = json.load(f) header = azhwars[0] rows = azhwars[1:] final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name return sorted(set(final_azhwars)) def get_standardized_divya_desam_names() -> list[str]: """ Get a list of divya desam names in divya_prabandham """ with open("./data/divya_desams.json", "r", encoding="utf-8") as f: divya_desams = json.load(f) # FIXED selected_fields = [ "title", "other_names", "name_ta", "alwars", "area", "state", "thirukolam", "direction", "sampradayam", "divya_desam", ] data = [ {key: row[key] for key in selected_fields if key in row} for row in divya_desams["pageProps"]["hits"] ] return sorted(set([row["title"] for row in data])) def reorder_taniyan(collection): logger.info("reorder_taniyan: started") # Fetch all docs with ids + metadatas data = collection.get(include=["metadatas"]) ids = data.get("ids", []) metas = data.get("metadatas", []) if not ids or not metas: logger.warning("reorder_taniyan: no data found in collection") return # sort globally by current _global_index records = sorted( [(i, m) for i, m in enumerate(metas)], key=lambda x: x[1].get("_global_index", float("inf")), ) # group by prabandham_code grouped = defaultdict(list) for i, meta in records: prabandham = meta.get("prabandham_code") if prabandham: grouped[prabandham].append((i, meta)) updates = [] global_counter = 1 # running _global_index across the collection for prabandham, items in grouped.items(): taniyan_items = [ (i, m) for i, m in items if m.get("section_type", "").startswith("taniyan") ] non_taniyan_items = [ (i, m) for i, m in items if not m.get("section_type", "").startswith("taniyan") ] if not taniyan_items and not non_taniyan_items: continue # sort both groups by original _global_index taniyan_items.sort(key=lambda x: x[1]["_global_index"]) non_taniyan_items.sort(key=lambda x: x[1]["_global_index"]) # --- taniyans first (verse starts from 1) --- for verse_no, (i, meta) in enumerate(taniyan_items, start=1): updates.append( { "id": ids[i], "metadata": { **meta, "_global_index": global_counter, "verse": verse_no, }, } ) global_counter += 1 # --- non-taniyans continue from their base verse --- if non_taniyan_items: base_verse = min(m["verse"] for _, m in non_taniyan_items) for offset, (i, meta) in enumerate(non_taniyan_items): updates.append( { "id": ids[i], "metadata": { **meta, "_global_index": global_counter, "verse": base_verse + offset, }, } ) global_counter += 1 if updates: logger.info("reorder_taniyan: updating %d records...", len(updates)) collection.update( ids=[u["id"] for u in updates], metadatas=[u["metadata"] for u in updates], ) logger.info("reorder_taniyan: update complete.") else: logger.info("reorder_taniyan: nothing to update") logger.info("reorder_taniyan: finished") def delete_taniyan(collection): logger.info("delete_taniyan: started") # Fetch all docs (only ids + metadata needed) data = collection.get(include=["metadatas"]) ids = data["ids"] metas = data["metadatas"] # Collect ids where section_type starts with "taniyan" taniyan_ids = [ ids[i] for i, meta in enumerate(metas) if meta.get("section_type", "").startswith("taniyan") ] if taniyan_ids: logger.info("delete_taniyan: Deleting %d taniyan records...", len(taniyan_ids)) collection.delete(ids=taniyan_ids) logger.info("delete_taniyan: Deleted %d taniyan records", len(taniyan_ids)) else: logger.info("delete_taniyan: No taniyan records found") logger.info("delete_taniyan: finished") if __name__ == "__main__": logger.info(get_standardized_azhwar_names())