Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on Sep 2

Commit

2a96fbf

verified ·

1 Parent(s): 2f2633c

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

app.py +37 -14
embeddings.py +27 -10
modules/nodes/init.py +31 -9
modules/nodes/state.py +2 -1
pyproject.toml +1 -0
requirements.txt +2 -0
uv.lock +11 -0

app.py CHANGED Viewed

@@ -25,6 +25,17 @@ logger.setLevel(logging.INFO)
 graph = generate_graph()
 def init():
     load_dotenv(override=True)
@@ -93,19 +104,29 @@ thinking_verbs = [
 ]
-async def chat_wrapper(message, history, thread_id, debug):
     if debug:
-        async for chunk in chat_streaming(debug, message, history, thread_id):
             yield chunk
     else:
-        response = chat(debug, message, history, thread_id)
         yield response
-def chat(debug_mode, message, history, thread_id):
     config = {"configurable": {"thread_id": thread_id}}
     response = graph.invoke(
-        {"debug_mode": debug_mode, "messages": [{"role": "user", "content": message}]},
         config=config,
     )
     return response["messages"][-1].content
@@ -140,10 +161,13 @@ def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
     )
-async def chat_streaming(debug_mode: bool, message, history, thread_id):
     state = {
         "debug_mode": debug_mode,
         "messages": (history or []) + [{"role": "user", "content": message}],
     }
     config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
     start_time = time.time()
@@ -184,9 +208,7 @@ async def chat_streaming(debug_mode: bool, message, history, thread_id):
             truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
             def generate_processing_message():
-                return (
-                    f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
-                )
             if (
                 not isinstance(msg, ToolMessage)
@@ -197,9 +219,7 @@ async def chat_streaming(debug_mode: bool, message, history, thread_id):
             if isinstance(msg, ToolMessage):
                 logger.debug("tool message = %s", msg)
-                html = (
-                    f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
-                )
                 yield f"### { ' → '.join(node_tree)}\n{html}"
             elif isinstance(msg, AIMessageChunk):
@@ -438,6 +458,9 @@ with gr.Blocks(
         gr.Markdown(value="------")
         debug_checkbox = gr.Checkbox(label="Debug (Streaming)", value=True)
     chatbot = gr.Chatbot(
         elem_id="chatbot",
         avatar_images=("assets/avatar_user.png", "assets/adiyen_bot.png"),
@@ -453,9 +476,9 @@ with gr.Blocks(
     chatInterface = gr.ChatInterface(
         title="Sanatan-AI",
         fn=chat_wrapper,
-        additional_inputs=[thread_id, debug_checkbox],
         chatbot=chatbot,
         textbox=message_textbox,
     )
-# app.launch()

 graph = generate_graph()
+import pycountry
+def get_all_languages():
+    """
+    Returns a sorted list of all languages by their English names.
+    Uses ISO 639 data from pycountry.
+    """
+    languages = [lang.name for lang in pycountry.languages if hasattr(lang, "name")]
+    return sorted(set(languages))  # remove duplicates and sort alphabetically
 def init():
     load_dotenv(override=True)
 ]
+async def chat_wrapper(
+    message, history, thread_id, debug, preferred_language="English"
+):
     if debug:
+        async for chunk in chat_streaming(
+            debug, message, history, thread_id, preferred_language=preferred_language
+        ):
             yield chunk
     else:
+        response = chat(
+            debug, message, history, thread_id, preferred_language=preferred_language
+        )
         yield response
+def chat(debug_mode, message, history, thread_id, preferred_language="English"):
     config = {"configurable": {"thread_id": thread_id}}
     response = graph.invoke(
+        {
+            "debug_mode": debug_mode,
+            "messages": [{"role": "user", "content": message}],
+            "language": preferred_language,
+        },
         config=config,
     )
     return response["messages"][-1].content
     )
+async def chat_streaming(
+    debug_mode: bool, message, history, thread_id, preferred_language="English"
+):
     state = {
         "debug_mode": debug_mode,
         "messages": (history or []) + [{"role": "user", "content": message}],
+        "language": preferred_language,
     }
     config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
     start_time = time.time()
             truncated = (full[:MAX_CONTENT] + "…") if len(full) > MAX_CONTENT else full
             def generate_processing_message():
+                return f"<div class='thinking-bubble'><em>🤔{random.choice(thinking_verbs)} ...</em></div>"
             if (
                 not isinstance(msg, ToolMessage)
             if isinstance(msg, ToolMessage):
                 logger.debug("tool message = %s", msg)
+                html = f"<div class='thinking-bubble'><em>🤔 {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
                 yield f"### { ' → '.join(node_tree)}\n{html}"
             elif isinstance(msg, AIMessageChunk):
         gr.Markdown(value="------")
         debug_checkbox = gr.Checkbox(label="Debug (Streaming)", value=True)
+        preferred_language = gr.Dropdown(
+            choices=get_all_languages(), value="English", label="Preferred Language"
+        )
     chatbot = gr.Chatbot(
         elem_id="chatbot",
         avatar_images=("assets/avatar_user.png", "assets/adiyen_bot.png"),
     chatInterface = gr.ChatInterface(
         title="Sanatan-AI",
         fn=chat_wrapper,
+        additional_inputs=[thread_id, debug_checkbox, preferred_language],
         chatbot=chatbot,
         textbox=message_textbox,
     )
+# app.launch()

embeddings.py CHANGED Viewed

@@ -27,30 +27,47 @@ def chunk_text(text: str, max_tokens: int = 1000) -> list[str]:
     tokens = tokenizer.encode(text)
     return [tokenizer.decode(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
 def _get_openai_embedding(texts: list[str]) -> list[list[float]]:
     """Get embeddings for a list of texts. If a text is too long, chunk + average."""
     final_embeddings = []
     for text in texts:
         # Split into chunks if too long
         if len(tokenizer.encode(text)) > 8192:
             chunks = chunk_text(text)
         else:
             chunks = [text]
-        # Call API on all chunks at once
-        response = client.embeddings.create(
-            model="text-embedding-3-large",
-            input=chunks
-        )
-        chunk_embeddings = [np.array(d.embedding) for d in response.data]
-        # Average embeddings if multiple chunks
-        avg_embedding = np.mean(chunk_embeddings, axis=0)
-        final_embeddings.append(avg_embedding.tolist())
     return final_embeddings
 embedding_cache = {}
 def get_embedding(texts: list[str], backend: Literal["hf","openai"] = "hf") -> list[list[float]]:

     tokens = tokenizer.encode(text)
     return [tokenizer.decode(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
+import numpy as np
+EMBED_DIM = 3072  # dimension of text-embedding-3-large
 def _get_openai_embedding(texts: list[str]) -> list[list[float]]:
     """Get embeddings for a list of texts. If a text is too long, chunk + average."""
     final_embeddings = []
     for text in texts:
+        if not text or not isinstance(text, str) or not text.strip():
+            # fallback: skip or append zero vector
+            final_embeddings.append([0.0] * EMBED_DIM)
+            continue
         # Split into chunks if too long
         if len(tokenizer.encode(text)) > 8192:
             chunks = chunk_text(text)
         else:
             chunks = [text]
+        # Clean chunks
+        clean_chunks = [c.strip() for c in chunks if isinstance(c, str) and c.strip()]
+        if not clean_chunks:
+            final_embeddings.append([0.0] * EMBED_DIM)
+            continue
+        try:
+            response = client.embeddings.create(
+                model="text-embedding-3-large",
+                input=clean_chunks
+            )
+            chunk_embeddings = [np.array(d.embedding) for d in response.data]
+            avg_embedding = np.mean(chunk_embeddings, axis=0)
+            final_embeddings.append(avg_embedding.tolist())
+        except Exception as e:
+            print(f"Embedding failed for text[:100]={text[:100]!r}, error={e}")
+            final_embeddings.append([0.0] * EMBED_DIM)  # fallback
     return final_embeddings
 embedding_cache = {}
 def get_embedding(texts: list[str], backend: Literal["hf","openai"] = "hf") -> list[list[float]]:

modules/nodes/init.py CHANGED Viewed

@@ -6,7 +6,11 @@ from modules.nodes.state import ChatState
 def init_system_prompt_node(state: ChatState) -> ChatState:
     messages = state["messages"] or []
-    initialized = state["initialized"] if "initialized"  in state else False
     # Check if system prompts were already added
     if not initialized:
@@ -76,7 +80,7 @@ Example user queries and tool usage:
             """
             ),
             SystemMessage(
-                content="""You are a knowledgeable assistant on the scripture *{collection_name}*, well-versed in **Sanskrit** , **English** and **Tamil**.
 You must answer the question using **only** the content from *{collection_name}* provided in the context below.
 - Do **not** bring in information from **any other scripture or source**, or from prior knowledge, even if the answer seems obvious or well-known.
 - Do **not** quote any Sanskrit/Tamil verses unless they appear **explicitly** in the provided context.
@@ -88,7 +92,8 @@ If the answer asks for translation to another language of their choice and you a
 If the answer WAS indeed found in the context, use the following response format (in Markdown) othereise clearly state **"I do not have enough information from the {collection_name} to answer this. I searched using {search_methodology}. Do you want me try to another search like {alternative_searchmethod}?"**
 ### 🧾 Answer
-- Present a brief summary of your response in concise **English**. Mention only the scripture(s), chapter(s) and verse number(s) available if multiple matches are available.
 The following format should be used to show only the most relevant match. Do not show all matches at once.
@@ -98,6 +103,9 @@ The following format should be used to show only the most relevant match. Do not
 ### 🕮 Chapter Title(s)
 - Mention the chapter(s) from which the references were taken.  Use the field *title* here from the context if available. For example `TVM 1.8.3`
 ### 🕮 Verse Number(s)
 - Mention the *verse number* from which the references were taken.
@@ -117,16 +125,16 @@ The following format should be used to show only the most relevant match. Do not
 - Do not translate, transliterate, or interpret.
 - Do not hallucinate or generate new verses.
 - Output should only be the **cleaned, original verses**.
-- The output in this section **MUST** be in native script not english or transliterated english.
 > If you are unsure about a character, leave it as it is rather than guessing.
-### 📜 English Transliteration(s)
-- For each verse above, provide the **matching English transliteration**.
-- Maintain the **same order** as the verses listed above.
-### 📜 English Translation(s)
-- Provide the **English meaning** for each verse listed above.
 - Again, follow the **same order**.
 - Do **not** repeat the original verse here — just the translation.
@@ -155,6 +163,20 @@ Respond in **Markdown** format only. Ensure native Sanskrit/Tamil verses are alw
             ),
         ]
         state["initialized"] = True
     state["tool_calls"] = 0
     state["seen_tool_calls"] = set()
     state["skip_tool"] = False

 def init_system_prompt_node(state: ChatState) -> ChatState:
     messages = state["messages"] or []
+    initialized = state["initialized"] if "initialized" in state else False
+    if "language" not in state:
+        # Set default language
+        state["language"] = "English"
     # Check if system prompts were already added
     if not initialized:
             """
             ),
             SystemMessage(
+                content="""You are a knowledgeable assistant on the scripture *{collection_name}*, well-versed in **Sanskrit** , **{user_preferred_language}** and **Tamil**.
 You must answer the question using **only** the content from *{collection_name}* provided in the context below.
 - Do **not** bring in information from **any other scripture or source**, or from prior knowledge, even if the answer seems obvious or well-known.
 - Do **not** quote any Sanskrit/Tamil verses unless they appear **explicitly** in the provided context.
 If the answer WAS indeed found in the context, use the following response format (in Markdown) othereise clearly state **"I do not have enough information from the {collection_name} to answer this. I searched using {search_methodology}. Do you want me try to another search like {alternative_searchmethod}?"**
 ### 🧾 Answer
+- Present a brief summary of your response in concise **{user_preferred_language}**. Mention only the scripture(s), chapter(s) and verse number(s) available if multiple matches are available.
+- This needs to begin with `author` says.
 The following format should be used to show only the most relevant match. Do not show all matches at once.
 ### 🕮 Chapter Title(s)
 - Mention the chapter(s) from which the references were taken.  Use the field *title* here from the context if available. For example `TVM 1.8.3`
+### 🕮 Author(s)
+- Mention the name of the Author. In the case of divya_prabandham, it is the `azhwar_name`
 ### 🕮 Verse Number(s)
 - Mention the *verse number* from which the references were taken.
 - Do not translate, transliterate, or interpret.
 - Do not hallucinate or generate new verses.
 - Output should only be the **cleaned, original verses**.
+- The output in this section **MUST** be in native script not {user_preferred_language} or transliterated {user_preferred_language}.
 > If you are unsure about a character, leave it as it is rather than guessing.
+### 📜 {user_preferred_language} Transliteration(s)
+- If `{user_preferred_language}` is the SAME as the native verse language, **omit this entire section completely** (do not output even the heading).
+- Otherwise, provide the transliterations in {user_preferred_language}, matching the order of verses above.
+### 📜 {user_preferred_language} Translation(s)
+- Provide the **{user_preferred_language} meaning** for each verse listed above.
 - Again, follow the **same order**.
 - Do **not** repeat the original verse here — just the translation.
             ),
         ]
         state["initialized"] = True
+    state["messages"].append(
+        SystemMessage(
+            content=(
+                f"Note: `user_preferred_language` is {state['language']}. "
+                f"Carefully translate all other sections (including the section headings) in the response "
+                f"**except the Native verses** to {state['language']}. "
+                f"While translating, meticulously correct any spelling mistakes, typos, conversion errors, "
+                f"and remove any untranslated words or foreign characters. "
+                f"Ensure the output text is **fully natural, grammatically correct, and orthographically valid** "
+                f"in {state['language']}."
+            )
+        )
+    )
     state["tool_calls"] = 0
     state["seen_tool_calls"] = set()
     state["skip_tool"] = False

modules/nodes/state.py CHANGED Viewed

@@ -9,4 +9,5 @@ class ChatState(TypedDict):
     tool_calls: int
     seen_tool_calls: set[tuple[str, str]]  # (tool_name, params_hash)
     skip_tool: bool
-    initialized : bool

     tool_calls: int
     seen_tool_calls: set[tuple[str, str]]  # (tool_name, params_hash)
     skip_tool: bool
+    initialized : bool
+    language : str

pyproject.toml CHANGED Viewed

@@ -17,5 +17,6 @@ dependencies = [
     "langchain-openai>=0.3.28",
     "langgraph>=0.6.2",
     "oauth2client>=4.1.3",
     "sentence-transformers>=5.0.0",
 ]

     "langchain-openai>=0.3.28",
     "langgraph>=0.6.2",
     "oauth2client>=4.1.3",
+    "pycountry>=24.6.1",
     "sentence-transformers>=5.0.0",
 ]

requirements.txt CHANGED Viewed

@@ -332,6 +332,8 @@ pyasn1-modules==0.4.2
     #   oauth2client
 pybase64==1.4.2
     # via chromadb
 pydantic==2.11.7
     # via
     #   chromadb

     #   oauth2client
 pybase64==1.4.2
     # via chromadb
+pycountry==24.6.1
+    # via sanatan-ai (pyproject.toml)
 pydantic==2.11.7
     # via
     #   chromadb

uv.lock CHANGED Viewed

@@ -2273,6 +2273,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3c/52/5600104ef7b85f89fb8ec54f73504ead3f6f0294027e08d281f3cafb5c1a/pybase64-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:f25140496b02db0e7401567cd869fb13b4c8118bf5c2428592ec339987146d8b", size = 31600, upload-time = "2025-07-27T13:05:52.24Z" },
 ]
 [[package]]
 name = "pycparser"
 version = "2.22"
@@ -2750,6 +2759,7 @@ dependencies = [
     { name = "langchain-openai" },
     { name = "langgraph" },
     { name = "oauth2client" },
     { name = "sentence-transformers" },
 ]
@@ -2767,6 +2777,7 @@ requires-dist = [
     { name = "langchain-openai", specifier = ">=0.3.28" },
     { name = "langgraph", specifier = ">=0.6.2" },
     { name = "oauth2client", specifier = ">=4.1.3" },
     { name = "sentence-transformers", specifier = ">=5.0.0" },
 ]

     { url = "https://files.pythonhosted.org/packages/3c/52/5600104ef7b85f89fb8ec54f73504ead3f6f0294027e08d281f3cafb5c1a/pybase64-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:f25140496b02db0e7401567cd869fb13b4c8118bf5c2428592ec339987146d8b", size = 31600, upload-time = "2025-07-27T13:05:52.24Z" },
 ]
+[[package]]
+name = "pycountry"
+version = "24.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/76/57/c389fa68c50590881a75b7883eeb3dc15e9e73a0fdc001cdd45c13290c92/pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221", size = 6043910, upload-time = "2024-06-01T04:12:15.05Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b1/ec/1fb891d8a2660716aadb2143235481d15ed1cbfe3ad669194690b0604492/pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f", size = 6335189, upload-time = "2024-06-01T04:11:49.711Z" },
+]
 [[package]]
 name = "pycparser"
 version = "2.22"
     { name = "langchain-openai" },
     { name = "langgraph" },
     { name = "oauth2client" },
+    { name = "pycountry" },
     { name = "sentence-transformers" },
 ]
     { name = "langchain-openai", specifier = ">=0.3.28" },
     { name = "langgraph", specifier = ">=0.6.2" },
     { name = "oauth2client", specifier = ">=4.1.3" },
+    { name = "pycountry", specifier = ">=24.6.1" },
     { name = "sentence-transformers", specifier = ">=5.0.0" },
 ]