vikramvasudevan commited on
Commit
2a96fbf
ยท
verified ยท
1 Parent(s): 2f2633c

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. app.py +37 -14
  2. embeddings.py +27 -10
  3. modules/nodes/init.py +31 -9
  4. modules/nodes/state.py +2 -1
  5. pyproject.toml +1 -0
  6. requirements.txt +2 -0
  7. uv.lock +11 -0
app.py CHANGED
@@ -25,6 +25,17 @@ logger.setLevel(logging.INFO)
25
 
26
  graph = generate_graph()
27
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def init():
30
  load_dotenv(override=True)
@@ -93,19 +104,29 @@ thinking_verbs = [
93
  ]
94
 
95
 
96
- async def chat_wrapper(message, history, thread_id, debug):
 
 
97
  if debug:
98
- async for chunk in chat_streaming(debug, message, history, thread_id):
 
 
99
  yield chunk
100
  else:
101
- response = chat(debug, message, history, thread_id)
 
 
102
  yield response
103
 
104
 
105
- def chat(debug_mode, message, history, thread_id):
106
  config = {"configurable": {"thread_id": thread_id}}
107
  response = graph.invoke(
108
- {"debug_mode": debug_mode, "messages": [{"role": "user", "content": message}]},
 
 
 
 
109
  config=config,
110
  )
111
  return response["messages"][-1].content
@@ -140,10 +161,13 @@ def get_args_for_toolcall(tool_calls_buffer: dict, tool_call_id: str):
140
  )
141
 
142
 
143
- async def chat_streaming(debug_mode: bool, message, history, thread_id):
 
 
144
  state = {
145
  "debug_mode": debug_mode,
146
  "messages": (history or []) + [{"role": "user", "content": message}],
 
147
  }
148
  config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
149
  start_time = time.time()
@@ -184,9 +208,7 @@ async def chat_streaming(debug_mode: bool, message, history, thread_id):
184
  truncated = (full[:MAX_CONTENT] + "โ€ฆ") if len(full) > MAX_CONTENT else full
185
 
186
  def generate_processing_message():
187
- return (
188
- f"<div class='thinking-bubble'><em>๐Ÿค”{random.choice(thinking_verbs)} ...</em></div>"
189
- )
190
 
191
  if (
192
  not isinstance(msg, ToolMessage)
@@ -197,9 +219,7 @@ async def chat_streaming(debug_mode: bool, message, history, thread_id):
197
  if isinstance(msg, ToolMessage):
198
  logger.debug("tool message = %s", msg)
199
 
200
- html = (
201
- f"<div class='thinking-bubble'><em>๐Ÿค” {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
202
- )
203
  yield f"### { ' โ†’ '.join(node_tree)}\n{html}"
204
  elif isinstance(msg, AIMessageChunk):
205
 
@@ -438,6 +458,9 @@ with gr.Blocks(
438
 
439
  gr.Markdown(value="------")
440
  debug_checkbox = gr.Checkbox(label="Debug (Streaming)", value=True)
 
 
 
441
  chatbot = gr.Chatbot(
442
  elem_id="chatbot",
443
  avatar_images=("assets/avatar_user.png", "assets/adiyen_bot.png"),
@@ -453,9 +476,9 @@ with gr.Blocks(
453
  chatInterface = gr.ChatInterface(
454
  title="Sanatan-AI",
455
  fn=chat_wrapper,
456
- additional_inputs=[thread_id, debug_checkbox],
457
  chatbot=chatbot,
458
  textbox=message_textbox,
459
  )
460
 
461
- # app.launch()
 
25
 
26
  graph = generate_graph()
27
 
28
+ import pycountry
29
+
30
+
31
+ def get_all_languages():
32
+ """
33
+ Returns a sorted list of all languages by their English names.
34
+ Uses ISO 639 data from pycountry.
35
+ """
36
+ languages = [lang.name for lang in pycountry.languages if hasattr(lang, "name")]
37
+ return sorted(set(languages)) # remove duplicates and sort alphabetically
38
+
39
 
40
  def init():
41
  load_dotenv(override=True)
 
104
  ]
105
 
106
 
107
+ async def chat_wrapper(
108
+ message, history, thread_id, debug, preferred_language="English"
109
+ ):
110
  if debug:
111
+ async for chunk in chat_streaming(
112
+ debug, message, history, thread_id, preferred_language=preferred_language
113
+ ):
114
  yield chunk
115
  else:
116
+ response = chat(
117
+ debug, message, history, thread_id, preferred_language=preferred_language
118
+ )
119
  yield response
120
 
121
 
122
+ def chat(debug_mode, message, history, thread_id, preferred_language="English"):
123
  config = {"configurable": {"thread_id": thread_id}}
124
  response = graph.invoke(
125
+ {
126
+ "debug_mode": debug_mode,
127
+ "messages": [{"role": "user", "content": message}],
128
+ "language": preferred_language,
129
+ },
130
  config=config,
131
  )
132
  return response["messages"][-1].content
 
161
  )
162
 
163
 
164
+ async def chat_streaming(
165
+ debug_mode: bool, message, history, thread_id, preferred_language="English"
166
+ ):
167
  state = {
168
  "debug_mode": debug_mode,
169
  "messages": (history or []) + [{"role": "user", "content": message}],
170
+ "language": preferred_language,
171
  }
172
  config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 15}
173
  start_time = time.time()
 
208
  truncated = (full[:MAX_CONTENT] + "โ€ฆ") if len(full) > MAX_CONTENT else full
209
 
210
  def generate_processing_message():
211
+ return f"<div class='thinking-bubble'><em>๐Ÿค”{random.choice(thinking_verbs)} ...</em></div>"
 
 
212
 
213
  if (
214
  not isinstance(msg, ToolMessage)
 
219
  if isinstance(msg, ToolMessage):
220
  logger.debug("tool message = %s", msg)
221
 
222
+ html = f"<div class='thinking-bubble'><em>๐Ÿค” {msg.name} tool: {random.choice(thinking_verbs)} ...</em></div>"
 
 
223
  yield f"### { ' โ†’ '.join(node_tree)}\n{html}"
224
  elif isinstance(msg, AIMessageChunk):
225
 
 
458
 
459
  gr.Markdown(value="------")
460
  debug_checkbox = gr.Checkbox(label="Debug (Streaming)", value=True)
461
+ preferred_language = gr.Dropdown(
462
+ choices=get_all_languages(), value="English", label="Preferred Language"
463
+ )
464
  chatbot = gr.Chatbot(
465
  elem_id="chatbot",
466
  avatar_images=("assets/avatar_user.png", "assets/adiyen_bot.png"),
 
476
  chatInterface = gr.ChatInterface(
477
  title="Sanatan-AI",
478
  fn=chat_wrapper,
479
+ additional_inputs=[thread_id, debug_checkbox, preferred_language],
480
  chatbot=chatbot,
481
  textbox=message_textbox,
482
  )
483
 
484
+ # app.launch()
embeddings.py CHANGED
@@ -27,30 +27,47 @@ def chunk_text(text: str, max_tokens: int = 1000) -> list[str]:
27
  tokens = tokenizer.encode(text)
28
  return [tokenizer.decode(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
29
 
 
 
 
 
30
  def _get_openai_embedding(texts: list[str]) -> list[list[float]]:
31
  """Get embeddings for a list of texts. If a text is too long, chunk + average."""
32
  final_embeddings = []
33
 
34
  for text in texts:
 
 
 
 
 
35
  # Split into chunks if too long
36
  if len(tokenizer.encode(text)) > 8192:
37
  chunks = chunk_text(text)
38
  else:
39
  chunks = [text]
40
 
41
- # Call API on all chunks at once
42
- response = client.embeddings.create(
43
- model="text-embedding-3-large",
44
- input=chunks
45
- )
46
- chunk_embeddings = [np.array(d.embedding) for d in response.data]
47
-
48
- # Average embeddings if multiple chunks
49
- avg_embedding = np.mean(chunk_embeddings, axis=0)
50
- final_embeddings.append(avg_embedding.tolist())
 
 
 
 
 
 
 
51
 
52
  return final_embeddings
53
 
 
54
  embedding_cache = {}
55
 
56
  def get_embedding(texts: list[str], backend: Literal["hf","openai"] = "hf") -> list[list[float]]:
 
27
  tokens = tokenizer.encode(text)
28
  return [tokenizer.decode(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
29
 
30
+ import numpy as np
31
+
32
+ EMBED_DIM = 3072 # dimension of text-embedding-3-large
33
+
34
  def _get_openai_embedding(texts: list[str]) -> list[list[float]]:
35
  """Get embeddings for a list of texts. If a text is too long, chunk + average."""
36
  final_embeddings = []
37
 
38
  for text in texts:
39
+ if not text or not isinstance(text, str) or not text.strip():
40
+ # fallback: skip or append zero vector
41
+ final_embeddings.append([0.0] * EMBED_DIM)
42
+ continue
43
+
44
  # Split into chunks if too long
45
  if len(tokenizer.encode(text)) > 8192:
46
  chunks = chunk_text(text)
47
  else:
48
  chunks = [text]
49
 
50
+ # Clean chunks
51
+ clean_chunks = [c.strip() for c in chunks if isinstance(c, str) and c.strip()]
52
+ if not clean_chunks:
53
+ final_embeddings.append([0.0] * EMBED_DIM)
54
+ continue
55
+
56
+ try:
57
+ response = client.embeddings.create(
58
+ model="text-embedding-3-large",
59
+ input=clean_chunks
60
+ )
61
+ chunk_embeddings = [np.array(d.embedding) for d in response.data]
62
+ avg_embedding = np.mean(chunk_embeddings, axis=0)
63
+ final_embeddings.append(avg_embedding.tolist())
64
+ except Exception as e:
65
+ print(f"Embedding failed for text[:100]={text[:100]!r}, error={e}")
66
+ final_embeddings.append([0.0] * EMBED_DIM) # fallback
67
 
68
  return final_embeddings
69
 
70
+
71
  embedding_cache = {}
72
 
73
  def get_embedding(texts: list[str], backend: Literal["hf","openai"] = "hf") -> list[list[float]]:
modules/nodes/init.py CHANGED
@@ -6,7 +6,11 @@ from modules.nodes.state import ChatState
6
 
7
  def init_system_prompt_node(state: ChatState) -> ChatState:
8
  messages = state["messages"] or []
9
- initialized = state["initialized"] if "initialized" in state else False
 
 
 
 
10
 
11
  # Check if system prompts were already added
12
  if not initialized:
@@ -76,7 +80,7 @@ Example user queries and tool usage:
76
  """
77
  ),
78
  SystemMessage(
79
- content="""You are a knowledgeable assistant on the scripture *{collection_name}*, well-versed in **Sanskrit** , **English** and **Tamil**.
80
  You must answer the question using **only** the content from *{collection_name}* provided in the context below.
81
  - Do **not** bring in information from **any other scripture or source**, or from prior knowledge, even if the answer seems obvious or well-known.
82
  - Do **not** quote any Sanskrit/Tamil verses unless they appear **explicitly** in the provided context.
@@ -88,7 +92,8 @@ If the answer asks for translation to another language of their choice and you a
88
  If the answer WAS indeed found in the context, use the following response format (in Markdown) othereise clearly state **"I do not have enough information from the {collection_name} to answer this. I searched using {search_methodology}. Do you want me try to another search like {alternative_searchmethod}?"**
89
 
90
  ### ๐Ÿงพ Answer
91
- - Present a brief summary of your response in concise **English**. Mention only the scripture(s), chapter(s) and verse number(s) available if multiple matches are available.
 
92
 
93
  The following format should be used to show only the most relevant match. Do not show all matches at once.
94
 
@@ -98,6 +103,9 @@ The following format should be used to show only the most relevant match. Do not
98
  ### ๐Ÿ•ฎ Chapter Title(s)
99
  - Mention the chapter(s) from which the references were taken. Use the field *title* here from the context if available. For example `TVM 1.8.3`
100
 
 
 
 
101
  ### ๐Ÿ•ฎ Verse Number(s)
102
  - Mention the *verse number* from which the references were taken.
103
 
@@ -117,16 +125,16 @@ The following format should be used to show only the most relevant match. Do not
117
  - Do not translate, transliterate, or interpret.
118
  - Do not hallucinate or generate new verses.
119
  - Output should only be the **cleaned, original verses**.
120
- - The output in this section **MUST** be in native script not english or transliterated english.
121
  > If you are unsure about a character, leave it as it is rather than guessing.
122
 
123
 
124
- ### ๐Ÿ“œ English Transliteration(s)
125
- - For each verse above, provide the **matching English transliteration**.
126
- - Maintain the **same order** as the verses listed above.
127
 
128
- ### ๐Ÿ“œ English Translation(s)
129
- - Provide the **English meaning** for each verse listed above.
130
  - Again, follow the **same order**.
131
  - Do **not** repeat the original verse here โ€” just the translation.
132
 
@@ -155,6 +163,20 @@ Respond in **Markdown** format only. Ensure native Sanskrit/Tamil verses are alw
155
  ),
156
  ]
157
  state["initialized"] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  state["tool_calls"] = 0
159
  state["seen_tool_calls"] = set()
160
  state["skip_tool"] = False
 
6
 
7
  def init_system_prompt_node(state: ChatState) -> ChatState:
8
  messages = state["messages"] or []
9
+ initialized = state["initialized"] if "initialized" in state else False
10
+
11
+ if "language" not in state:
12
+ # Set default language
13
+ state["language"] = "English"
14
 
15
  # Check if system prompts were already added
16
  if not initialized:
 
80
  """
81
  ),
82
  SystemMessage(
83
+ content="""You are a knowledgeable assistant on the scripture *{collection_name}*, well-versed in **Sanskrit** , **{user_preferred_language}** and **Tamil**.
84
  You must answer the question using **only** the content from *{collection_name}* provided in the context below.
85
  - Do **not** bring in information from **any other scripture or source**, or from prior knowledge, even if the answer seems obvious or well-known.
86
  - Do **not** quote any Sanskrit/Tamil verses unless they appear **explicitly** in the provided context.
 
92
  If the answer WAS indeed found in the context, use the following response format (in Markdown) othereise clearly state **"I do not have enough information from the {collection_name} to answer this. I searched using {search_methodology}. Do you want me try to another search like {alternative_searchmethod}?"**
93
 
94
  ### ๐Ÿงพ Answer
95
+ - Present a brief summary of your response in concise **{user_preferred_language}**. Mention only the scripture(s), chapter(s) and verse number(s) available if multiple matches are available.
96
+ - This needs to begin with `author` says.
97
 
98
  The following format should be used to show only the most relevant match. Do not show all matches at once.
99
 
 
103
  ### ๐Ÿ•ฎ Chapter Title(s)
104
  - Mention the chapter(s) from which the references were taken. Use the field *title* here from the context if available. For example `TVM 1.8.3`
105
 
106
+ ### ๐Ÿ•ฎ Author(s)
107
+ - Mention the name of the Author. In the case of divya_prabandham, it is the `azhwar_name`
108
+
109
  ### ๐Ÿ•ฎ Verse Number(s)
110
  - Mention the *verse number* from which the references were taken.
111
 
 
125
  - Do not translate, transliterate, or interpret.
126
  - Do not hallucinate or generate new verses.
127
  - Output should only be the **cleaned, original verses**.
128
+ - The output in this section **MUST** be in native script not {user_preferred_language} or transliterated {user_preferred_language}.
129
  > If you are unsure about a character, leave it as it is rather than guessing.
130
 
131
 
132
+ ### ๐Ÿ“œ {user_preferred_language} Transliteration(s)
133
+ - If `{user_preferred_language}` is the SAME as the native verse language, **omit this entire section completely** (do not output even the heading).
134
+ - Otherwise, provide the transliterations in {user_preferred_language}, matching the order of verses above.
135
 
136
+ ### ๐Ÿ“œ {user_preferred_language} Translation(s)
137
+ - Provide the **{user_preferred_language} meaning** for each verse listed above.
138
  - Again, follow the **same order**.
139
  - Do **not** repeat the original verse here โ€” just the translation.
140
 
 
163
  ),
164
  ]
165
  state["initialized"] = True
166
+
167
+ state["messages"].append(
168
+ SystemMessage(
169
+ content=(
170
+ f"Note: `user_preferred_language` is {state['language']}. "
171
+ f"Carefully translate all other sections (including the section headings) in the response "
172
+ f"**except the Native verses** to {state['language']}. "
173
+ f"While translating, meticulously correct any spelling mistakes, typos, conversion errors, "
174
+ f"and remove any untranslated words or foreign characters. "
175
+ f"Ensure the output text is **fully natural, grammatically correct, and orthographically valid** "
176
+ f"in {state['language']}."
177
+ )
178
+ )
179
+ )
180
  state["tool_calls"] = 0
181
  state["seen_tool_calls"] = set()
182
  state["skip_tool"] = False
modules/nodes/state.py CHANGED
@@ -9,4 +9,5 @@ class ChatState(TypedDict):
9
  tool_calls: int
10
  seen_tool_calls: set[tuple[str, str]] # (tool_name, params_hash)
11
  skip_tool: bool
12
- initialized : bool
 
 
9
  tool_calls: int
10
  seen_tool_calls: set[tuple[str, str]] # (tool_name, params_hash)
11
  skip_tool: bool
12
+ initialized : bool
13
+ language : str
pyproject.toml CHANGED
@@ -17,5 +17,6 @@ dependencies = [
17
  "langchain-openai>=0.3.28",
18
  "langgraph>=0.6.2",
19
  "oauth2client>=4.1.3",
 
20
  "sentence-transformers>=5.0.0",
21
  ]
 
17
  "langchain-openai>=0.3.28",
18
  "langgraph>=0.6.2",
19
  "oauth2client>=4.1.3",
20
+ "pycountry>=24.6.1",
21
  "sentence-transformers>=5.0.0",
22
  ]
requirements.txt CHANGED
@@ -332,6 +332,8 @@ pyasn1-modules==0.4.2
332
  # oauth2client
333
  pybase64==1.4.2
334
  # via chromadb
 
 
335
  pydantic==2.11.7
336
  # via
337
  # chromadb
 
332
  # oauth2client
333
  pybase64==1.4.2
334
  # via chromadb
335
+ pycountry==24.6.1
336
+ # via sanatan-ai (pyproject.toml)
337
  pydantic==2.11.7
338
  # via
339
  # chromadb
uv.lock CHANGED
@@ -2273,6 +2273,15 @@ wheels = [
2273
  { url = "https://files.pythonhosted.org/packages/3c/52/5600104ef7b85f89fb8ec54f73504ead3f6f0294027e08d281f3cafb5c1a/pybase64-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:f25140496b02db0e7401567cd869fb13b4c8118bf5c2428592ec339987146d8b", size = 31600, upload-time = "2025-07-27T13:05:52.24Z" },
2274
  ]
2275
 
 
 
 
 
 
 
 
 
 
2276
  [[package]]
2277
  name = "pycparser"
2278
  version = "2.22"
@@ -2750,6 +2759,7 @@ dependencies = [
2750
  { name = "langchain-openai" },
2751
  { name = "langgraph" },
2752
  { name = "oauth2client" },
 
2753
  { name = "sentence-transformers" },
2754
  ]
2755
 
@@ -2767,6 +2777,7 @@ requires-dist = [
2767
  { name = "langchain-openai", specifier = ">=0.3.28" },
2768
  { name = "langgraph", specifier = ">=0.6.2" },
2769
  { name = "oauth2client", specifier = ">=4.1.3" },
 
2770
  { name = "sentence-transformers", specifier = ">=5.0.0" },
2771
  ]
2772
 
 
2273
  { url = "https://files.pythonhosted.org/packages/3c/52/5600104ef7b85f89fb8ec54f73504ead3f6f0294027e08d281f3cafb5c1a/pybase64-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:f25140496b02db0e7401567cd869fb13b4c8118bf5c2428592ec339987146d8b", size = 31600, upload-time = "2025-07-27T13:05:52.24Z" },
2274
  ]
2275
 
2276
+ [[package]]
2277
+ name = "pycountry"
2278
+ version = "24.6.1"
2279
+ source = { registry = "https://pypi.org/simple" }
2280
+ sdist = { url = "https://files.pythonhosted.org/packages/76/57/c389fa68c50590881a75b7883eeb3dc15e9e73a0fdc001cdd45c13290c92/pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221", size = 6043910, upload-time = "2024-06-01T04:12:15.05Z" }
2281
+ wheels = [
2282
+ { url = "https://files.pythonhosted.org/packages/b1/ec/1fb891d8a2660716aadb2143235481d15ed1cbfe3ad669194690b0604492/pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f", size = 6335189, upload-time = "2024-06-01T04:11:49.711Z" },
2283
+ ]
2284
+
2285
  [[package]]
2286
  name = "pycparser"
2287
  version = "2.22"
 
2759
  { name = "langchain-openai" },
2760
  { name = "langgraph" },
2761
  { name = "oauth2client" },
2762
+ { name = "pycountry" },
2763
  { name = "sentence-transformers" },
2764
  ]
2765
 
 
2777
  { name = "langchain-openai", specifier = ">=0.3.28" },
2778
  { name = "langgraph", specifier = ">=0.6.2" },
2779
  { name = "oauth2client", specifier = ">=4.1.3" },
2780
+ { name = "pycountry", specifier = ">=24.6.1" },
2781
  { name = "sentence-transformers", specifier = ">=5.0.0" },
2782
  ]
2783