NeerajAhire commited on
Commit
925fa6b
Β·
verified Β·
1 Parent(s): 28e97f2

Upload mcp_server.py

Browse files
Files changed (1) hide show
  1. Server/mcp_server.py +596 -0
Server/mcp_server.py ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import socket
4
+ from typing import List, Tuple, Dict
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import fitz # PyMuPDF
9
+
10
+ from fastmcp import FastMCP
11
+ from openai import OpenAI
12
+ from langchain_community.document_loaders import WebBaseLoader
13
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
14
+
15
+ # ---------- Config ----------
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format="[%(asctime)s] %(levelname)-7s %(message)s",
19
+ datefmt="%m/%d/%y %H:%M:%S",
20
+ )
21
+ logger = logging.getLogger("rag-mcp-server")
22
+
23
+ mcp = FastMCP(name="rag-mcp-server", version="1.1.0")
24
+
25
+ # Paths
26
+ EXCEL_PATH = "Data/IIT_Opening_Closing_Ranks.xlsx"
27
+
28
+ PDF_FILES: Dict[str, str] = {
29
+ "eng_design": "Data/Engineering_design_Course_Details.pdf",
30
+ "aero_curriculum": "Data/Aerospace_curriculum.pdf",
31
+ "nirf_2024": "Data/IR2024_Report.pdf",
32
+ "iitm_curriculum_2024": "Data/Curriculum_-_2024_Batch_B.Tech_Version_1 (1).pdf",
33
+ "iitb_cse_curriculum": "Data/IITB_CSE_Btech_Curriculum.pdf",
34
+ "iitb_civil_curriculum": "Data/IITB_Civil_Btech_Curriculum.pdf",
35
+ "iitb_mech_curriculum": "Data/IITB_Mechanical_Engg_Curriculum.pdf",
36
+ "iitb_elec_curriculum": "Data/IITD_Electrical_Btech_Curriculum.pdf",
37
+ "iitd_allprogrammes_curriculum": "Data/IITD_Programmes_Curriculum.pdf"
38
+
39
+ }
40
+
41
+ LINK_FILES: Dict[str, str] = {
42
+ "linkedin_profile_iit_d": "https://alumni.iitd.ac.in/distinguished-alum-awards",
43
+ "linkedin_profile_iit_m": "https://www.vaave.com/blog/iit-madras-notable-alumni/",
44
+ "linkedin_profile_iit_b": "https://acr.iitbombay.org/distinguished-alumnus/",
45
+ "linkedin_profile_iit_kgp": "http://alumni.iitkgp.ac.in/",
46
+ }
47
+
48
+ # Models
49
+ EMBED_MODEL = "text-embedding-3-small"
50
+ CHAT_MODEL = "gpt-4o-mini"
51
+ TOP_K = 5
52
+
53
+ client = OpenAI(api_key = "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")
54
+
55
+ # ---------- Utility ----------
56
+ def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> np.ndarray:
57
+ a_norm = a / (np.linalg.norm(a) + 1e-12)
58
+ b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-12)
59
+ return b_norm @ a_norm
60
+
61
+
62
+ # EXCEL PIPELINE
63
+
64
+ EXCEL_INDEX = {
65
+ "texts": None,
66
+ "embeddings": None,
67
+ "row_ids": None,
68
+ "columns": None,
69
+ }
70
+
71
+ def _excel_to_texts(excel_path: str, sheet: int | str = 0) -> Tuple[List[str], List[int], List[str]]:
72
+ df = pd.read_excel(excel_path, sheet_name=sheet) # requires openpyxl
73
+ df = df.fillna("")
74
+ cols = list(df.columns)
75
+
76
+ texts, row_ids = [], []
77
+ for i, row in df.iterrows():
78
+ parts = [f"Row {i}"]
79
+ for c in cols:
80
+ parts.append(f"{c}: {row[c]}")
81
+ texts.append(" | ".join(parts))
82
+ row_ids.append(i)
83
+ return texts, row_ids, cols
84
+
85
+ def _build_excel_index(force: bool = False, sheet: int | str = 0):
86
+ if not force and EXCEL_INDEX["texts"] is not None and EXCEL_INDEX["embeddings"] is not None:
87
+ return
88
+ if not os.path.exists(EXCEL_PATH):
89
+ raise FileNotFoundError(f"Excel not found at {EXCEL_PATH}")
90
+
91
+ logger.info("Loading Excel and building embeddings index...")
92
+ texts, row_ids, cols = _excel_to_texts(EXCEL_PATH, sheet)
93
+ emb = client.embeddings.create(model=EMBED_MODEL, input=texts)
94
+ vectors = np.array([e.embedding for e in emb.data], dtype=np.float32)
95
+
96
+ EXCEL_INDEX.update({
97
+ "texts": texts,
98
+ "embeddings": vectors,
99
+ "row_ids": row_ids,
100
+ "columns": cols,
101
+ })
102
+ logger.info(f"[EXCEL INDEX] rows={len(texts)} emb.shape={vectors.shape} cols={len(cols)}")
103
+
104
+ def _retrieve_excel(question: str, top_k: int = TOP_K) -> List[Tuple[int, str]]:
105
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[question]).data[0].embedding
106
+ q = np.array(q_emb, dtype=np.float32)
107
+ sims = _cosine_similarity(q, EXCEL_INDEX["embeddings"])
108
+ idxs = np.argsort(sims)[::-1][:top_k]
109
+ out = [(int(EXCEL_INDEX["row_ids"][i]), EXCEL_INDEX["texts"][i]) for i in idxs]
110
+ top_info = [(int(EXCEL_INDEX["row_ids"][i]), float(sims[i])) for i in idxs]
111
+ logger.info(f"[EXCEL RETRIEVE] q='{question[:80]}...' top_k={top_k} -> {top_info}")
112
+ return out
113
+
114
+
115
+ def _make_excel_prompt(question: str, retrieved_rows: List[Tuple[int, str]], subquery_context: str = None) -> List[dict]:
116
+ context_lines = [f"[Row {rid}] {rtext}" for rid, rtext in retrieved_rows]
117
+ context = "\n".join(context_lines) or "(no relevant rows found)"
118
+ logger.info(f"[EXCEL PROMPT] context_len={len(context)}; preview:\n{context[:500]}")
119
+
120
+ system = (
121
+ "You are a helpful assistant. Answer the user's question STRICTLY using the provided Excel context. "
122
+ "If the answer is not present, say you don't have enough information."
123
+ )
124
+
125
+ # βœ… Append subquery_context if provided
126
+ user = (
127
+ f"Context (from Excel):\n{context}\n\n"
128
+ f"User question: {question}\n\n"
129
+ )
130
+ if subquery_context:
131
+ user += f"Additional context:\n{subquery_context}\n\n"
132
+
133
+ user += (
134
+ "Instructions:\n"
135
+ "- Use only the context above.\n"
136
+ "- Keep answers concise and accurate.\n"
137
+ "- Do not include any bracketed tags or citations."
138
+ )
139
+
140
+ return [{"role": "system", "content": system}, {"role": "user", "content": user}]
141
+
142
+
143
+ @mcp.tool("ask_excel", description="RAG over an Excel file; answer questions grounded in the sheet.")
144
+
145
+ def ask_excel(question: str, top_k: int = TOP_K, sheet: int | str = 0, temperature: float = 0.1, subquery_context: str = None) -> str:
146
+ try:
147
+ _build_excel_index(False, sheet)
148
+ retrieved = _retrieve_excel(question, top_k)
149
+ messages = _make_excel_prompt(question, retrieved, subquery_context)
150
+ for m in messages:
151
+ logger.info(f"[EXCEL MESSAGES] role={m['role']} len={len(m['content'])}")
152
+ completion = client.chat.completions.create(model=CHAT_MODEL, messages=messages, temperature=temperature)
153
+ answer = completion.choices[0].message.content or "I couldn't generate an answer."
154
+ logger.info(f"[EXCEL ANSWER] len={len(answer)}; preview: {answer[:200]}")
155
+ return answer.strip()
156
+ except Exception as e:
157
+ logger.exception("ask_excel failed: %s", e)
158
+ return f"❌ Error: {e}"
159
+
160
+
161
+ # PDF PIPELINE (Multi-file)
162
+
163
+
164
+ # Per-PDF indices
165
+ PDF_INDEXES: Dict[str, Dict[str, object]] = {key: {"chunks": None, "embeddings": None, "chunk_ids": None} for key in PDF_FILES}
166
+
167
+ # Router: keyword heuristics to quickly select a PDF
168
+ PDF_ROUTER_KEYWORDS: Dict[str, List[str]] = {
169
+ "eng_design": [
170
+ "finite element", "non-linear", "lagrangian", "continuum mechanics", "contact mechanics",
171
+ "ed5015", "ed5012", "ergonomics", "human factors", "design", "galerkin", "variational"
172
+ ],
173
+ "aero_curriculum": [
174
+ "aerospace", "b.tech", "semester", "credits", "as1010", "fluid mechanics", "gas dynamics",
175
+ "strength of materials", "lab", "workshop", "curriculum"
176
+ ],
177
+ "nirf_2024": [
178
+ "nirf", "ranking", "perception", "outreach", "inclusivity", "graduation outcome",
179
+ "research", "teaching", "learning", "resources", "department of higher education"
180
+ ],
181
+ "iitm_curriculum_2024": [
182
+ "curriculum", "credit requirements", "branch-wise", "data science", "computer science",
183
+ "electrical", "mechanical", "metallurgical", "naval architecture", "engineering physics",
184
+ "2024 batch", "2023 batch", "programme"
185
+ ],
186
+ }
187
+
188
+ # Descriptors used for embedding-based fallback routing (short, representative strings)
189
+ PDF_DESCRIPTORS: Dict[str, str] = {
190
+ "eng_design": "Engineering Design course details including ED5015 finite element methods and ED5012 human factors.",
191
+ "aero_curriculum": "IIT Madras Aerospace Engineering B.Tech curriculum semester-wise credits and course list.",
192
+ "nirf_2024": "India Rankings 2024 NIRF categories: teaching, research, graduation outcomes, outreach, inclusivity, perception.",
193
+ "iitm_curriculum_2024": "IIT Madras B.Tech curriculum 2024 batch branch-wise credit requirements across departments.",
194
+ "iitb_cse_curriculum": "IIT Bombay Computer Science Engineering B.Tech curriculum semester-wise credits and course list.",
195
+ "iitb_civil_curriculum": "IIT Bombay Civil Engineering B.Tech curriculum semester-wise credits and course list.",
196
+ "iitb_mech_curriculum": "IIT Bombay Mechanical Engineering B.Tech curriculum semester-wise credits and course list.",
197
+ "iitb_elec_curriculum": "IIT Bombay Electrical Engineering B.Tech curriculum semester-wise credits and course list.",
198
+ "iitd_allprogrammes_curriculum": "IIT Delhi All B.Tech programmes curriculum semester-wise credits and course list."
199
+ }
200
+
201
+
202
+ PDF_DESC_EMB: Dict[str, np.ndarray] = {} # cached descriptor embeddings
203
+
204
+ def _build_pdf_router_embeddings():
205
+ if PDF_DESC_EMB:
206
+ return
207
+ inputs = [PDF_DESCRIPTORS[k] for k in PDF_FILES.keys()]
208
+ emb = client.embeddings.create(model=EMBED_MODEL, input=inputs)
209
+ vecs = [np.array(e.embedding, dtype=np.float32) for e in emb.data]
210
+ for k, v in zip(PDF_FILES.keys(), vecs):
211
+ PDF_DESC_EMB[k] = v
212
+ logger.info(f"[PDF ROUTER] cached descriptor embeddings for {len(PDF_DESC_EMB)} PDFs")
213
+
214
+ def _pdf_to_chunks(pdf_path: str) -> List[str]:
215
+ doc = fitz.open(pdf_path)
216
+ chunks: List[str] = []
217
+ for pno, page in enumerate(doc, start=1):
218
+ text = page.get_text("text")
219
+ if not text:
220
+ continue
221
+ # Split into paragraphs to improve retrieval granularity
222
+ paras = [p.strip() for p in text.split("\n\n") if p.strip()]
223
+ for para in paras:
224
+ para = " ".join(para.split()) # collapse whitespace
225
+ chunks.append(f"Page {pno}: {para}")
226
+ return chunks
227
+
228
+ def _build_pdf_index(pdf_key: str, force: bool = False):
229
+ idx = PDF_INDEXES[pdf_key]
230
+ if not force and idx["chunks"] is not None and idx["embeddings"] is not None:
231
+ return
232
+
233
+ pdf_path = PDF_FILES[pdf_key]
234
+ if not os.path.exists(pdf_path):
235
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
236
+
237
+ logger.info(f"[PDF INDEX] building for '{pdf_key}' -> {pdf_path}")
238
+ chunks = _pdf_to_chunks(pdf_path)
239
+ if not chunks:
240
+ logger.warning(f"[PDF INDEX] No text extracted for '{pdf_key}'.")
241
+ idx["chunks"], idx["embeddings"], idx["chunk_ids"] = [], np.zeros((0, 1), dtype=np.float32), []
242
+ return
243
+
244
+ emb = client.embeddings.create(model=EMBED_MODEL, input=chunks)
245
+ vectors = np.array([e.embedding for e in emb.data], dtype=np.float32)
246
+
247
+ idx["chunks"] = chunks
248
+ idx["embeddings"] = vectors
249
+ idx["chunk_ids"] = list(range(len(chunks)))
250
+ logger.info(f"[PDF INDEX] '{pdf_key}' chunks={len(chunks)} emb.shape={vectors.shape}")
251
+
252
+ def _retrieve_pdf(pdf_key: str, question: str, top_k: int = TOP_K) -> List[Tuple[int, str]]:
253
+ idx = PDF_INDEXES[pdf_key]
254
+ embeddings = idx["embeddings"]
255
+ if embeddings is None or len(embeddings) == 0:
256
+ logger.warning(f"[PDF RETRIEVE] Empty embeddings for '{pdf_key}'.")
257
+ return []
258
+
259
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[question]).data[0].embedding
260
+ q = np.array(q_emb, dtype=np.float32)
261
+ sims = _cosine_similarity(q, embeddings)
262
+ idxs = np.argsort(sims)[::-1][:top_k]
263
+ out = [(int(i), idx["chunks"][i]) for i in idxs]
264
+ top_info = [(int(i), float(sims[i])) for i in idxs]
265
+ logger.info(f"[PDF RETRIEVE] '{pdf_key}' q='{question[:80]}...' top_k={top_k} -> {top_info}")
266
+ return out
267
+
268
+ def _route_pdf(question: str) -> str:
269
+ q_lower = question.lower()
270
+
271
+ # 1) Keyword heuristic
272
+ for key, kws in PDF_ROUTER_KEYWORDS.items():
273
+ if any(k in q_lower for k in kws):
274
+ logger.info(f"[PDF ROUTER] keyword matched '{key}'")
275
+ return key
276
+
277
+ # 2) Embedding fallback (compare question to PDF descriptors)
278
+ _build_pdf_router_embeddings()
279
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[question]).data[0].embedding
280
+ q_vec = np.array(q_emb, dtype=np.float32)
281
+ q_vec = q_vec / (np.linalg.norm(q_vec) + 1e-12)
282
+
283
+ keys = list(PDF_DESC_EMB.keys())
284
+ desc_mat = np.stack([PDF_DESC_EMB[k] / (np.linalg.norm(PDF_DESC_EMB[k]) + 1e-12) for k in keys], axis=0)
285
+ sims = desc_mat @ q_vec
286
+ best_idx = int(np.argmax(sims))
287
+ chosen = keys[best_idx]
288
+ logger.info(f"[PDF ROUTER] embed sims={[(k, float(s)) for k, s in zip(keys, sims.tolist())]} -> '{chosen}'")
289
+ return chosen
290
+
291
+
292
+
293
+
294
+ def _make_pdf_prompt(question: str, retrieved_chunks: List[Tuple[int, str]], pdf_key: str, subquery_context: str = None) -> List[dict]:
295
+ tagged_preview = [f"[{pdf_key} | Chunk {cid}] {text}" for cid, text in retrieved_chunks]
296
+ logger.info(f"[PDF PROMPT] '{pdf_key}' preview:\n{'\n'.join(tagged_preview)[:500]}")
297
+
298
+ context_lines = [text for _, text in retrieved_chunks]
299
+ context = "\n\n".join(context_lines) or "(no relevant chunks found)"
300
+
301
+ system = (
302
+ "You are a helpful assistant. Answer the user's question STRICTLY using the provided PDF context. "
303
+ "If the answer is not present, say you don't have enough information. "
304
+ "Do not include file names, chunk ids, or any bracketed metadata in your answer."
305
+ )
306
+
307
+ user = (
308
+ f"Context:\n{context}\n\n"
309
+ f"User question: {question}\n\n"
310
+ )
311
+ if subquery_context:
312
+ user += f"Additional context:\n{subquery_context}\n\n"
313
+
314
+ user += (
315
+ "Instructions:\n"
316
+ "- Use only the context above.\n"
317
+ "- Keep answers concise.\n"
318
+ "- Do not include any bracketed tags or source identifiers."
319
+ )
320
+
321
+ return [
322
+ {"role": "system", "content": system},
323
+ {"role": "user", "content": user},
324
+ ]
325
+
326
+
327
+ @mcp.tool("ask_pdf", description="RAG over multiple PDFs; auto-select the best-matching document and answer.")
328
+
329
+ def ask_pdf(question: str, top_k: int = TOP_K, temperature: float = 0.1, pdf_key: str = None, subquery_context: str = None) -> str:
330
+ try:
331
+ chosen = pdf_key or _route_pdf(question)
332
+ print(_route_pdf(question))
333
+ _build_pdf_index(chosen, force=False)
334
+ retrieved = _retrieve_pdf(chosen, question, top_k)
335
+ messages = _make_pdf_prompt(question, retrieved, chosen, subquery_context)
336
+ for m in messages:
337
+ logger.info(f"[PDF MESSAGES] role={m['role']} len={len(m['content'])}")
338
+ completion = client.chat.completions.create(model=CHAT_MODEL, messages=messages, temperature=temperature)
339
+ answer = completion.choices[0].message.content or "I couldn't generate an answer."
340
+ logger.info(f"[PDF ANSWER] '{chosen}' len={len(answer)}; preview: {answer[:200]}")
341
+ return answer.strip()
342
+ except Exception as e:
343
+ logger.exception("ask_pdf failed: %s", e)
344
+ return f"❌ Error: {e}"
345
+
346
+
347
+
348
+ '''
349
+ @mcp.tool("ask_link", description="RAG over a webpage (LinkedIn or any site); answer questions grounded in the page content.")
350
+ def ask_link(
351
+ question: str,
352
+ link_key: str = "linkedin_profile",
353
+ url: str | None = None,
354
+ temperature: float = 0.1,
355
+ subquery_context: str | None = None,
356
+ top_k: int = TOP_K
357
+ ) -> str:
358
+ """
359
+ Implements RAG for a webpage:
360
+ - Loads content using LangChain WebBaseLoader.
361
+ - Splits into chunks.
362
+ - Embeds chunks and retrieves top_k relevant ones.
363
+ - Builds prompt with retrieved chunks + optional subquery_context.
364
+ """
365
+ try:
366
+ # βœ… Resolve URL
367
+ target_url = url or LINK_FILES.get(link_key)
368
+ if not target_url:
369
+ return f"❌ Error: No URL resolved for link_key='{link_key}'."
370
+
371
+ logger.info(f"[LINK TOOL] Fetching and processing content from: {target_url}")
372
+
373
+ # βœ… Load webpage content
374
+ loader = WebBaseLoader(target_url, verify_ssl=False)
375
+ documents = loader.load()
376
+ if not documents or not documents[0].page_content.strip():
377
+ return "❌ Error: Could not extract readable content from the URL."
378
+
379
+ page_text = documents[0].page_content.strip()
380
+
381
+ # βœ… Split into chunks using langchain-text-splitters
382
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
383
+ chunks = splitter.split_text(page_text)
384
+ if not chunks:
385
+ return "❌ Error: No chunks generated from page content."
386
+
387
+ # βœ… Embed chunks
388
+ emb = client.embeddings.create(model=EMBED_MODEL, input=chunks)
389
+ chunk_vectors = np.array([e.embedding for e in emb.data], dtype=np.float32)
390
+
391
+ # βœ… Embed question
392
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[question]).data[0].embedding
393
+ q_vec = np.array(q_emb, dtype=np.float32)
394
+
395
+ # βœ… Compute cosine similarity correctly
396
+ chunk_norms = np.linalg.norm(chunk_vectors, axis=1)
397
+ q_norm = np.linalg.norm(q_vec)
398
+ sims = (chunk_vectors @ q_vec) / (chunk_norms * q_norm + 1e-12)
399
+
400
+ # βœ… Sort and select top_k safely
401
+ idxs = np.argsort(sims)[::-1][:min(top_k, len(chunks))]
402
+ retrieved_chunks = [(i, chunks[i]) for i in idxs]
403
+
404
+ logger.info(f"[LINK RETRIEVE] top_k={top_k} -> {[ (i, float(sims[i])) for i in idxs ]}")
405
+
406
+ # βœ… Build prompt
407
+ context_lines = [text for _, text in retrieved_chunks]
408
+ context = "\n\n".join(context_lines) or "(no relevant chunks found)"
409
+
410
+ system = (
411
+ "You are a helpful assistant. Answer the user's question STRICTLY using the provided webpage context. "
412
+ "If the answer is not present, say you don't have enough information. "
413
+ "Do not include URLs or any bracketed metadata in your answer."
414
+ )
415
+
416
+ user = (
417
+ f"Context:\n{context}\n\n"
418
+ f"User question: {question}\n\n"
419
+ )
420
+ if subquery_context:
421
+ user += f"Additional context:\n{subquery_context}\n\n"
422
+
423
+ user += (
424
+ "Instructions:\n"
425
+ "- Use only the context above.\n"
426
+ "- Keep answers concise.\n"
427
+ "- Do not include any bracketed tags or source identifiers."
428
+ )
429
+
430
+ messages = [{"role": "system", "content": system}, {"role": "user", "content": user}]
431
+
432
+ # βœ… LLM call
433
+ completion = client.chat.completions.create(model=CHAT_MODEL, messages=messages, temperature=temperature)
434
+ answer = completion.choices[0].message.content or "I couldn't generate an answer."
435
+ logger.info(f"[LINK ANSWER] len={len(answer)}; preview: {answer[:200]}")
436
+ return answer.strip()
437
+
438
+ except Exception as e:
439
+ logger.exception("ask_link failed: %s", e)
440
+ return f"❌ Error: {e}"
441
+ '''
442
+
443
+
444
+ ########################################################################################################################################################################################################################################################################################
445
+
446
+
447
+ LINK_DESCRIPTORS: Dict[str, str] = {
448
+ "linkedin_profile_iit_m": "IIT Madras Alumni.",
449
+ "linkedin_profile_iit_d": "IIT Delhi Alumni.",
450
+ "linkedin_profile_iit_b": "IIT Bombay Alumni.",
451
+ "linkedin_profile_iit_kgp": "IIT Kharagpur Alumni.",
452
+
453
+ }
454
+ LINK_DESC_EMB: Dict[str, np.ndarray] = {}
455
+
456
+
457
+
458
+ def _build_link_router_embeddings():
459
+ if LINK_DESC_EMB:
460
+ return
461
+ inputs = [LINK_DESCRIPTORS[k] for k in LINK_FILES.keys()]
462
+ emb = client.embeddings.create(model=EMBED_MODEL, input=inputs)
463
+ vecs = [np.array(e.embedding, dtype=np.float32) for e in emb.data]
464
+ for k, v in zip(LINK_FILES.keys(), vecs):
465
+ LINK_DESC_EMB[k] = v
466
+
467
+
468
+
469
+ def _route_link(question: str) -> str:
470
+ q_lower = question.lower()
471
+
472
+ _build_link_router_embeddings()
473
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[question]).data[0].embedding
474
+ q_vec = np.array(q_emb, dtype=np.float32)
475
+ q_vec = q_vec / (np.linalg.norm(q_vec) + 1e-12)
476
+
477
+ keys = list(LINK_DESC_EMB.keys())
478
+ desc_mat = np.stack([LINK_DESC_EMB[k] / (np.linalg.norm(LINK_DESC_EMB[k]) + 1e-12) for k in keys], axis=0)
479
+ sims = desc_mat @ q_vec
480
+ best_idx = int(np.argmax(sims))
481
+ chosen = keys[best_idx]
482
+ return chosen
483
+
484
+
485
+ @mcp.tool("ask_link", description="RAG over a webpage (LinkedIn or any site); answer questions grounded in the page content.")
486
+
487
+ def ask_link(
488
+ query: str,
489
+ link_key: str = "linkedin_profile",
490
+ url: str | None = None,
491
+ temperature: float = 0.1,
492
+ subquery_context: str | None = None,
493
+ top_k: int = TOP_K
494
+ ) -> str:
495
+ """
496
+ Implements RAG for a webpage:
497
+ - Loads content using LangChain WebBaseLoader.
498
+ - Splits into chunks.
499
+ - Embeds chunks and retrieves top_k relevant ones.
500
+ - Builds prompt with retrieved chunks + optional subquery_context.
501
+ """
502
+ try:
503
+ # βœ… Resolve URL
504
+ target_url_key = url or _route_link(query)
505
+ target_url = LINK_FILES[target_url_key]
506
+ if not target_url:
507
+ return f"❌ Error: No URL resolved for link_key='{link_key}'."
508
+
509
+ # logger.info(f"[LINK TOOL] Fetching and processing content from: {target_url}")
510
+
511
+ # βœ… Load webpage content
512
+ loader = WebBaseLoader(target_url, verify_ssl=False)
513
+ documents = loader.load()
514
+ if not documents or not documents[0].page_content.strip():
515
+ return "❌ Error: Could not extract readable content from the URL."
516
+
517
+ page_text = documents[0].page_content.strip()
518
+
519
+ # βœ… Split into chunks using langchain-text-splitters
520
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
521
+ chunks = splitter.split_text(page_text)
522
+ if not chunks:
523
+ return "❌ Error: No chunks generated from page content."
524
+
525
+ # βœ… Embed chunks
526
+ emb = client.embeddings.create(model=EMBED_MODEL, input=chunks)
527
+ chunk_vectors = np.array([e.embedding for e in emb.data], dtype=np.float32)
528
+
529
+ # βœ… Embed question
530
+ q_emb = client.embeddings.create(model=EMBED_MODEL, input=[query]).data[0].embedding
531
+ q_vec = np.array(q_emb, dtype=np.float32)
532
+
533
+ # βœ… Compute cosine similarity correctly
534
+ chunk_norms = np.linalg.norm(chunk_vectors, axis=1)
535
+ q_norm = np.linalg.norm(q_vec)
536
+ sims = (chunk_vectors @ q_vec) / (chunk_norms * q_norm + 1e-12)
537
+
538
+ # βœ… Sort and select top_k safely
539
+ idxs = np.argsort(sims)[::-1][:min(top_k, len(chunks))]
540
+ retrieved_chunks = [(i, chunks[i]) for i in idxs]
541
+
542
+ # logger.info(f"[LINK RETRIEVE] top_k={top_k} -> {[ (i, float(sims[i])) for i in idxs ]}")
543
+
544
+ # βœ… Build prompt
545
+ context_lines = [text for _, text in retrieved_chunks]
546
+ context = "\n\n".join(context_lines) or "(no relevant chunks found)"
547
+
548
+ system = (
549
+ "You are a helpful assistant. Answer the user's question STRICTLY using the provided webpage context. "
550
+ "If the answer is not present, say you don't have enough information. "
551
+ "Do not include URLs or any bracketed metadata in your answer."
552
+ )
553
+
554
+ user = (
555
+ f"Context:\n{context}\n\n"
556
+ f"User question: {query}\n\n"
557
+ )
558
+ if subquery_context:
559
+ user += f"Additional context:\n{subquery_context}\n\n"
560
+
561
+ user += (
562
+ "Instructions:\n"
563
+ "- Use only the context above.\n"
564
+ "- Keep answers concise.\n"
565
+ "- Do not include any bracketed tags or source identifiers."
566
+ )
567
+
568
+ messages = [{"role": "system", "content": system}, {"role": "user", "content": user}]
569
+
570
+ # βœ… LLM call
571
+ completion = client.chat.completions.create(model=CHAT_MODEL, messages=messages, temperature=temperature)
572
+ answer = completion.choices[0].message.content or "I couldn't generate an answer."
573
+ # logger.info(f"[LINK ANSWER] len={len(answer)}; preview: {answer[:200]}")
574
+ return answer.strip()
575
+
576
+ except Exception as e:
577
+ # logger.exception("ask_link failed: %s", e)
578
+ return f"❌ Error: {e}"
579
+
580
+
581
+ def find_available_port(start_port=8001) -> int:
582
+ port = start_port
583
+ while True:
584
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
585
+ if s.connect_ex(("127.0.0.1", port)) != 0:
586
+ return port
587
+ port += 1
588
+
589
+ if __name__ == "__main__":
590
+ try:
591
+ port = find_available_port(8001)
592
+ logger.info(f"Starting RAG MCP server (Excel + multi-PDF) on port {port}")
593
+ mcp.run(transport="sse", host="127.0.0.1", port=port)
594
+ except Exception as e:
595
+ logger.error(f"Failed to start server: {e}")
596
+ print(f"Error starting server: {e}")