Spaces:
Sleeping
Sleeping
File size: 9,035 Bytes
ba2d57a e69d432 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
from pinecone import Pinecone
import os
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from pinecone import ServerlessSpec
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
# Validate API keys
if not PINECONE_API_KEY:
raise ValueError("❌ PINECONE_API_KEY not found in environment variables")
if not GEMINI_API_KEY:
raise ValueError("❌ GEMINI_API_KEY not found in environment variables")
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "doc-embeddings"
if not pc.has_index(index_name):
pc.delete_index(index_name)
pc.create_index(
name=index_name,
dimension=384,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)
encoder = HuggingFaceEmbeddings(
#model_name=r"C:\Users\bahra\.cache\huggingface\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots\86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d",
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
model_kwargs={"device": "cpu"}
)
def encode(pdf_id, page_numb, docs, encoder=encoder):
"""Embed and store document chunks"""
if not docs:
print("⚠️ No documents to encode")
return
try:
embeddings = encoder.embed_documents(docs)
vectors = [
(str(f"{pdf_id}_{page_numb}_{i}"), emb, {
"pdf_id": str(pdf_id),
"chunk_index": i,
"page_no": page_numb,
"text": docs[i]
})
for i, emb in enumerate(embeddings)
]
index.upsert(vectors,namespace=str(pdf_id))
print(f"✅ {len(vectors)} chunks stored in Pinecone for {pdf_id} page {page_numb}")
except Exception as e:
print(f"❌ Error encoding documents: {e}")
query = (
"Key organizational operations, critical urgent tasks and deadlines, compliance and regulatory updates, "
"inter-departmental coordination issues, staffing and HR priorities, safety bulletins, procurement status, "
"knowledge retention challenges, and strategic initiatives impacting timely decision-making and operational efficiency."
"financial performance, budgets, payments, audits, cost control, funding, procurement finance"
"പ്രധാന സംഘടനാ പ്രവർത്തനങ്ങൾ, അടിയന്തരമായ നിർണായക ജോലികളും അവസാന തീയതികളും, അനുസരണവും നിയന്ത്രണാത്മകമായ പുതുക്കലുകളും, അന്തർ-വകുപ്പ് ഏകോപന പ്രശ്നങ്ങൾ, സ്റ്റാഫിംഗ്യും മാനവ വിഭവശേഷി മുൻഗണനകളും, സുരക്ഷാ ബുള്ളറ്റിനുകൾ, വാങ്ങൽ നില, അറിവ് സംരക്ഷണ വെല്ലുവിളികൾ, സമയബന്ധിതമായ തീരുമാനം കൈക്കൊള്ളലിനെയും പ്രവർത്തന കാര്യക്ഷമതയെയും ബാധിക്കുന്ന തന്ത്രപരമായ പ്രവർത്തനങ്ങൾ."
"സാമ്പത്തിക പ്രകടനം, ബജറ്റുകൾ, പേയ്മെന്റുകൾ, ഓഡിറ്റുകൾ, ചെലവ് നിയന്ത്രണം, ഫണ്ടിംഗ്, വാങ്ങൽ ധനകാര്യം."
)
def query_pinecone_top_k(pdf_id, top_k=10, query=query):
"""Query Pinecone for relevant chunks"""
print(f"\n🔍 Querying Pinecone for pdf_id: {pdf_id}")
try:
q_emb = encoder.embed_query(query)
results = index.query(
vector=q_emb,
top_k=top_k,
include_metadata=True,
namespace=str(pdf_id)
)
docs = [
Document(
page_content=match['metadata'].get('text', ''),
metadata=match['metadata']
)
for match in results.get('matches', [])
if match['metadata'].get('text', '').strip() # Filter out empty texts
]
if not docs:
print("⚠️ No relevant chunks found with semantic search, fetching all chunks...")
all_results = index.query(
vector=[0.0] * 384,
top_k=top_k,
include_metadata=True,
namespace=str(pdf_id)
)
docs = [
Document(
page_content=match["metadata"].get("text", ""),
metadata=match["metadata"]
)
for match in all_results.get("matches", [])
if match["metadata"].get("text", "").strip()
]
print(f"✅ Retrieved {len(docs)} chunks from Pinecone")
# Debug: Print first chunk preview
if docs:
print(f"📄 First chunk preview: {docs[0].page_content[:200]}...")
else:
print("❌ No chunks found for this PDF!")
return docs
except Exception as e:
print(f"❌ Error querying Pinecone: {e}")
return []
prompt = ChatPromptTemplate.from_messages([
(
"system",
"You are an expert organizational analyst. Generate a brief, actionable summary that highlights the most important and urgent points "
"from the given document chunks. The summary should focus on tasks department heads need to act on immediately, critical deadlines, compliance, "
"and cross-department coordination issues. Use only the provided context strictly.\n\n"
"If the text is in english print summary in english else print in hybrid language malayalam and english\n"
"Structure:\n"
"1. Overview of Main Operations and Activities\n"
"2. Critical Urgent Tasks and Immediate Deadlines\n"
"3. Compliance and Regulatory Highlights\n"
"4. Key Departmental Responsibilities and Coordination Needs\n"
"5. Safety, Staffing, Procurement, and Strategic Initiatives\n"
),
("user", "Summarize the following document accordingly:\n\n{context}")
])
llm = ChatGoogleGenerativeAI(
model="gemini-2.0-flash-exp",
temperature=0,
max_tokens=1000,
timeout=60, # Add timeout
)
output_parser = StrOutputParser()
chain = create_stuff_documents_chain(llm, prompt=prompt, output_parser=output_parser)
def create_summary(pdf_id):
"""Generate summary from retrieved chunks"""
print(f"\n📝 Generating summary for {pdf_id}...")
try:
# Step 1: Retrieve chunks
docs = query_pinecone_top_k(pdf_id)
if not docs:
error_msg = f"❌ No documents found in Pinecone for pdf_id: {pdf_id}"
print(error_msg)
return error_msg
# Step 2: Generate summary
print(f"🤖 Sending {len(docs)} chunks to Gemini API...")
try:
summary = chain.invoke({"context": docs})
# Validate output
if not summary or not summary.strip():
print("⚠️ Gemini returned empty response, retrying with fewer chunks...")
if len(docs) > 5:
docs = docs[:5]
summary = chain.invoke({"context": docs})
if summary and summary.strip():
print("✅ Summary generated successfully!")
print(f"📊 Summary length: {len(summary)} characters")
return summary
else:
error_msg = "❌ Gemini API returned empty summary"
print(error_msg)
return error_msg
except Exception as api_error:
print(f"❌ Gemini API error: {api_error}")
# Retry with reduced context
if len(docs) > 3:
print("🔄 Retrying with top 3 chunks only...")
try:
summary = chain.invoke({"context": docs[:3]})
if summary and summary.strip():
print("✅ Summary generated with reduced context")
return summary
except Exception as retry_error:
print(f"❌ Retry also failed: {retry_error}")
return f"Summary generation failed: {str(api_error)}"
except Exception as e:
error_msg = f"❌ Unexpected error in create_summary: {str(e)}"
print(error_msg)
import traceback
traceback.print_exc()
return error_msg |