File size: 9,035 Bytes
ba2d57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e69d432
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from pinecone import Pinecone
import os
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from pinecone import ServerlessSpec

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# Validate API keys
if not PINECONE_API_KEY:
    raise ValueError("❌ PINECONE_API_KEY not found in environment variables")
if not GEMINI_API_KEY:
    raise ValueError("❌ GEMINI_API_KEY not found in environment variables")

os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "doc-embeddings"
if not pc.has_index(index_name):
    pc.delete_index(index_name)
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

encoder = HuggingFaceEmbeddings(
    #model_name=r"C:\Users\bahra\.cache\huggingface\hub\models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2\snapshots\86741b4e3f5cb7765a600d3a3d55a0f6a6cb443d",
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cpu"}
)



def encode(pdf_id, page_numb, docs, encoder=encoder):
    """Embed and store document chunks"""
    if not docs:
        print("⚠️  No documents to encode")
        return

    try:
        embeddings = encoder.embed_documents(docs)
        vectors = [
            (str(f"{pdf_id}_{page_numb}_{i}"), emb, {
                "pdf_id": str(pdf_id),
                "chunk_index": i,
                "page_no": page_numb,
                "text": docs[i]
            })
            for i, emb in enumerate(embeddings)
        ]
        index.upsert(vectors,namespace=str(pdf_id))
        print(f"✅ {len(vectors)} chunks stored in Pinecone for {pdf_id} page {page_numb}")
    except Exception as e:
        print(f"❌ Error encoding documents: {e}")

query = (
    "Key organizational operations, critical urgent tasks and deadlines, compliance and regulatory updates, "
    "inter-departmental coordination issues, staffing and HR priorities, safety bulletins, procurement status, "
    "knowledge retention challenges, and strategic initiatives impacting timely decision-making and operational efficiency."
    "financial performance, budgets, payments, audits, cost control, funding, procurement finance"
    "പ്രധാന സംഘടനാ പ്രവർത്തനങ്ങൾ, അടിയന്തരമായ നിർണായക ജോലികളും അവസാന തീയതികളും, അനുസരണവും നിയന്ത്രണാത്മകമായ പുതുക്കലുകളും, അന്തർ-വകുപ്പ് ഏകോപന പ്രശ്നങ്ങൾ, സ്റ്റാഫിംഗ്‌യും മാനവ വിഭവശേഷി മുൻഗണനകളും, സുരക്ഷാ ബുള്ളറ്റിനുകൾ, വാങ്ങൽ നില, അറിവ് സംരക്ഷണ വെല്ലുവിളികൾ, സമയബന്ധിതമായ തീരുമാനം കൈക്കൊള്ളലിനെയും പ്രവർത്തന കാര്യക്ഷമതയെയും ബാധിക്കുന്ന തന്ത്രപരമായ പ്രവർത്തനങ്ങൾ."
    "സാമ്പത്തിക പ്രകടനം, ബജറ്റുകൾ, പേയ്‌മെന്റുകൾ, ഓഡിറ്റുകൾ, ചെലവ് നിയന്ത്രണം, ഫണ്ടിംഗ്, വാങ്ങൽ ധനകാര്യം."
)

def query_pinecone_top_k(pdf_id, top_k=10, query=query):
    """Query Pinecone for relevant chunks"""
    print(f"\n🔍 Querying Pinecone for pdf_id: {pdf_id}")

    try:
        q_emb = encoder.embed_query(query)
        results = index.query(
            vector=q_emb,
            top_k=top_k,
            include_metadata=True,
            namespace=str(pdf_id)
        )

        docs = [
            Document(
                page_content=match['metadata'].get('text', ''),
                metadata=match['metadata']
            )
            for match in results.get('matches', [])
            if match['metadata'].get('text', '').strip()  # Filter out empty texts
        ]

        if not docs:
            print("⚠️  No relevant chunks found with semantic search, fetching all chunks...")
            all_results = index.query(
                vector=[0.0] * 384,
                top_k=top_k,
                include_metadata=True,
                namespace=str(pdf_id)
            )
            docs = [
                Document(
                    page_content=match["metadata"].get("text", ""),
                    metadata=match["metadata"]
                )
                for match in all_results.get("matches", [])
                if match["metadata"].get("text", "").strip()
            ]

        print(f"✅ Retrieved {len(docs)} chunks from Pinecone")

        # Debug: Print first chunk preview
        if docs:
            print(f"📄 First chunk preview: {docs[0].page_content[:200]}...")
        else:
            print("❌ No chunks found for this PDF!")

        return docs

    except Exception as e:
        print(f"❌ Error querying Pinecone: {e}")
        return []

prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are an expert organizational analyst. Generate a brief, actionable summary that highlights the most important and urgent points "
        "from the given document chunks. The summary should focus on tasks department heads need to act on immediately, critical deadlines, compliance, "
        "and cross-department coordination issues. Use only the provided context strictly.\n\n"
        "If the text is in english print summary in english else print in hybrid language malayalam and english\n"
        "Structure:\n"
        "1. Overview of Main Operations and Activities\n"
        "2. Critical Urgent Tasks and Immediate Deadlines\n"
        "3. Compliance and Regulatory Highlights\n"
        "4. Key Departmental Responsibilities and Coordination Needs\n"
        "5. Safety, Staffing, Procurement, and Strategic Initiatives\n"
    ),
    ("user", "Summarize the following document accordingly:\n\n{context}")
])

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-exp",
    temperature=0,
    max_tokens=1000,
    timeout=60,  # Add timeout
)

output_parser = StrOutputParser()
chain = create_stuff_documents_chain(llm, prompt=prompt, output_parser=output_parser)


def create_summary(pdf_id):
    """Generate summary from retrieved chunks"""
    print(f"\n📝 Generating summary for {pdf_id}...")

    try:
        # Step 1: Retrieve chunks
        docs = query_pinecone_top_k(pdf_id)

        if not docs:
            error_msg = f"❌ No documents found in Pinecone for pdf_id: {pdf_id}"
            print(error_msg)
            return error_msg

        # Step 2: Generate summary
        print(f"🤖 Sending {len(docs)} chunks to Gemini API...")

        try:
            summary = chain.invoke({"context": docs})

            # Validate output
            if not summary or not summary.strip():
                print("⚠️  Gemini returned empty response, retrying with fewer chunks...")
                if len(docs) > 5:
                    docs = docs[:5]
                    summary = chain.invoke({"context": docs})

            if summary and summary.strip():
                print("✅ Summary generated successfully!")
                print(f"📊 Summary length: {len(summary)} characters")
                return summary
            else:
                error_msg = "❌ Gemini API returned empty summary"
                print(error_msg)
                return error_msg

        except Exception as api_error:
            print(f"❌ Gemini API error: {api_error}")

            # Retry with reduced context
            if len(docs) > 3:
                print("🔄 Retrying with top 3 chunks only...")
                try:
                    summary = chain.invoke({"context": docs[:3]})
                    if summary and summary.strip():
                        print("✅ Summary generated with reduced context")
                        return summary
                except Exception as retry_error:
                    print(f"❌ Retry also failed: {retry_error}")

            return f"Summary generation failed: {str(api_error)}"

    except Exception as e:
        error_msg = f"❌ Unexpected error in create_summary: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()
        return error_msg