semanticsearch / app.py
Maitreyee22's picture
Update app.py
2ed5b00 verified
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import os
from openai import OpenAI # or your preferred client
from pinecone import Pinecone, ServerlessSpec
import gradio as gr
import json
from tqdm.auto import tqdm
#Setup API Keys
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key) # or openai.api_key = api_key
papi_key = os.getenv("PINECONE_API_KEY")
environment = os.getenv("PINECONE_ENVIRONMENT")
#Load dataset and consolidate questions from the dataset into a single list.
dataset = load_dataset('quora', trust_remote_code=True, split='train[240000:290000]')
questions = []
for record in dataset['questions']:
questions.extend(record['text'])
question = list(set(questions))
#Load the model- Sentence Transformers for CPU.
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
#Initialize Pinecone singleton object with API keys and set an index name.
pc = Pinecone(api_key=papi_key)
INDEX_NAME = 'semanticsearch'
#Delete any existing indices
if INDEX_NAME in [index.name for index in pc.list_indexes()]:
pc.delete_index(INDEX_NAME)
#Create a new index for the dataset with shape of the embeddings, cosine similarity, and setup Serverless index with AWS.
pc.create_index(name=INDEX_NAME,
dimension=model.get_sentence_embedding_dimension(),
metric='cosine',
spec=ServerlessSpec(cloud='aws', region='us-east-1'))
index = pc.Index(INDEX_NAME)
batch_size=200
vector_limit=80000
questions = question[:vector_limit]
for i in tqdm(range(0, len(questions), batch_size)):
# find end of batch
i_end = min(i+batch_size, len(questions))
# create IDs batch of vectors.
ids = [str(x) for x in range(i, i_end)]
# create metadata batch
metadatas = [{'text': text} for text in questions[i:i_end]]
# create vector embeddings that creates a tuple of ID, Vector Embedding, and Metadata associated with it.
xc = model.encode(questions[i:i_end])
# create records list for upsert
records = zip(ids, xc, metadatas)
# upsert to Pinecone
index.upsert(vectors=records)
def run_query(query):
#Build Vector Embedding of the user query.
embedding = model.encode(query).tolist()
#Run the query against Pinecone
results = index.query(top_k=5, vector=embedding, include_metadata=True, include_values=False)
answers= []
for result in results['matches']:
answers.append([round(result['score'], 2), result['metadata']['text']])
return "\n".join(f"Similarity percentage: {score * 100:.2f} % , {text}" for score, text in answers)
# Create the Gradio interface
demo = gr.Interface(fn=run_query, inputs=gr.Textbox(label="User Input", placeholder="Type your question here..."), outputs=gr.Textbox(label="Matching Questions from Vector Database"))
# Launch the app
demo.launch(share=True)