Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| from openai import OpenAI # or your preferred client | |
| from pinecone import Pinecone, ServerlessSpec | |
| import gradio as gr | |
| import json | |
| from tqdm.auto import tqdm | |
| #Setup API Keys | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| client = OpenAI(api_key=api_key) # or openai.api_key = api_key | |
| papi_key = os.getenv("PINECONE_API_KEY") | |
| environment = os.getenv("PINECONE_ENVIRONMENT") | |
| #Load dataset and consolidate questions from the dataset into a single list. | |
| dataset = load_dataset('quora', trust_remote_code=True, split='train[240000:290000]') | |
| questions = [] | |
| for record in dataset['questions']: | |
| questions.extend(record['text']) | |
| question = list(set(questions)) | |
| #Load the model- Sentence Transformers for CPU. | |
| model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') | |
| #Initialize Pinecone singleton object with API keys and set an index name. | |
| pc = Pinecone(api_key=papi_key) | |
| INDEX_NAME = 'semanticsearch' | |
| #Delete any existing indices | |
| if INDEX_NAME in [index.name for index in pc.list_indexes()]: | |
| pc.delete_index(INDEX_NAME) | |
| #Create a new index for the dataset with shape of the embeddings, cosine similarity, and setup Serverless index with AWS. | |
| pc.create_index(name=INDEX_NAME, | |
| dimension=model.get_sentence_embedding_dimension(), | |
| metric='cosine', | |
| spec=ServerlessSpec(cloud='aws', region='us-east-1')) | |
| index = pc.Index(INDEX_NAME) | |
| batch_size=200 | |
| vector_limit=80000 | |
| questions = question[:vector_limit] | |
| for i in tqdm(range(0, len(questions), batch_size)): | |
| # find end of batch | |
| i_end = min(i+batch_size, len(questions)) | |
| # create IDs batch of vectors. | |
| ids = [str(x) for x in range(i, i_end)] | |
| # create metadata batch | |
| metadatas = [{'text': text} for text in questions[i:i_end]] | |
| # create vector embeddings that creates a tuple of ID, Vector Embedding, and Metadata associated with it. | |
| xc = model.encode(questions[i:i_end]) | |
| # create records list for upsert | |
| records = zip(ids, xc, metadatas) | |
| # upsert to Pinecone | |
| index.upsert(vectors=records) | |
| def run_query(query): | |
| #Build Vector Embedding of the user query. | |
| embedding = model.encode(query).tolist() | |
| #Run the query against Pinecone | |
| results = index.query(top_k=5, vector=embedding, include_metadata=True, include_values=False) | |
| answers= [] | |
| for result in results['matches']: | |
| answers.append([round(result['score'], 2), result['metadata']['text']]) | |
| return "\n".join(f"Similarity percentage: {score * 100:.2f} % , {text}" for score, text in answers) | |
| # Create the Gradio interface | |
| demo = gr.Interface(fn=run_query, inputs=gr.Textbox(label="User Input", placeholder="Type your question here..."), outputs=gr.Textbox(label="Matching Questions from Vector Database")) | |
| # Launch the app | |
| demo.launch(share=True) |