Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext | |
| from llama_index.llms import HuggingFaceLLM | |
| import torch | |
| from llama_index.llms import LlamaCPP | |
| from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt | |
| documents = SimpleDirectoryReader("./data").load_data() | |
| llm = LlamaCPP( | |
| # You can pass in the URL to a GGML model to download it automatically | |
| model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf', | |
| # optionally, you can set the path to a pre-downloaded model instead of model_url | |
| model_path=None, | |
| temperature=0.1, | |
| max_new_tokens=256, | |
| # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
| context_window=3900, | |
| # kwargs to pass to __call__() | |
| generate_kwargs={}, | |
| # kwargs to pass to __init__() | |
| # set to at least 1 to use GPU | |
| model_kwargs={"n_gpu_layers": -1}, | |
| # transform inputs into Llama2 format | |
| messages_to_prompt=messages_to_prompt, | |
| completion_to_prompt=completion_to_prompt, | |
| verbose=True, | |
| ) | |
| from llama_index.embeddings import HuggingFaceEmbedding | |
| # loads BAAI/bge-small-en | |
| # embed_model = HuggingFaceEmbedding() | |
| # loads BAAI/bge-small-en-v1.5 | |
| # intilaize our custom embeddings | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| service_context = ServiceContext.from_defaults( | |
| chunk_size=512, | |
| llm=llm, | |
| embed_model=embed_model | |
| ) | |
| """Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy""" | |
| from llama_index.indices.postprocessor import SentenceTransformerRerank | |
| # Initialize the reranker | |
| rerank = SentenceTransformerRerank( | |
| model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks | |
| #create query engine | |
| index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
| query_engine = index.as_query_engine() # Without reranker | |
| def predict(input, history): | |
| response = query_engine.query(input) | |
| return str(response) | |
| #create query engine with cross encoder reranker | |
| index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
| query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks. | |
| def predict(input, history): | |
| response = query_engine.query(input) | |
| return str(response) | |
| import time | |
| import gradio as gr | |
| def predict(input, history): | |
| start_time = time.time() # Start the timer | |
| response = query_engine.query(input) # Process the query | |
| end_time = time.time() # Stop the timer | |
| response_time = end_time - start_time # Calculate the time taken | |
| # Format the response to include the time taken | |
| timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)" | |
| return str(timed_response) | |
| # Launch gradio chat ui | |
| gr.ChatInterface(predict).launch(share=True, debug=True) |