Spaces:

diogodsa
/

ia-ibovespa-ri-tech

Sleeping

App Files Files Community

ia-ibovespa-ri-tech / app.py

diogodsa

Update app.py

cf99100 verified almost 2 years ago

raw

history blame contribute delete

3.09 kB

	# -- coding: utf-8 --
	from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
	from llama_index.llms import HuggingFaceLLM
	import torch
	from llama_index.llms import LlamaCPP
	from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

	documents = SimpleDirectoryReader("./data").load_data()

	llm = LlamaCPP(
	# You can pass in the URL to a GGML model to download it automatically
	model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf',
	# optionally, you can set the path to a pre-downloaded model instead of model_url
	model_path=None,
	temperature=0.1,
	max_new_tokens=256,
	# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
	context_window=3900,
	# kwargs to pass to __call__()
	generate_kwargs={},
	# kwargs to pass to __init__()
	# set to at least 1 to use GPU
	model_kwargs={"n_gpu_layers": -1},
	# transform inputs into Llama2 format
	messages_to_prompt=messages_to_prompt,
	completion_to_prompt=completion_to_prompt,
	verbose=True,
	)

	from llama_index.embeddings import HuggingFaceEmbedding

	# loads BAAI/bge-small-en
	# embed_model = HuggingFaceEmbedding()

	# loads BAAI/bge-small-en-v1.5
	# intilaize our custom embeddings
	embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

	service_context = ServiceContext.from_defaults(
	chunk_size=512,
	llm=llm,
	embed_model=embed_model
	)

	"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy"""

	from llama_index.indices.postprocessor import SentenceTransformerRerank
	# Initialize the reranker
	rerank = SentenceTransformerRerank(
	model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks

	#create query engine
	index = VectorStoreIndex.from_documents(documents, service_context=service_context)

	query_engine = index.as_query_engine() # Without reranker

	def predict(input, history):
	response = query_engine.query(input)
	return str(response)

	#create query engine with cross encoder reranker
	index = VectorStoreIndex.from_documents(documents, service_context=service_context)

	query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks.

	def predict(input, history):
	response = query_engine.query(input)
	return str(response)

	import time
	import gradio as gr

	def predict(input, history):
	start_time = time.time() # Start the timer

	response = query_engine.query(input) # Process the query

	end_time = time.time() # Stop the timer
	response_time = end_time - start_time # Calculate the time taken

	# Format the response to include the time taken
	timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)"
	return str(timed_response)

	# Launch gradio chat ui
	gr.ChatInterface(predict).launch(share=True, debug=True)