Spaces:

yangzzay
/

HydroxApp_t2t

Runtime error

App Files Files Community

HydroxApp_t2t / llama2_wrapper /model.py

yangzzay

Upload folder using huggingface_hub

02a7b98 almost 2 years ago

raw

history blame

28.4 kB

	import os
	import time
	import uuid
	from enum import Enum
	from threading import Thread
	from typing import Any, Iterator, Union, List
	from llama2_wrapper.types import (
	Completion,
	CompletionChunk,
	ChatCompletion,
	ChatCompletionChunk,
	# ChatCompletionMessage,
	Message,
	B_INST,
	E_INST,
	B_SYS,
	E_SYS,
	)


	class LLAMA2_WRAPPER:
	def __init__(
	self,
	model_path: str = "",
	backend_type: str = "llama.cpp",
	max_tokens: int = 4000,
	load_in_8bit: bool = True,
	verbose: bool = False,
	):
	"""Load a llama2 model from `model_path`.

	Args:
	model_path: Path to the model.
	backend_type: Backend for llama2, options: llama.cpp, gptq, transformers
	max_tokens: Maximum context size.
	load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models).
	verbose: Print verbose output to stderr.

	Raises:
	ValueError: If the model path does not exist.

	Returns:
	A LLAMA2_WRAPPER instance.
	"""
	self.model_path = model_path
	self.backend_type = BackendType.get_type(backend_type)
	self.max_tokens = max_tokens
	self.load_in_8bit = load_in_8bit

	self.model = None
	self.tokenizer = None

	self.verbose = verbose

	if self.backend_type is BackendType.LLAMA_CPP:
	print("Running on backend llama.cpp.")
	else:
	import torch

	if torch.cuda.is_available():
	print("Running on GPU with backend torch transformers.")
	else:
	print("GPU CUDA not found.")

	self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf"
	self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ"
	# Download default ggml/gptq model
	if self.model_path == "":
	print("Model path is empty.")
	if self.backend_type is BackendType.LLAMA_CPP:
	print("Use default llama.cpp model path: " + self.default_llamacpp_path)
	if not os.path.exists(self.default_llamacpp_path):
	print("Start downloading model to: " + self.default_llamacpp_path)
	from huggingface_hub import hf_hub_download

	hf_hub_download(
	repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
	filename="llama-2-7b-chat.Q4_0.gguf",
	local_dir="./models/",
	)
	else:
	print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.")
	self.model_path = self.default_llamacpp_path
	elif self.backend_type is BackendType.GPTQ:
	print("Use default gptq model path: " + self.default_gptq_path)
	if not os.path.exists(self.default_gptq_path):
	print("Start downloading model to: " + self.default_gptq_path)
	from huggingface_hub import snapshot_download

	snapshot_download(
	"TheBloke/Llama-2-7b-Chat-GPTQ",
	local_dir=self.default_gptq_path,
	)
	else:
	print("Model exists in " + self.default_gptq_path)
	self.model_path = self.default_gptq_path

	self.init_tokenizer()
	self.init_model()

	def init_model(self):
	if self.model is None:
	self.model = LLAMA2_WRAPPER.create_llama2_model(
	self.model_path,
	self.backend_type,
	self.max_tokens,
	self.load_in_8bit,
	self.verbose,
	)
	if self.backend_type is not BackendType.LLAMA_CPP:
	self.model.eval()

	def init_tokenizer(self):
	if self.backend_type is not BackendType.LLAMA_CPP:
	if self.tokenizer is None:
	self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path)

	@classmethod
	def create_llama2_model(
	cls, model_path, backend_type, max_tokens, load_in_8bit, verbose
	):
	if backend_type is BackendType.LLAMA_CPP:
	from llama_cpp import Llama

	model = Llama(
	model_path=model_path,
	n_ctx=max_tokens,
	n_batch=max_tokens,
	verbose=verbose,
	)
	elif backend_type is BackendType.GPTQ:
	from auto_gptq import AutoGPTQForCausalLM

	model = AutoGPTQForCausalLM.from_quantized(
	model_path,
	use_safetensors=True,
	trust_remote_code=True,
	device="cuda:0",
	use_triton=False,
	quantize_config=None,
	)
	elif backend_type is BackendType.TRANSFORMERS:
	import torch
	from transformers import AutoModelForCausalLM

	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	torch_dtype=torch.float16,
	load_in_8bit=load_in_8bit,
	)
	else:
	print(backend_type + "not implemented.")
	return model

	@classmethod
	def create_llama2_tokenizer(cls, model_path):
	from transformers import AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	return tokenizer

	def get_token_length(
	self,
	prompt: str,
	) -> int:
	if self.backend_type is BackendType.LLAMA_CPP:
	input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
	return len(input_ids)
	else:
	input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
	return input_ids.shape[-1]

	def get_input_token_length(
	self,
	message: str,
	chat_history: list[tuple[str, str]] = [],
	system_prompt: str = "",
	) -> int:
	prompt = get_prompt(message, chat_history, system_prompt)

	return self.get_token_length(prompt)

	def generate(
	self,
	prompt: str,
	max_new_tokens: int = 1000,
	temperature: float = 0.9,
	top_p: float = 1.0,
	top_k: int = 40,
	repetition_penalty: float = 1.0,
	**kwargs: Any,
	) -> Iterator[str]:
	"""Create a generator of response from a prompt.

	Examples:
	>>> llama2_wrapper = LLAMA2_WRAPPER()
	>>> prompt = get_prompt("Hi do you know Pytorch?")
	>>> for response in llama2_wrapper.generate(prompt):
	... print(response)

	Args:
	prompt: The prompt to generate text from.
	max_new_tokens: The maximum number of tokens to generate.
	temperature: The temperature to use for sampling.
	top_p: The top-p value to use for sampling.
	top_k: The top-k value to use for sampling.
	repetition_penalty: The penalty to apply to repeated tokens.
	kwargs: all other arguments.

	Yields:
	The generated text.
	"""
	if self.backend_type is BackendType.LLAMA_CPP:
	result = self.model(
	prompt=prompt,
	stream=True,
	max_tokens=max_new_tokens,
	top_k=top_k,
	top_p=top_p,
	temperature=temperature,
	repeat_penalty=repetition_penalty,
	**kwargs,
	)
	outputs = []
	for part in result:
	text = part["choices"][0]["text"]
	outputs.append(text)
	yield "".join(outputs)
	else:
	from transformers import TextIteratorStreamer

	inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")

	streamer = TextIteratorStreamer(
	self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
	)
	generate_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	# num_beams=1,
	)
	generate_kwargs = (
	generate_kwargs if kwargs is None else {generate_kwargs, kwargs}
	)
	t = Thread(target=self.model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)

	def run(
	self,
	message: str,
	chat_history: list[tuple[str, str]] = [],
	system_prompt: str = "",
	max_new_tokens: int = 1000,
	temperature: float = 0.9,
	top_p: float = 1.0,
	top_k: int = 40,
	repetition_penalty: float = 1.0,
	) -> Iterator[str]:
	"""Create a generator of response from a chat message.
	Process message to llama2 prompt with chat history
	and system_prompt for chatbot.

	Args:
	message: The origianl chat message to generate text from.
	chat_history: Chat history list from chatbot.
	system_prompt: System prompt for chatbot.
	max_new_tokens: The maximum number of tokens to generate.
	temperature: The temperature to use for sampling.
	top_p: The top-p value to use for sampling.
	top_k: The top-k value to use for sampling.
	repetition_penalty: The penalty to apply to repeated tokens.
	kwargs: all other arguments.

	Yields:
	The generated text.
	"""
	prompt = get_prompt(message, chat_history, system_prompt)
	return self.generate(
	prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty
	)

	def __call__(
	self,
	prompt: str,
	stream: bool = False,
	max_new_tokens: int = 1000,
	temperature: float = 0.9,
	top_p: float = 1.0,
	top_k: int = 40,
	repetition_penalty: float = 1.0,
	**kwargs: Any,
	) -> Union[str, Iterator[str]]:
	"""Generate text from a prompt.

	Examples:
	>>> llama2_wrapper = LLAMA2_WRAPPER()
	>>> prompt = get_prompt("Hi do you know Pytorch?")
	>>> print(llama2_wrapper(prompt))

	Args:
	prompt: The prompt to generate text from.
	stream: Whether to stream the results.
	max_new_tokens: The maximum number of tokens to generate.
	temperature: The temperature to use for sampling.
	top_p: The top-p value to use for sampling.
	top_k: The top-k value to use for sampling.
	repetition_penalty: The penalty to apply to repeated tokens.
	kwargs: all other arguments.

	Raises:
	ValueError: If the requested tokens exceed the context window.
	RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.

	Returns:
	Generated text.
	"""
	if self.backend_type is BackendType.LLAMA_CPP:
	completion_or_chunks = self.model.__call__(
	prompt,
	stream=stream,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repetition_penalty,
	**kwargs,
	)
	if stream:

	def chunk_generator(chunks):
	for part in chunks:
	chunk = part["choices"][0]["text"]
	yield chunk

	chunks: Iterator[str] = chunk_generator(completion_or_chunks)
	return chunks
	return completion_or_chunks["choices"][0]["text"]
	else:
	inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
	prompt_tokens_len = len(inputs[0])
	inputs = inputs.to("cuda")
	generate_kwargs = dict(
	inputs=inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	# num_beams=1,
	)
	generate_kwargs = (
	generate_kwargs if kwargs is None else {generate_kwargs, kwargs}
	)
	if stream:
	from transformers import TextIteratorStreamer

	streamer = TextIteratorStreamer(
	self.tokenizer,
	timeout=10.0,
	skip_prompt=True,
	skip_special_tokens=True,
	)
	generate_kwargs["streamer"] = streamer

	t = Thread(target=self.model.generate, kwargs=generate_kwargs)
	t.start()
	return streamer
	else:
	output_ids = self.model.generate(
	**generate_kwargs,
	)
	# skip prompt, skip special tokens
	output = self.tokenizer.decode(
	output_ids[0][prompt_tokens_len:], skip_special_tokens=True
	)
	return output

	def completion(
	self,
	prompt: str,
	stream: bool = False,
	max_new_tokens: int = 1000,
	temperature: float = 0.9,
	top_p: float = 1.0,
	top_k: int = 40,
	repetition_penalty: float = 1.0,
	**kwargs: Any,
	) -> Union[Completion, Iterator[CompletionChunk]]:
	"""For OpenAI compatible API /v1/completions
	Generate text from a prompt.

	Examples:
	>>> llama2_wrapper = LLAMA2_WRAPPER()
	>>> prompt = get_prompt("Hi do you know Pytorch?")
	>>> print(llm.completion(prompt))

	Args:
	prompt: The prompt to generate text from.
	stream: Whether to stream the results.
	max_new_tokens: The maximum number of tokens to generate.
	temperature: The temperature to use for sampling.
	top_p: The top-p value to use for sampling.
	top_k: The top-k value to use for sampling.
	repetition_penalty: The penalty to apply to repeated tokens.
	kwargs: all other arguments.

	Raises:
	ValueError: If the requested tokens exceed the context window.
	RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.

	Returns:
	Response object containing the generated text.
	"""
	completion_id: str = f"cmpl-{str(uuid.uuid4())}"
	created: int = int(time.time())
	model_name: str = (
	self.backend_type + " default model"
	if self.model_path == ""
	else self.model_path
	)
	if self.backend_type is BackendType.LLAMA_CPP:
	completion_or_chunks = self.model.__call__(
	prompt,
	stream=stream,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repetition_penalty,
	**kwargs,
	)
	if stream:
	chunks: Iterator[CompletionChunk] = completion_or_chunks
	return chunks
	return completion_or_chunks
	else:
	inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
	prompt_tokens_len = len(inputs[0])
	inputs = inputs.to("cuda")
	generate_kwargs = dict(
	inputs=inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	# num_beams=1,
	)
	generate_kwargs = (
	generate_kwargs if kwargs is None else {generate_kwargs, kwargs}
	)
	if stream:
	from transformers import TextIteratorStreamer

	streamer = TextIteratorStreamer(
	self.tokenizer,
	timeout=10.0,
	skip_prompt=True,
	skip_special_tokens=True,
	)
	generate_kwargs["streamer"] = streamer

	t = Thread(target=self.model.generate, kwargs=generate_kwargs)
	t.start()

	def chunk_generator(chunks):
	for part in chunks:
	yield {
	"id": completion_id,
	"object": "text_completion",
	"created": created,
	"model": model_name,
	"choices": [
	{
	"text": part,
	"index": 0,
	"logprobs": None,
	"finish_reason": None,
	}
	],
	}

	chunks: Iterator[CompletionChunk] = chunk_generator(streamer)
	return chunks

	else:
	output_ids = self.model.generate(
	**generate_kwargs,
	)
	total_tokens_len = len(output_ids[0])
	output = self.tokenizer.decode(
	output_ids[0][prompt_tokens_len:], skip_special_tokens=True
	)
	completion: Completion = {
	"id": completion_id,
	"object": "text_completion",
	"created": created,
	"model": model_name,
	"choices": [
	{
	"text": output,
	"index": 0,
	"logprobs": None,
	"finish_reason": None,
	}
	],
	"usage": {
	"prompt_tokens": prompt_tokens_len,
	"completion_tokens": total_tokens_len - prompt_tokens_len,
	"total_tokens": total_tokens_len,
	},
	}
	return completion

	def chat_completion(
	self,
	messages: List[Message],
	stream: bool = False,
	max_new_tokens: int = 1000,
	temperature: float = 0.9,
	top_p: float = 1.0,
	top_k: int = 40,
	repetition_penalty: float = 1.0,
	**kwargs: Any,
	) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
	"""For OpenAI compatible API /v1/chat/completions
	Generate text from a dialog (chat history).

	Examples:
	>>> llama2_wrapper = LLAMA2_WRAPPER()
	>>> dialog = [
	{
	"role":"system",
	"content":"You are a helpful, respectful and honest assistant. "
	},{
	"role":"user",
	"content":"Hi do you know Pytorch?",
	},
	]
	>>> print(llm.chat_completion(dialog))

	Args:
	dialog: The dialog (chat history) to generate text from.
	stream: Whether to stream the results.
	max_new_tokens: The maximum number of tokens to generate.
	temperature: The temperature to use for sampling.
	top_p: The top-p value to use for sampling.
	top_k: The top-k value to use for sampling.
	repetition_penalty: The penalty to apply to repeated tokens.
	kwargs: all other arguments.

	Raises:
	ValueError: If the requested tokens exceed the context window.
	RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.

	Returns:
	Response object containing the generated text.
	"""
	completion_id: str = f"cmpl-{str(uuid.uuid4())}"
	created: int = int(time.time())
	model_name: str = (
	self.backend_type + " default model"
	if self.model_path == ""
	else self.model_path
	)
	if self.backend_type is BackendType.LLAMA_CPP:
	completion_or_chunks = self.model.create_chat_completion(
	messages,
	stream=stream,
	max_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repetition_penalty,
	**kwargs,
	)
	if stream:
	chunks: Iterator[ChatCompletionChunk] = completion_or_chunks
	return chunks
	return completion_or_chunks
	else:
	prompt = get_prompt_for_dialog(messages)
	inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
	prompt_tokens_len = len(inputs[0])
	inputs = inputs.to("cuda")
	generate_kwargs = dict(
	inputs=inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	# num_beams=1,
	)
	generate_kwargs = (
	generate_kwargs if kwargs is None else {generate_kwargs, kwargs}
	)
	if stream:
	from transformers import TextIteratorStreamer

	streamer = TextIteratorStreamer(
	self.tokenizer,
	timeout=10.0,
	skip_prompt=True,
	skip_special_tokens=True,
	)
	generate_kwargs["streamer"] = streamer
	t = Thread(target=self.model.generate, kwargs=generate_kwargs)
	t.start()

	def chunk_generator(chunks):
	yield {
	"id": "chat" + completion_id,
	"model": model_name,
	"created": created,
	"object": "chat.completion.chunk",
	"choices": [
	{
	"index": 0,
	"delta": {
	"role": "assistant",
	},
	"finish_reason": None,
	}
	],
	}
	for part in enumerate(chunks):
	yield {
	"id": "chat" + completion_id,
	"model": model_name,
	"created": created,
	"object": "chat.completion.chunk",
	"choices": [
	{
	"index": 0,
	"delta": {
	"content": part,
	},
	"finish_reason": None,
	}
	],
	}

	chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer)
	return chunks

	else:
	output_ids = self.model.generate(
	**generate_kwargs,
	)
	total_tokens_len = len(output_ids[0])
	output = self.tokenizer.decode(
	output_ids[0][prompt_tokens_len:], skip_special_tokens=True
	)
	chatcompletion: ChatCompletion = {
	"id": "chat" + completion_id,
	"object": "chat.completion",
	"created": created,
	"model": model_name,
	"choices": [
	{
	"index": 0,
	"message": {
	"role": "assistant",
	"content": output,
	},
	"finish_reason": None,
	}
	],
	"usage": {
	"prompt_tokens": prompt_tokens_len,
	"completion_tokens": total_tokens_len - prompt_tokens_len,
	"total_tokens": total_tokens_len,
	},
	}
	return chatcompletion


	def get_prompt_for_dialog(dialog: List[Message]) -> str:
	"""Process dialog (chat history) to llama2 prompt for
	OpenAI compatible API /v1/chat/completions.

	Examples:
	>>> dialog = [
	{
	"role":"system",
	"content":"You are a helpful, respectful and honest assistant. "
	},{
	"role":"user",
	"content":"Hi do you know Pytorch?",
	},
	]
	>>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?")

	Args:
	dialog: The dialog (chat history) to generate text from.

	Yields:
	prompt string.
	"""
	# add "<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" in first dialog
	if dialog[0]["role"] == "system":
	dialog = [
	{
	"role": dialog[1]["role"],
	"content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"],
	}
	] + dialog[2:]
	# check roles
	assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
	[msg["role"] == "assistant" for msg in dialog[1::2]]
	), (
	"model only supports 'system', 'user' and 'assistant' roles, "
	"starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
	)
	# add chat history
	texts = []
	for prompt, answer in zip(
	dialog[::2],
	dialog[1::2],
	):
	texts.append(
	f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} "
	)
	# check last message if role is user, then add it to prompt text
	assert (
	dialog[-1]["role"] == "user"
	), f"Last message must be from user, got {dialog[-1]['role']}"
	texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}")
	return "".join(texts)


	def get_prompt(
	message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = ""
	) -> str:
	"""Process message to llama2 prompt with chat history
	and system_prompt for chatbot.

	Examples:
	>>> prompt = get_prompt("Hi do you know Pytorch?")

	Args:
	message: The origianl chat message to generate text from.
	chat_history: Chat history list from chatbot.
	system_prompt: System prompt for chatbot.

	Yields:
	prompt string.
	"""
	texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
	for user_input, response in chat_history:
	texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
	texts.append(f"{message.strip()} [/INST]")
	return "".join(texts)


	class BackendType(Enum):
	UNKNOWN = 0
	TRANSFORMERS = 1
	GPTQ = 2
	LLAMA_CPP = 3

	@classmethod
	def get_type(cls, backend_name: str):
	backend_type = None
	backend_name_lower = backend_name.lower()
	if "transformers" in backend_name_lower:
	backend_type = BackendType.TRANSFORMERS
	elif "gptq" in backend_name_lower:
	backend_type = BackendType.GPTQ
	elif "cpp" in backend_name_lower:
	backend_type = BackendType.LLAMA_CPP
	else:
	raise Exception("Unknown backend: " + backend_name)
	# backend_type = BackendType.UNKNOWN
	return backend_type