Spaces:
Sleeping
Sleeping
johnsmith253325
commited on
Commit
·
52cd289
1
Parent(s):
7691698
feat: 初步加入LLaMA.cpp支持
Browse files- .gitignore +1 -0
- modules/models/{azure.py → Azure.py} +0 -0
- modules/models/LLaMA.py +48 -66
- modules/presets.py +10 -6
.gitignore
CHANGED
|
@@ -141,6 +141,7 @@ api_key.txt
|
|
| 141 |
config.json
|
| 142 |
auth.json
|
| 143 |
.models/
|
|
|
|
| 144 |
lora/
|
| 145 |
.idea
|
| 146 |
templates/*
|
|
|
|
| 141 |
config.json
|
| 142 |
auth.json
|
| 143 |
.models/
|
| 144 |
+
models/*
|
| 145 |
lora/
|
| 146 |
.idea
|
| 147 |
templates/*
|
modules/models/{azure.py → Azure.py}
RENAMED
|
File without changes
|
modules/models/LLaMA.py
CHANGED
|
@@ -3,11 +3,40 @@ from __future__ import annotations
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
from ..index_func import *
|
| 7 |
from ..presets import *
|
| 8 |
from ..utils import *
|
| 9 |
from .base_model import BaseLLMModel
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class LLaMA_Client(BaseLLMModel):
|
| 13 |
def __init__(
|
|
@@ -17,51 +46,28 @@ class LLaMA_Client(BaseLLMModel):
|
|
| 17 |
user_name=""
|
| 18 |
) -> None:
|
| 19 |
super().__init__(model_name=model_name, user=user_name)
|
| 20 |
-
from lmflow.args import (DatasetArguments, InferencerArguments,
|
| 21 |
-
ModelArguments)
|
| 22 |
-
from lmflow.datasets.dataset import Dataset
|
| 23 |
-
from lmflow.models.auto_model import AutoModel
|
| 24 |
-
from lmflow.pipeline.auto_pipeline import AutoPipeline
|
| 25 |
|
| 26 |
self.max_generation_token = 1000
|
| 27 |
self.end_string = "\n\n"
|
| 28 |
# We don't need input data
|
| 29 |
-
|
| 30 |
-
self.dataset = Dataset(data_args)
|
| 31 |
self.system_prompt = ""
|
| 32 |
|
| 33 |
-
global LLAMA_MODEL
|
| 34 |
-
if LLAMA_MODEL is None
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
| 44 |
# raise Exception(f"models目录下没有这个模型: {model_name}")
|
| 45 |
-
if lora_path is not None:
|
| 46 |
-
|
| 47 |
-
model_args = ModelArguments(model_name_or_path=model_source, lora_model_path=lora_path, model_type=None, config_overrides=None, config_name=None, tokenizer_name=None, cache_dir=None,
|
| 48 |
-
use_fast_tokenizer=True, model_revision='main', use_auth_token=False, torch_dtype=None, use_lora=False, lora_r=8, lora_alpha=32, lora_dropout=0.1, use_ram_optimized_load=True)
|
| 49 |
-
pipeline_args = InferencerArguments(
|
| 50 |
-
local_rank=0, random_seed=1, deepspeed='configs/ds_config_chatbot.json', mixed_precision='bf16')
|
| 51 |
-
|
| 52 |
-
with open(pipeline_args.deepspeed, "r", encoding="utf-8") as f:
|
| 53 |
-
ds_config = json.load(f)
|
| 54 |
-
LLAMA_MODEL = AutoModel.get_model(
|
| 55 |
-
model_args,
|
| 56 |
-
tune_strategy="none",
|
| 57 |
-
ds_config=ds_config,
|
| 58 |
-
)
|
| 59 |
-
LLAMA_INFERENCER = AutoPipeline.get_pipeline(
|
| 60 |
-
pipeline_name="inferencer",
|
| 61 |
-
model_args=model_args,
|
| 62 |
-
data_args=data_args,
|
| 63 |
-
pipeline_args=pipeline_args,
|
| 64 |
-
)
|
| 65 |
|
| 66 |
def _get_llama_style_input(self):
|
| 67 |
history = []
|
|
@@ -79,38 +85,14 @@ class LLaMA_Client(BaseLLMModel):
|
|
| 79 |
|
| 80 |
def get_answer_at_once(self):
|
| 81 |
context = self._get_llama_style_input()
|
| 82 |
-
|
| 83 |
-
input_dataset = self.dataset.from_dict(
|
| 84 |
-
{"type": "text_only", "instances": [{"text": context}]}
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
output_dataset = LLAMA_INFERENCER.inference(
|
| 88 |
-
model=LLAMA_MODEL,
|
| 89 |
-
dataset=input_dataset,
|
| 90 |
-
max_new_tokens=self.max_generation_token,
|
| 91 |
-
temperature=self.temperature,
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
response = output_dataset.to_dict()["instances"][0]["text"]
|
| 95 |
return response, len(response)
|
| 96 |
|
| 97 |
def get_answer_stream_iter(self):
|
| 98 |
context = self._get_llama_style_input()
|
|
|
|
| 99 |
partial_text = ""
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
input_dataset = self.dataset.from_dict(
|
| 103 |
-
{"type": "text_only", "instances": [
|
| 104 |
-
{"text": context + partial_text}]}
|
| 105 |
-
)
|
| 106 |
-
output_dataset = LLAMA_INFERENCER.inference(
|
| 107 |
-
model=LLAMA_MODEL,
|
| 108 |
-
dataset=input_dataset,
|
| 109 |
-
max_new_tokens=step,
|
| 110 |
-
temperature=self.temperature,
|
| 111 |
-
)
|
| 112 |
-
response = output_dataset.to_dict()["instances"][0]["text"]
|
| 113 |
-
if response == "" or response == self.end_string:
|
| 114 |
-
break
|
| 115 |
partial_text += response
|
| 116 |
yield partial_text
|
|
|
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
+
from llama_cpp import Llama
|
| 8 |
+
|
| 9 |
from ..index_func import *
|
| 10 |
from ..presets import *
|
| 11 |
from ..utils import *
|
| 12 |
from .base_model import BaseLLMModel
|
| 13 |
|
| 14 |
+
import json
|
| 15 |
+
from llama_cpp import Llama
|
| 16 |
+
from huggingface_hub import hf_hub_download
|
| 17 |
+
|
| 18 |
+
def download(repo_id, filename, retry=10):
|
| 19 |
+
if os.path.exists("./models/downloaded_models.json"):
|
| 20 |
+
with open("./models/downloaded_models.json", "r") as f:
|
| 21 |
+
downloaded_models = json.load(f)
|
| 22 |
+
if repo_id in downloaded_models:
|
| 23 |
+
return downloaded_models[repo_id]["path"]
|
| 24 |
+
else:
|
| 25 |
+
downloaded_models = {}
|
| 26 |
+
while retry > 0:
|
| 27 |
+
try:
|
| 28 |
+
model_path = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir="models", resume_download=True)
|
| 29 |
+
downloaded_models[repo_id] = {"path": model_path}
|
| 30 |
+
with open("./models/downloaded_models.json", "w") as f:
|
| 31 |
+
json.dump(downloaded_models, f)
|
| 32 |
+
break
|
| 33 |
+
except:
|
| 34 |
+
print("Error downloading model, retrying...")
|
| 35 |
+
retry -= 1
|
| 36 |
+
if retry == 0:
|
| 37 |
+
raise Exception("Error downloading model, please try again later.")
|
| 38 |
+
return model_path
|
| 39 |
+
|
| 40 |
|
| 41 |
class LLaMA_Client(BaseLLMModel):
|
| 42 |
def __init__(
|
|
|
|
| 46 |
user_name=""
|
| 47 |
) -> None:
|
| 48 |
super().__init__(model_name=model_name, user=user_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
self.max_generation_token = 1000
|
| 51 |
self.end_string = "\n\n"
|
| 52 |
# We don't need input data
|
| 53 |
+
path_to_model = download(MODEL_METADATA[model_name]["repo_id"], MODEL_METADATA[model_name]["filelist"][0])
|
|
|
|
| 54 |
self.system_prompt = ""
|
| 55 |
|
| 56 |
+
global LLAMA_MODEL
|
| 57 |
+
if LLAMA_MODEL is None:
|
| 58 |
+
LLAMA_MODEL = Llama(model_path=path_to_model)
|
| 59 |
+
# model_path = None
|
| 60 |
+
# if os.path.exists("models"):
|
| 61 |
+
# model_dirs = os.listdir("models")
|
| 62 |
+
# if model_name in model_dirs:
|
| 63 |
+
# model_path = f"models/{model_name}"
|
| 64 |
+
# if model_path is not None:
|
| 65 |
+
# model_source = model_path
|
| 66 |
+
# else:
|
| 67 |
+
# model_source = f"decapoda-research/{model_name}"
|
| 68 |
# raise Exception(f"models目录下没有这个模型: {model_name}")
|
| 69 |
+
# if lora_path is not None:
|
| 70 |
+
# lora_path = f"lora/{lora_path}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def _get_llama_style_input(self):
|
| 73 |
history = []
|
|
|
|
| 85 |
|
| 86 |
def get_answer_at_once(self):
|
| 87 |
context = self._get_llama_style_input()
|
| 88 |
+
response = LLAMA_MODEL(context, max_tokens=self.max_generation_token, stop=[], echo=False, stream=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
return response, len(response)
|
| 90 |
|
| 91 |
def get_answer_stream_iter(self):
|
| 92 |
context = self._get_llama_style_input()
|
| 93 |
+
iter = LLAMA_MODEL(context, max_tokens=self.max_generation_token, stop=[], echo=False, stream=True)
|
| 94 |
partial_text = ""
|
| 95 |
+
for i in iter:
|
| 96 |
+
response = i["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
partial_text += response
|
| 98 |
yield partial_text
|
modules/presets.py
CHANGED
|
@@ -83,10 +83,7 @@ LOCAL_MODELS = [
|
|
| 83 |
"chatglm2-6b-int4",
|
| 84 |
"StableLM",
|
| 85 |
"MOSS",
|
| 86 |
-
"
|
| 87 |
-
"llama-13b-hf",
|
| 88 |
-
"llama-30b-hf",
|
| 89 |
-
"llama-65b-hf",
|
| 90 |
]
|
| 91 |
|
| 92 |
if os.environ.get('HIDE_LOCAL_MODELS', 'false') == 'true':
|
|
@@ -134,8 +131,8 @@ REPLY_LANGUAGES = [
|
|
| 134 |
]
|
| 135 |
|
| 136 |
HISTORY_NAME_METHODS = [
|
| 137 |
-
i18n("根据日期时间"),
|
| 138 |
-
i18n("第一条提问"),
|
| 139 |
i18n("模型自动总结(消耗tokens)"),
|
| 140 |
]
|
| 141 |
|
|
@@ -266,3 +263,10 @@ small_and_beautiful_theme = gr.themes.Soft(
|
|
| 266 |
chatbot_code_background_color_dark="*neutral_950",
|
| 267 |
)
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
"chatglm2-6b-int4",
|
| 84 |
"StableLM",
|
| 85 |
"MOSS",
|
| 86 |
+
"Llama-2-7B",
|
|
|
|
|
|
|
|
|
|
| 87 |
]
|
| 88 |
|
| 89 |
if os.environ.get('HIDE_LOCAL_MODELS', 'false') == 'true':
|
|
|
|
| 131 |
]
|
| 132 |
|
| 133 |
HISTORY_NAME_METHODS = [
|
| 134 |
+
i18n("根据日期时间"),
|
| 135 |
+
i18n("第一条提问"),
|
| 136 |
i18n("模型自动总结(消耗tokens)"),
|
| 137 |
]
|
| 138 |
|
|
|
|
| 263 |
chatbot_code_background_color_dark="*neutral_950",
|
| 264 |
)
|
| 265 |
|
| 266 |
+
# Additional metadate for local models
|
| 267 |
+
MODEL_METADATA = {
|
| 268 |
+
"Llama-2-7B":{
|
| 269 |
+
"repo_id": "TheBloke/Llama-2-7B-GGUF",
|
| 270 |
+
"filelist": ["llama-2-7b.Q6_K.gguf"],
|
| 271 |
+
}
|
| 272 |
+
}
|