Fluctuation-based Adaptive Structured Pruning for Large Language Models
Paper • 2312.11983 • Published
How to use npc0/llama3.1-41B-raw with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="npc0/llama3.1-41B-raw")
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe(messages) # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("npc0/llama3.1-41B-raw")
model = AutoModelForCausalLM.from_pretrained("npc0/llama3.1-41B-raw")
messages = [
{"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use npc0/llama3.1-41B-raw with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "npc0/llama3.1-41B-raw"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "npc0/llama3.1-41B-raw",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker model run hf.co/npc0/llama3.1-41B-raw
How to use npc0/llama3.1-41B-raw with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "npc0/llama3.1-41B-raw" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "npc0/llama3.1-41B-raw",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "npc0/llama3.1-41B-raw" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "npc0/llama3.1-41B-raw",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'How to use npc0/llama3.1-41B-raw with Docker Model Runner:
docker model run hf.co/npc0/llama3.1-41B-raw
The Llama 3.1 text only 41B model is pruned from Llama 3.1 instruction finetuned text only 70B using FLAP method.
TL;DR No under maintenance. Bad performance, no value. Side product of experiment.
Hyper parameters used for pruning:
metrics: WIFV
structure: AL-AM
pruning_ratio: 0.5
This llama3.1-41B-raw model gives unstable output.
A finetune on instruction dataset is recommended.
The model is not supported by any library at the moment due to its unconsistent shape between layers after pruning.
The model is not supported by any library at the moment, following is a workaround.
from functools import reduce
def get_module_by_name(module, access_string):
names = access_string.split(sep='.')
return reduce(getattr, names, module)
import json
from safetensors import safe_open
from transformers import LlamaForCausalLM
class MyLlamaForCausalLM(LlamaForCausalLM):
def __init__(self, config):
super().__init__(config)
with open(os.path.join(
config._name_or_path,
"model.safetensors.index.json")) as f:
weight_map = json.load(f)
weight_map = weight_map["weight_map"]
for name, path in weight_map.items():
module_name = name.replace('.weight', '')
if '.bias' in module_name:
continue
layer = get_module_by_name(self, module_name)
with safe_open(
os.path.join(
config._name_or_path,
path), framework="pt") as f:
tensor = f.get_tensor(name)
if 'mlp.' in name or 'attn.' in name:
if tensor.shape != (layer.out_features, layer.in_features):
layer = layer.__init__(
tensor.shape[1],
tensor.shape[0],
bias=layer.bias,
dtype=layer.weight.dtype,
device=layer.weight.device)
for name, path in weight_map.items():
if 'attn.' in name:
module = get_module_by_name(
self,
'.'.join(name.split('.')[:-2]))
module.num_heads = module.q_proj.out_features // module.head_dim
module.num_key_value_heads = module.num_heads
module.num_key_value_groups = module.num_heads // module.num_key_value_heads
model = MyLlamaForCausalLM.from_pretrained(
"npc0/llama3.1-41B-raw",
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(
"FLAP/llm_weights/flap_p0.5_WIFV_ALAM_llama_70b")
model = model.eval()
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
{"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
{"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]
model_inputs = tokenizer.apply_chat_template(messages,
return_tensors="pt").to(model.device)
generated_ids = model.generate(model_inputs, max_new_tokens=128)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])
Base model
meta-llama/Llama-3.1-70B