|
|
--- |
|
|
base_model: unsloth/Llama-3.2-1B-Instruct |
|
|
tags: |
|
|
- text-generation-inference |
|
|
- transformers |
|
|
- unsloth |
|
|
- llama |
|
|
- trl |
|
|
- sft |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
--- |
|
|
|
|
|
# Uploaded model |
|
|
|
|
|
- **Developed by:** metascroy |
|
|
- **License:** apache-2.0 |
|
|
- **Finetuned from model :** unsloth/Llama-3.2-1B-Instruct |
|
|
|
|
|
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. |
|
|
|
|
|
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth) |
|
|
|
|
|
# Script |
|
|
```Python |
|
|
# ========================================================================================= |
|
|
# Fine-tuning script based on https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_%281B_and_3B%29-Conversational.ipynb |
|
|
# |
|
|
# This script and HF checkpoint are only intended to showcase how to do finetuning in a way compatible with ExecuTorch |
|
|
# Only 100 steps are done, and quality of the finetuned model is not evaluated |
|
|
# ========================================================================================= |
|
|
|
|
|
from unsloth import FastLanguageModel |
|
|
from unsloth.chat_templates import ( |
|
|
get_chat_template, |
|
|
standardize_data_formats, |
|
|
standardize_sharegpt, |
|
|
train_on_responses_only, |
|
|
) |
|
|
|
|
|
from datasets import load_dataset |
|
|
from trl import SFTConfig, SFTTrainer |
|
|
from transformers import DataCollatorForSeq2Seq |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
|
|
|
batch_size = 2 |
|
|
learning_rate = 2e-5 |
|
|
gradient_accumulation_steps = 4 |
|
|
max_steps = 100 |
|
|
full_finetuning = True |
|
|
qat_scheme = "int8-int4" |
|
|
output_dir = "/tmp/unsloth_example" |
|
|
|
|
|
|
|
|
model_id = "unsloth/Llama-3.2-1B-Instruct" |
|
|
chat_template = "llama-3.1" |
|
|
max_seq_length = 2048 |
|
|
dtype = torch.bfloat16 |
|
|
load_in_4bit = False |
|
|
|
|
|
################################################################################ |
|
|
# Define model/tokenizer |
|
|
################################################################################ |
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name=model_id, |
|
|
max_seq_length=max_seq_length, |
|
|
dtype=dtype, |
|
|
load_in_4bit =load_in_4bit, |
|
|
full_finetuning=full_finetuning, |
|
|
qat_scheme=qat_scheme, |
|
|
) |
|
|
tokenizer = get_chat_template(tokenizer, chat_template = chat_template) |
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer) |
|
|
|
|
|
print("MODEL AFTER LOADING") |
|
|
print(model) |
|
|
|
|
|
################################################################################ |
|
|
# Untie model weights |
|
|
################################################################################ |
|
|
|
|
|
def untie_word_embeddings_(model): |
|
|
"""Untie input and output embeddings in a Hugging Face causal LM.""" |
|
|
# 1) Persist setting in config |
|
|
if hasattr(model.config, "tie_word_embeddings"): |
|
|
model.config.tie_word_embeddings = False |
|
|
|
|
|
# 2) Find input and output embeddings |
|
|
in_emb = model.get_input_embeddings() # nn.Embedding |
|
|
out_proj = model.get_output_embeddings() or getattr(model, "lm_head", None) |
|
|
if out_proj is None: |
|
|
raise AttributeError("Couldn't locate output projection (lm_head).") |
|
|
|
|
|
# (Optional) sanity: shapes should match [vocab, hidden] |
|
|
assert out_proj.weight.shape == in_emb.weight.shape, ( |
|
|
f"Shape mismatch: out_proj {out_proj.weight.shape} vs in_emb {in_emb.weight.shape}" |
|
|
) |
|
|
|
|
|
# 3) Only clone if they are actually tied (shared storage) |
|
|
if out_proj.weight.data_ptr() == in_emb.weight.data_ptr(): |
|
|
with torch.no_grad(): |
|
|
W = in_emb.weight.detach().clone() |
|
|
out_proj.weight = nn.Parameter(W) # new storage, keeps dtype/device |
|
|
|
|
|
# 4) Prevent future automatic re-tying |
|
|
def _no_tie(self): |
|
|
return |
|
|
model.tie_weights = _no_tie.__get__(model, model.__class__) |
|
|
|
|
|
# 5) Verify no shared storage |
|
|
assert out_proj.weight.data_ptr() != in_emb.weight.data_ptr(), "Embeddings still tied!" |
|
|
|
|
|
return model |
|
|
|
|
|
model = untie_word_embeddings_(model) |
|
|
|
|
|
print("MODEL AFTER UNTYING") |
|
|
print(model) |
|
|
print(model.config) |
|
|
|
|
|
|
|
|
################################################################################ |
|
|
# Process dataset |
|
|
################################################################################ |
|
|
|
|
|
def formatting_prompts_func(examples): |
|
|
convos = examples["conversations"] |
|
|
texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] |
|
|
return { "text" : texts, } |
|
|
dataset = load_dataset("mlabonne/FineTome-100k", split = "train") |
|
|
dataset = standardize_sharegpt(dataset) |
|
|
dataset = dataset.map(formatting_prompts_func, batched = True,) |
|
|
|
|
|
print("DATASET ENTRY") |
|
|
print(dataset[0]) |
|
|
print("\n\n") |
|
|
|
|
|
################################################################################ |
|
|
# Define trainer |
|
|
################################################################################ |
|
|
|
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
train_dataset=dataset, |
|
|
dataset_text_field="text", |
|
|
max_seq_length=max_seq_length, |
|
|
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer), |
|
|
packing=False, |
|
|
args=SFTConfig( |
|
|
per_device_train_batch_size=batch_size, |
|
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
|
warmup_steps=5, |
|
|
num_train_epochs=1, |
|
|
max_steps=max_steps, |
|
|
learning_rate=learning_rate, |
|
|
logging_steps=1, |
|
|
optim="adamw_8bit", |
|
|
weight_decay=0.01, |
|
|
lr_scheduler_type="linear", |
|
|
seed=3407, |
|
|
output_dir="outputs", |
|
|
report_to="none", |
|
|
), |
|
|
) |
|
|
trainer = train_on_responses_only( |
|
|
trainer, |
|
|
instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n", |
|
|
response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n", |
|
|
) |
|
|
|
|
|
print("VERIFYING PROMPT MASKING ON EXAMPLE") |
|
|
idx = 5 |
|
|
print("Original: ", tokenizer.decode(trainer.train_dataset[idx]["input_ids"])) |
|
|
space = tokenizer(" ", add_special_tokens = False).input_ids[0] |
|
|
print("Masked: ", tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[idx]["labels"]])) |
|
|
print("\n\n") |
|
|
|
|
|
|
|
|
################################################################################ |
|
|
# Do fine tuning |
|
|
################################################################################ |
|
|
print("DOING FINETUNING") |
|
|
trainer_stats = trainer.train() |
|
|
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") |
|
|
print( |
|
|
f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training." |
|
|
) |
|
|
|
|
|
################################################################################ |
|
|
# Save model |
|
|
################################################################################ |
|
|
model.save_pretrained(output_dir) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
################################################################################ |
|
|
# Convert model |
|
|
################################################################################ |
|
|
from torchao.quantization import Int8DynamicActivationIntxWeightConfig, IntxWeightOnlyConfig, ModuleFqnToConfig, quantize_ |
|
|
from torchao.quantization.qat import QATConfig |
|
|
from torchao.quantization.granularity import PerGroup, PerAxis |
|
|
from transformers import TorchAoConfig |
|
|
|
|
|
base_config = Int8DynamicActivationIntxWeightConfig(weight_dtype=torch.int4, weight_granularity=PerGroup(32)) |
|
|
quantize_(model, QATConfig(base_config, step="convert")) |
|
|
|
|
|
################################################################################ |
|
|
# Quantize embeddings to 8-bit with PTQ since they are not supported by QAT yet |
|
|
################################################################################ |
|
|
|
|
|
embedding_fqn = "model.embed_tokens" |
|
|
embedding_config = IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)) |
|
|
quantize_(model, embedding_config, lambda m, fqn: fqn == embedding_fqn) |
|
|
|
|
|
################################################################################ |
|
|
# Attach quantization config to model |
|
|
################################################################################ |
|
|
|
|
|
quant_config = ModuleFqnToConfig({"_default": base_config, embedding_fqn: embedding_config}) |
|
|
quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[]) |
|
|
model.config.quantization_config = TorchAoConfig(base_config) |
|
|
|
|
|
print('MODEL AFTER CONVERT', model) |
|
|
|
|
|
################################################################################ |
|
|
# Push converted model to hub |
|
|
################################################################################ |
|
|
from huggingface_hub import get_token, whoami |
|
|
|
|
|
def _get_username(): |
|
|
token = get_token() |
|
|
username = whoami(token=token)["name"] |
|
|
return username |
|
|
|
|
|
username = _get_username() |
|
|
model_name = model_id.split("/")[-1] |
|
|
save_to = f"{username}/{model_name}-{qat_scheme}" |
|
|
model.push_to_hub(save_to, safe_serialization=False) |
|
|
tokenizer.push_to_hub(save_to) |
|
|
|
|
|
################################################################################ |
|
|
# Load converted from hub and inspect |
|
|
################################################################################ |
|
|
from transformers import AutoModelForCausalLM |
|
|
model = AutoModelForCausalLM.from_pretrained(save_to) |
|
|
print('model', model) |
|
|
print("model.embed_tokens.weight", model.model.embed_tokens.weight) |
|
|
print("model.layers[0].self_attn.q_proj.weight", model.model.layers[0].self_attn.q_proj.weight) |
|
|
print("lm_head.weight", model.lm_head.weight) |
|
|
|
|
|
``` |
|
|
|