Llama-3.2-1B-Instruct-int8-int4 / README.md

Update README.md

ef94e10 verified about 2 months ago

9.35 kB

	---
	base_model: unsloth/Llama-3.2-1B-Instruct
	tags:
	- text-generation-inference
	- transformers
	- unsloth
	- llama
	- trl
	- sft
	license: apache-2.0
	language:
	- en
	---

	# Uploaded model

	- Developed by: metascroy
	- License: apache-2.0
	- Finetuned from model : unsloth/Llama-3.2-1B-Instruct

	This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

	[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)

	# Script
	```Python
	# =========================================================================================
	# Fine-tuning script based on https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_%281B_and_3B%29-Conversational.ipynb
	#
	# This script and HF checkpoint are only intended to showcase how to do finetuning in a way compatible with ExecuTorch
	# Only 100 steps are done, and quality of the finetuned model is not evaluated
	# =========================================================================================

	from unsloth import FastLanguageModel
	from unsloth.chat_templates import (
	get_chat_template,
	standardize_data_formats,
	standardize_sharegpt,
	train_on_responses_only,
	)

	from datasets import load_dataset
	from trl import SFTConfig, SFTTrainer
	from transformers import DataCollatorForSeq2Seq
	import torch
	import torch.nn as nn

	batch_size = 2
	learning_rate = 2e-5
	gradient_accumulation_steps = 4
	max_steps = 100
	full_finetuning = True
	qat_scheme = "int8-int4"
	output_dir = "/tmp/unsloth_example"


	model_id = "unsloth/Llama-3.2-1B-Instruct"
	chat_template = "llama-3.1"
	max_seq_length = 2048
	dtype = torch.bfloat16
	load_in_4bit = False

	################################################################################
	# Define model/tokenizer
	################################################################################

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_id,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit =load_in_4bit,
	full_finetuning=full_finetuning,
	qat_scheme=qat_scheme,
	)
	tokenizer = get_chat_template(tokenizer, chat_template = chat_template)
	data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer)

	print("MODEL AFTER LOADING")
	print(model)

	################################################################################
	# Untie model weights
	################################################################################

	def untie_word_embeddings_(model):
	"""Untie input and output embeddings in a Hugging Face causal LM."""
	# 1) Persist setting in config
	if hasattr(model.config, "tie_word_embeddings"):
	model.config.tie_word_embeddings = False

	# 2) Find input and output embeddings
	in_emb = model.get_input_embeddings() # nn.Embedding
	out_proj = model.get_output_embeddings() or getattr(model, "lm_head", None)
	if out_proj is None:
	raise AttributeError("Couldn't locate output projection (lm_head).")

	# (Optional) sanity: shapes should match [vocab, hidden]
	assert out_proj.weight.shape == in_emb.weight.shape, (
	f"Shape mismatch: out_proj {out_proj.weight.shape} vs in_emb {in_emb.weight.shape}"
	)

	# 3) Only clone if they are actually tied (shared storage)
	if out_proj.weight.data_ptr() == in_emb.weight.data_ptr():
	with torch.no_grad():
	W = in_emb.weight.detach().clone()
	out_proj.weight = nn.Parameter(W) # new storage, keeps dtype/device

	# 4) Prevent future automatic re-tying
	def _no_tie(self):
	return
	model.tie_weights = _no_tie.__get__(model, model.__class__)

	# 5) Verify no shared storage
	assert out_proj.weight.data_ptr() != in_emb.weight.data_ptr(), "Embeddings still tied!"

	return model

	model = untie_word_embeddings_(model)

	print("MODEL AFTER UNTYING")
	print(model)
	print(model.config)


	################################################################################
	# Process dataset
	################################################################################

	def formatting_prompts_func(examples):
	convos = examples["conversations"]
	texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
	return { "text" : texts, }
	dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
	dataset = standardize_sharegpt(dataset)
	dataset = dataset.map(formatting_prompts_func, batched = True,)

	print("DATASET ENTRY")
	print(dataset[0])
	print("\n\n")

	################################################################################
	# Define trainer
	################################################################################

	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=dataset,
	dataset_text_field="text",
	max_seq_length=max_seq_length,
	data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
	packing=False,
	args=SFTConfig(
	per_device_train_batch_size=batch_size,
	gradient_accumulation_steps=gradient_accumulation_steps,
	warmup_steps=5,
	num_train_epochs=1,
	max_steps=max_steps,
	learning_rate=learning_rate,
	logging_steps=1,
	optim="adamw_8bit",
	weight_decay=0.01,
	lr_scheduler_type="linear",
	seed=3407,
	output_dir="outputs",
	report_to="none",
	),
	)
	trainer = train_on_responses_only(
	trainer,
	instruction_part = "<\|start_header_id\|>user<\|end_header_id\|>\n\n",
	response_part = "<\|start_header_id\|>assistant<\|end_header_id\|>\n\n",
	)

	print("VERIFYING PROMPT MASKING ON EXAMPLE")
	idx = 5
	print("Original: ", tokenizer.decode(trainer.train_dataset[idx]["input_ids"]))
	space = tokenizer(" ", add_special_tokens = False).input_ids[0]
	print("Masked: ", tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[idx]["labels"]]))
	print("\n\n")


	################################################################################
	# Do fine tuning
	################################################################################
	print("DOING FINETUNING")
	trainer_stats = trainer.train()
	print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
	print(
	f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
	)

	################################################################################
	# Save model
	################################################################################
	model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)

	################################################################################
	# Convert model
	################################################################################
	from torchao.quantization import Int8DynamicActivationIntxWeightConfig, IntxWeightOnlyConfig, ModuleFqnToConfig, quantize_
	from torchao.quantization.qat import QATConfig
	from torchao.quantization.granularity import PerGroup, PerAxis
	from transformers import TorchAoConfig

	base_config = Int8DynamicActivationIntxWeightConfig(weight_dtype=torch.int4, weight_granularity=PerGroup(32))
	quantize_(model, QATConfig(base_config, step="convert"))

	################################################################################
	# Quantize embeddings to 8-bit with PTQ since they are not supported by QAT yet
	################################################################################

	embedding_fqn = "model.embed_tokens"
	embedding_config = IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0))
	quantize_(model, embedding_config, lambda m, fqn: fqn == embedding_fqn)

	################################################################################
	# Attach quantization config to model
	################################################################################

	quant_config = ModuleFqnToConfig({"_default": base_config, embedding_fqn: embedding_config})
	quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[])
	model.config.quantization_config = TorchAoConfig(base_config)

	print('MODEL AFTER CONVERT', model)

	################################################################################
	# Push converted model to hub
	################################################################################
	from huggingface_hub import get_token, whoami

	def _get_username():
	token = get_token()
	username = whoami(token=token)["name"]
	return username

	username = _get_username()
	model_name = model_id.split("/")[-1]
	save_to = f"{username}/{model_name}-{qat_scheme}"
	model.push_to_hub(save_to, safe_serialization=False)
	tokenizer.push_to_hub(save_to)

	################################################################################
	# Load converted from hub and inspect
	################################################################################
	from transformers import AutoModelForCausalLM
	model = AutoModelForCausalLM.from_pretrained(save_to)
	print('model', model)
	print("model.embed_tokens.weight", model.model.embed_tokens.weight)
	print("model.layers[0].self_attn.q_proj.weight", model.model.layers[0].self_attn.q_proj.weight)
	print("lm_head.weight", model.lm_head.weight)

	```