update config and modelling files

Files changed (3) hide show

config.json +1 -3
configuration_nllbllm2vec.py +15 -2
modeling_nllbllm2vec.py +243 -407

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "fdschmidt93/NLLBLLM2Vec",
   "architectures": [
     "NLLBLLM2Vec"
   ],
@@ -37,6 +36,5 @@
     "vocab_size": 256206
   },
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.2"
 }

 {
   "architectures": [
     "NLLBLLM2Vec"
   ],
     "vocab_size": 256206
   },
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2"
 }

configuration_nllbllm2vec.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.llama.configuration_llama import LlamaConfig
@@ -36,6 +37,7 @@ DEFAULT_M2M100_CONFIG = {
     "vocab_size": 256206,
     "tokenizer_class": "NllbTokenizer",
     "max_length": 200,
 }
 DEFAULT_LLAMA_CONFIG = {
@@ -61,6 +63,7 @@ DEFAULT_LLAMA_CONFIG = {
     "transformers_version": "4.40.0.dev0",
     "use_cache": False,
     "vocab_size": 128256,
 }
@@ -70,13 +73,23 @@ class NLLBLLM2VecConfig(PretrainedConfig):
     def __init__(
         self,
-        nllb_config: dict = DEFAULT_M2M100_CONFIG,
-        llm2vec_config: dict = DEFAULT_LLAMA_CONFIG,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.nllb_config = M2M100Config(**nllb_config)
         self.llm2vec_config = LlamaConfig(**llm2vec_config)
 AutoConfig.register(NLLBLLM2VEC_TYPE, NLLBLLM2VecConfig)

+from typing import Optional, Dict
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.llama.configuration_llama import LlamaConfig
     "vocab_size": 256206,
     "tokenizer_class": "NllbTokenizer",
     "max_length": 200,
+    "_attn_implementation": "flash_attention_2",
 }
 DEFAULT_LLAMA_CONFIG = {
     "transformers_version": "4.40.0.dev0",
     "use_cache": False,
     "vocab_size": 128256,
+    "_attn_implementation": "flash_attention_2",
 }
     def __init__(
         self,
+        nllb_config: Dict = DEFAULT_M2M100_CONFIG,
+        llm2vec_config: Dict = DEFAULT_LLAMA_CONFIG,
+        _attn_implementation="sdpa",
+        initializer_range: Optional[float] = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self._attn_implementation = _attn_implementation
         self.nllb_config = M2M100Config(**nllb_config)
+        self.nllb_config._attn_implementation = _attn_implementation
         self.llm2vec_config = LlamaConfig(**llm2vec_config)
+        self.llm2vec_config._attn_implementation = _attn_implementation
+        if initializer_range is None:
+            self.initializer_range = self.llm2vec_config.initializer_range
+        else:
+            self.initializer_range = initializer_range
+            self.llm2vec_config.initializer_range
 AutoConfig.register(NLLBLLM2VEC_TYPE, NLLBLLM2VecConfig)

modeling_nllbllm2vec.py CHANGED Viewed

@@ -1,24 +1,69 @@
-from typing import Any, Dict, List, Optional, Tuple, cast, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers.models.auto import AutoModel, AutoModelForSequenceClassification
 from transformers.modeling_outputs import (
     BaseModelOutputWithPooling,
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
-from transformers.cache_utils import Cache
 from .configuration_nllbllm2vec import NLLBLLM2VecConfig
 from .modeling_llama_encoder import LlamaEncoderModel
 class NLLBLLM2Vec(PreTrainedModel):
     config_class = NLLBLLM2VecConfig
     model_type = "nllb-llm2vec"
     """
     NLLBLLM2Vec model combining NLLB and LLama encoders.
@@ -46,9 +91,13 @@ class NLLBLLM2Vec(PreTrainedModel):
         if config is not None:
             super().__init__(config, *inputs, **kwargs)
             self.nllb_encoder = nllb_encoder or M2M100Encoder(config.nllb_config)
             self.llm2vec = llm2vec or LlamaEncoderModel(config.llm2vec_config)
             self.config = config
         else:
             # Both encoders are provided
             self.nllb_encoder = cast(M2M100Encoder, nllb_encoder)
@@ -64,7 +113,15 @@ class NLLBLLM2Vec(PreTrainedModel):
             self.llm2vec.config.hidden_size,
             bias=False,
         )
-        # Additional initialization logic can go here
     def forward(
         self,
@@ -91,14 +148,12 @@ class NLLBLLM2Vec(PreTrainedModel):
         else:
             seq_indices, seq_offsets = indices
-        with torch.inference_mode():
-            nllb_outputs = self.nllb_encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-            )
-            nllb_last_hidden_state = nllb_outputs.last_hidden_state
-            nllb_last_hidden_state = self.up_proj(nllb_last_hidden_state)
-        nllb_last_hidden_state = nllb_last_hidden_state.detach().clone()
         outputs = self.llm2vec(
             inputs_embeds=nllb_last_hidden_state,
             attention_mask=attention_mask,
@@ -133,14 +188,22 @@ class NLLBLLM2Vec(PreTrainedModel):
         self,
         inputs: List[str],
         src_lang: str = "eng_Latn",
         tokenize_kwargs: Optional[Dict[str, Any]] = None,
     ) -> torch.Tensor:
         """
         Encode input texts into embeddings.
         Args:
             inputs (List[str]): List of input texts.
-            src_lang (str): Source language code.
             tokenize_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for the tokenizer.
                 Defaults to:
                 >>    tokenize_kwargs = {
@@ -149,26 +212,54 @@ class NLLBLLM2Vec(PreTrainedModel):
                 >>        "max_length": 512,
                 >>        "return_tensors": "pt",
                 >>    }
         Returns:
             torch.Tensor: Mean-pooled sequence embeddings of the inputs.
         """
-        if tokenize_kwargs is None:
-            tokenize_kwargs = {
-                "padding": True,
-                "truncation": True,
-                "max_length": 512,
-                "return_tensors": "pt",
-            }
         tokenizer = self.tokenizer
         tokenizer.src_lang = src_lang
         device = next(self.parameters()).device
-        batch = tokenizer(inputs, **tokenize_kwargs).to(device)
-        device_type = device.type  # e.g., 'cuda' or 'cpu'
-        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
-            return self(**batch).pooler_output
     @staticmethod
     def _get_input_offsets(
@@ -192,12 +283,8 @@ class NLLBLLM2Vec(PreTrainedModel):
         non_padded_lengths = attention_mask.sum(
             dim=1
         )  # Count non-padded tokens per sequence
-        offsets = torch.cat(
-            [
-                torch.tensor([0], device=attention_mask.device),
-                non_padded_lengths.cumsum(dim=0)[:-1],
-            ]
-        )
         return input_indices, offsets
     @staticmethod
@@ -235,10 +322,13 @@ class NLLBLLM2VecForSequenceClassification(PreTrainedModel):
     config_class = NLLBLLM2VecConfig
     model_type = "nllb-llm2vec"
     base_model_prefix = "model"
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.model = NLLBLLM2Vec(config)
         self.score = nn.Linear(
             config.llm2vec_config.hidden_size, self.num_labels, bias=False
@@ -247,114 +337,29 @@ class NLLBLLM2VecForSequenceClassification(PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.model.nllb.embed_tokens
     def set_input_embeddings(self, value):
         self.model.nllb.embed_tokens = value
-    # We need to modify the adapter config and state dict at runtime
-    # such that adapter weights are correctly loaded from an AutoModel-suitable
-    # adapter_config.json and adapter_config.safetensors
-    def load_adapter(
-        self,
-        peft_model_id: Optional[str] = None,
-        adapter_name: Optional[str] = None,
-        revision: Optional[str] = None,
-        token: Optional[str] = None,
-        device_map: Optional[str] = "auto",
-        max_memory: Optional[str] = None,
-        offload_folder: Optional[str] = None,
-        offload_index: Optional[int] = None,
-        peft_config: Optional[Dict[str, Any]] = None,
-        adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None,
-        adapter_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        from peft import PeftConfig, load_peft_weights  # type: ignore
-        from transformers.utils import find_adapter_config_file
-        if adapter_kwargs is None:
-            adapter_kwargs = {}
-        if "device" not in adapter_kwargs:
-            device = (
-                self.device
-                if not hasattr(self, "hf_device_map")
-                else list(self.hf_device_map.values())[0]
-            )
-        else:
-            device = adapter_kwargs["device"]
-        # To avoid PEFT errors later on with safetensors.
-        if isinstance(device, torch.device):
-            device = str(device)
-        # Override token with adapter_kwargs' token
-        if "token" in adapter_kwargs:
-            token = adapter_kwargs["token"]
-        if peft_model_id is None and (
-            adapter_state_dict is None and peft_config is None
-        ):
-            raise ValueError(
-                "You should either pass a `peft_model_id` or a `peft_config` and `adapter_state_dict` to load an adapter."
-            )
-        if peft_config is None:
-            assert isinstance(peft_model_id, str)
-            adapter_config_file = find_adapter_config_file(
-                peft_model_id,
-                token=token,
-                **adapter_kwargs,
-            )
-            if adapter_config_file is None:
-                raise ValueError(
-                    f"adapter model file not found in {peft_model_id}. Make sure you are passing the correct path to the "
-                    "adapter model."
-                )
-            peft_config = cast(
-                Dict[str, Any],
-                PeftConfig.from_pretrained(
-                    peft_model_id,
-                    token=token,
-                    **adapter_kwargs,
-                ),
-            )
-            peft_config.target_modules = [  # type: ignore
-                "model." + module
-                for module in peft_config.target_modules  # type: ignore
-            ]
-        if peft_model_id is not None:
-            adapter_state_dict = load_peft_weights(
-                peft_model_id, token=token, device=device, **adapter_kwargs
-            )
-        assert isinstance(adapter_state_dict, dict)
-        # correctly set the name
-        processed_adapter_state_dict = {}
-        prefix = "base_model."
-        for key, value in adapter_state_dict.items():
-            if key.startswith(prefix):
-                new_key = key[len(prefix) :]
-            else:
-                new_key = key
-            processed_adapter_state_dict[new_key] = value
-        return super().load_adapter(
-            peft_model_id=None,
-            adapter_name=adapter_name,
-            revision=revision,
-            token=token,
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            offload_index=offload_index,
-            peft_config=peft_config,
-            adapter_state_dict=processed_adapter_state_dict,
-            adapter_kwargs=adapter_kwargs,
-        )
     def forward(
         self,
@@ -420,10 +425,110 @@ class NLLBLLM2VecForSequenceClassification(PreTrainedModel):
             output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutputWithPast(
             loss=loss,
             hidden_states=hidden_states,
             logits=pooled_logits,
         )
@@ -431,275 +536,6 @@ AutoModel.register(NLLBLLM2VecConfig, NLLBLLM2Vec)
 AutoModelForSequenceClassification.register(
     NLLBLLM2VecConfig, NLLBLLM2VecForSequenceClassification
 )
-def repl():
-    from transformers import AutoModel
-    cfg = NLLBLLM2VecConfig()
-    model = NLLBLLM2Vec(cfg)
-    nllb = AutoModel.from_pretrained(
-        "facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16
-    ).encoder
-    # llm2vec = AutoModel.from_pretrained(
-    #     "fdschmidt93/LLM2Vec-Meta-Llama-3.1-8B-Instruct-mntp-unsup-simcse",
-    #     trust_remote_code=True,
-    #     torch_dtype=torch.bfloat16,
-    # )
-    llama = LlamaEncoderModel.from_pretrained("../trident-nllb-llm2vec/data/model/llm2vec_llama3-1_unsupervised/", torch_dtype=torch.bfloat16)
-    model.nllb_encoder.load_state_dict(nllb.state_dict())
-    model.llm2vec.load_state_dict(llama.state_dict())
-    ckpt = torch.load("./step=20000-weights.ckpt", map_location="cpu")
-    model.up_proj.load_state_dict({"weight": ckpt["model.up_proj.weight"]})
-    model.save_pretrained("../weights_new")
-    from peft.mapping import get_peft_model
-    from peft.tuners.lora.config import LoraConfig
-    lora_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        lora_dropout=0.0,
-        bias="none",
-        task_type="FEATURE_EXTRACTION",
-        target_modules=[
-            "llm2vec.layers.0.self_attn.q_proj",
-            "llm2vec.layers.0.self_attn.k_proj",
-            "llm2vec.layers.0.self_attn.v_proj",
-            "llm2vec.layers.0.self_attn.o_proj",
-            "llm2vec.layers.0.mlp.gate_proj",
-            "llm2vec.layers.0.mlp.up_proj",
-            "llm2vec.layers.0.mlp.down_proj",
-            "llm2vec.layers.1.self_attn.q_proj",
-            "llm2vec.layers.1.self_attn.k_proj",
-            "llm2vec.layers.1.self_attn.v_proj",
-            "llm2vec.layers.1.self_attn.o_proj",
-            "llm2vec.layers.1.mlp.gate_proj",
-            "llm2vec.layers.1.mlp.up_proj",
-            "llm2vec.layers.1.mlp.down_proj",
-            "llm2vec.layers.2.self_attn.q_proj",
-            "llm2vec.layers.2.self_attn.k_proj",
-            "llm2vec.layers.2.self_attn.v_proj",
-            "llm2vec.layers.2.self_attn.o_proj",
-            "llm2vec.layers.2.mlp.gate_proj",
-            "llm2vec.layers.2.mlp.up_proj",
-            "llm2vec.layers.2.mlp.down_proj",
-            "llm2vec.layers.3.self_attn.q_proj",
-            "llm2vec.layers.3.self_attn.k_proj",
-            "llm2vec.layers.3.self_attn.v_proj",
-            "llm2vec.layers.3.self_attn.o_proj",
-            "llm2vec.layers.3.mlp.gate_proj",
-            "llm2vec.layers.3.mlp.up_proj",
-            "llm2vec.layers.3.mlp.down_proj",
-            "llm2vec.layers.4.self_attn.q_proj",
-            "llm2vec.layers.4.self_attn.k_proj",
-            "llm2vec.layers.4.self_attn.v_proj",
-            "llm2vec.layers.4.self_attn.o_proj",
-            "llm2vec.layers.4.mlp.gate_proj",
-            "llm2vec.layers.4.mlp.up_proj",
-            "llm2vec.layers.4.mlp.down_proj",
-            "llm2vec.layers.5.self_attn.q_proj",
-            "llm2vec.layers.5.self_attn.k_proj",
-            "llm2vec.layers.5.self_attn.v_proj",
-            "llm2vec.layers.5.self_attn.o_proj",
-            "llm2vec.layers.5.mlp.gate_proj",
-            "llm2vec.layers.5.mlp.up_proj",
-            "llm2vec.layers.5.mlp.down_proj",
-            "llm2vec.layers.6.self_attn.q_proj",
-            "llm2vec.layers.6.self_attn.k_proj",
-            "llm2vec.layers.6.self_attn.v_proj",
-            "llm2vec.layers.6.self_attn.o_proj",
-            "llm2vec.layers.6.mlp.gate_proj",
-            "llm2vec.layers.6.mlp.up_proj",
-            "llm2vec.layers.6.mlp.down_proj",
-            "llm2vec.layers.7.self_attn.q_proj",
-            "llm2vec.layers.7.self_attn.k_proj",
-            "llm2vec.layers.7.self_attn.v_proj",
-            "llm2vec.layers.7.self_attn.o_proj",
-            "llm2vec.layers.7.mlp.gate_proj",
-            "llm2vec.layers.7.mlp.up_proj",
-            "llm2vec.layers.7.mlp.down_proj",
-            "llm2vec.layers.8.self_attn.q_proj",
-            "llm2vec.layers.8.self_attn.k_proj",
-            "llm2vec.layers.8.self_attn.v_proj",
-            "llm2vec.layers.8.self_attn.o_proj",
-            "llm2vec.layers.8.mlp.gate_proj",
-            "llm2vec.layers.8.mlp.up_proj",
-            "llm2vec.layers.8.mlp.down_proj",
-            "llm2vec.layers.9.self_attn.q_proj",
-            "llm2vec.layers.9.self_attn.k_proj",
-            "llm2vec.layers.9.self_attn.v_proj",
-            "llm2vec.layers.9.self_attn.o_proj",
-            "llm2vec.layers.9.mlp.gate_proj",
-            "llm2vec.layers.9.mlp.up_proj",
-            "llm2vec.layers.9.mlp.down_proj",
-            "llm2vec.layers.10.self_attn.q_proj",
-            "llm2vec.layers.10.self_attn.k_proj",
-            "llm2vec.layers.10.self_attn.v_proj",
-            "llm2vec.layers.10.self_attn.o_proj",
-            "llm2vec.layers.10.mlp.gate_proj",
-            "llm2vec.layers.10.mlp.up_proj",
-            "llm2vec.layers.10.mlp.down_proj",
-            "llm2vec.layers.11.self_attn.q_proj",
-            "llm2vec.layers.11.self_attn.k_proj",
-            "llm2vec.layers.11.self_attn.v_proj",
-            "llm2vec.layers.11.self_attn.o_proj",
-            "llm2vec.layers.11.mlp.gate_proj",
-            "llm2vec.layers.11.mlp.up_proj",
-            "llm2vec.layers.11.mlp.down_proj",
-            "llm2vec.layers.12.self_attn.q_proj",
-            "llm2vec.layers.12.self_attn.k_proj",
-            "llm2vec.layers.12.self_attn.v_proj",
-            "llm2vec.layers.12.self_attn.o_proj",
-            "llm2vec.layers.12.mlp.gate_proj",
-            "llm2vec.layers.12.mlp.up_proj",
-            "llm2vec.layers.12.mlp.down_proj",
-            "llm2vec.layers.13.self_attn.q_proj",
-            "llm2vec.layers.13.self_attn.k_proj",
-            "llm2vec.layers.13.self_attn.v_proj",
-            "llm2vec.layers.13.self_attn.o_proj",
-            "llm2vec.layers.13.mlp.gate_proj",
-            "llm2vec.layers.13.mlp.up_proj",
-            "llm2vec.layers.13.mlp.down_proj",
-            "llm2vec.layers.14.self_attn.q_proj",
-            "llm2vec.layers.14.self_attn.k_proj",
-            "llm2vec.layers.14.self_attn.v_proj",
-            "llm2vec.layers.14.self_attn.o_proj",
-            "llm2vec.layers.14.mlp.gate_proj",
-            "llm2vec.layers.14.mlp.up_proj",
-            "llm2vec.layers.14.mlp.down_proj",
-            "llm2vec.layers.15.self_attn.q_proj",
-            "llm2vec.layers.15.self_attn.k_proj",
-            "llm2vec.layers.15.self_attn.v_proj",
-            "llm2vec.layers.15.self_attn.o_proj",
-            "llm2vec.layers.15.mlp.gate_proj",
-            "llm2vec.layers.15.mlp.up_proj",
-            "llm2vec.layers.15.mlp.down_proj",
-            "llm2vec.layers.16.self_attn.q_proj",
-            "llm2vec.layers.16.self_attn.k_proj",
-            "llm2vec.layers.16.self_attn.v_proj",
-            "llm2vec.layers.16.self_attn.o_proj",
-            "llm2vec.layers.16.mlp.gate_proj",
-            "llm2vec.layers.16.mlp.up_proj",
-            "llm2vec.layers.16.mlp.down_proj",
-            "llm2vec.layers.17.self_attn.q_proj",
-            "llm2vec.layers.17.self_attn.k_proj",
-            "llm2vec.layers.17.self_attn.v_proj",
-            "llm2vec.layers.17.self_attn.o_proj",
-            "llm2vec.layers.17.mlp.gate_proj",
-            "llm2vec.layers.17.mlp.up_proj",
-            "llm2vec.layers.17.mlp.down_proj",
-            "llm2vec.layers.18.self_attn.q_proj",
-            "llm2vec.layers.18.self_attn.k_proj",
-            "llm2vec.layers.18.self_attn.v_proj",
-            "llm2vec.layers.18.self_attn.o_proj",
-            "llm2vec.layers.18.mlp.gate_proj",
-            "llm2vec.layers.18.mlp.up_proj",
-            "llm2vec.layers.18.mlp.down_proj",
-            "llm2vec.layers.19.self_attn.q_proj",
-            "llm2vec.layers.19.self_attn.k_proj",
-            "llm2vec.layers.19.self_attn.v_proj",
-            "llm2vec.layers.19.self_attn.o_proj",
-            "llm2vec.layers.19.mlp.gate_proj",
-            "llm2vec.layers.19.mlp.up_proj",
-            "llm2vec.layers.19.mlp.down_proj",
-            "llm2vec.layers.20.self_attn.q_proj",
-            "llm2vec.layers.20.self_attn.k_proj",
-            "llm2vec.layers.20.self_attn.v_proj",
-            "llm2vec.layers.20.self_attn.o_proj",
-            "llm2vec.layers.20.mlp.gate_proj",
-            "llm2vec.layers.20.mlp.up_proj",
-            "llm2vec.layers.20.mlp.down_proj",
-            "llm2vec.layers.21.self_attn.q_proj",
-            "llm2vec.layers.21.self_attn.k_proj",
-            "llm2vec.layers.21.self_attn.v_proj",
-            "llm2vec.layers.21.self_attn.o_proj",
-            "llm2vec.layers.21.mlp.gate_proj",
-            "llm2vec.layers.21.mlp.up_proj",
-            "llm2vec.layers.21.mlp.down_proj",
-            "llm2vec.layers.22.self_attn.q_proj",
-            "llm2vec.layers.22.self_attn.k_proj",
-            "llm2vec.layers.22.self_attn.v_proj",
-            "llm2vec.layers.22.self_attn.o_proj",
-            "llm2vec.layers.22.mlp.gate_proj",
-            "llm2vec.layers.22.mlp.up_proj",
-            "llm2vec.layers.22.mlp.down_proj",
-            "llm2vec.layers.23.self_attn.q_proj",
-            "llm2vec.layers.23.self_attn.k_proj",
-            "llm2vec.layers.23.self_attn.v_proj",
-            "llm2vec.layers.23.self_attn.o_proj",
-            "llm2vec.layers.23.mlp.gate_proj",
-            "llm2vec.layers.23.mlp.up_proj",
-            "llm2vec.layers.23.mlp.down_proj",
-            "llm2vec.layers.24.self_attn.q_proj",
-            "llm2vec.layers.24.self_attn.k_proj",
-            "llm2vec.layers.24.self_attn.v_proj",
-            "llm2vec.layers.24.self_attn.o_proj",
-            "llm2vec.layers.24.mlp.gate_proj",
-            "llm2vec.layers.24.mlp.up_proj",
-            "llm2vec.layers.24.mlp.down_proj",
-            "llm2vec.layers.25.self_attn.q_proj",
-            "llm2vec.layers.25.self_attn.k_proj",
-            "llm2vec.layers.25.self_attn.v_proj",
-            "llm2vec.layers.25.self_attn.o_proj",
-            "llm2vec.layers.25.mlp.gate_proj",
-            "llm2vec.layers.25.mlp.up_proj",
-            "llm2vec.layers.25.mlp.down_proj",
-            "llm2vec.layers.26.self_attn.q_proj",
-            "llm2vec.layers.26.self_attn.k_proj",
-            "llm2vec.layers.26.self_attn.v_proj",
-            "llm2vec.layers.26.self_attn.o_proj",
-            "llm2vec.layers.26.mlp.gate_proj",
-            "llm2vec.layers.26.mlp.up_proj",
-            "llm2vec.layers.26.mlp.down_proj",
-            "llm2vec.layers.27.self_attn.q_proj",
-            "llm2vec.layers.27.self_attn.k_proj",
-            "llm2vec.layers.27.self_attn.v_proj",
-            "llm2vec.layers.27.self_attn.o_proj",
-            "llm2vec.layers.27.mlp.gate_proj",
-            "llm2vec.layers.27.mlp.up_proj",
-            "llm2vec.layers.27.mlp.down_proj",
-            "llm2vec.layers.28.self_attn.q_proj",
-            "llm2vec.layers.28.self_attn.k_proj",
-            "llm2vec.layers.28.self_attn.v_proj",
-            "llm2vec.layers.28.self_attn.o_proj",
-            "llm2vec.layers.28.mlp.gate_proj",
-            "llm2vec.layers.28.mlp.up_proj",
-            "llm2vec.layers.28.mlp.down_proj",
-            "llm2vec.layers.29.self_attn.q_proj",
-            "llm2vec.layers.29.self_attn.k_proj",
-            "llm2vec.layers.29.self_attn.v_proj",
-            "llm2vec.layers.29.self_attn.o_proj",
-            "llm2vec.layers.29.mlp.gate_proj",
-            "llm2vec.layers.29.mlp.up_proj",
-            "llm2vec.layers.29.mlp.down_proj",
-            "llm2vec.layers.30.self_attn.q_proj",
-            "llm2vec.layers.30.self_attn.k_proj",
-            "llm2vec.layers.30.self_attn.v_proj",
-            "llm2vec.layers.30.self_attn.o_proj",
-            "llm2vec.layers.30.mlp.gate_proj",
-            "llm2vec.layers.30.mlp.up_proj",
-            "llm2vec.layers.30.mlp.down_proj",
-            "llm2vec.layers.31.self_attn.q_proj",
-            "llm2vec.layers.31.self_attn.k_proj",
-            "llm2vec.layers.31.self_attn.v_proj",
-            "llm2vec.layers.31.self_attn.o_proj",
-            "llm2vec.layers.31.mlp.gate_proj",
-            "llm2vec.layers.31.mlp.up_proj",
-            "llm2vec.layers.31.mlp.down_proj",
-        ],
-    )
-    peft_model = get_peft_model(model, lora_config)
-    peft_model.save_pretrained("../nllb-llm2vec-saved")
-    import json
-    with open("./model.safetensors.index.json", "r") as f:
-        print(json.load(f))
-    from transformers import AutoModelForSequenceClassification
-    model = AutoModelForSequenceClassification.from_pretrained(
-        ".", trust_remote_code=True, device_map="cuda"
-    )

+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import transformers
+from packaging import version
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
     BaseModelOutputWithPooling,
+    ModelOutput,
     SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto import AutoModel, AutoModelForSequenceClassification
 from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
+from transformers.tokenization_utils import BatchEncoding
 from .configuration_nllbllm2vec import NLLBLLM2VecConfig
 from .modeling_llama_encoder import LlamaEncoderModel
+DEFAULT_TOKENIZE_KWARGS = {
+    "padding": True,
+    "truncation": True,
+    "max_length": 512,
+    "return_tensors": "pt",
+}
+DEFAULT_DATALOADER_KWARGS = {
+    "shuffle": False,
+    "batch_size": 32,
+    "pin_memory": True,
+}
+def default_collate_fn_closure(tokenizer, tokenize_kwargs) -> Callable:
+    def collate_fn(batch: list[str]) -> BatchEncoding:
+        return tokenizer(batch, **tokenize_kwargs)
+    return collate_fn
+def defaulter(kwd_dict: Optional[Dict], default_dict: Dict) -> Dict:
+    return default_dict if kwd_dict is None else {**default_dict, **kwd_dict}
+@dataclass
+class SequenceClassifierOutputWithPastAndPooler(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    pooler_output: torch.FloatTensor = None
 class NLLBLLM2Vec(PreTrainedModel):
     config_class = NLLBLLM2VecConfig
     model_type = "nllb-llm2vec"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
     """
     NLLBLLM2Vec model combining NLLB and LLama encoders.
         if config is not None:
             super().__init__(config, *inputs, **kwargs)
+            # from_pretrained overwrites this after config instantiation, so we make sure it's correctly set
+            config.nllb_config._attn_implementation = config._attn_implementation
+            config.llm2vec_config._attn_implementation = config._attn_implementation
             self.nllb_encoder = nllb_encoder or M2M100Encoder(config.nllb_config)
             self.llm2vec = llm2vec or LlamaEncoderModel(config.llm2vec_config)
             self.config = config
         else:
             # Both encoders are provided
             self.nllb_encoder = cast(M2M100Encoder, nllb_encoder)
             self.llm2vec.config.hidden_size,
             bias=False,
         )
+        # TODO: update this once commit is included
+        min_version = "4.46.0"
+        if self.config.nllb_config._attn_implementation == "flash_attention_2":
+            if version.parse(transformers.__version__) < version.parse(min_version):
+                warnings.warn(
+                    f"Installed transformers version ({transformers.__version__}) never sets NLLB-encoder dropout to `False` with FlashAttention2. See https://github.com/huggingface/transformers/pull/33844 for more info. Consider upgrading to latest to {min_version} or master.",
+                    UserWarning,
+                )
     def forward(
         self,
         else:
             seq_indices, seq_offsets = indices
+        nllb_outputs = self.nllb_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        nllb_last_hidden_state = nllb_outputs.last_hidden_state
+        nllb_last_hidden_state = self.up_proj(nllb_last_hidden_state)
         outputs = self.llm2vec(
             inputs_embeds=nllb_last_hidden_state,
             attention_mask=attention_mask,
         self,
         inputs: List[str],
         src_lang: str = "eng_Latn",
+        dataloader_kwargs: Optional[Dict[str, Any]] = None,
         tokenize_kwargs: Optional[Dict[str, Any]] = None,
+        collate_fn_closure: Optional[Callable] = None,
     ) -> torch.Tensor:
         """
         Encode input texts into embeddings.
         Args:
             inputs (List[str]): List of input texts.
+            src_lang (str): Source language code for the tokenizer (default: `"eng_Latn"`).
+            dataloader_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for the dataloader excl. `collate_fn`.
+                Defaults to:
+                >>    dataloader_kwargs = {
+                >>        "shuffle": False,
+                >>        "pin_memory": True,
+                >>    }
             tokenize_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for the tokenizer.
                 Defaults to:
                 >>    tokenize_kwargs = {
                 >>        "max_length": 512,
                 >>        "return_tensors": "pt",
                 >>    }
+            collate_fn_closure (Optional[Callable]): Closure that should return a `collate_fn`.
+                Defaults to:
+                >>    def default_collate_fn_closure(tokenizer, tokenize_kwargs) -> Callable:
+                >>        def collate_fn(batch: list[str]) -> BatchEncoding:
+                >>            return tokenizer(batch, **tokenize_kwargs)
+                >>        return collate_fn
         Returns:
             torch.Tensor: Mean-pooled sequence embeddings of the inputs.
         """
+        # merge user kwargs with defaults, giving priority to user kwargs
+        tokenize_kwargs = defaulter(tokenize_kwargs, DEFAULT_TOKENIZE_KWARGS)
+        dataloader_kwargs = defaulter(dataloader_kwargs, DEFAULT_DATALOADER_KWARGS)
         tokenizer = self.tokenizer
         tokenizer.src_lang = src_lang
         device = next(self.parameters()).device
+        if collate_fn_closure is None:
+            collate_fn = default_collate_fn_closure(tokenizer, tokenize_kwargs)
+        else:
+            collate_fn = collate_fn_closure(tokenizer, tokenize_kwargs)
+        assert (
+            "collate_fn" not in dataloader_kwargs
+        ), "`collate_fn` should be created via `collate_fn_closure`"
+        self.eval()
+        if len(inputs) > dataloader_kwargs.get("batch_size", 1):
+            dataloader = DataLoader(inputs, collate_fn=collate_fn, **dataloader_kwargs)  # type: ignore
+            all_embeddings = []
+            # Iterate through the dataloader with a progress bar and autocast
+            with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
+                for batch in tqdm(dataloader, desc="Encoding"):
+                    # Move batch to device
+                    batch = {k: v.to(device) for k, v in batch.items()}
+                    # Forward pass through the model (assumes model returns embeddings)
+                    with torch.inference_mode():
+                        pooled_embeddings = cast(
+                            SequenceClassifierOutputWithPastAndPooler, self(**batch)
+                        ).pooler_output  # Assuming model returns sequence embeddings
+                    all_embeddings.append(pooled_embeddings)
+            # Concatenate all pooled embeddings along the batch dimension
+            all_embeddings = torch.cat(all_embeddings, dim=0)
+        else:
+            batch = {k: v.to(device) for k, v in collate_fn(inputs)}
+            with torch.inference_mode():
+                all_embeddings = cast(
+                    SequenceClassifierOutputWithPastAndPooler, self(**batch)
+                ).pooler_output  # Assuming model returns sequence embeddings
+        return all_embeddings
     @staticmethod
     def _get_input_offsets(
         non_padded_lengths = attention_mask.sum(
             dim=1
         )  # Count non-padded tokens per sequence
+        offsets = non_padded_lengths.cumsum(dim=0).roll(shifts=1)
+        offsets[0] = 0
         return input_indices, offsets
     @staticmethod
     config_class = NLLBLLM2VecConfig
     model_type = "nllb-llm2vec"
     base_model_prefix = "model"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.model = NLLBLLM2Vec(config)
         self.score = nn.Linear(
             config.llm2vec_config.hidden_size, self.num_labels, bias=False
         # Initialize weights and apply final processing
         self.post_init()
+    def _init_weights(self, module):
+        if module is self.score:
+            # INFO:
+            # - critical that clf head is in float32 (NusaX perf. drops funky otherwise)
+            # - Initialization needs to be redone, otherwise borked
+            #   - Use kaiming uniform, b/c Llama init (cf. `nn.Linear` below) performs worse
+            self.score = self.score.to(torch.float32)
+            torch.nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+        elif isinstance(module, nn.Linear):
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
     def get_input_embeddings(self):
         return self.model.nllb.embed_tokens
     def set_input_embeddings(self, value):
         self.model.nllb.embed_tokens = value
     def forward(
         self,
             output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPastAndPooler(
             loss=loss,
             hidden_states=hidden_states,
             logits=pooled_logits,
+            pooler_output=transformer_outputs.pooler_output,
+        )
+class NLLBLLM2VecForTokenClassification(PreTrainedModel):
+    config_class = NLLBLLM2VecConfig
+    model_type = "nllb-llm2vec"
+    base_model_prefix = "model"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(self, config: NLLBLLM2VecConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = NLLBLLM2Vec(config)
+        self.classifier = nn.Linear(
+            config.llm2vec_config.hidden_size, self.num_labels, bias=False
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module):
+        if module is self.classifier:
+            # INFO:
+            # - critical that clf head is in float32 (NusaX perf. drops funky otherwise)
+            # - Initialization needs to be redone, otherwise borked
+            #   - Use kaiming uniform, b/c Llama init (cf. `nn.Linear` below) performs worse
+            self.classifier = self.classifier.to(torch.float32)
+            torch.nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+        elif isinstance(module, nn.Linear):
+            if isinstance(module, nn.Linear):
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def get_input_embeddings(self):
+        return self.model.nllb.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.nllb.embed_tokens = value
+    # adapted from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification
+    # - removed classifier dropout
+    # - use F.cross_entropy
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
 AutoModelForSequenceClassification.register(
     NLLBLLM2VecConfig, NLLBLLM2VecForSequenceClassification
 )
+AutoModelForSequenceClassification.register(
+    NLLBLLM2VecConfig, NLLBLLM2VecForTokenClassification
+)