OpenNLPLab
/

TransNormerLLM3-15B-Intermediate-Checkpoints

@@ -53,8 +53,13 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "TransnormerConfig"
 use_triton = eval(os.environ.get("use_triton", default="True"))
 debug = eval(os.environ.get("debug", default="False"))
 if use_triton:
     try:
@@ -84,6 +89,7 @@ if not has_lightning_attention:
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
@@ -93,9 +99,8 @@ class Lrpe(nn.Module):
         d = num_heads * embed_dim
         self.index = torch.empty(0)
-        self.theta = nn.Parameter(
-            10000 ** (-2 / d * torch.arange(d)).reshape(num_heads, 1, -1)
-        )
     def extra_repr(self):
         return print_module(self)
@@ -114,6 +119,7 @@ class Lrpe(nn.Module):
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
@@ -136,6 +142,7 @@ class GLU(nn.Module):
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
@@ -169,7 +176,7 @@ class NormLinearAttention(nn.Module):
             )
         self.qkv_proj = nn.Linear(embed_dim, 3 * hidden_dim, bias=bias)
-        self.output_gate =  nn.Sequential(
             nn.Linear(embed_dim, gate_dim, bias=bias),
             nn.Linear(gate_dim, hidden_dim, bias=bias),
         )
@@ -187,7 +194,6 @@ class NormLinearAttention(nn.Module):
         use_cache: bool = False,
         slope_rate: Optional[torch.Tensor] = None,
     ):
-        do_eval = eval(os.environ.get("do_eval", default="False"))
         if (not self.training) and (not do_eval):
             return self.inference(
                 x,
@@ -203,11 +209,11 @@ class NormLinearAttention(nn.Module):
         # linear map
         qkv = self.act(self.qkv_proj(x))
         q, k, v = qkv.split([d, d, d], dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
-        )
         q_offset = 0
         # lrpe relys on position, get cache first
@@ -222,12 +228,12 @@ class NormLinearAttention(nn.Module):
         # lrpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=q_offset)
-            k = self.lrpe(k)
         if attn_padding_mask is not None:
             v = v.masked_fill(
-                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
-            )
         if not has_lightning_attention:
             if attn_mask == None:
@@ -236,9 +242,8 @@ class NormLinearAttention(nn.Module):
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
-            output = lightning_attention(
-                q, k, v, True, slope_rate.squeeze(-1).squeeze(-1)
-            )
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
@@ -257,14 +262,14 @@ class NormLinearAttention(nn.Module):
         return output, attn_weights, past_key_value
     def inference(
-        self,
-        x,
-        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
-        attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
-        output_attentions: bool = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        use_cache: bool = False,
-        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         b, n, d = x.shape
@@ -273,13 +278,13 @@ class NormLinearAttention(nn.Module):
         q, k, v = qkv.split([d, d, d], dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
-        )
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
-            k = self.lrpe(k)
         if past_key_value == None:
             self.offset = q.shape[-2]
@@ -290,38 +295,47 @@ class NormLinearAttention(nn.Module):
         # only use for the first time
         if past_key_value == None:
-            if attn_mask == None:
-                attn_mask = (torch.tril(torch.ones(n, n))).to(q)
-            if slope_rate != None:
-                attn_mask = torch.exp(slope_rate * attn_mask)
             if attn_padding_mask is not None:
-                attn_mask = attn_mask.masked_fill(
-                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(2).to(torch.bool),
-                    0,
-                )
-            energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
-            if attn_mask != None:
-                energy = energy * attn_mask
-            output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
-            eval_and_not_generate = eval(
-                os.environ.get("eval_and_not_generate", default="False")
-            )
-            if eval_and_not_generate:
-                kv = None
-            else:
-                # b, h, n, e, d
-                kv_outproduct = torch.einsum("... n e, ... n d -> ... n e d", k, v)
-                # 1, 1, n, 1, 1
-                index = torch.arange(n - 1, -1, -1).reshape(1, 1, -1, 1, 1).to(x)
-                # (h, 1, 1) -> (1, h, 1, 1, 1); (1, h, 1, 1, 1), (1, 1, n, 1, 1) -> (1, h, n, 1, 1)
-                decay = ratio.unsqueeze(0).unsqueeze(-1) ** index
-                kv_outproduct_with_decay = kv_outproduct * decay
-                kv = torch.sum(kv_outproduct_with_decay, dim=-3)
         else:
             kv = past_key_value
@@ -329,12 +343,11 @@ class NormLinearAttention(nn.Module):
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
-                    k[:, :, i : i + 1],
-                    v[:, :, i : i + 1],
-                )
-                qkv = torch.einsum(
-                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv
                 )
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
@@ -353,6 +366,7 @@ class NormLinearAttention(nn.Module):
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
@@ -392,14 +406,14 @@ class TransnormerDecoderLayer(nn.Module):
         return residual + x
     def forward(
-        self,
-        x,
-        attn_mask: Optional[torch.Tensor] = None,
-        attn_padding_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         x = self.token_norm(x)
@@ -419,13 +433,13 @@ class TransnormerDecoderLayer(nn.Module):
         x = self.channel_mixer(x)
         x = self.residual_connection(x, residual)
-        outputs = (x,)
         if output_attentions:
-            outputs += (self_attn_weights,)
         if use_cache:
-            outputs += (present_key_value,)
         return outputs
@@ -447,9 +461,7 @@ TRANSNORMER_START_DOCSTRING = r"""
 """
-@add_start_docstrings(
-    TRANSNORMER_START_DOCSTRING,
-)
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
@@ -534,9 +546,7 @@ TRANSNORMER_INPUTS_DOCSTRING = r"""
 """
-@add_start_docstrings(
-    TRANSNORMER_START_DOCSTRING,
-)
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
@@ -560,29 +570,31 @@ class TransnormerModel(TransnormerPreTrainedModel):
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.decoder_embed_dim, self.padding_idx
-        )
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
-        self.final_norm = get_norm_fn(config.norm_type)(config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
-        self.embed_scale = (
-            1.0 if config.no_scale_embedding else math.sqrt(self.embed_dim)
-        )
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
-                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
@@ -591,18 +603,15 @@ class TransnormerModel(TransnormerPreTrainedModel):
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
-                closest_power_of_2 = 2 ** math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
-                return (
-                    get_slopes_power_of_2(closest_power_of_2)
-                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
-                )
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
-            n_attention_heads, 1, 1
-        )
         return slopes
@@ -615,26 +624,26 @@ class TransnormerModel(TransnormerPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    def _prepare_decoder_linear_attn_mask(
-        self, input_shape, inputs_embeds, past_key_values_length
-    ):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
-            return 2 ** (math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
-                mask = torch.triu(torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
-                    mask[i, : i + 1] = -torch.flip(y, [0])
                 return mask
@@ -646,7 +655,8 @@ class TransnormerModel(TransnormerPreTrainedModel):
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
-        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len, src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
@@ -660,21 +670,15 @@ class TransnormerModel(TransnormerPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
@@ -696,7 +700,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
@@ -718,23 +722,23 @@ class TransnormerModel(TransnormerPreTrainedModel):
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
-            (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-        slope_rates = [self.slopes.to(input_ids.device) for _ in range(self.num_layers)]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            past_key_value = (
-                past_key_values[idx] if past_key_values is not None else None
-            )
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
             layer_outputs = layer(
                 hidden_states,
                 attn_mask=mask,
@@ -748,27 +752,24 @@ class TransnormerModel(TransnormerPreTrainedModel):
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
             if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-            # if idx == 0:
-            #     break
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
-            all_hidden_states += (hidden_states,)
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -778,6 +779,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
@@ -785,9 +787,9 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
-        self.lm_head = nn.Linear(
-            config.decoder_embed_dim, config.vocab_size, bias=False
-        )
         # Initialize weights and apply final processing
         self.post_init()
@@ -811,9 +813,8 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -851,19 +852,13 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
@@ -894,8 +889,8 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
@@ -922,22 +917,18 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         else:
             model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(0, beam_idx) for past_state in layer_past
-                ),
-            )
         return reordered_past

 _CONFIG_FOR_DOC = "TransnormerConfig"
+# TODO: fix environment: https://huggingface.co/OpenNLPLab/TransNormerLLM-7B/discussions/1
 use_triton = eval(os.environ.get("use_triton", default="True"))
 debug = eval(os.environ.get("debug", default="False"))
+do_eval = eval(os.environ.get("do_eval", default="False"))
+eval_and_not_generate = eval(
+    os.environ.get("eval_and_not_generate", default="False"))
+BLOCK = 256
 if use_triton:
     try:
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
         d = num_heads * embed_dim
         self.index = torch.empty(0)
+        self.theta = nn.Parameter(10000**(-2 / d * torch.arange(d)).reshape(
+            num_heads, 1, -1))
     def extra_repr(self):
         return print_module(self)
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
             )
         self.qkv_proj = nn.Linear(embed_dim, 3 * hidden_dim, bias=bias)
+        self.output_gate = nn.Sequential(
             nn.Linear(embed_dim, gate_dim, bias=bias),
             nn.Linear(gate_dim, hidden_dim, bias=bias),
         )
         use_cache: bool = False,
         slope_rate: Optional[torch.Tensor] = None,
     ):
         if (not self.training) and (not do_eval):
             return self.inference(
                 x,
         # linear map
         qkv = self.act(self.qkv_proj(x))
         q, k, v = qkv.split([d, d, d], dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
+            [q, k, v])
         q_offset = 0
         # lrpe relys on position, get cache first
         # lrpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=q_offset)
+            k = self.lrpe(k, offset=q_offset)
         if attn_padding_mask is not None:
             v = v.masked_fill(
+                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(
+                    torch.bool), 0)
         if not has_lightning_attention:
             if attn_mask == None:
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
+            output = lightning_attention(q, k, v, True,
+                                         slope_rate.squeeze(-1).squeeze(-1))
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
         return output, attn_weights, past_key_value
     def inference(
+            self,
+            x,
+            attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+            attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
+            output_attentions: bool = False,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            use_cache: bool = False,
+            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         b, n, d = x.shape
         q, k, v = qkv.split([d, d, d], dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
+            [q, k, v])
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
+            k = self.lrpe(k, offset=self.offset)
         if past_key_value == None:
             self.offset = q.shape[-2]
         # only use for the first time
         if past_key_value == None:
+            slope_rate = slope_rate.to(torch.float32)
             if attn_padding_mask is not None:
+                v = v.masked_fill(
+                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(
+                        torch.bool), 0)
+            NUM_BLOCK = (n + BLOCK - 1) // BLOCK
+            b, h, n, d = q.shape
+            e = v.shape[-1]
+            # other
+            array = torch.arange(BLOCK).to(q) + 1  ## !!!! important
+            q_decay = torch.exp(-slope_rate * array.reshape(-1, 1))
+            k_decay = torch.exp(-slope_rate * (BLOCK - array.reshape(-1, 1)))
+            index = array[:, None] - array[None, :]
+            s_index = slope_rate * index[
+                None,
+                None,
+            ]
+            s_index = torch.where(index >= 0, -s_index, float("-inf"))
+            diag_decay = torch.exp(s_index)
+            kv = torch.zeros(b, h, d, e).to(torch.float32).to(q.device)
+            output = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+            for i in range(NUM_BLOCK):
+                si = i * BLOCK
+                ei = min(si + BLOCK, n)
+                m = ei - si
+                qi = q[:, :, si:ei].contiguous()
+                ki = k[:, :, si:ei].contiguous()
+                vi = v[:, :, si:ei].contiguous()
+                qkv_none_diag = torch.matmul(qi * q_decay[:, :m],
+                                             kv).to(torch.float32)
+                # diag
+                qk = torch.matmul(qi, ki.transpose(-1, -2)).to(
+                    torch.float32) * diag_decay[:, :, :m, :m]
+                qkv_diag = torch.matmul(qk, vi.to(torch.float32))
+                block_decay = torch.exp(-slope_rate * m)
+                output[:, :, si:ei] = qkv_none_diag + qkv_diag
+                kv = block_decay * kv + torch.matmul(
+                    (ki * k_decay[:, -m:]).transpose(-1, -2).to(vi.dtype), vi)
         else:
             kv = past_key_value
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
+                    k[:, :, i:i + 1],
+                    v[:, :, i:i + 1],
                 )
+                qkv = torch.einsum("... n e, ... e d -> ... n d",
+                                   q[:, :, i:i + 1], kv)
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
         return residual + x
     def forward(
+            self,
+            x,
+            attn_mask: Optional[torch.Tensor] = None,
+            attn_padding_mask: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         x = self.token_norm(x)
         x = self.channel_mixer(x)
         x = self.residual_connection(x, residual)
+        outputs = (x, )
         if output_attentions:
+            outputs += (self_attn_weights, )
         if use_cache:
+            outputs += (present_key_value, )
         return outputs
 """
+@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
 """
+@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
+        self.embed_tokens = nn.Embedding(config.vocab_size,
+                                         config.decoder_embed_dim,
+                                         self.padding_idx)
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
+        self.final_norm = get_norm_fn(config.norm_type)(
+            config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
+        self.embed_scale = (1.0 if config.no_scale_embedding else math.sqrt(
+            self.embed_dim))
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+                closest_power_of_2 = 2**math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+            n_attention_heads, 1, 1)
         return slopes
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    def _prepare_decoder_linear_attn_mask(self, input_shape, inputs_embeds,
+                                          past_key_values_length):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
+            return 2**(math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
+                mask = torch.triu(
+                    torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
+                    mask[i, :i + 1] = -torch.flip(y, [0])
                 return mask
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
+        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len,
+                                                      src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (output_attentions if output_attentions is not None
+                             else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (return_dict if return_dict is not None else
+                       self.config.use_return_dict)
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
+            (batch_size, seq_length), inputs_embeds, past_key_values_length)
+        slope_rates = [
+            self.slopes.to(input_ids.device) for _ in range(self.num_layers)
+        ]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            past_key_value = (past_key_values[idx]
+                              if past_key_values is not None else None)
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
             layer_outputs = layer(
                 hidden_states,
                 attn_mask=mask,
             hidden_states = layer_outputs[0]
             if use_cache:
+                next_decoder_cache += (
+                    layer_outputs[2 if output_attentions else 1], )
             if output_attentions:
+                all_self_attns += (layer_outputs[1], )
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
+            all_hidden_states += (hidden_states, )
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
+                v for v in
+                [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.decoder_embed_dim,
+                                 config.vocab_size,
+                                 bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
+                               config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
+        output_attentions = (output_attentions if output_attentions is not None
+                             else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = (return_dict if return_dict is not None else
+                       self.config.use_return_dict)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
         else:
             model_inputs = {"input_ids": input_ids}
+        model_inputs.update({
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        })
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
         return reordered_past