h2oai
/

h2ogpt-oasst1-falcon-40b

@@ -52,11 +52,10 @@ class RotaryEmbedding(torch.nn.Module):
     def __init__(
         self,
-        config,
         base=10000,
     ):
-        head_dim = config.head_dim
-        self.use_cache = config.use_cache
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
@@ -65,6 +64,7 @@ class RotaryEmbedding(torch.nn.Module):
         self.batch_size_cached = None
         self.cos_cached: torch.Tensor | None = None
         self.sin_cached: torch.Tensor | None = None
     def cos_sin(
         self,
@@ -107,10 +107,7 @@ class RotaryEmbedding(torch.nn.Module):
     def forward(self, q, k):
         batch, seq_len, head_dim = q.shape
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
-        try:
-            return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
-        except Exception as e:
-            raise
 def _make_causal_mask(
@@ -187,7 +184,7 @@ class Attention(nn.Module):
                 f" {self.num_heads})."
             )
-        self.maybe_rotary = RotaryEmbedding(config) if config.rotary else lambda q, k: (q, k)
         # Layer-wise attention scaling
         self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
@@ -195,34 +192,44 @@ class Attention(nn.Module):
         self.query_key_value = Linear(
             self.hidden_size,
-            3 * self.hidden_size if not config.multi_query else (self.hidden_size + 2 * self.head_dim),
             bias=config.bias,
         )
-        self.multi_query = config.multi_query
         self.dense = Linear(self.hidden_size, self.hidden_size, bias=config.bias)
         self.attention_dropout = nn.Dropout(config.attention_dropout)
-        self.num_kv = config.n_head if not self.multi_query else 1
     def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
         storage as `fused_qkv`
         Args:
             fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
         Returns:
-            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
             value: [batch_size, seq_length, num_heads, head_dim]
         """
-        if not self.multi_query:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
-            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
-        else:
-            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
-            return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
     def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -268,11 +275,11 @@ class Attention(nn.Module):
         query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
         key_layer = key_layer.transpose(1, 2).reshape(
-            batch_size * self.num_kv,
             q_length,
             self.head_dim,
         )
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.head_dim)
         query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
@@ -293,15 +300,12 @@ class Attention(nn.Module):
         if alibi is None:
             query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
-            key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            try:
-                attn_output = F.scaled_dot_product_attention(
-                    query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
-                )
-            except Exception as e:
-                raise
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
@@ -326,7 +330,8 @@ class Attention(nn.Module):
                 attention_scores = attention_scores.to(torch.float32)
             # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
             attention_probs = F.softmax(
-                (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * self.inv_norm_factor + attention_mask_float,
                 dim=-1,
                 dtype=hidden_states.dtype,
             )
@@ -375,14 +380,12 @@ class DecoderLayer(nn.Module):
         super().__init__()
         hidden_size = config.hidden_size
-        self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.num_heads = config.n_head
         self.self_attention = Attention(config)
-        if not config.parallel_attn:
-            # unused if parallel attn
-            self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = MLP(config)
         self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
@@ -401,12 +404,14 @@ class DecoderLayer(nn.Module):
         output_attentions: bool = False,
     ):
-        layernorm_output = self.input_layernorm(hidden_states)
         residual = hidden_states
         # Self attention.
         attn_outputs = self.self_attention(
-            layernorm_output,
             layer_past=layer_past,
             attention_mask=attention_mask,
             alibi=alibi,
@@ -417,19 +422,14 @@ class DecoderLayer(nn.Module):
         attention_output = attn_outputs[0]
-        if not self.config.parallel_attn:
-            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
-            layernorm_output = self.post_attention_layernorm(residual)
         outputs = attn_outputs[1:]
         # MLP.
-        mlp_output = self.mlp(layernorm_output)
-        if self.config.parallel_attn:
-            mlp_output += attention_output
-        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
         if use_cache:
             outputs = (output,) + outputs
@@ -1120,4 +1120,4 @@ class RWForQuestionAnswering(RWPreTrainedModel):
             end_logits=end_logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

     def __init__(
         self,
+        head_dim: int,
         base=10000,
+        use_cache=False,
     ):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.batch_size_cached = None
         self.cos_cached: torch.Tensor | None = None
         self.sin_cached: torch.Tensor | None = None
+        self.use_cache = use_cache
     def cos_sin(
         self,
     def forward(self, q, k):
         batch, seq_len, head_dim = q.shape
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
+        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 def _make_causal_mask(
                 f" {self.num_heads})."
             )
+        self.maybe_rotary = RotaryEmbedding(config.head_dim) if config.rotary else lambda q, k: (q, k)
         # Layer-wise attention scaling
         self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
         self.query_key_value = Linear(
             self.hidden_size,
+            (config.n_head_kv * 2 + config.n_head) * self.head_dim,
             bias=config.bias,
         )
         self.dense = Linear(self.hidden_size, self.hidden_size, bias=config.bias)
         self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv = config.n_head_kv
     def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
+        Split the last dimension into (num_heads, head_dim), results share same memory
         storage as `fused_qkv`
         Args:
             fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
         Returns:
+            query: [batch_size, seq_length, num_heads, head_dim]
+            key: [batch_size, seq_length, num_heads, head_dim]
             value: [batch_size, seq_length, num_heads, head_dim]
         """
+        batch, seq_len, _ = fused_qkv.shape
+        qkv = fused_qkv.view(batch, seq_len, -1, self.num_heads // self.num_kv + 2, 64)
+        q = qkv[:, :, :, :-2]
+        k = qkv[:, :, :, [-2]]
+        v = qkv[:, :, :, [-1]]
+        k = torch.broadcast_to(k, q.shape)
+        v = torch.broadcast_to(v, q.shape)
+        q, k, v = [
+            rearrange(
+                x,
+                "batch seq_len group num_heads head_dim ->\
+                batch seq_len (group num_heads) head_dim",
+                head_dim=self.head_dim,
+            )
+            for x in [q, k, v]
+        ]
+        return q, k, v
     def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         """
         query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
         key_layer = key_layer.transpose(1, 2).reshape(
+            batch_size * self.num_heads,
             q_length,
             self.head_dim,
         )
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
         query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
         if alibi is None:
             query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+            key_layer_ = key_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+            value_layer_ = value_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
+            attn_output = F.scaled_dot_product_attention(
+                query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
+            )
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
                 attention_scores = attention_scores.to(torch.float32)
             # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
             attention_probs = F.softmax(
+                (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * self.inv_norm_factor
+                + attention_mask_float,
                 dim=-1,
                 dtype=hidden_states.dtype,
             )
         super().__init__()
         hidden_size = config.hidden_size
+        self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.num_heads = config.n_head
         self.self_attention = Attention(config)
         self.mlp = MLP(config)
         self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
         output_attentions: bool = False,
     ):
+        ln_attn = self.ln_attn(hidden_states)
+        ln_mlp = self.ln_mlp(hidden_states)
         residual = hidden_states
         # Self attention.
         attn_outputs = self.self_attention(
+            ln_attn,
             layer_past=layer_past,
             attention_mask=attention_mask,
             alibi=alibi,
         attention_output = attn_outputs[0]
         outputs = attn_outputs[1:]
         # MLP.
+        mlp_output = self.mlp(ln_mlp)
+        output = dropout_add(
+            mlp_output + attention_output, residual, self.config.hidden_dropout, training=self.training
+        )
         if use_cache:
             outputs = (output,) + outputs
             end_logits=end_logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )