efederici
/

ipt-350m

@@ -1,27 +1,29 @@
 """A HuggingFace-style model configuration."""
-from typing import Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
-    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
         """The MPT configuration class.
         Args:
             d_model (int): The size of the embedding dimension of the model.
             n_heads (int): The number of attention heads.
             n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
             max_seq_len (int): The maximum sequence length of the model.
             vocab_size (int): The size of the vocabulary.
             resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
             emb_pdrop (float): The dropout probability for the embedding layer.
             learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
@@ -38,13 +40,15 @@ class MPTConfig(PretrainedConfig):
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
             verbose (int): The verbosity level. 0 is silent.
             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
             norm_type (str): choose type of norm to use
-            multiquery_attention (bool): Whether to use multiquery attention implementation.
             use_cache (bool): Whether or not the model should return the last key/values attentions
             init_config (Dict): A dictionary used to configure the model initialization:
                 init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
@@ -61,6 +65,7 @@ class MPTConfig(PretrainedConfig):
                 init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
         """
         self.d_model = d_model
         self.n_heads = n_heads
@@ -72,29 +77,36 @@ class MPTConfig(PretrainedConfig):
         self.emb_pdrop = emb_pdrop
         self.learned_pos_emb = learned_pos_emb
         self.attn_config = attn_config
         self.init_device = init_device
         self.logit_scale = logit_scale
         self.no_bias = no_bias
-        self.verbose = verbose
         self.embedding_fraction = embedding_fraction
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
         if 'name' in kwargs:
             del kwargs['name']
         if 'loss_fn' in kwargs:
             del kwargs['loss_fn']
         super().__init__(**kwargs)
         self._validate_config()
-    def _set_config_defaults(self, config, config_defaults):
         for (k, v) in config_defaults.items():
             if k not in config:
                 config[k] = v
         return config
-    def _validate_config(self):
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
@@ -115,4 +127,14 @@ class MPTConfig(PretrainedConfig):
         if self.init_config.get('name', None) is None:
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
         if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')

 """A HuggingFace-style model configuration."""
+import warnings
+from typing import Any, Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
+ffn_config_defaults: Dict = {'ffn_type': 'mptmlp'}
 init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, ffn_config: Dict=ffn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, fc_type: str='torch', verbose: Optional[int]=None, **kwargs: Any):
         """The MPT configuration class.
         Args:
             d_model (int): The size of the embedding dimension of the model.
             n_heads (int): The number of attention heads.
             n_layers (int): The number of layers in the model.
+            expansion_ratio (int): The ratio of the up/down scale in the ffn.
             max_seq_len (int): The maximum sequence length of the model.
             vocab_size (int): The size of the vocabulary.
             resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
             emb_pdrop (float): The dropout probability for the embedding layer.
             learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict): A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
                 attn_pdrop (float): The dropout probability for the attention layers.
                 attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
                 qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
+                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+            ffn_config (Dict): A dictionary used to configure the model's ffn module:
+                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
             verbose (int): The verbosity level. 0 is silent.
             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
             norm_type (str): choose type of norm to use
             use_cache (bool): Whether or not the model should return the last key/values attentions
             init_config (Dict): A dictionary used to configure the model initialization:
                 init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
                 init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
                 ---
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
         """
         self.d_model = d_model
         self.n_heads = n_heads
         self.emb_pdrop = emb_pdrop
         self.learned_pos_emb = learned_pos_emb
         self.attn_config = attn_config
+        self.ffn_config = ffn_config
         self.init_device = init_device
         self.logit_scale = logit_scale
         self.no_bias = no_bias
         self.embedding_fraction = embedding_fraction
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
+        self.fc_type = fc_type
+        if verbose is not None:
+            warnings.warn(DeprecationWarning('verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'))
         if 'name' in kwargs:
             del kwargs['name']
         if 'loss_fn' in kwargs:
             del kwargs['loss_fn']
+        if self.attn_config.get('alibi', False):
+            self.learned_pos_emb = False
+            warnings.warn(f'alibi is turned on, setting `learned_pos_emb` to `False.`')
         super().__init__(**kwargs)
         self._validate_config()
+    def _set_config_defaults(self, config: Dict[str, Any], config_defaults: Dict[str, Any]) -> Dict[str, Any]:
         for (k, v) in config_defaults.items():
             if k not in config:
                 config[k] = v
         return config
+    def _validate_config(self) -> None:
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
+        self.ffn_config = self._set_config_defaults(self.ffn_config, ffn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
         if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
         if self.init_config.get('name', None) is None:
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
         if not self.learned_pos_emb and (not self.attn_config['alibi']):
+            warnings.warn(f'Positional information not being provided to the model using either learned_pos_emb or alibi.')
+        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            try:
+                import transformer_engine.pytorch as te
+                del te
+            except:
+                raise ImportError('TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' + 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156')
+        if self.ffn_config['ffn_type'] == 'mptmlp':
+            self.ffn_config['fc_type'] = self.fc_type
+        elif self.ffn_config['ffn_type'] == 'te_ln_mlp':
+            self.ffn_config['bias'] = not self.no_bias