Matt
commited on
Commit
·
12c569a
1
Parent(s):
8bda090
Revert to Falcon naming
Browse files
config.json
CHANGED
|
@@ -6,12 +6,12 @@
|
|
| 6 |
],
|
| 7 |
"attention_dropout": 0.0,
|
| 8 |
"auto_map": {
|
| 9 |
-
"AutoConfig": "
|
| 10 |
-
"AutoModel": "
|
| 11 |
-
"AutoModelForSequenceClassification": "
|
| 12 |
-
"AutoModelForTokenClassification": "
|
| 13 |
-
"AutoModelForQuestionAnswering": "
|
| 14 |
-
"AutoModelForCausalLM": "
|
| 15 |
},
|
| 16 |
"bias": false,
|
| 17 |
"bos_token_id": 11,
|
|
|
|
| 6 |
],
|
| 7 |
"attention_dropout": 0.0,
|
| 8 |
"auto_map": {
|
| 9 |
+
"AutoConfig": "configuration_falcon.FalconConfig",
|
| 10 |
+
"AutoModel": "modeling_falcon.FalconModel",
|
| 11 |
+
"AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification",
|
| 12 |
+
"AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification",
|
| 13 |
+
"AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering",
|
| 14 |
+
"AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM"
|
| 15 |
},
|
| 16 |
"bias": false,
|
| 17 |
"bos_token_id": 11,
|
configuration_RW.py → configuration_falcon.py
RENAMED
|
@@ -25,7 +25,7 @@ FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
|
| 28 |
-
class
|
| 29 |
r"""
|
| 30 |
This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
|
| 31 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
|
@@ -80,10 +80,10 @@ class RWConfig(PretrainedConfig):
|
|
| 80 |
Example:
|
| 81 |
|
| 82 |
```python
|
| 83 |
-
>>> from transformers import FalconModel,
|
| 84 |
|
| 85 |
>>> # Initializing a small (2-layer) Falcon configuration
|
| 86 |
-
>>> configuration =
|
| 87 |
|
| 88 |
>>> # Initializing a model from the small configuration
|
| 89 |
>>> model = FalconModel(configuration)
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
|
| 28 |
+
class FalconConfig(PretrainedConfig):
|
| 29 |
r"""
|
| 30 |
This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
|
| 31 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
|
|
|
| 80 |
Example:
|
| 81 |
|
| 82 |
```python
|
| 83 |
+
>>> from transformers import FalconModel, FalconConfig
|
| 84 |
|
| 85 |
>>> # Initializing a small (2-layer) Falcon configuration
|
| 86 |
+
>>> configuration = FalconConfig(num_hidden_layers=2)
|
| 87 |
|
| 88 |
>>> # Initializing a model from the small configuration
|
| 89 |
>>> model = FalconModel(configuration)
|
modeling_RW.py → modeling_falcon.py
RENAMED
|
@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
|
|
| 32 |
)
|
| 33 |
from transformers.modeling_utils import PreTrainedModel
|
| 34 |
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
| 35 |
-
from .
|
| 36 |
|
| 37 |
|
| 38 |
logger = logging.get_logger(__name__)
|
|
@@ -46,7 +46,7 @@ FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|
| 46 |
"tiiuae/falcon-rw-1b",
|
| 47 |
]
|
| 48 |
_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
|
| 49 |
-
_CONFIG_FOR_DOC = "
|
| 50 |
|
| 51 |
|
| 52 |
# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
|
|
@@ -188,7 +188,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
|
|
| 188 |
|
| 189 |
|
| 190 |
class FalconAttention(nn.Module):
|
| 191 |
-
def __init__(self, config:
|
| 192 |
super().__init__()
|
| 193 |
|
| 194 |
self.hidden_size = config.hidden_size
|
|
@@ -396,7 +396,7 @@ class FalconAttention(nn.Module):
|
|
| 396 |
|
| 397 |
|
| 398 |
class FalconMLP(nn.Module):
|
| 399 |
-
def __init__(self, config:
|
| 400 |
super().__init__()
|
| 401 |
hidden_size = config.hidden_size
|
| 402 |
|
|
@@ -412,7 +412,7 @@ class FalconMLP(nn.Module):
|
|
| 412 |
|
| 413 |
|
| 414 |
class FalconDecoderLayer(nn.Module):
|
| 415 |
-
def __init__(self, config:
|
| 416 |
super().__init__()
|
| 417 |
hidden_size = config.hidden_size
|
| 418 |
self.num_heads = config.num_attention_heads
|
|
@@ -499,7 +499,7 @@ FALCON_START_DOCSTRING = r"""
|
|
| 499 |
and behavior.
|
| 500 |
|
| 501 |
Parameters:
|
| 502 |
-
config ([`
|
| 503 |
Initializing with a config file does not load the weights associated with the model, only the
|
| 504 |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
| 505 |
"""
|
|
@@ -559,13 +559,13 @@ FALCON_INPUTS_DOCSTRING = r"""
|
|
| 559 |
"""
|
| 560 |
|
| 561 |
|
| 562 |
-
class
|
| 563 |
"""
|
| 564 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
| 565 |
models.
|
| 566 |
"""
|
| 567 |
|
| 568 |
-
config_class =
|
| 569 |
base_model_prefix = "transformer"
|
| 570 |
supports_gradient_checkpointing = True
|
| 571 |
_no_split_modules = ["FalconDecoderLayer"]
|
|
@@ -589,9 +589,9 @@ class RWPreTrainedModel(PreTrainedModel):
|
|
| 589 |
module.bias.data.zero_()
|
| 590 |
module.weight.data.fill_(1.0)
|
| 591 |
|
| 592 |
-
# Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->
|
| 593 |
def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
|
| 594 |
-
if isinstance(module,
|
| 595 |
module.gradient_checkpointing = value
|
| 596 |
|
| 597 |
@staticmethod
|
|
@@ -635,8 +635,8 @@ class RWPreTrainedModel(PreTrainedModel):
|
|
| 635 |
"The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
|
| 636 |
FALCON_START_DOCSTRING,
|
| 637 |
)
|
| 638 |
-
class
|
| 639 |
-
def __init__(self, config:
|
| 640 |
super().__init__(config)
|
| 641 |
|
| 642 |
self.embed_dim = config.hidden_size
|
|
@@ -835,12 +835,12 @@ class RWModel(RWPreTrainedModel):
|
|
| 835 |
"The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
|
| 836 |
FALCON_START_DOCSTRING,
|
| 837 |
)
|
| 838 |
-
class
|
| 839 |
_tied_weights_keys = ["lm_head.weight"]
|
| 840 |
|
| 841 |
-
def __init__(self, config:
|
| 842 |
super().__init__(config)
|
| 843 |
-
self.transformer =
|
| 844 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 845 |
|
| 846 |
# Initialize weights and apply final processing
|
|
@@ -965,7 +965,7 @@ class RWForCausalLM(RWPreTrainedModel):
|
|
| 965 |
"""
|
| 966 |
The Falcon Model transformer with a sequence classification head on top (linear layer).
|
| 967 |
|
| 968 |
-
[`
|
| 969 |
(e.g. GPT-1) do.
|
| 970 |
|
| 971 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
@@ -976,11 +976,11 @@ class RWForCausalLM(RWPreTrainedModel):
|
|
| 976 |
""",
|
| 977 |
FALCON_START_DOCSTRING,
|
| 978 |
)
|
| 979 |
-
class
|
| 980 |
-
def __init__(self, config:
|
| 981 |
super().__init__(config)
|
| 982 |
self.num_labels = config.num_labels
|
| 983 |
-
self.transformer =
|
| 984 |
self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
|
| 985 |
|
| 986 |
# Initialize weights and apply final processing
|
|
@@ -1092,12 +1092,12 @@ class RWForSequenceClassification(RWPreTrainedModel):
|
|
| 1092 |
""",
|
| 1093 |
FALCON_START_DOCSTRING,
|
| 1094 |
)
|
| 1095 |
-
class
|
| 1096 |
-
def __init__(self, config:
|
| 1097 |
super().__init__(config)
|
| 1098 |
self.num_labels = config.num_labels
|
| 1099 |
|
| 1100 |
-
self.transformer =
|
| 1101 |
if getattr(config, "classifier_dropout", None) is not None:
|
| 1102 |
classifier_dropout = config.classifier_dropout
|
| 1103 |
elif getattr(config, "hidden_dropout", None) is not None:
|
|
@@ -1181,10 +1181,10 @@ class RWForTokenClassification(RWPreTrainedModel):
|
|
| 1181 |
""",
|
| 1182 |
FALCON_START_DOCSTRING,
|
| 1183 |
)
|
| 1184 |
-
class
|
| 1185 |
def __init__(self, config):
|
| 1186 |
super().__init__(config)
|
| 1187 |
-
self.transformer =
|
| 1188 |
self.qa_outputs = nn.Linear(config.hidden_size, 2)
|
| 1189 |
|
| 1190 |
# Initialize weights and apply final processing
|
|
|
|
| 32 |
)
|
| 33 |
from transformers.modeling_utils import PreTrainedModel
|
| 34 |
from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
|
| 35 |
+
from .configuration_falcon import FalconConfig
|
| 36 |
|
| 37 |
|
| 38 |
logger = logging.get_logger(__name__)
|
|
|
|
| 46 |
"tiiuae/falcon-rw-1b",
|
| 47 |
]
|
| 48 |
_CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
|
| 49 |
+
_CONFIG_FOR_DOC = "FalconConfig"
|
| 50 |
|
| 51 |
|
| 52 |
# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
|
|
|
|
| 188 |
|
| 189 |
|
| 190 |
class FalconAttention(nn.Module):
|
| 191 |
+
def __init__(self, config: FalconConfig):
|
| 192 |
super().__init__()
|
| 193 |
|
| 194 |
self.hidden_size = config.hidden_size
|
|
|
|
| 396 |
|
| 397 |
|
| 398 |
class FalconMLP(nn.Module):
|
| 399 |
+
def __init__(self, config: FalconConfig):
|
| 400 |
super().__init__()
|
| 401 |
hidden_size = config.hidden_size
|
| 402 |
|
|
|
|
| 412 |
|
| 413 |
|
| 414 |
class FalconDecoderLayer(nn.Module):
|
| 415 |
+
def __init__(self, config: FalconConfig):
|
| 416 |
super().__init__()
|
| 417 |
hidden_size = config.hidden_size
|
| 418 |
self.num_heads = config.num_attention_heads
|
|
|
|
| 499 |
and behavior.
|
| 500 |
|
| 501 |
Parameters:
|
| 502 |
+
config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
|
| 503 |
Initializing with a config file does not load the weights associated with the model, only the
|
| 504 |
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
| 505 |
"""
|
|
|
|
| 559 |
"""
|
| 560 |
|
| 561 |
|
| 562 |
+
class FalconPreTrainedModel(PreTrainedModel):
|
| 563 |
"""
|
| 564 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
| 565 |
models.
|
| 566 |
"""
|
| 567 |
|
| 568 |
+
config_class = FalconConfig
|
| 569 |
base_model_prefix = "transformer"
|
| 570 |
supports_gradient_checkpointing = True
|
| 571 |
_no_split_modules = ["FalconDecoderLayer"]
|
|
|
|
| 589 |
module.bias.data.zero_()
|
| 590 |
module.weight.data.fill_(1.0)
|
| 591 |
|
| 592 |
+
# Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
|
| 593 |
def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
|
| 594 |
+
if isinstance(module, FalconModel):
|
| 595 |
module.gradient_checkpointing = value
|
| 596 |
|
| 597 |
@staticmethod
|
|
|
|
| 635 |
"The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
|
| 636 |
FALCON_START_DOCSTRING,
|
| 637 |
)
|
| 638 |
+
class FalconModel(FalconPreTrainedModel):
|
| 639 |
+
def __init__(self, config: FalconConfig):
|
| 640 |
super().__init__(config)
|
| 641 |
|
| 642 |
self.embed_dim = config.hidden_size
|
|
|
|
| 835 |
"The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
|
| 836 |
FALCON_START_DOCSTRING,
|
| 837 |
)
|
| 838 |
+
class FalconForCausalLM(FalconPreTrainedModel):
|
| 839 |
_tied_weights_keys = ["lm_head.weight"]
|
| 840 |
|
| 841 |
+
def __init__(self, config: FalconConfig):
|
| 842 |
super().__init__(config)
|
| 843 |
+
self.transformer = FalconModel(config)
|
| 844 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 845 |
|
| 846 |
# Initialize weights and apply final processing
|
|
|
|
| 965 |
"""
|
| 966 |
The Falcon Model transformer with a sequence classification head on top (linear layer).
|
| 967 |
|
| 968 |
+
[`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
| 969 |
(e.g. GPT-1) do.
|
| 970 |
|
| 971 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
|
| 976 |
""",
|
| 977 |
FALCON_START_DOCSTRING,
|
| 978 |
)
|
| 979 |
+
class FalconForSequenceClassification(FalconPreTrainedModel):
|
| 980 |
+
def __init__(self, config: FalconConfig):
|
| 981 |
super().__init__(config)
|
| 982 |
self.num_labels = config.num_labels
|
| 983 |
+
self.transformer = FalconModel(config)
|
| 984 |
self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
|
| 985 |
|
| 986 |
# Initialize weights and apply final processing
|
|
|
|
| 1092 |
""",
|
| 1093 |
FALCON_START_DOCSTRING,
|
| 1094 |
)
|
| 1095 |
+
class FalconForTokenClassification(FalconPreTrainedModel):
|
| 1096 |
+
def __init__(self, config: FalconConfig):
|
| 1097 |
super().__init__(config)
|
| 1098 |
self.num_labels = config.num_labels
|
| 1099 |
|
| 1100 |
+
self.transformer = FalconModel(config)
|
| 1101 |
if getattr(config, "classifier_dropout", None) is not None:
|
| 1102 |
classifier_dropout = config.classifier_dropout
|
| 1103 |
elif getattr(config, "hidden_dropout", None) is not None:
|
|
|
|
| 1181 |
""",
|
| 1182 |
FALCON_START_DOCSTRING,
|
| 1183 |
)
|
| 1184 |
+
class FalconForQuestionAnswering(FalconPreTrainedModel):
|
| 1185 |
def __init__(self, config):
|
| 1186 |
super().__init__(config)
|
| 1187 |
+
self.transformer = FalconModel(config)
|
| 1188 |
self.qa_outputs = nn.Linear(config.hidden_size, 2)
|
| 1189 |
|
| 1190 |
# Initialize weights and apply final processing
|