| ModernBertForSequenceClassification( |
| (model): ModernBertModel( |
| (embeddings): ModernBertEmbeddings( |
| (tok_embeddings): Embedding(50368, 768, padding_idx=50283) |
| (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| (drop): Dropout(p=0.0, inplace=False) |
| ) |
| (layers): ModuleList( |
| (0): ModernBertEncoderLayer( |
| (attn_norm): Identity() |
| (attn): ModernBertAttention( |
| (Wqkv): Linear(in_features=768, out_features=2304, bias=False) |
| (rotary_emb): ModernBertRotaryEmbedding() |
| (Wo): Linear(in_features=768, out_features=768, bias=False) |
| (out_drop): Identity() |
| ) |
| (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| (mlp): ModernBertMLP( |
| (Wi): Linear(in_features=768, out_features=2304, bias=False) |
| (act): GELUActivation() |
| (drop): Dropout(p=0.0, inplace=False) |
| (Wo): Linear(in_features=1152, out_features=768, bias=False) |
| ) |
| ) |
| (1-21): 21 x ModernBertEncoderLayer( |
| (attn_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| (attn): ModernBertAttention( |
| (Wqkv): Linear(in_features=768, out_features=2304, bias=False) |
| (rotary_emb): ModernBertRotaryEmbedding() |
| (Wo): Linear(in_features=768, out_features=768, bias=False) |
| (out_drop): Identity() |
| ) |
| (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| (mlp): ModernBertMLP( |
| (Wi): Linear(in_features=768, out_features=2304, bias=False) |
| (act): GELUActivation() |
| (drop): Dropout(p=0.0, inplace=False) |
| (Wo): Linear(in_features=1152, out_features=768, bias=False) |
| ) |
| ) |
| ) |
| (final_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| ) |
| (head): ModernBertPredictionHead( |
| (dense): Linear(in_features=768, out_features=768, bias=False) |
| (act): GELUActivation() |
| (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) |
| ) |
| (drop): Dropout(p=0.0, inplace=False) |
| (classifier): Linear(in_features=768, out_features=3240, bias=True) |
| ) |
|
|