Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +157 -0
blocks_standalone.py +520 -0
config.json +60 -0
configuration_tx.py +177 -0
model.safetensors +3 -0
model_standalone.py +318 -0
modeling_tx_standalone.py +157 -0
requirements.txt +7 -0
tokenizer_config.json +5 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,157 @@

+---
+license: apache-2.0
+language: en
+tags:
+- biology
+- genomics
+- single-cell
+library_name: transformers
+---
+# TXModel - Standalone Version
+**Zero external dependencies!** This model requires only:
+- `transformers`
+- `torch`
+- `safetensors`
+No llmfoundry, composer, or other libraries needed!
+## 🚀 Quick Start
+```python
+from transformers import AutoModel
+import torch
+# Load model (downloads automatically from Hub)
+model = AutoModel.from_pretrained(
+    "your-username/tx-model-standalone",
+    trust_remote_code=True
+)
+# Prepare inputs
+genes = torch.randint(0, 100, (2, 10))
+values = torch.rand(2, 10)
+masks = torch.ones(2, 10).bool()
+# Inference
+model.eval()
+with torch.no_grad():
+    output = model(genes=genes, values=values, gen_masks=masks)
+print(output.last_hidden_state.shape)  # [2, 10, d_model]
+```
+## 📦 Installation
+```bash
+pip install transformers torch safetensors
+```
+That's it! No other dependencies required.
+## 🎯 Usage
+The model works exactly like any other HuggingFace model:
+```python
+from transformers import AutoModel
+# Load from Hub
+model = AutoModel.from_pretrained(
+    "your-username/tx-model-standalone",
+    trust_remote_code=True
+)
+# Or load locally
+model = AutoModel.from_pretrained(
+    "./path/to/model",
+    trust_remote_code=True
+)
+# Move to GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+model.eval()
+# Your inference code here
+```
+## ⚡ Features
+- ✅ **Zero external dependencies** (only transformers + torch)
+- ✅ **Works with AutoModel** out of the box
+- ✅ **Hub-ready** - upload and share easily
+- ✅ **Same architecture** as original model
+- ✅ **Full compatibility** with existing weights
+## 📊 Model Details
+| Property | Value |
+|----------|-------|
+| Parameters | ~70M |
+| Architecture | Transformer Encoder |
+| Hidden Size | 512 |
+| Layers | 12 |
+| Attention Heads | 8 |
+## 🔧 Advanced Usage
+### Accessing Model Internals
+```python
+# Access the TXModel directly
+tx_model = model.tx_model
+# Get cell embeddings
+output = model(genes, values, masks)
+cell_emb = output.last_hidden_state[:, 0, :]  # CLS token
+# Get gene embeddings
+tx_output = tx_model(genes, values, masks, key_padding_mask=~genes.eq(0))
+gene_embs = tx_output["gene_embeddings"]  # If return_gene_embeddings=True
+```
+### Batch Processing
+```python
+from torch.utils.data import DataLoader
+# Your dataloader
+dataloader = DataLoader(dataset, batch_size=32)
+results = []
+for batch in dataloader:
+    with torch.no_grad():
+        output = model(
+            genes=batch['genes'],
+            values=batch['values'],
+            gen_masks=batch['masks']
+        )
+        results.append(output.last_hidden_state)
+```
+## 🆚 vs Original Version
+This standalone version:
+- ✅ Removes dependencies on llmfoundry and composer
+- ✅ Uses only PyTorch and Transformers components
+- ✅ Works with standard HuggingFace tools
+- ✅ Maintains same model architecture and weights
+- ✅ Easier to install and deploy
+## 📝 Citation
+If you use this model, please cite the original work:
+```bibtex
+@article{tahoe2024,
+  title={Tahoe-x1: Foundation Model for Genomics},
+  author={...},
+  year={2024}
+}
+```
+## 📄 License
+Apache 2.0

blocks_standalone.py ADDED Viewed

	@@ -0,0 +1,520 @@

+# Copyright (C) Tahoe Therapeutics 2025. All rights reserved.
+"""
+Standalone implementation of TXModel blocks without external dependencies.
+Only requires: torch, transformers
+"""
+import math
+from typing import Optional, Dict, Any, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+class MultiheadAttention(nn.Module):
+    """Standard multi-head attention implementation"""
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        kv_n_heads: Optional[int] = None,
+        dropout: float = 0.0,
+        bias: bool = True,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.kv_n_heads = kv_n_heads if kv_n_heads is not None else n_heads
+        self.head_dim = d_model // n_heads
+        self.dropout = dropout
+        # Grouped Query Attention support
+        self.n_rep = n_heads // self.kv_n_heads
+        self.q_proj = nn.Linear(d_model, d_model, bias=bias, device=device)
+        self.k_proj = nn.Linear(d_model, self.kv_n_heads * self.head_dim, bias=bias, device=device)
+        self.v_proj = nn.Linear(d_model, self.kv_n_heads * self.head_dim, bias=bias, device=device)
+        self.out_proj = nn.Linear(d_model, d_model, bias=bias, device=device)
+        self.attn_dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: Tensor,
+        attn_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+        is_causal: bool = False,
+        **kwargs
+    ) -> Tuple[Tensor, None, None]:
+        batch_size, seq_len, _ = x.shape
+        # Project queries, keys, values
+        q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
+        k = self.k_proj(x).view(batch_size, seq_len, self.kv_n_heads, self.head_dim)
+        v = self.v_proj(x).view(batch_size, seq_len, self.kv_n_heads, self.head_dim)
+        # Transpose for attention: (batch, heads, seq, head_dim)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # Repeat k/v for grouped query attention
+        if self.n_rep > 1:
+            k = k.repeat_interleave(self.n_rep, dim=1)
+            v = v.repeat_interleave(self.n_rep, dim=1)
+        # Scaled dot-product attention
+        scale = 1.0 / math.sqrt(self.head_dim)
+        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * scale
+        # Apply attention bias if provided
+        if attn_bias is not None:
+            attn_scores = attn_scores + attn_bias
+        # Apply key padding mask
+        if key_padding_mask is not None:
+            # key_padding_mask: (batch, seq_len) with True for valid positions
+            # Convert to attention mask: (batch, 1, 1, seq_len)
+            mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            attn_scores = attn_scores.masked_fill(~mask, float('-inf'))
+        # Apply causal mask if needed
+        if is_causal:
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool),
+                diagonal=1
+            )
+            attn_scores = attn_scores.masked_fill(causal_mask, float('-inf'))
+        # Softmax and dropout
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Apply attention to values
+        output = torch.matmul(attn_weights, v)
+        # Reshape and project output
+        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        output = self.out_proj(output)
+        return output, None, None
+class TXBlock(nn.Module):
+    """Transformer encoder block with pre/post normalization support"""
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        expansion_ratio: int,
+        attn_config: Optional[Dict] = None,
+        norm_config: Optional[Dict] = None,
+        dropout: Optional[float] = 0.0,
+        activation: Optional[str] = "gelu",
+        device: Optional[str] = None,
+        norm_scheme: str = "pre",
+        use_glu: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        if attn_config is None:
+            attn_config = {}
+        if norm_config is None:
+            norm_config = {}
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.device = device
+        self.norm_scheme = norm_scheme
+        self.use_glu = use_glu
+        # Attention
+        kv_n_heads = attn_config.get("kv_n_heads", n_heads)
+        self.self_attn = MultiheadAttention(
+            d_model=d_model,
+            n_heads=n_heads,
+            kv_n_heads=kv_n_heads,
+            dropout=attn_config.get("attn_pdrop", 0.0),
+            device=device,
+        )
+        # FFN
+        dim_feedforward = d_model * expansion_ratio
+        self.up_proj = nn.Linear(d_model, dim_feedforward, device=device)
+        self.down_proj = nn.Linear(dim_feedforward, d_model, device=device)
+        if use_glu:
+            self.gate_proj = nn.Linear(d_model, dim_feedforward, device=device)
+        # Normalization
+        eps = norm_config.get("eps", 1e-5)
+        self.norm1 = nn.LayerNorm(d_model, eps=eps, device=device)
+        self.norm2 = nn.LayerNorm(d_model, eps=eps, device=device)
+        # Dropout
+        self.post_sa_dropout = nn.Dropout(dropout)
+        self.post_ffn_dropout = nn.Dropout(dropout)
+        # Activation
+        self.activation = self._get_activation_fn(activation)
+        if norm_scheme not in ["pre", "post"]:
+            raise ValueError("norm_scheme must be either pre or post")
+    @staticmethod
+    def _get_activation_fn(activation: str):
+        if activation == "gelu":
+            return nn.GELU()
+        elif activation == "relu":
+            return nn.ReLU()
+        elif activation == "silu" or activation == "swish":
+            return nn.SiLU()
+        elif activation == "leaky_relu":
+            return nn.LeakyReLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+    def forward(
+        self,
+        x: Tensor,
+        attn_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+        **kwargs
+    ) -> Tensor:
+        if self.norm_scheme == "pre":
+            # Pre-norm: norm -> attention -> add
+            x = x + self._sa_block(
+                self.norm1(x),
+                attn_bias=attn_bias,
+                key_padding_mask=key_padding_mask,
+            )
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            # Post-norm: attention -> add -> norm
+            x = self.norm1(
+                x + self._sa_block(
+                    x,
+                    attn_bias=attn_bias,
+                    key_padding_mask=key_padding_mask,
+                )
+            )
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    def _sa_block(
+        self,
+        x: Tensor,
+        attn_bias: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        x, _, _ = self.self_attn(
+            x,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=False,
+        )
+        return self.post_sa_dropout(x)
+    def _ff_block(self, x: Tensor) -> Tensor:
+        if self.use_glu:
+            # GLU variant: (gate * activation(x)) * up(x)
+            x = self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x))
+        else:
+            # Standard FFN
+            x = self.down_proj(self.activation(self.up_proj(x)))
+        return self.post_ffn_dropout(x)
+class TXEncoder(nn.Module):
+    """Stack of transformer encoder layers"""
+    def __init__(
+        self,
+        encoder_layer: TXBlock,
+        num_layers: int,
+        use_norm: bool = False,
+        norm_config: Optional[Dict] = None,
+        attn_config: Optional[Dict] = None,
+    ):
+        super().__init__()
+        if norm_config is None:
+            norm_config = {}
+        # Clone the layer
+        self.layers = nn.ModuleList([
+            TXBlock(
+                d_model=encoder_layer.d_model,
+                n_heads=encoder_layer.n_heads,
+                expansion_ratio=encoder_layer.up_proj.out_features // encoder_layer.d_model,
+                attn_config=attn_config,
+                norm_config=norm_config,
+                activation="gelu",
+                device=encoder_layer.device,
+                norm_scheme=encoder_layer.norm_scheme,
+                use_glu=encoder_layer.use_glu,
+            )
+            for _ in range(num_layers)
+        ])
+        self.use_norm = use_norm
+        if use_norm:
+            eps = norm_config.get("eps", 1e-5)
+            self.norm = nn.LayerNorm(encoder_layer.d_model, eps=eps)
+    def forward(
+        self,
+        total_embs: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        output_hidden_states: bool = False,
+    ) -> Tuple[Tensor, Optional[list]]:
+        x = total_embs
+        hidden_states = [] if output_hidden_states else None
+        for layer in self.layers:
+            x = layer(
+                x,
+                attn_bias=None,
+                key_padding_mask=key_padding_mask,
+            )
+            if output_hidden_states:
+                hidden_states.append(x)
+        if self.use_norm:
+            x = self.norm(x)
+        return x, hidden_states
+class GeneEncoder(nn.Module):
+    """Gene embedding with optional extra embeddings"""
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int = 0,
+        use_norm: bool = False,
+        gene_encoder_cfg: Optional[Dict] = None,
+    ):
+        super().__init__()
+        if gene_encoder_cfg is None:
+            gene_encoder_cfg = {}
+        self.use_norm = use_norm
+        self.embedding = nn.Embedding(
+            num_embeddings,
+            embedding_dim,
+            padding_idx=padding_idx,
+        )
+        # For now, no extra embeddings in standalone version
+        self.project = nn.Identity()
+        if self.use_norm:
+            self.enc_norm = nn.LayerNorm(embedding_dim)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.embedding(x)
+        x = self.project(x)
+        if self.use_norm:
+            x = self.enc_norm(x)
+        return x
+class ChemEncoder(nn.Module):
+    """Chemical compound encoder"""
+    def __init__(
+        self,
+        d_out: int,
+        padding_idx: int = 0,
+        activation: str = "leaky_relu",
+        use_norm: bool = True,
+        freeze: bool = False,
+        num_drugs: int = 1000,
+        fp_dim: int = 2048,
+    ):
+        super().__init__()
+        # Initialize with zeros (user should load pretrained weights)
+        drug_fps = torch.zeros((num_drugs, fp_dim), dtype=torch.float32)
+        self.embedding = nn.Embedding.from_pretrained(
+            drug_fps,
+            padding_idx=padding_idx,
+            freeze=freeze,
+        )
+        self.fc = nn.Linear(fp_dim, d_out)
+        if activation == "leaky_relu":
+            self.activation = nn.LeakyReLU()
+        elif activation == "relu":
+            self.activation = nn.ReLU()
+        elif activation == "gelu":
+            self.activation = nn.GELU()
+        else:
+            self.activation = nn.ReLU()
+        self.proj = nn.Linear(d_out, d_out)
+        self.use_norm = use_norm
+        if self.use_norm:
+            self.norm = nn.LayerNorm(d_out)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.embedding(x)
+        x = self.activation(self.fc(x))
+        x = self.proj(x)
+        if self.use_norm:
+            x = self.norm(x)
+        return x
+class ContinuousValueEncoder(nn.Module):
+    """Encode continuous values to embeddings"""
+    def __init__(
+        self,
+        d_model: int,
+        dropout: float = 0.1,
+        max_value: int = 512,
+        activation: str = "relu",
+        use_norm: bool = False,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        self.linear1 = nn.Linear(1, d_model)
+        if activation == "relu":
+            self.activation = nn.ReLU()
+        elif activation == "gelu":
+            self.activation = nn.GELU()
+        elif activation == "leaky_relu":
+            self.activation = nn.LeakyReLU()
+        else:
+            self.activation = nn.ReLU()
+        self.linear2 = nn.Linear(d_model, d_model)
+        self.use_norm = use_norm
+        if self.use_norm:
+            self.norm = nn.LayerNorm(d_model)
+        self.max_value = max_value
+    def forward(self, x: Tensor) -> Tensor:
+        # Expand last dimension
+        x = x.unsqueeze(-1)
+        # Clip to max value
+        x = torch.clamp(x, max=self.max_value)
+        # Project
+        x = self.activation(self.linear1(x))
+        x = self.linear2(x)
+        if self.use_norm:
+            x = self.norm(x)
+        return self.dropout(x)
+class ExprDecoder(nn.Module):
+    """Expression value decoder"""
+    def __init__(
+        self,
+        d_model: int,
+        n_outputs: int = 1,
+        n_layers: int = 2,
+        activation: str = "leaky_relu",
+    ):
+        super().__init__()
+        if activation == "leaky_relu":
+            self.activation = nn.LeakyReLU()
+        elif activation == "relu":
+            self.activation = nn.ReLU()
+        elif activation == "gelu":
+            self.activation = nn.GELU()
+        else:
+            self.activation = nn.LeakyReLU()
+        self.linear_layers = nn.ModuleList(
+            [nn.Linear(d_model, d_model) for _ in range(n_layers)]
+        )
+        self.out_proj = nn.Linear(d_model, n_outputs)
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        for layer in self.linear_layers:
+            x = self.activation(layer(x))
+        pred_value = self.out_proj(x)
+        if pred_value.shape[-1] == 1:
+            pred_value = pred_value.squeeze(-1)
+        return {"pred": pred_value}
+class MVCDecoder(nn.Module):
+    """Masked value prediction decoder"""
+    def __init__(
+        self,
+        d_model: int,
+        arch_style: str = "inner product",
+        query_activation: str = "sigmoid",
+        scaled_dot_product: bool = False,
+    ) -> None:
+        super().__init__()
+        self.scaled_dot_product = scaled_dot_product
+        if arch_style == "inner product":
+            self.gene2query = nn.Linear(d_model, d_model)
+            if query_activation == "sigmoid":
+                self.query_activation = nn.Sigmoid()
+            elif query_activation == "relu":
+                self.query_activation = nn.ReLU()
+            elif query_activation == "tanh":
+                self.query_activation = nn.Tanh()
+            else:
+                self.query_activation = nn.Sigmoid()
+            self.W = nn.Linear(d_model, d_model, bias=False)
+        else:
+            raise ValueError(f"Unknown arch_style: {arch_style}")
+        self.arch_style = arch_style
+    def forward(
+        self,
+        cell_emb: Tensor,
+        gene_embs: Tensor,
+    ) -> Dict[str, Tensor]:
+        if self.arch_style == "inner product":
+            query_vecs = self.query_activation(
+                self.gene2query(gene_embs)
+            )
+            inner_product_dimension = query_vecs.shape[-1]
+            cell_emb = cell_emb.unsqueeze(2)
+            pred_value = torch.bmm(self.W(query_vecs), cell_emb).squeeze(2)
+            if self.scaled_dot_product:
+                pred_value = pred_value / torch.sqrt(
+                    torch.tensor(inner_product_dimension, dtype=pred_value.dtype)
+                )
+            return {"pred": pred_value}
+        else:
+            raise ValueError(f"Unknown arch_style: {self.arch_style}")

config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "model_type": "tx_model",
+  "architectures": [
+    "TXModelForHF"
+  ],
+  "vocab_size": 62720,
+  "d_model": 512,
+  "n_layers": 12,
+  "n_heads": 8,
+  "expansion_ratio": 4,
+  "norm_scheme": "pre",
+  "transformer_activation": "relu",
+  "use_glu": false,
+  "pad_token_id": 0,
+  "pad_value": -2,
+  "num_bins": 51,
+  "use_chem_token": false,
+  "keep_first_n_tokens": 1,
+  "cell_emb_style": "cls",
+  "return_gene_embeddings": false,
+  "standard_scale_outputs": false,
+  "attn_config": {
+    "attn_impl": "flash",
+    "use_attn_mask": false,
+    "attn_type": "grouped_query_attention",
+    "kv_nheads": 8,
+    "attn_pdrop": 0
+  },
+  "norm_config": {
+    "eps": 1e-05,
+    "norm_type": "layernorm"
+  },
+  "gene_encoder_config": {
+    "use_norm": true
+  },
+  "expression_encoder_config": {
+    "dropout": 0.1,
+    "use_norm": true,
+    "max_value": 512,
+    "activation": "relu",
+    "input_emb_style": "continuous"
+  },
+  "expression_decoder_config": {
+    "n_layers": 1,
+    "n_outputs": 1,
+    "activation": "leaky_relu"
+  },
+  "mvc_config": {
+    "arch_style": "inner product",
+    "query_activation": "sigmoid",
+    "scaled_dot_product": true
+  },
+  "chemical_encoder_config": null,
+  "auto_map": {
+    "AutoConfig": "configuration_tx.TXConfig",
+    "AutoModel": "modeling_tx_standalone.TXModelForHF",
+    "AutoModelForCausalLM": "modeling_tx_standalone.TXModelForHF"
+  },
+  "transformers_version": "4.35.0"
+}

configuration_tx.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (C) Tahoe Therapeutics 2025. All rights reserved.
+"""
+Configuration class for TXModel compatible with HuggingFace Transformers
+"""
+from transformers import PretrainedConfig
+from typing import Optional, Dict, Any
+class TXConfig(PretrainedConfig):
+    """
+    Configuration class for TXModel.
+    This class stores the configuration of a TXModel, which is a Transformer-based model
+    for genomic/biological sequence analysis.
+    Args:
+        vocab_size (int): Size of the vocabulary
+        d_model (int): Dimensionality of the model embeddings
+        n_layers (int): Number of transformer layers
+        n_heads (int): Number of attention heads
+        expansion_ratio (int): Expansion ratio for FFN
+        norm_scheme (str): Normalization scheme ('pre' or 'post')
+        transformer_activation (str): Activation function for transformer
+        cell_emb_style (str): Cell embedding style ('cls', 'avg-pool', 'w-pool')
+        pad_token_id (int): ID of the padding token
+        pad_value (float): Value for padding
+        num_bins (int): Number of bins for expression values
+        use_chem_token (bool): Whether to use chemical token encoder
+        attn_config (Dict): Attention configuration
+        norm_config (Dict): Normalization configuration
+        init_config (Dict): Initialization configuration
+        gene_encoder_config (Dict): Gene encoder configuration
+        expression_encoder_config (Dict): Expression encoder configuration
+        expression_decoder_config (Dict): Expression decoder configuration
+        mvc_config (Optional[Dict]): MVC decoder configuration
+        chemical_encoder_config (Optional[Dict]): Chemical encoder configuration
+        use_glu (bool): Whether to use GLU in FFN
+        return_gene_embeddings (bool): Whether to return gene embeddings
+        standard_scale_outputs (bool): Whether to scale outputs
+    """
+    model_type = "tx_model"
+    def __init__(
+        self,
+        vocab_size: int = 30000,
+        d_model: int = 512,
+        n_layers: int = 12,
+        n_heads: int = 8,
+        expansion_ratio: int = 4,
+        norm_scheme: str = "pre",
+        transformer_activation: str = "gelu",
+        cell_emb_style: str = "cls",
+        pad_token_id: int = 0,
+        pad_value: float = 0.0,
+        num_bins: int = 51,
+        use_chem_token: bool = False,
+        attn_config: Optional[Dict] = None,
+        norm_config: Optional[Dict] = None,
+        init_config: Optional[Dict] = None,
+        gene_encoder_config: Optional[Dict] = None,
+        expression_encoder_config: Optional[Dict] = None,
+        expression_decoder_config: Optional[Dict] = None,
+        mvc_config: Optional[Dict] = None,
+        chemical_encoder_config: Optional[Dict] = None,
+        use_glu: bool = False,
+        return_gene_embeddings: bool = False,
+        standard_scale_outputs: bool = False,
+        keep_first_n_tokens: int = 1,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.expansion_ratio = expansion_ratio
+        self.norm_scheme = norm_scheme
+        self.transformer_activation = transformer_activation
+        self.cell_emb_style = cell_emb_style
+        self.pad_value = pad_value
+        self.num_bins = num_bins
+        self.use_chem_token = use_chem_token
+        self.keep_first_n_tokens = keep_first_n_tokens
+        self.return_gene_embeddings = return_gene_embeddings
+        self.standard_scale_outputs = standard_scale_outputs
+        self.use_glu = use_glu
+        # Sub-configurations
+        self.attn_config = attn_config or {
+            "attn_type": "grouped_query_attention",
+            "attn_pdrop": 0.0,
+            "attn_impl": "flash",
+            "use_attn_mask": False,
+            "qk_ln": False,
+            "qk_gn": False,
+            "clip_qkv": None,
+            "softmax_scale": None,
+        }
+        self.norm_config = norm_config or {
+            "norm_type": "low_precision_layernorm",
+            "eps": 1e-5,
+        }
+        self.init_config = init_config or {
+            "name": "kaiming_normal_",
+            "fan_mode": "fan_in",
+            "init_nonlinearity": "relu",
+            "init_div_is_residual": True,
+            "emb_init_std": None,
+            "emb_init_uniform_lim": None,
+            "init_std": None,
+            "init_gain": 0.0,
+        }
+        self.gene_encoder_config = gene_encoder_config or {
+            "use_norm": False,
+        }
+        self.expression_encoder_config = expression_encoder_config or {
+            "input_emb_style": "continuous",
+            "dropout": 0.1,
+            "max_value": 512,
+            "activation": "relu",
+            "use_norm": False,
+        }
+        self.expression_decoder_config = expression_decoder_config or {
+            "n_outputs": 1,
+            "n_layers": 2,
+            "activation": "leaky_relu",
+        }
+        self.mvc_config = mvc_config
+        self.chemical_encoder_config = chemical_encoder_config
+    @classmethod
+    def from_yaml_configs(cls, model_config_dict: Dict, collator_config_dict: Dict) -> "TXConfig":
+        """
+        Create TXConfig from model_config.yml and collator_config.yml dictionaries
+        Args:
+            model_config_dict: Dictionary from model_config.yml
+            collator_config_dict: Dictionary from collator_config.yml
+        Returns:
+            TXConfig instance
+        """
+        return cls(
+            vocab_size=model_config_dict.get("vocab_size"),
+            d_model=model_config_dict.get("d_model"),
+            n_layers=model_config_dict.get("n_layers"),
+            n_heads=model_config_dict.get("n_heads"),
+            expansion_ratio=model_config_dict.get("expansion_ratio"),
+            norm_scheme=model_config_dict.get("norm_scheme", "pre"),
+            transformer_activation=model_config_dict.get("transformer_activation", "gelu"),
+            cell_emb_style=model_config_dict.get("cell_emb_style", "cls"),
+            pad_token_id=collator_config_dict.get("pad_token_id", 0),
+            pad_value=collator_config_dict.get("pad_value", 0.0),
+            num_bins=collator_config_dict.get("num_bins", 51),
+            use_chem_token=collator_config_dict.get("use_chem_token", False),
+            attn_config=model_config_dict.get("attn_config"),
+            norm_config=model_config_dict.get("norm_config"),
+            init_config=model_config_dict.get("init_config"),
+            gene_encoder_config=model_config_dict.get("gene_encoder"),
+            expression_encoder_config=model_config_dict.get("expression_encoder"),
+            expression_decoder_config=model_config_dict.get("expression_decoder"),
+            mvc_config=model_config_dict.get("mvc"),
+            chemical_encoder_config=model_config_dict.get("chemical_encoder"),
+            use_glu=model_config_dict.get("use_glu", False),
+            return_gene_embeddings=model_config_dict.get("return_gene_embeddings", False),
+            standard_scale_outputs=model_config_dict.get("standard_scale_outputs", False),
+            keep_first_n_tokens=collator_config_dict.get("keep_first_n_tokens", 1),
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:217637af5a4d12f3fe2d2648fb9d4d1404b53eea587336c62cfcfbfb26088efd
+size 284008108

model_standalone.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# Copyright (C) Tahoe Therapeutics 2025. All rights reserved.
+"""
+Standalone implementation of TXModel without external dependencies.
+Only requires: torch, transformers, safetensors
+"""
+from typing import Optional, Union, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from blocks_standalone import (
+    ChemEncoder,
+    ContinuousValueEncoder,
+    ExprDecoder,
+    GeneEncoder,
+    MVCDecoder,
+    TXBlock,
+    TXEncoder,
+)
+class TXModel(nn.Module):
+    """Standalone Transformer model for genomic data"""
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int,
+        n_layers: int,
+        n_heads: int,
+        expansion_ratio: int,
+        pad_token_id: int,
+        pad_value: float,
+        num_bins: int,
+        norm_scheme: str = "pre",
+        transformer_activation: str = "gelu",
+        cell_emb_style: str = "cls",
+        use_chem_token: bool = False,
+        attn_config: Optional[dict] = None,
+        norm_config: Optional[dict] = None,
+        gene_encoder_config: Optional[dict] = None,
+        expression_encoder_config: Optional[dict] = None,
+        expression_decoder_config: Optional[dict] = None,
+        mvc_config: Optional[dict] = None,
+        chemical_encoder_config: Optional[dict] = None,
+        use_glu: bool = False,
+        return_gene_embeddings: bool = False,
+        keep_first_n_tokens: int = 1,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.model_type = "Transformer"
+        self.device = device
+        self.vocab_size = vocab_size
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_model = d_model
+        self.expansion_ratio = expansion_ratio
+        self.norm_scheme = norm_scheme
+        self.transformer_activation = transformer_activation
+        self.use_chem_token = use_chem_token
+        self.cell_emb_style = cell_emb_style
+        self.pad_token_id = pad_token_id
+        self.pad_value = pad_value
+        self.n_input_bins = num_bins
+        self.keep_first_n_tokens = keep_first_n_tokens
+        self.return_gene_embeddings = return_gene_embeddings
+        if attn_config is None:
+            attn_config = {}
+        if norm_config is None:
+            norm_config = {}
+        if gene_encoder_config is None:
+            gene_encoder_config = {"use_norm": False}
+        if expression_encoder_config is None:
+            expression_encoder_config = {}
+        if expression_decoder_config is None:
+            expression_decoder_config = {}
+        # Gene encoder
+        self.gene_encoder = GeneEncoder(
+            self.vocab_size,
+            self.d_model,
+            padding_idx=self.pad_token_id,
+            use_norm=gene_encoder_config.get("use_norm", False),
+            gene_encoder_cfg=gene_encoder_config,
+        )
+        # Flag encoder
+        self.flag_encoder = nn.Embedding(2, self.d_model)
+        # Expression encoder
+        self.expression_encoder = ContinuousValueEncoder(
+            d_model=self.d_model,
+            dropout=expression_encoder_config.get("dropout", 0.1),
+            max_value=expression_encoder_config.get("max_value", 512),
+            activation=expression_encoder_config.get("activation", "relu"),
+            use_norm=expression_encoder_config.get("use_norm", False),
+        )
+        # Chemical encoder (if needed)
+        if self.use_chem_token:
+            if chemical_encoder_config is None:
+                chemical_encoder_config = {}
+            self.chem_encoder = ChemEncoder(
+                d_out=self.d_model,
+                padding_idx=chemical_encoder_config.get("padding_idx", 0),
+                activation=chemical_encoder_config.get("activation", "leaky_relu"),
+                freeze=chemical_encoder_config.get("freeze", False),
+                num_drugs=chemical_encoder_config.get("num_drugs", 1000),
+                fp_dim=chemical_encoder_config.get("fp_dim", 2048),
+            )
+        # Transformer encoder
+        encoder_layer = TXBlock(
+            d_model=self.d_model,
+            n_heads=self.n_heads,
+            expansion_ratio=self.expansion_ratio,
+            attn_config=attn_config,
+            norm_config=norm_config,
+            activation=self.transformer_activation,
+            device=self.device,
+            norm_scheme=self.norm_scheme,
+            use_glu=use_glu,
+        )
+        self.transformer_encoder = TXEncoder(
+            encoder_layer,
+            self.n_layers,
+            use_norm=self.norm_scheme == "pre",
+            norm_config=norm_config,
+            attn_config=attn_config,
+        )
+        # Expression decoder
+        self.expression_decoder = ExprDecoder(
+            d_model=self.d_model,
+            n_outputs=expression_decoder_config.get("n_outputs", 1),
+            n_layers=expression_decoder_config.get("n_layers", 2),
+            activation=expression_decoder_config.get("activation", "leaky_relu"),
+        )
+        # MVC decoder (if configured)
+        if mvc_config is not None:
+            self.mvc_decoder = MVCDecoder(
+                d_model=self.d_model,
+                arch_style=mvc_config.get("arch_style", "inner product"),
+                query_activation=mvc_config.get("query_activation", "sigmoid"),
+                scaled_dot_product=mvc_config.get("scaled_dot_product", False),
+            )
+        else:
+            self.mvc_decoder = None
+    def transformer_generate(
+        self,
+        genes: Tensor,
+        values: Tensor,
+        gen_masks: Tensor,
+        key_padding_mask: Tensor,
+        drug_ids: Optional[Tensor] = None,
+        output_hidden_states: bool = False,
+    ) -> Union[Tensor, Tuple[Tensor, list]]:
+        # Encode genes
+        token_embs = self.gene_encoder(genes)
+        # Encode expression values
+        token_values = self.expression_encoder(values)
+        token_values = token_values.masked_fill(gen_masks.unsqueeze(-1), 0.0)
+        # Flag embeddings
+        flag = self.flag_encoder(
+            torch.tensor(1, device=token_embs.device)
+        ).reshape(1, 1, -1)
+        flag_embs = gen_masks.unsqueeze(-1).to(token_embs.dtype) * flag
+        # Combine embeddings
+        total_embs = token_embs + token_values + flag_embs
+        # Add chemical embedding if used
+        if self.use_chem_token and drug_ids is not None:
+            drug_embs = self.chem_encoder(drug_ids)
+            total_embs[:, 1, :] = drug_embs
+        # Store gene embeddings for MVC
+        self.cur_gene_token_embs = token_embs
+        # Pass through transformer
+        output, hidden_states = self.transformer_encoder(
+            total_embs=total_embs,
+            key_padding_mask=key_padding_mask,
+            output_hidden_states=output_hidden_states,
+        )
+        return output, hidden_states
+    def forward(
+        self,
+        genes: Tensor,
+        values: Tensor,
+        gen_masks: Tensor,
+        key_padding_mask: Tensor,
+        drug_ids: Optional[Tensor] = None,
+        skip_decoders: bool = False,
+        output_hidden_states: bool = False,
+    ) -> dict:
+        # Generate transformer output
+        transformer_output, hidden_states = self.transformer_generate(
+            genes, values, gen_masks, key_padding_mask,
+            drug_ids, output_hidden_states
+        )
+        # Prepare output dict
+        output = {
+            "transformer_output": transformer_output,
+        }
+        if output_hidden_states:
+            output["hidden_states"] = hidden_states
+        # Cell embedding (CLS token or pooling)
+        if self.cell_emb_style == "cls":
+            cell_emb = transformer_output[:, 0, :]
+        elif self.cell_emb_style == "avg-pool":
+            # Average over non-padding tokens
+            mask = key_padding_mask.unsqueeze(-1).float()
+            cell_emb = (transformer_output * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+        elif self.cell_emb_style == "w-pool":
+            # Weighted pooling (not implemented, use avg)
+            mask = key_padding_mask.unsqueeze(-1).float()
+            cell_emb = (transformer_output * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+        else:
+            cell_emb = transformer_output[:, 0, :]
+        output["cell_emb"] = cell_emb
+        # Return gene embeddings if requested
+        if self.return_gene_embeddings:
+            output["gene_embeddings"] = transformer_output
+        # Skip decoders if requested
+        if skip_decoders:
+            return output
+        # Expression decoder
+        expr_output = self.expression_decoder(transformer_output)
+        output["expr_preds"] = expr_output["pred"]
+        # MVC decoder (if available)
+        if self.mvc_decoder is not None:
+            mvc_output = self.mvc_decoder(
+                cell_emb,
+                self.cur_gene_token_embs,
+            )
+            output["mvc_output"] = mvc_output["pred"]
+        return output
+    @classmethod
+    def from_pretrained(cls, model_path: str, **kwargs):
+        """Load model from pretrained weights"""
+        from safetensors.torch import load_file
+        import json
+        from pathlib import Path
+        model_path = Path(model_path)
+        # Load config
+        with open(model_path / "config.json", "r") as f:
+            config = json.load(f)
+        # Create model
+        model = cls(
+            vocab_size=config["vocab_size"],
+            d_model=config["d_model"],
+            n_layers=config["n_layers"],
+            n_heads=config["n_heads"],
+            expansion_ratio=config["expansion_ratio"],
+            pad_token_id=config["pad_token_id"],
+            pad_value=config["pad_value"],
+            num_bins=config["num_bins"],
+            norm_scheme=config.get("norm_scheme", "pre"),
+            transformer_activation=config.get("transformer_activation", "gelu"),
+            cell_emb_style=config.get("cell_emb_style", "cls"),
+            use_chem_token=config.get("use_chem_token", False),
+            attn_config=config.get("attn_config"),
+            norm_config=config.get("norm_config"),
+            gene_encoder_config=config.get("gene_encoder_config"),
+            expression_encoder_config=config.get("expression_encoder_config"),
+            expression_decoder_config=config.get("expression_decoder_config"),
+            mvc_config=config.get("mvc_config"),
+            chemical_encoder_config=config.get("chemical_encoder_config"),
+            use_glu=config.get("use_glu", False),
+            return_gene_embeddings=config.get("return_gene_embeddings", False),
+            keep_first_n_tokens=config.get("keep_first_n_tokens", 1),
+        )
+        # Load weights
+        state_dict = load_file(model_path / "model.safetensors")
+        # Remove 'model.tx_model.' or 'tx_model.' prefix if present
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            new_key = k
+            if k.startswith('model.tx_model.'):
+                new_key = k[14:]  # Remove 'model.tx_model.'
+            elif k.startswith('tx_model.'):
+                new_key = k[9:]  # Remove 'tx_model.'
+            elif k.startswith('model.'):
+                new_key = k[6:]  # Remove 'model.'
+            new_state_dict[new_key] = v
+        model.load_state_dict(new_state_dict, strict=False)
+        return model

modeling_tx_standalone.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (C) Tahoe Therapeutics 2025. All rights reserved.
+"""
+HuggingFace-compatible wrapper for TXModel (Standalone version)
+Only requires: transformers, torch, safetensors
+"""
+from typing import Optional, Union, Tuple
+import torch
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutput
+from configuration_tx import TXConfig
+from model_standalone import TXModel
+class TXPreTrainedModel(PreTrainedModel):
+    """
+    Base class for TXModel with HuggingFace integration
+    """
+    config_class = TXConfig
+    base_model_prefix = "tx_model"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["TXBlock"]
+    def _init_weights(self, module):
+        """Initialize weights"""
+        if isinstance(module, torch.nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, torch.nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, torch.nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class TXModelForHF(TXPreTrainedModel):
+    """
+    HuggingFace-compatible TXModel
+    This model can be used directly with HuggingFace's from_pretrained()
+    and requires only: transformers, torch, safetensors
+    No dependencies on llmfoundry, composer, or other external libraries.
+    """
+    def __init__(self, config: TXConfig):
+        super().__init__(config)
+        # Initialize standalone model
+        self.tx_model = TXModel(
+            vocab_size=config.vocab_size,
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            expansion_ratio=config.expansion_ratio,
+            pad_token_id=config.pad_token_id,
+            pad_value=config.pad_value,
+            num_bins=config.num_bins,
+            norm_scheme=config.norm_scheme,
+            transformer_activation=config.transformer_activation,
+            cell_emb_style=config.cell_emb_style,
+            use_chem_token=config.use_chem_token,
+            attn_config=config.attn_config,
+            norm_config=config.norm_config,
+            gene_encoder_config=config.gene_encoder_config,
+            expression_encoder_config=config.expression_encoder_config,
+            expression_decoder_config=config.expression_decoder_config,
+            mvc_config=config.mvc_config,
+            chemical_encoder_config=config.chemical_encoder_config,
+            use_glu=config.use_glu,
+            return_gene_embeddings=config.return_gene_embeddings,
+            keep_first_n_tokens=config.keep_first_n_tokens,
+        )
+        # Post init
+        self.post_init()
+    def forward(
+        self,
+        genes: torch.Tensor,
+        values: torch.Tensor,
+        gen_masks: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        drug_ids: Optional[torch.Tensor] = None,
+        skip_decoders: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Forward pass through the model.
+        Args:
+            genes: Gene token IDs [batch_size, seq_len]
+            values: Expression values [batch_size, seq_len]
+            gen_masks: Generation masks [batch_size, seq_len]
+            key_padding_mask: Padding mask [batch_size, seq_len]
+            drug_ids: Drug IDs [batch_size] (optional)
+            skip_decoders: Whether to skip decoder computation
+            output_hidden_states: Whether to return hidden states
+            return_dict: Whether to return a dict or tuple
+        Returns:
+            Model outputs
+        """
+        if key_padding_mask is None:
+            key_padding_mask = ~genes.eq(self.config.pad_token_id)
+        outputs = self.tx_model(
+            genes=genes,
+            values=values,
+            gen_masks=gen_masks,
+            key_padding_mask=key_padding_mask,
+            drug_ids=drug_ids,
+            skip_decoders=skip_decoders,
+            output_hidden_states=output_hidden_states,
+        )
+        if not return_dict:
+            return tuple(v for v in outputs.values())
+        # Convert to HuggingFace output format
+        return BaseModelOutput(
+            last_hidden_state=outputs.get("cell_emb"),
+            hidden_states=outputs.get("hidden_states") if output_hidden_states else None,
+        )
+    def get_input_embeddings(self):
+        """Get input embeddings"""
+        return self.tx_model.gene_encoder.embedding
+    def set_input_embeddings(self, value):
+        """Set input embeddings"""
+        self.tx_model.gene_encoder.embedding = value
+    def get_output_embeddings(self):
+        """Get output embeddings (not applicable)"""
+        return None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load model from pretrained weights.
+        Works with both local paths and HuggingFace Hub.
+        Requires only: transformers, torch, safetensors
+        """
+        # Let parent class handle config and weight loading
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+# Alias for easier importing
+TXForCausalLM = TXModelForHF

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# Standalone version - ONLY these dependencies required!
+transformers>=4.35.0
+torch>=2.0.0
+safetensors>=0.4.0
+# Optional: for converting from original format
+# omegaconf>=2.3.0  # Only needed for conversion, not for using the model

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "model_max_length": 1000000000000000019884624838656,
+  "vocab_size": 62720
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff