ctheodoris
/

Geneformer

Model card Files Files and versions

dictionaries from parent dir

#405

by madhavanvenkatesh - opened Sep 3, 2024

base: refs/heads/main

←

from: refs/pr/405

Discussion Files changed

Files changed (1) hide show

geneformer/mtl/collators.py +13 -10

geneformer/mtl/collators.py CHANGED Viewed

@@ -1,18 +1,24 @@
 # imports
 import torch
 from ..collator_for_classification import DataCollatorForGeneClassification
-from . import TOKEN_DICTIONARY  # import the token dictionary from the mtl module's init
-"""
-Geneformer collator for multi-task cell classification.
-"""
 class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassification):
     class_type = "cell"
     def __init__(self, *args, **kwargs) -> None:
-        # Use the loaded token dictionary from the mtl module's init
-        super().__init__(token_dictionary=TOKEN_DICTIONARY, *args, **kwargs)
     def _prepare_batch(self, features):
         # Process inputs as usual
@@ -29,7 +35,6 @@ class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassificati
         if "label" in features[0]:
             # Initialize labels dictionary for all tasks
             labels = {task: [] for task in features[0]["label"].keys()}
             # Populate labels for each task
             for feature in features:
                 for task, label in feature["label"].items():
@@ -57,7 +62,6 @@ class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassificati
     def __call__(self, features):
         batch = self._prepare_batch(features)
         for k, v in batch.items():
             if torch.is_tensor(v):
                 batch[k] = v.clone().detach()
@@ -69,5 +73,4 @@ class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassificati
                 }
             else:
                 batch[k] = torch.tensor(v, dtype=torch.int64)
-        return batch

 # imports
 import torch
+import pickle
 from ..collator_for_classification import DataCollatorForGeneClassification
+from .. import TOKEN_DICTIONARY_FILE
+"""Geneformer collator for multi-task cell classification."""
 class DataCollatorForMultitaskCellClassification(DataCollatorForGeneClassification):
     class_type = "cell"
+    @staticmethod
+    def load_token_dictionary():
+        with open(TOKEN_DICTIONARY_FILE, 'rb') as f:
+            return pickle.load(f)
     def __init__(self, *args, **kwargs) -> None:
+        # Load the token dictionary
+        token_dictionary = self.load_token_dictionary()
+        # Use the loaded token dictionary
+        super().__init__(token_dictionary=token_dictionary, *args, **kwargs)
     def _prepare_batch(self, features):
         # Process inputs as usual
         if "label" in features[0]:
             # Initialize labels dictionary for all tasks
             labels = {task: [] for task in features[0]["label"].keys()}
             # Populate labels for each task
             for feature in features:
                 for task, label in feature["label"].items():
     def __call__(self, features):
         batch = self._prepare_batch(features)
         for k, v in batch.items():
             if torch.is_tensor(v):
                 batch[k] = v.clone().detach()
                 }
             else:
                 batch[k] = torch.tensor(v, dtype=torch.int64)
+        return batch