Update geneformer/tokenizer.py (#450)
Browse files- Update geneformer/tokenizer.py (889cadcf135b313171aadec4e879caae20fd0bc1)
- Update geneformer/tokenizer.py (b8808792d6e106aeec320447f4032b49533ba4de)
Co-authored-by: Han Chen <[email protected]>
- geneformer/tokenizer.py +42 -36
geneformer/tokenizer.py
CHANGED
|
@@ -103,33 +103,38 @@ def sum_ensembl_ids(
|
|
| 103 |
assert (
|
| 104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 107 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
if len(gene_ids_in_dict) == len(set(gene_ids_in_dict)):
|
| 114 |
return data_directory
|
| 115 |
else:
|
| 116 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
]
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
data.ra["ensembl_id_collapsed"] =
|
| 127 |
return data_directory
|
|
|
|
| 128 |
else:
|
| 129 |
dedup_filename = data_directory.with_name(
|
| 130 |
data_directory.stem + "__dedup.loom"
|
| 131 |
)
|
| 132 |
-
data.ra["
|
|
|
|
| 133 |
dup_genes = [
|
| 134 |
idx
|
| 135 |
for idx, count in Counter(data.ra["ensembl_id_collapsed"]).items()
|
|
@@ -204,32 +209,33 @@ def sum_ensembl_ids(
|
|
| 204 |
"ensembl_id_collapsed" not in data.var.columns
|
| 205 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
| 206 |
|
|
|
|
|
|
|
|
|
|
| 207 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 208 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
return data
|
| 216 |
else:
|
| 217 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
| 218 |
|
| 219 |
-
#
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
]
|
| 223 |
-
gene_ids_collapsed_in_dict = [
|
| 224 |
-
gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
|
| 225 |
-
]
|
| 226 |
-
if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
|
| 227 |
-
data.var["ensembl_id_collapsed"] = data.var.ensembl_id.map(gene_mapping_dict)
|
| 228 |
-
return data
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
else:
|
| 231 |
-
data.var["ensembl_id_collapsed"] =
|
| 232 |
-
data.var_names =
|
| 233 |
data = data[:, ~data.var.index.isna()]
|
| 234 |
dup_genes = [
|
| 235 |
idx for idx, count in Counter(data.var_names).items() if count > 1
|
|
|
|
| 103 |
assert (
|
| 104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
| 105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# Get the ensembl ids that exist in data
|
| 109 |
+
ensembl_ids = data.ra.ensembl_id
|
| 110 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 111 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 112 |
+
if not collapse_gene_ids:
|
| 113 |
+
ensembl_id_check = [
|
| 114 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
| 115 |
+
]
|
| 116 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
|
|
|
| 117 |
return data_directory
|
| 118 |
else:
|
| 119 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
| 120 |
+
|
| 121 |
+
# Get the genes that exist in the mapping dictionary and the value of those genes
|
| 122 |
+
genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()]
|
| 123 |
+
vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict]
|
| 124 |
+
|
| 125 |
+
# if the genes in the mapping dict and the value of those genes are of the same length,
|
| 126 |
+
# simply return the mapped values
|
| 127 |
+
if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))):
|
| 128 |
+
mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]]
|
| 129 |
+
data.ra["ensembl_id_collapsed"] = mapped_vals
|
| 130 |
return data_directory
|
| 131 |
+
# Genes need to be collapsed
|
| 132 |
else:
|
| 133 |
dedup_filename = data_directory.with_name(
|
| 134 |
data_directory.stem + "__dedup.loom"
|
| 135 |
)
|
| 136 |
+
mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]]
|
| 137 |
+
data.ra["ensembl_id_collapsed"] = mapped_vals
|
| 138 |
dup_genes = [
|
| 139 |
idx
|
| 140 |
for idx, count in Counter(data.ra["ensembl_id_collapsed"]).items()
|
|
|
|
| 209 |
"ensembl_id_collapsed" not in data.var.columns
|
| 210 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
| 211 |
|
| 212 |
+
|
| 213 |
+
# Get the ensembl ids that exist in data
|
| 214 |
+
ensembl_ids = data.var.ensembl_id
|
| 215 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
| 216 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
| 217 |
+
if not collapse_gene_ids:
|
| 218 |
+
ensembl_id_check = [
|
| 219 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
| 220 |
+
]
|
| 221 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
| 222 |
+
return data_directory
|
|
|
|
| 223 |
else:
|
| 224 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
| 225 |
|
| 226 |
+
# Get the genes that exist in the mapping dictionary and the value of those genes
|
| 227 |
+
genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()]
|
| 228 |
+
vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
+
# if the genes in the mapping dict and the value of those genes are of the same length,
|
| 231 |
+
# simply return the mapped values
|
| 232 |
+
if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))):
|
| 233 |
+
data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict)
|
| 234 |
+
return data
|
| 235 |
+
# Genes need to be collapsed
|
| 236 |
else:
|
| 237 |
+
data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict)
|
| 238 |
+
data.var_names = data.var["ensembl_id_collapsed"]
|
| 239 |
data = data[:, ~data.var.index.isna()]
|
| 240 |
dup_genes = [
|
| 241 |
idx for idx, count in Counter(data.var_names).items() if count > 1
|