# coding: utf-8 # Distributed under the terms of the MIT License. """This module defines some remote datasets that can be downloaded into the user's installation. """ import os from collections import namedtuple from enum import Enum, auto from pathlib import Path from typing import Union from modnet.utils import LOG class Usage(Enum): _MODData = auto() cross_nmi = auto() feature_db = auto() Dataset = namedtuple("Dataset", ("url", "description", "filename", "md5", "usage")) DATASETS = { "MP_2018.6": Dataset( url="https://ndownloader.figshare.com/files/24364571", description=( "A MODData that contains all inorganic compounds from the Materials Project (MP) as of June 2018, " "decorated with the DeBreuck2020 featurizer preset." ), filename="MP_2018.6.zip", md5="06280c4e539508bbcc5266f07698f8d1", usage=Usage["_MODData"], ), "MP_2018.6_CROSS_NMI": Dataset( url="https://ndownloader.figshare.com/files/25584803", description=( "Pickled dataframe containing the Normalized Mutual Information (NMI) between matminer features " "computed on the Materials Project." ), filename="features_cross", md5="b83e0bd43f71ec53c4d69ee0764acfbe", usage=Usage["cross_nmi"], ), "MP_210321": Dataset( url="https://figshare.com/ndownloader/files/33177986", description=( "A pickled dataframe containing featurized (DeBreuck2020) materials from the Materials Project as of 21 March 2021." ), filename="feature_database_v2", md5="8efaac9b64f11073577396b9fb353f89", usage=Usage["feature_db"], ), } def load_ext_dataset(dataset_name: str, expected_type: Union[Usage, str]): """Load one of the preset datasets from the `DATASETS` constant. Will not overwrite any existing local data with remote datasets. Checks hashes against what is expected and will not depickle if unrecognised. Parameters: dataset_name: The name (key) of the dataset in `DATASETS`. expected_type: A string representing the expected usage of the dataset, e.g. `'_MODData'` or `'cross_nmi'` or `'feature_db'`. Returns: The path to the downloaded or previously installed model. """ import urllib.request import urllib.error if dataset_name not in DATASETS: raise ValueError( f"No dataset {dataset_name} found, must be one of {list(DATASETS.keys())}" ) dataset = DATASETS[dataset_name] if isinstance(expected_type, str): if expected_type == "MODData": expected_type = "_MODData" expected_type = Usage[expected_type] if dataset.usage != expected_type: raise ValueError( f"Cannot load {dataset_name} as it has the wrong type {dataset.usage}." ) data_dir = Path(__file__).parent.joinpath("data") model_path = data_dir.joinpath(dataset.filename) if not model_path.is_file(): LOG.info( f"Downloading featurized dataset {dataset_name} from {dataset.url} into {model_path} \n This may take some time..." ) if not data_dir.is_dir(): os.makedirs(data_dir) try: zip_file, response = urllib.request.urlretrieve(dataset.url, model_path) except (urllib.error.URLError, urllib.error.HTTPError) as exc: raise ValueError( f"There was a problem downloading {dataset.url}: {exc.reason}" ) if dataset.md5 is not None: from modnet.utils import get_hash_of_file file_md5 = get_hash_of_file(model_path, algo="md5") if file_md5 != dataset.md5: raise RuntimeError( f"Precomputed {str(dataset.usage.name.strip('_'))} did not match expected MD5 from {dataset.url}, will not depickle." f"\nExpected: {str(dataset.md5)}" f"\nReceived: {str(file_md5)}" ) return model_path