Source code for immuneML.IO.dataset_import.PickleImport

# quality: gold

import logging
import pickle
from pathlib import Path

import pandas as pd

from immuneML.IO.dataset_import.DataImport import DataImport
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.ElementDataset import ElementDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.environment.Constants import Constants


[docs]class PickleImport(DataImport): """ Imports the dataset from the pickle files previously exported by immuneML. PickleImport can import any kind of dataset (RepertoireDataset, SequenceDataset, ReceptorDataset). Important note: Pickle files might not be compatible between different immuneML (sub)versions. Arguments: path (str): The path to the previously created dataset file. This file should have an '.iml_dataset' extension. If the path has not been specified, immuneML attempts to load the dataset from a specified metadata file (only for RepertoireDatasets). metadata_file (str): An optional metadata file for a RepertoireDataset. If specified, the RepertoireDataset metadata will be updated to the newly specified metadata without otherwise changing the Repertoire objects YAML specification: .. indent with spaces .. code-block:: yaml my_pickle_dataset: format: Pickle params: path: path/to/dataset.iml_dataset metadata_file: path/to/metadata.csv """
[docs] @staticmethod def import_dataset(params: dict, dataset_name: str) -> Dataset: pickle_params = DatasetImportParams.build_object(**params) if pickle_params.path is not None: dataset = PickleImport._import_from_path(pickle_params) elif pickle_params.metadata_file is not None: dataset = PickleImport._import_from_metadata(pickle_params, dataset_name) else: raise ValueError(f"PickleImport: no path nor metadata file were defined under key {dataset_name}. At least one of these has " f"to be specified to import the dataset.") if isinstance(dataset, RepertoireDataset): dataset = PickleImport._update_repertoire_paths(pickle_params, dataset) else: dataset = PickleImport._update_receptor_paths(pickle_params, dataset) return dataset
@staticmethod def _import_from_path(pickle_params): with pickle_params.path.open("rb") as file: dataset = pickle.load(file) if hasattr(dataset, "metadata_file"): if pickle_params.metadata_file is not None: dataset.metadata_file = pickle_params.metadata_file metadata = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN) metadata.to_csv(dataset.metadata_file, index=False) else: if dataset.metadata_file is not None and not dataset.metadata_file.is_file(): new_metadata_file = Path(dataset.metadata_file.name) if new_metadata_file.is_file(): dataset.metadata_file = new_metadata_file logging.warning(f"PickleImport: metadata file could not be found at {pickle_params.metadata_file}, " f"using {new_metadata_file} instead.") else: raise FileNotFoundError(f"PickleImport: the metadata file could not be found at {pickle_params.metadata_file}" f"or at {new_metadata_file}. Please update the path to the metadata file.") return dataset @staticmethod def _import_from_metadata(pickle_params, dataset_name): with pickle_params.metadata_file.open("r") as file: dataset_filename = file.readline().replace(Constants.COMMENT_SIGN, "").replace("\n", "") pickle_params.path = pickle_params.metadata_file.parent / dataset_filename assert pickle_params.path.is_file(), f"PickleImport: dataset file {dataset_filename} specified in " \ f"{pickle_params.metadata_file} could not be found ({pickle_params.path} is not a file), " \ f"failed to import the dataset {dataset_name}." return PickleImport._import_from_path(pickle_params) @staticmethod def _update_repertoire_paths(pickle_params, dataset): path = PickleImport._discover_repertoire_path(pickle_params, dataset) if path is not None: for repertoire in dataset.repertoires: repertoire.data_filename = path / repertoire.data_filename.name repertoire.metadata_filename = path / repertoire.metadata_filename.name return dataset @staticmethod def _discover_dataset_dir(pickle_params): return pickle_params.path.parent @staticmethod def _update_receptor_paths(pickle_params, dataset: ElementDataset): dataset_dir = PickleImport._discover_dataset_dir(pickle_params) if len(list(dataset_dir.glob("*.pickle"))) == len(dataset.get_filenames()): path = dataset_dir new_filenames = [] for file in dataset.get_filenames(): new_filenames.append(path / file.name) dataset.set_filenames(new_filenames) return dataset @staticmethod def _discover_repertoire_path(pickle_params, dataset): dataset_dir = PickleImport._discover_dataset_dir(pickle_params) if len(list(dataset_dir.glob("*.npy"))) == len(dataset.repertoires): path = dataset_dir elif len(list(dataset_dir.glob("repertoires/*.npy"))) == len(dataset.repertoires): path = dataset_dir / "repertoires/" else: path = None return path