# quality: gold
from pathlib import Path
import yaml
from immuneML.IO.dataset_import.DataImport import DataImport
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.ElementDataset import ElementDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.environment.Constants import Constants
from immuneML.util.ReflectionHandler import ReflectionHandler
[docs]
class ImmuneMLImport(DataImport):
"""
Imports the dataset from the files previously exported by immuneML. It closely resembles AIRR format but relies on binary
representations and is optimized for faster read-in at runtime.
ImmuneMLImport can import any kind of dataset (RepertoireDataset, SequenceDataset, ReceptorDataset).
This format includes:
1. a dataset file in yaml format with iml_dataset extension with parameters:
- name,
- identifier,
- metadata_file (for repertoire datasets),
- metadata_fields (for repertoire datasets),
- repertoire_ids (for repertoire datasets)
- element_ids (for receptor and sequence datasets),
- labels,
2. a csv metadata file (only for repertoire datasets, should be in the same folder as the iml_dataset file),
3. data files for different types of data. For repertoire datasets, data files include one binary numpy file per repertoire with sequences and associated information and one metadata yaml file per repertoire with details such as repertoire identifier, disease status, subject id and other similar available information. For sequence and receptor datasets, sequences or receptors respectively, are stored in batches in binary numpy files.
Arguments:
path (str): The path to the previously created dataset file. This file should have an '.iml_dataset' extension. If the path has not been specified, immuneML attempts to load the dataset from a specified metadata file (only for RepertoireDatasets).
metadata_file (str): An optional metadata file for a RepertoireDataset. If specified, the RepertoireDataset metadata will be updated to the newly specified metadata without otherwise changing the Repertoire objects
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_dataset:
format: ImmuneML
params:
path: path/to/dataset.iml_dataset
metadata_file: path/to/metadata.csv
"""
[docs]
@staticmethod
def import_dataset(params: dict, dataset_name: str) -> Dataset:
iml_params = DatasetImportParams.build_object(**params)
if iml_params.path is not None:
dataset = ImmuneMLImport._import_from_path(iml_params)
elif iml_params.metadata_file is not None:
dataset = ImmuneMLImport._import_from_metadata(iml_params, dataset_name)
else:
raise ValueError(f"{ImmuneMLImport.__name__}: no path nor metadata file were defined under key {dataset_name}. At least one of these has "
f"to be specified to import the dataset.")
if isinstance(dataset, RepertoireDataset):
dataset = ImmuneMLImport._update_repertoire_paths(iml_params, dataset)
else:
dataset = ImmuneMLImport._update_receptor_paths(iml_params, dataset)
return dataset
@staticmethod
def _import_from_path(iml_params):
with iml_params.path.open("r") as file:
dataset_dict = yaml.safe_load(file)
assert 'dataset_class' in dataset_dict, f"{ImmuneMLImport.__name__}: 'dataset_class' parameter is missing from the dataset file " \
f"{iml_params.path}."
dataset_class = ReflectionHandler.get_class_by_name(dataset_dict['dataset_class'])
del dataset_dict['dataset_class']
if iml_params.metadata_file is not None and iml_params.metadata_file != '':
dataset_dict['metadata_file'] = iml_params.metadata_file
cwd = Path.cwd()
if 'metadata_file' in dataset_dict and Path(dataset_dict['metadata_file']).parent.samefile(cwd) and not iml_params.path.samefile(cwd):
dataset_dict['metadata_file'] = iml_params.path.parent / Path(dataset_dict['metadata_file']).name
dataset = dataset_class.build(**dataset_dict)
return dataset
@staticmethod
def _import_from_metadata(iml_params, dataset_name):
with iml_params.metadata_file.open("r") as file:
dataset_filename = file.readline().replace(Constants.COMMENT_SIGN, "").replace("\n", "")
iml_params.path = iml_params.metadata_file.parent / dataset_filename
assert iml_params.path.is_file(), f"{ImmuneMLImport.__name__}: dataset file {dataset_filename} specified in " \
f"{iml_params.metadata_file} could not be found ({iml_params.path} is not a file), " \
f"failed to import the dataset {dataset_name}."
return ImmuneMLImport._import_from_path(iml_params)
@staticmethod
def _update_repertoire_paths(iml_params, dataset):
path = ImmuneMLImport._discover_repertoire_path(iml_params, dataset)
if path is not None:
for repertoire in dataset.repertoires:
repertoire.data_filename = path / repertoire.data_filename.name
repertoire.metadata_filename = path / repertoire.metadata_filename.name
return dataset
@staticmethod
def _discover_dataset_dir(pickle_params):
return pickle_params.path.parent
@staticmethod
def _update_receptor_paths(pickle_params, dataset: ElementDataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(pickle_params)
if len(list(dataset_dir.glob("*.npy"))) == len(dataset.get_filenames()):
path = dataset_dir
new_filenames = []
for file in dataset.get_filenames():
new_filenames.append(path / file.name)
dataset.set_filenames(new_filenames)
return dataset
@staticmethod
def _discover_repertoire_path(params, dataset):
dataset_dir = ImmuneMLImport._discover_dataset_dir(params)
if len(list(dataset_dir.glob("*.npy"))) == len(dataset.repertoires):
path = dataset_dir
elif len(list(dataset_dir.glob("repertoires/*.npy"))) == len(dataset.repertoires):
path = dataset_dir / "repertoires/"
else:
path = None
return path