Source code for immuneML.IO.dataset_import.ImmuneMLImport

# quality: gold

from pathlib import Path

import yaml

from immuneML.IO.dataset_import.DataImport import DataImport
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.ElementDataset import ElementDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.environment.Constants import Constants
from immuneML.util.ReflectionHandler import ReflectionHandler



[docs]
class ImmuneMLImport(DataImport):
    """
    Imports the dataset from the files previously exported by immuneML. It closely resembles AIRR format but relies on binary
    representations and is optimized for faster read-in at runtime.

    ImmuneMLImport can import any kind of dataset (RepertoireDataset, SequenceDataset, ReceptorDataset).

    This format includes:

     1. a dataset file in yaml format with iml_dataset extension with parameters:

        - name,
        - identifier,
        - metadata_file (for repertoire datasets),
        - metadata_fields (for repertoire datasets),
        - repertoire_ids (for repertoire datasets)
        - element_ids (for receptor and sequence datasets),
        - labels,

     2. a csv metadata file (only for repertoire datasets, should be in the same folder as the iml_dataset file),

     3. data files for different types of data. For repertoire datasets, data files include one binary numpy file per repertoire with sequences and associated information and one metadata yaml file per repertoire with details such as repertoire identifier, disease status, subject id and other similar available information. For sequence and receptor datasets, sequences or receptors respectively, are stored in batches in binary numpy files.

    Arguments:

        path (str): The path to the previously created dataset file. This file should have an '.iml_dataset' extension. If the path has not been specified, immuneML attempts to load the dataset from a specified metadata file (only for RepertoireDatasets).

        metadata_file (str): An optional metadata file for a RepertoireDataset. If specified, the RepertoireDataset metadata will be updated to the newly specified metadata without otherwise changing the Repertoire objects


    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_dataset:
            format: ImmuneML
            params:
                path: path/to/dataset.iml_dataset
                metadata_file: path/to/metadata.csv

    """


[docs]
    @staticmethod
    def import_dataset(params: dict, dataset_name: str) -> Dataset:
        iml_params = DatasetImportParams.build_object(**params)

        if iml_params.path is not None:
            dataset = ImmuneMLImport._import_from_path(iml_params)
        elif iml_params.metadata_file is not None:
            dataset = ImmuneMLImport._import_from_metadata(iml_params, dataset_name)
        else:
            raise ValueError(f"{ImmuneMLImport.__name__}: no path nor metadata file were defined under key {dataset_name}. At least one of these has "
                             f"to be specified to import the dataset.")

        if isinstance(dataset, RepertoireDataset):
            dataset = ImmuneMLImport._update_repertoire_paths(iml_params, dataset)
        else:
            dataset = ImmuneMLImport._update_receptor_paths(iml_params, dataset)

        return dataset


    @staticmethod
    def _import_from_path(iml_params):
        with iml_params.path.open("r") as file:
            dataset_dict = yaml.safe_load(file)
        assert 'dataset_class' in dataset_dict, f"{ImmuneMLImport.__name__}: 'dataset_class' parameter is missing from the dataset file " \
                                                f"{iml_params.path}."
        dataset_class = ReflectionHandler.get_class_by_name(dataset_dict['dataset_class'])
        del dataset_dict['dataset_class']

        if iml_params.metadata_file is not None and iml_params.metadata_file != '':
            dataset_dict['metadata_file'] = iml_params.metadata_file

        cwd = Path.cwd()
        if 'metadata_file' in dataset_dict and Path(dataset_dict['metadata_file']).parent.samefile(cwd) and not iml_params.path.samefile(cwd):
            dataset_dict['metadata_file'] = iml_params.path.parent / Path(dataset_dict['metadata_file']).name

        dataset = dataset_class.build(**dataset_dict)

        return dataset

    @staticmethod
    def _import_from_metadata(iml_params, dataset_name):
        with iml_params.metadata_file.open("r") as file:
            dataset_filename = file.readline().replace(Constants.COMMENT_SIGN, "").replace("\n", "")
        iml_params.path = iml_params.metadata_file.parent / dataset_filename

        assert iml_params.path.is_file(), f"{ImmuneMLImport.__name__}: dataset file {dataset_filename} specified in " \
                                          f"{iml_params.metadata_file} could not be found ({iml_params.path} is not a file), " \
                                          f"failed to import the dataset {dataset_name}."

        return ImmuneMLImport._import_from_path(iml_params)

    @staticmethod
    def _update_repertoire_paths(iml_params, dataset):
        path = ImmuneMLImport._discover_repertoire_path(iml_params, dataset)
        if path is not None:
            for repertoire in dataset.repertoires:
                repertoire.data_filename = path / repertoire.data_filename.name
                repertoire.metadata_filename = path / repertoire.metadata_filename.name
        return dataset

    @staticmethod
    def _discover_dataset_dir(pickle_params):
        return pickle_params.path.parent

    @staticmethod
    def _update_receptor_paths(pickle_params, dataset: ElementDataset):
        dataset_dir = ImmuneMLImport._discover_dataset_dir(pickle_params)

        if len(list(dataset_dir.glob("*.npy"))) == len(dataset.get_filenames()):
            path = dataset_dir
            new_filenames = []
            for file in dataset.get_filenames():
                new_filenames.append(path / file.name)
            dataset.set_filenames(new_filenames)

        return dataset

    @staticmethod
    def _discover_repertoire_path(params, dataset):
        dataset_dir = ImmuneMLImport._discover_dataset_dir(params)

        if len(list(dataset_dir.glob("*.npy"))) == len(dataset.repertoires):
            path = dataset_dir
        elif len(list(dataset_dir.glob("repertoires/*.npy"))) == len(dataset.repertoires):
            path = dataset_dir / "repertoires/"
        else:
            path = None

        return path