Source code for immuneML.encodings.reference_encoding.MatchedReferenceUtil

import logging
import os
from pathlib import Path

from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.dsl.DefaultParamsLoader import DefaultParamsLoader
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.ImportHelper import ImportHelper
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.ReflectionHandler import ReflectionHandler


[docs] class MatchedReferenceUtil: """ Utility class for MatchedSequencesEncoder and MatchedReceptorsEncoder """
[docs] @staticmethod def prepare_reference(reference_params: dict, location: str, paired: bool): ParameterValidator.assert_keys(list(reference_params.keys()), ["format", "params"], location, "reference") seq_import_params = reference_params["params"] if "params" in reference_params else {} assert os.path.isfile(seq_import_params["path"]), f"{location}: the file {seq_import_params['path']} does not exist. " \ f"Specify the correct path under reference." if "is_repertoire" in seq_import_params: assert seq_import_params["is_repertoire"] is False, f"{location}: is_repertoire must be False for SequenceImport" else: seq_import_params["is_repertoire"] = False if "paired" in seq_import_params: assert seq_import_params["paired"] == paired, f"{location}: paired must be {paired} for SequenceImport" else: seq_import_params["paired"] = paired format_str = reference_params["format"] import_class = ReflectionHandler.get_class_by_name(f"{format_str}Import") default_params = DefaultParamsLoader.load(EnvironmentSettings.default_params_path / "datasets", DefaultParamsLoader.convert_to_snake_case(format_str)) params = {**default_params, **seq_import_params} processed_params = DatasetImportParams.build_object(**params) path = Path(reference_params['params']['path']) processed_params.result_path = PathBuilder.build(path.parent / 'iml_imported' if path.is_file() else path / 'iml_imported') if format_str == "SingleLineReceptor": receptors = list(import_class.import_dataset(processed_params, 'tmp_receptor_dataset').get_data()) else: receptors = ImportHelper.import_items(import_class, reference_params["params"]["path"], processed_params) assert len(receptors) > 0, f"MatchedReferenceUtil: The total number of imported reference {'receptors' if paired else 'sequences'} is 0, please ensure that reference import is specified correctly." logging.info(f"MatchedReferenceUtil: successfully imported {len(receptors)} reference {'receptors' if paired else 'sequences'}.") return receptors