Source code for immuneML.workflows.instructions.dataset_generation.DatasetExportInstruction

import datetime
from pathlib import Path
from typing import List

from immuneML.IO.dataset_export.DataExporter import DataExporter
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.util.ReflectionHandler import ReflectionHandler
from immuneML.workflows.instructions.Instruction import Instruction
from immuneML.workflows.instructions.dataset_generation.DatasetExportState import DatasetExportState
from scripts.specification_util import update_docs_per_mapping


[docs]class DatasetExportInstruction(Instruction):
    """
    DatasetExport instruction takes a list of datasets as input and outputs them in specified formats.

    Arguments:

        datasets (list): a list of datasets to export in all given formats

        formats (list): a list of formats in which to export the datasets. Valid formats are class names of any non-abstract class inheriting :py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`. Important note: Pickle files might not be compatible between different immuneML (sub)versions.

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_dataset_export_instruction: # user-defined instruction name
            type: DatasetExport # which instruction to execute
            datasets: # list of datasets to export
                - my_generated_dataset
                - my_dataset_from_adaptive
            export_formats: # list of formats to export the datasets to
                - AIRR
                - Pickle

    """

    def __init__(self, datasets: List[Dataset], exporters: List[DataExporter], result_path: Path = None, name: str = None):
        self.datasets = datasets
        self.exporters = exporters
        self.result_path = result_path
        self.name = name

[docs]    def run(self, result_path: Path) -> DatasetExportState:
        self.result_path = result_path / self.name
        paths = {}

        for dataset in self.datasets:
            dataset_name = dataset.name if dataset.name is not None else dataset.identifier
            paths[dataset_name] = {}
            for exporter in self.exporters:
                export_format = exporter.__name__[:-8]
                path = self.result_path / dataset_name / export_format
                exporter.export(dataset, path)
                paths[dataset_name][export_format] = path
                contains = str(dataset.__class__.__name__).replace("Dataset", "s").lower()
                print(f"{datetime.datetime.now()}: Exported dataset {dataset_name} containing {dataset.get_example_count()} {contains} in {export_format} format.", flush=True)

        return DatasetExportState(datasets=self.datasets, formats=[exporter.__name__[:-8] for exporter in self.exporters],
                                      paths=paths, result_path=self.result_path, name=self.name)

[docs]    @staticmethod
    def get_documentation():
        doc = str(DatasetExportInstruction.__doc__)

        valid_strategy_values = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, "Exporter", "dataset_export/")
        valid_strategy_values = str(valid_strategy_values)[1:-1].replace("'", "`")
        mapping = {
            "Valid formats are class names of any non-abstract class inheriting "
            ":py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`.": f"Valid values are: {valid_strategy_values}."
        }
        doc = update_docs_per_mapping(doc, mapping)
        return doc