Source code for immuneML.workflows.instructions.subsampling.SubsamplingInstruction

import random
import shutil
from pathlib import Path
from typing import List

from immuneML.IO.dataset_export.DataExporter import DataExporter
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.ReflectionHandler import ReflectionHandler
from immuneML.workflows.instructions.Instruction import Instruction
from immuneML.workflows.instructions.subsampling.SubsamplingState import SubsamplingState
from scripts.specification_util import update_docs_per_mapping


[docs]class SubsamplingInstruction(Instruction):
    """
    Subsampling is an instruction that subsamples a given dataset and creates multiple smaller dataset according to the parameters provided.

    Arguments:

        dataset (Dataset): original dataset which will be used as a basis for subsampling

        subsampled_dataset_sizes (list): a list of dataset sizes (number of examples) each subsampled dataset should have

        dataset_export_formats (list): in which formats to export the subsampled datasets. Valid formats are class names of any non-abstract class inheriting :py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`.

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_subsampling_instruction: # user-defined name of the instruction
            type: Subsampling # which instruction to execute
            dataset: my_dataset # original dataset to be subsampled, with e.g., 300 examples
            subsampled_dataset_sizes: # how large the subsampled datasets should be, one dataset will be created for each list item
                - 200 # one subsampled dataset with 200 examples (200 repertoires if my_dataset was repertoire dataset)
                - 100 # the other subsampled dataset will have 100 examples
            dataset_export_formats: # in which formats to export the subsampled datasets
                - ImmuneML
                - AIRR

    """

    def __init__(self, dataset: Dataset, subsampled_dataset_sizes: List[int], dataset_export_formats: list, result_path: Path = None, name: str = None):
        self.state = SubsamplingState(dataset, subsampled_dataset_sizes, dataset_export_formats, result_path, name)

[docs]    def run(self, result_path: Path):
        self.state.result_path = PathBuilder.build(result_path / self.state.name)

        example_indices = list(range(self.state.dataset.get_example_count()))

        for index, dataset_size in enumerate(self.state.subsampled_dataset_sizes):

            new_dataset_name = f"{self.state.dataset.name}_{dataset_size}_subsampled_{index+1}"
            new_dataset_path = PathBuilder.build(self.state.result_path / new_dataset_name)

            new_example_indices = random.sample(example_indices, k=dataset_size)
            new_dataset = self.state.dataset.make_subset(new_example_indices, new_dataset_path, Dataset.SUBSAMPLED)
            new_dataset.name = new_dataset_name

            self.state.subsampled_datasets.append(new_dataset)

            self.export_dataset(new_dataset, new_dataset_path)

        return self.state

[docs]    def export_dataset(self, new_dataset, new_dataset_path):

        self.state.subsampled_dataset_paths[new_dataset.name] = {}

        for exporter in self.state.dataset_exporters:
            exporter_name = exporter.__name__[:-8].lower()
            export_path = new_dataset_path / f"exported/{exporter_name}/"
            exporter.export(new_dataset, export_path)
            zip_export_path = shutil.make_archive(new_dataset_path / f"exported_{exporter_name}_{new_dataset.name}", "zip", export_path)
            self.state.subsampled_dataset_paths[new_dataset.name][exporter_name] = zip_export_path

[docs]    @staticmethod
    def get_documentation():
        doc = str(SubsamplingInstruction.__doc__)

        valid_strategy_values = ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter, "Exporter", "dataset_export/")
        valid_strategy_values = str(valid_strategy_values)[1:-1].replace("'", "`")
        mapping = {
            "Valid formats are class names of any non-abstract class inheriting "
            ":py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`.": f"Valid values are: {valid_strategy_values}."
        }
        doc = update_docs_per_mapping(doc, mapping)
        return doc
Versions

Source code for immuneML.workflows.instructions.subsampling.SubsamplingInstruction