Source code for immuneML.IO.dataset_import.RandomRepertoireDatasetImport

from immuneML.IO.dataset_import.DataImport import DataImport
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.simulation.dataset_generation.RandomDatasetGenerator import RandomDatasetGenerator
from immuneML.util.ParameterValidator import ParameterValidator


[docs] class RandomRepertoireDatasetImport(DataImport): """ Returns a RepertoireDataset consisting of randomly generated sequences, which can be used for benchmarking purposes. The sequences consist of uniformly chosen amino acids or nucleotides. Arguments: repertoire_count (int): The number of repertoires the RepertoireDataset should contain. sequence_count_probabilities (dict): A mapping where the keys are the number of sequences per repertoire, and the values are the probabilities that any of the repertoires would have that number of sequences. For example, to create a random RepertoireDataset where 40% of the repertoires would have 1000 sequences, and the other 60% would have 1100 sequences, this mapping would need to be specified: .. indent with spaces .. code-block:: yaml 1000: 0.4 1100: 0.6 sequence_length_probabilities (dict): A mapping where the keys correspond to different sequence lengths, and the values are the probabilities for choosing each sequence length. For example, to create a random RepertoireDataset where 40% of the sequences would be of length 10, and 60% of the sequences would have length 12, this mapping would need to be specified: .. indent with spaces .. code-block:: yaml 10: 0.4 12: 0.6 labels (dict): A mapping that specifies randomly chosen labels to be assigned to the Repertoires. One or multiple labels can be specified here. The keys of this mapping are the labels, and the values consist of another mapping between label classes and their probabilities. For example, to create a random RepertoireDataset with the label CMV where 70% of the Repertoires has class cmv_positive and the remaining 30% has class cmv_negative, the following mapping should be specified: .. indent with spaces .. code-block:: yaml CMV: cmv_positive: 0.7 cmv_negative: 0.3 YAML specification: .. indent with spaces .. code-block:: yaml my_random_dataset: format: RandomRepertoireDataset params: repertoire_count: 100 # number of random repertoires to generate sequence_count_probabilities: 10: 0.5 # probability that any of the repertoires would have 10 receptor sequences 20: 0.5 sequence_length_probabilities: 10: 0.5 # probability that any of the receptor sequences would be 10 amino acids in length 12: 0.5 labels: # randomly assigned labels (only useful for simple benchmarking) cmv: True: 0.5 # probability of value True for label cmv to be assigned to any repertoire False: 0.5 """
[docs] @staticmethod def import_dataset(params: dict, dataset_name: str) -> RepertoireDataset: valid_keys = ["result_path", "repertoire_count", "sequence_count_probabilities", "sequence_length_probabilities", "labels"] ParameterValidator.assert_all_in_valid_list(list(params.keys()), valid_keys, "RandomRepertoireDatasetImport", "params") return RandomDatasetGenerator.generate_repertoire_dataset(repertoire_count=params["repertoire_count"], sequence_count_probabilities=params["sequence_count_probabilities"], sequence_length_probabilities=params["sequence_length_probabilities"], labels=params["labels"], path=params["result_path"])