Source code for immuneML.simulation.dataset_generation.RandomDatasetGenerator

import copy
import random
import uuid
from pathlib import Path

from immuneML.IO.dataset_export.AIRRExporter import AIRRExporter
from immuneML.data_model.SequenceParams import RegionType, ChainPair
from immuneML.data_model.SequenceSet import ReceptorSequence, Receptor
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.datasets.ElementDataset import ReceptorDataset, SequenceDataset
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.RepertoireBuilder import RepertoireBuilder



[docs]
class RandomDatasetGenerator:

    @staticmethod
    def _check_probabilities(probabilities_dict, key_type, dict_name):
        assert isinstance(probabilities_dict, dict) and all(
            isinstance(key, key_type) for key in probabilities_dict.keys()) \
               and all(isinstance(value, float) or value in {0, 1} for value in probabilities_dict.values()) \
               and 0.99 <= round(sum(probabilities_dict.values()), 5) <= 1, \
            f"RandomDatasetGenerator: {dict_name} are not specified correctly. They should be a dictionary with probabilities per count " \
            f"and sum to 1, but got {probabilities_dict} instead."

    @staticmethod
    def _check_labels(labels: dict):
        if labels is not None:
            assert isinstance(labels, dict)
            for label in labels:
                RandomDatasetGenerator._check_probabilities(labels[label], object, f"labels - {label}")

    @staticmethod
    def _check_example_count(count, name):
        assert isinstance(count, int) and count > 0, f"RandomDatasetGenerator: {name} is not specified properly. " \
                                                     f"It should be a positive integer, got {count} instead."

    @staticmethod
    def _check_path(path: Path):
        assert path is not None, "RandomDatasetGenerator: path cannot be None when generating datasets."

    @staticmethod
    def _check_rep_dataset_generation_params(repertoire_count: int, sequence_count_probabilities: dict,
                                             sequence_length_probabilities: dict,
                                             labels: dict, path: Path):

        RandomDatasetGenerator._check_example_count(repertoire_count, "repertoire_count")
        RandomDatasetGenerator._check_probabilities(sequence_count_probabilities, int, "sequence_count_probabilities")
        RandomDatasetGenerator._check_probabilities(sequence_length_probabilities, int, "sequence_length_probabilities")
        RandomDatasetGenerator._check_labels(labels)
        RandomDatasetGenerator._check_path(path)


[docs]
    @staticmethod
    def generate_repertoire_dataset(repertoire_count: int, sequence_count_probabilities: dict,
                                    sequence_length_probabilities: dict,
                                    labels: dict, path: Path, name="repertoire_dataset") -> RepertoireDataset:
        """
        Creates repertoire_count repertoires where the number of sequences per repertoire is sampled from the probability distribution given
        in sequence_count_probabilities. The length of sequences is sampled independently for each sequence from
        sequence_length_probabilities distribution. The labels are also randomly assigned to repertoires from the distribution given in
        labels. In this case, labels are multi-class, so each repertoire will get at one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:
        repertoire_count: 100 # generate 100 repertoires
        sequence_count_probabilities:
            100: 0.5 # half of the generated repertoires will have 100 sequences
            200: 0.5 # the other half of the generated repertoires will have 200 sequences
        sequence_length_distribution:
            14: 0.8 # 80% of all generated sequences for all repertoires will have length 14
            15: 0.2 # 20% of all generated sequences across all repertoires will have length 15
        labels:
            cmv: # label name
                True: 0.5 # 50% of the repertoires will have class True
                False: 0.5 # 50% of the repertoires will have class False
            coeliac: # next label with classes that will be assigned to repertoires independently of the previous label or any other parameter
                1: 0.3 # 30% of the generated repertoires will have class 1
                0: 0.7 # 70% of the generated repertoires will have class 0

        """
        RandomDatasetGenerator._check_rep_dataset_generation_params(repertoire_count, sequence_count_probabilities,
                                                                    sequence_length_probabilities,
                                                                    labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        sequences = [["".join(random.choices(alphabet,
                                             k=random.choices(list(sequence_length_probabilities.keys()),
                                                              list(sequence_length_probabilities.values()))[0]))
                      for seq_count in range(
                random.choices(list(sequence_count_probabilities.keys()), list(sequence_count_probabilities.values()))[0])]
                     for rep in range(repertoire_count)]

        processed_labels, dataset_params = RandomDatasetGenerator._make_labels(labels, repertoire_count)

        dataset = RepertoireBuilder.build_dataset(sequences=sequences, path=path, labels=processed_labels, name=name)

        AIRRExporter.export(dataset, path)

        return dataset


    @staticmethod
    def _make_labels(labels: dict, element_count: int):
        if labels is not None:
            processed_labels = {
                label: random.choices(list(labels[label].keys()), labels[label].values(), k=element_count) for label
                in labels}
            dataset_params = {label: list(labels[label].keys()) for label in labels}
        else:
            processed_labels = None
            dataset_params = None

        return processed_labels, dataset_params

    @staticmethod
    def _check_receptor_dataset_generation_params(receptor_count: int, chain_1_length_probabilities: dict,
                                                  chain_2_length_probabilities: dict, labels: dict, path: Path):

        RandomDatasetGenerator._check_probabilities(chain_1_length_probabilities, int, 'chain_1_length_probabilities')
        RandomDatasetGenerator._check_probabilities(chain_2_length_probabilities, int, 'chain_2_length_probabilities')
        RandomDatasetGenerator._check_example_count(receptor_count, "receptor_count")
        RandomDatasetGenerator._check_labels(labels)
        RandomDatasetGenerator._check_path(path)


[docs]
    @staticmethod
    def generate_receptor_dataset(receptor_count: int, chain_1_length_probabilities: dict,
                                  chain_2_length_probabilities: dict, labels: dict,
                                  path: Path, name="receptor_dataset"):
        """
        Creates receptor_count receptors where the length of sequences in each chain is sampled independently for each sequence from
        chain_n_length_probabilities distribution. The labels are also randomly assigned to receptors from the distribution given in
        labels. In this case, labels are multi-class, so each receptor will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification. chain 1 and 2 in this case refer to alpha and beta
        chain of a T-cell receptor.

        An example of input parameters is given below:

        receptor_count: 100 # generate 100 TRABReceptors
        chain_1_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        chain_2_length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 2) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 2) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0

        """
        RandomDatasetGenerator._check_receptor_dataset_generation_params(receptor_count, chain_1_length_probabilities,
                                                                         chain_2_length_probabilities, labels, path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        get_random_sequence = lambda proba, chain, id: ReceptorSequence(
            sequence_aa="".join(random.choices(alphabet, k=random.choices(list(proba.keys()), proba.values())[0])),
            duplicate_count=random.randint(1, 10), v_call=chain + "V1-1*01", j_call=chain + "J1-1*01", locus=chain, cell_id=str(id))

        receptors = []
        for i in range(receptor_count):
            receptor_id = uuid.uuid4().hex
            receptors.append(Receptor(chain_1=get_random_sequence(chain_1_length_probabilities, "TRA", receptor_id),
                                      chain_2=get_random_sequence(chain_2_length_probabilities, "TRB", receptor_id),
                                      chain_pair=ChainPair["TRA_TRB"],
                                      cell_id=str(receptor_id), receptor_id=receptor_id,
                                      metadata={
                                          **{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0]
                                             for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}}))

        for receptor in receptors:
            receptor.chain_1.metadata = copy.deepcopy(receptor.metadata)
            receptor.chain_2.metadata = copy.deepcopy(receptor.metadata)

        processed_labels, dataset_params = RandomDatasetGenerator._make_labels(labels, receptor_count)
        dataset = ReceptorDataset.build_from_objects(receptors=receptors, path=path, name=name,
                                                     labels=dataset_params)

        AIRRExporter.export(dataset, path)

        return dataset


    @staticmethod
    def _check_sequence_dataset_generation_params(receptor_count: int, length_probabilities: dict, labels: dict,
                                                  path: Path):
        RandomDatasetGenerator._check_probabilities(length_probabilities, int, 'length_probabilities')
        RandomDatasetGenerator._check_example_count(receptor_count, "receptor_count")
        RandomDatasetGenerator._check_labels(labels)
        RandomDatasetGenerator._check_path(path)


[docs]
    @staticmethod
    def generate_sequence_dataset(sequence_count: int, length_probabilities: dict, labels: dict, path: Path,
                                  region_type: str = RegionType.IMGT_CDR3.name, name="sequence_dataset"):
        """
        Creates sequence_count receptor sequences (single chain) where the length of sequences in each chain is sampled independently for each sequence from
        length_probabilities distribution. The labels are also randomly assigned to sequences from the distribution given in
        labels. In this case, labels are multi-class, so each sequences will get one class from each label. This means that negative
        classes for the labels should be included as well in the specification.

        An example of input parameters is given below:

        sequence_count: 100 # generate 100 TRB ReceptorSequences
        length_probabilities:
            14: 0.8 # 80% of all generated sequences for all receptors (for chain 1) will have length 14
            15: 0.2 # 20% of all generated sequences across all receptors (for chain 1) will have length 15
        labels:
            epitope1: # label name
                True: 0.5 # 50% of the receptors will have class True
                False: 0.5 # 50% of the receptors will have class False
            epitope2: # next label with classes that will be assigned to receptors independently of the previous label or other parameters
                1: 0.3 # 30% of the generated receptors will have class 1
                0: 0.7 # 70% of the generated receptors will have class 0

        """
        RandomDatasetGenerator._check_sequence_dataset_generation_params(sequence_count, length_probabilities, labels,
                                                                         path)

        alphabet = EnvironmentSettings.get_sequence_alphabet()
        PathBuilder.build(path)

        chain = "TRB"

        sequences = [ReceptorSequence(uuid.uuid4().hex, None,
                                      "".join(random.choices(alphabet,
                                                             k=random.choices(list(length_probabilities.keys()),
                                                                              list(length_probabilities.values()))[0])),
                                      locus=chain, v_call=chain + "V1-1*01", j_call=chain + "J1-1*01",
                                      duplicate_count=random.randint(1, 10),
                                      metadata={
                                          **{label: random.choices(list(label_dict.keys()), label_dict.values(), k=1)[0]
                                             for label, label_dict in labels.items()}, **{"subject": f"subj_{i + 1}"}})
                     for i in range(sequence_count)]

        processed_labels, dataset_params = RandomDatasetGenerator._make_labels(labels, sequence_count)
        dataset = SequenceDataset.build_from_objects(sequences, path, name=name, labels=dataset_params,
                                                     region_type=RegionType[region_type.upper()])

        AIRRExporter.export(dataset, path)

        return dataset