Source code for immuneML.preprocessing.filters.ClonesPerRepertoireFilter

from pathlib import Path

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.preprocessing.filters.Filter import Filter


[docs]class ClonesPerRepertoireFilter(Filter):
    """
    Removes all repertoires from the RepertoireDataset, which contain fewer clonotypes than specified by the
    lower_limit, or more clonotypes than specified by the upper_limit.
    Note that this filter filters out repertoires, not individual sequences, and can thus only be applied to RepertoireDatasets.

    Since the filter removes repertoires from the dataset (examples in machine learning setting), it cannot be used with :ref:`TrainMLModel`
    instruction. If you want to use this filter, see :ref:`DatasetExport` instruction with preprocessing.

    Arguments:

        lower_limit (int): The minimal inclusive lower limit for the number of clonotypes allowed in a repertoire.

        upper_limit (int): The maximal inclusive upper limit for the number of clonotypes allowed in a repertoire.

    When no lower or upper limit is specified, or the value -1 is specified, the limit is ignored.


    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        preprocessing_sequences:
            my_preprocessing:
                - my_filter:
                    ClonesPerRepertoireFilter:
                        lower_limit: 100
                        upper_limit: 100000

    """

    def __init__(self, result_path: Path = None, lower_limit: int = -1, upper_limit: int = -1):
        super().__init__(result_path)
        self.lower_limit = lower_limit
        self.upper_limit = upper_limit

[docs]    def keeps_example_count(self) -> bool:
        return False

[docs]    def process_dataset(self, dataset: RepertoireDataset, result_path: Path = None):
        self.check_dataset_type(dataset, [RepertoireDataset], "ClonesPerRepertoireFilter")
        self.result_path = result_path if result_path is not None else self.result_path

        processed_dataset = dataset.clone()
        repertoires, indices = [], []

        for index, repertoire in enumerate(dataset.get_data()):
            if self.lower_limit != -1 and len(repertoire.sequences) < self.lower_limit:
                continue
            if self.upper_limit != -1 and len(repertoire.sequences) > self.upper_limit:
                continue
            repertoires.append(dataset.repertoires[index])
            indices.append(index)

        processed_dataset.repertoires = repertoires
        processed_dataset.metadata_file = self._build_new_metadata(dataset, indices)

        self.check_dataset_not_empty(processed_dataset, "ClonesPerRepertoireFilter")

        return processed_dataset