Source code for immuneML.preprocessing.filters.ChainRepertoireFilter

from pathlib import Path

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.preprocessing.filters.Filter import Filter


[docs]class ChainRepertoireFilter(Filter): """ Removes all repertoires from the RepertoireDataset object which contain at least one sequence with chain different than "keep_chain" parameter. Note that this filter filters out repertoires, not individual sequences, and can thus only be applied to RepertoireDatasets. Arguments: keep_chain (:py:obj:`~immuneML.environment.SequenceType.SequenceType`): Which chain should be kept. YAML specification: .. indent with spaces .. code-block:: yaml preprocessing_sequences: my_preprocessing: - my_filter: ChainRepertoireFilter: keep_chain: TRB """ def __init__(self, keep_chain: Chain): self.keep_chain = keep_chain
[docs] def process_dataset(self, dataset: RepertoireDataset, result_path: Path = None): return ChainRepertoireFilter.process(dataset=dataset, params={"keep_chain": self.keep_chain, "result_path": result_path})
[docs] @staticmethod def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: ChainRepertoireFilter.check_dataset_type(dataset, [RepertoireDataset], "ChainRepertoireFilter") processed_dataset = dataset.clone() repertoires = [] indices = [] for index, repertoire in enumerate(dataset.get_data()): if all(sequence.metadata.chain == Chain.get_chain(params["keep_chain"]) for sequence in repertoire.sequences): repertoires.append(repertoire) indices.append(index) processed_dataset.repertoires = repertoires processed_dataset.metadata_file = ChainRepertoireFilter.build_new_metadata(processed_dataset, indices, params["result_path"]) ChainRepertoireFilter.check_dataset_not_empty(processed_dataset, "ChainRepertoireFilter") return processed_dataset