Source code for immuneML.preprocessing.filters.SequenceLengthFilter

from multiprocessing import Pool
from pathlib import Path

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.environment.SequenceType import SequenceType
from immuneML.preprocessing.filters.Filter import Filter
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs] class SequenceLengthFilter(Filter): """ Removes sequences with length out of the predefined range. Arguments: sequence_type (:py:obj:`~immuneML.environment.SequenceType.SequenceType`): Whether the sequences should be filtered on the nucleotide or amino acid level. Valid options are defined by the SequenceType enum. min_len (int): minimum length of the sequence (sequences shorter than min_len will be removed); to not use min_len, set it to -1 max_len (int): maximum length of the sequence (sequences longer than max_len will be removed); to not use max_len, set it to -1 YAML specification: .. indent with spaces .. code-block:: yaml preprocessing_sequences: my_preprocessing: - my_filter: SequenceLengthFilter: sequence_type: AMINO_ACID min_len: 3 # -> remove all sequences shorter than 3 max_len: -1 # -> no upper bound on the sequence length """ def __init__(self, min_len: int, max_len: int, sequence_type: SequenceType, name: str = None): super().__init__() self._min_len = min_len self._max_len = max_len self._sequence_type = sequence_type self._name = name
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ['min_len', 'max_len', 'sequence_type'], SequenceLengthFilter.__name__, SequenceLengthFilter.__name__) ParameterValidator.assert_all_type_and_value([kwargs['min_len'], kwargs['max_len']], int, SequenceLengthFilter.__name__, 'length') if kwargs['max_len'] >= 0: assert kwargs['min_len'] <= kwargs['max_len'], f"{SequenceLengthFilter.__name__}: min_len must be less or equal to max_len." assert kwargs['min_len'] >= 0 or kwargs['max_len'] >= 0, f"{SequenceLengthFilter.__name__}: at least one of min_len and max_len has to be set." ParameterValidator.assert_sequence_type(kwargs, SequenceLengthFilter.__name__) return cls(min_len=kwargs['min_len'], max_len=kwargs['max_len'], sequence_type=SequenceType[kwargs['sequence_type'].upper()], name=kwargs['name'] if 'name' in kwargs else SequenceLengthFilter.__name__)
[docs] def process_dataset(self, dataset: RepertoireDataset, result_path: Path, number_of_processes: int = 1) -> RepertoireDataset: if not isinstance(dataset, RepertoireDataset): raise NotImplementedError new_reps_path = PathBuilder.build(result_path / 'repertoires') arguments = [(repertoire, new_reps_path) for repertoire in dataset.repertoires] with Pool(number_of_processes) as pool: repertoires = pool.starmap(self._process_repertoire, arguments) return RepertoireDataset.build_from_objects(repertoires=repertoires, path=result_path)
def _process_repertoire(self, repertoire: Repertoire, result_path: Path) -> Repertoire: sequences = repertoire.get_sequence_aas() if self._sequence_type == SequenceType.AMINO_ACID else repertoire.get_attribute('sequences') keep_seq_func = self._get_keep_seq_func() indices_to_keep = [keep_seq_func(seq) for seq in sequences] return Repertoire.build_like(repertoire, indices_to_keep, result_path, filename_base=repertoire.metadata['subject_id'] + '_filtered' if 'subject_id' in repertoire.metadata else None) def _get_keep_seq_func(self): if self._max_len < 0: return lambda x: len(x) >= self._min_len elif self._min_len < 0: return lambda x: len(x) <= self._max_len else: return lambda x: self._max_len >= len(x) >= self._min_len