Source code for immuneML.util.KmerHelper

# quality: peripheral
import itertools
import logging
import warnings

from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.SequenceSet import ReceptorSequence, Repertoire
from immuneML.environment.SequenceType import SequenceType
from immuneML.util.PositionHelper import PositionHelper


[docs] class KmerHelper:
[docs] @staticmethod def create_kmers_from_sequence(sequence: ReceptorSequence, k: int, sequence_type: SequenceType, overlap: bool = True): return KmerHelper.create_kmers_from_string(sequence.get_sequence(sequence_type), k, overlap)
[docs] @staticmethod def create_kmers_from_string(sequence, k: int, overlap: bool = True): kmers = [] step = 1 if overlap else k for i in range(0, len(sequence) - k + 1, step): kmers.append(sequence[i:i + k]) return kmers
[docs] @staticmethod def create_IMGT_kmers_from_sequence(sequence: ReceptorSequence, k: int, sequence_type: SequenceType, region_type: RegionType = RegionType.IMGT_CDR3): return KmerHelper.create_IMGT_kmers_from_string(sequence.get_sequence(sequence_type), k, region_type)
[docs] @staticmethod def create_IMGT_kmers_from_string(sequence: str, k: int, region_type: RegionType): positions = PositionHelper.gen_imgt_positions_from_length(len(sequence), region_type) if positions is not None and len(positions) > 0: sequence_w_pos = list(zip(list(sequence), positions)) kmers = KmerHelper.create_kmers_from_string(sequence_w_pos, k) kmers = [(''.join([x[0] for x in kmer]), kmer[0][1]) for kmer in kmers] return kmers else: logging.warning(f"{KmerHelper.__name__}: {sequence} could not be represented using IMGT {k}-mers, " f"no IMGT positions were found. Returning empty list instead...") return []
[docs] @staticmethod def create_IMGT_gapped_kmers_from_sequence(sequence: ReceptorSequence, sequence_type: SequenceType, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0, region_type: RegionType = RegionType.IMGT_CDR3): positions = PositionHelper.gen_imgt_positions_from_sequence(sequence, sequence_type, region_type) sequence_w_pos = list(zip(list(sequence.get_sequence(sequence_type)), positions)) kmers = KmerHelper.create_gapped_kmers_from_string(sequence_w_pos, k_left=k_left, max_gap=max_gap, k_right=k_right, min_gap=min_gap) if kmers is not None: kmers = [(''.join([x[0] for x in kmer]), kmer[0][1]) for kmer in kmers] return kmers else: return None
[docs] @staticmethod def create_gapped_kmers_from_string(sequence, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): length = len(sequence) k_right = k_left if k_right is None else k_right if length < k_left + k_right + max_gap: raise ValueError('Sequence length is less than k_left + k_right + max_gap. ' 'Filter sequences from each repertoire that are less than this length then rerun.') gapped_kmers = [] for i in range(min_gap, max_gap + 1): s = k_left + k_right + i kmers = [sequence[i: i + s] for i in range(length - s + 1)] if isinstance(sequence, str): gapped_kmers.extend([kmer[:k_left] + i * "." + kmer[k_left + i:] for kmer in kmers]) if isinstance(sequence, list): gapped_kmers.extend([kmer[:k_left] + [(".", el[1]) for el in kmer[k_left:k_left+i]] + kmer[k_left + i:] for kmer in kmers]) return gapped_kmers
[docs] @staticmethod def create_gapped_kmers_from_sequence(sequence: ReceptorSequence, sequence_type: SequenceType, k_left: int, max_gap: int, k_right: int = None, min_gap: int = 0): return KmerHelper.create_gapped_kmers_from_string(sequence.get_sequence(sequence_type), k_left, max_gap, k_right, min_gap)
[docs] @staticmethod def create_all_kmers(k: int, alphabet: list): """ creates all possible k-mers given a k-mer length and an alphabet :param k: length of k-mer (int) :param alphabet: list of characters from which to make all possible k-mers (list) :return: alphabetically sorted list of k-mers """ kmers = [''.join(x) for x in itertools.product(alphabet, repeat=k)] kmers.sort() return kmers
[docs] @staticmethod def create_sentences_from_repertoire(repertoire: Repertoire, k: int, sequence_type: SequenceType, overlap: bool = True, region_type: RegionType = RegionType.IMGT_CDR3): sentences = [] for sequence in repertoire.sequences(region_type): sentences.append(KmerHelper.create_kmers_from_sequence(sequence=sequence, k=k, overlap=overlap, sequence_type=sequence_type)) return sentences
[docs] @staticmethod def create_kmers_within_HD(kmer: str, alphabet: list, distance: int = 1): assert distance < len(kmer) if distance > 1: warnings.warn("In create_kmers_within_HD distance larger than 1 is not yet implemented. " "Using default value 1...", Warning) pairs = [] for i in range(len(kmer)): for letter in alphabet: new_kmer = kmer[0:i] + letter + kmer[i + 1:] pairs.append([kmer, new_kmer]) return pairs