import warnings
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.kmer_frequency.sequence_encoding.SequenceEncodingStrategy import SequenceEncodingStrategy
from immuneML.environment.Constants import Constants
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.KmerHelper import KmerHelper
[docs]
class IMGTGappedKmerEncoder(SequenceEncodingStrategy):
[docs]
@staticmethod
def encode_sequence(sequence: ReceptorSequence, params: EncoderParams):
"""
creates all overlapping gapped k-mers and IMGT position pairs from a sequence as features for use in KmerFrequencyEncoder.
this gap length goes from min_gap to max_gap inclusive, and there is a k-mer of length k_left on the left
side of the gap and a k-mer of length k_right on the right side of the gap.
:param sequence: ReceptorSequence
:param params: EncoderParams (within the "model", the following keys are used: "k_left", "k_right", "max_gap",
"min_gap")
:return: SequenceEncodingResult
"""
k_left = params.model.get('k_left')
k_right = params.model.get('k_right', k_left)
max_gap = params.model.get('max_gap')
min_gap = params.model.get('min_gap', 0)
sequence_type = params.model.get('sequence_type', EnvironmentSettings.sequence_type)
length = len(sequence.get_sequence(sequence_type))
if length < k_left + k_right + max_gap:
warnings.warn('Sequence length is less than k_left + k_right + max_gap. Ignoring sequence')
return None
gapped_kmers = KmerHelper.create_IMGT_gapped_kmers_from_sequence(sequence, k_left=k_left, max_gap=max_gap,
min_gap=min_gap, k_right=k_right, sequence_type=sequence_type)
gapped_kmers = [Constants.FEATURE_DELIMITER.join([str(mer) for mer in kmer]) for kmer in gapped_kmers]
return gapped_kmers
[docs]
@staticmethod
def get_feature_names(params: EncoderParams):
return ["sequence", "imgt_position"]