Source code for immuneML.util.PositionHelper

import logging

import numpy as np

from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.SequenceSet import ReceptorSequence
from immuneML.environment.SequenceType import SequenceType


[docs] class PositionHelper: MAX_CDR3_LEN = 91 MIN_CDR3_LEN = 5 MIDPOINT_CDR3_LEN = 13
[docs] @staticmethod def get_imgt_position_weights_for_annotation(input_length: int, region_type: RegionType, sequence_position_weights: dict): imgt_positions = PositionHelper.gen_imgt_positions_from_length(input_length, region_type) position_weights = {} if sequence_position_weights: for index, position in enumerate(imgt_positions): if position in sequence_position_weights: position_weights[position] = sequence_position_weights[position] if len(imgt_positions) > len(position_weights): weights_sum = sum(list(position_weights.values())) remaining_weight_for_position = (1 - weights_sum) / (len(imgt_positions) - len(position_weights)) for position in imgt_positions: if position not in position_weights: position_weights[position] = remaining_weight_for_position if not np.isclose(sum(list(position_weights.values())), 1): weights_sum = sum(list(position_weights.values())) position_weights = {position: weight / weights_sum for position, weight in position_weights.items()} return {position: position_weights[position] for position in imgt_positions}
[docs] @staticmethod def get_allowed_positions_for_annotation(input_length: int, region_type: RegionType, sequence_position_weights: dict): position_weights = PositionHelper.get_imgt_position_weights_for_annotation(input_length, region_type, sequence_position_weights) return [int(bool(weight)) for weight in position_weights.values()]
[docs] @staticmethod def get_imgt_position_weights_for_implanting(aa_input_length: int, region_type: RegionType, sequence_position_weights: dict, limit: int): position_weights = PositionHelper.get_imgt_position_weights_for_annotation(aa_input_length, region_type, sequence_position_weights) for index, position in enumerate(position_weights.keys()): if index > aa_input_length - limit: position_weights[position] = 0. weights_sum = sum(list(position_weights.values())) if weights_sum == 0: logging.warning(f"Sequence of length {aa_input_length} has no allowed positions for signal with sequence " f"position weights {sequence_position_weights} and motif length {limit}, it will be discarded.") return position_weights position_weights = {position: np.array([weight]).astype(np.float64)[0] / weights_sum for position, weight in position_weights.items()} assert np.isclose(sum(list(position_weights.values())), 1.), \ (aa_input_length, region_type.name, position_weights, sum(list(position_weights.values())), limit) return position_weights
[docs] @staticmethod def gen_imgt_positions_from_cdr3_length(input_length: int) -> list: if PositionHelper.MIN_CDR3_LEN <= input_length <= PositionHelper.MIDPOINT_CDR3_LEN: positions = [105, 106, 107, 116, 117] pos_left_count = (input_length - PositionHelper.MIN_CDR3_LEN) // 2 pos_right_count = input_length - PositionHelper.MIN_CDR3_LEN - pos_left_count positions = ([str(pos) for pos in positions if pos <= 107] + [str(i) for i in range(108, 107 + pos_left_count + 1)] + [str(i) for i in range(116 - pos_right_count, 116)] + ['116', '117']) return positions elif PositionHelper.MIDPOINT_CDR3_LEN < input_length <= PositionHelper.MAX_CDR3_LEN: positions = list(range(105, 118)) pos111_count = (input_length - PositionHelper.MIDPOINT_CDR3_LEN) // 2 pos112_count = input_length - PositionHelper.MIDPOINT_CDR3_LEN - pos111_count positions = ([str(pos) for pos in positions if pos <= 111] + [f'111.{i}' for i in range(1, pos111_count + 1)] + [f'112.{i}' for i in range(pos112_count, 0, -1)] + [str(pos) for pos in positions if pos >= 112]) return positions else: logging.warning(f"IMGT positions could not be generated for CDR3 sequence of length {input_length}.") return []
[docs] @staticmethod def gen_imgt_positions_from_junction_length(input_length: int): if PositionHelper.MIN_CDR3_LEN + 2 <= input_length <= PositionHelper.MAX_CDR3_LEN + 2: return ['104'] + PositionHelper.gen_imgt_positions_from_cdr3_length(input_length - 2) + ['118'] else: logging.warning( f"IMGT positions could not be generated for IMGT junction sequence of length {input_length}.") return []
[docs] @staticmethod def gen_imgt_positions_from_sequence(sequence: ReceptorSequence, sequence_type: SequenceType = SequenceType.AMINO_ACID, region_type: RegionType = RegionType.IMGT_CDR3): if sequence_type != SequenceType.AMINO_ACID: raise NotImplementedError(f"{sequence_type.name} is currently not supported for obtaining IMGT positions") input_length = len(sequence.get_sequence(sequence_type=sequence_type)) return PositionHelper.gen_imgt_positions_from_length(input_length, region_type)
[docs] @staticmethod def gen_imgt_positions_from_length(input_length: int, region_type: RegionType): if region_type == RegionType.IMGT_CDR3: return PositionHelper.gen_imgt_positions_from_cdr3_length(input_length) if region_type == RegionType.IMGT_JUNCTION: return PositionHelper.gen_imgt_positions_from_junction_length(input_length) else: raise NotImplementedError( f"PositionHelper: IMGT positions are not implemented for region type {region_type}")