Source code for immuneML.analysis.SequenceMatcher

from multiprocessing.pool import Pool

import numpy as np
from editdistance import eval as edit_distance

from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceSet import ReceptorSequence
from immuneML.data_model.SequenceSet import Repertoire
from immuneML.encodings.reference_encoding.SequenceMatchingSummaryType import SequenceMatchingSummaryType


[docs] class SequenceMatcher: """ Matches the sequences across the given list of reference sequences (a list of ReceptorSequence objects) and returns the following information: { "repertoires":[{ "sequences": [{ "sequence": "AAA", "matching_sequences": ["AAA", "AAC"], "v_gene": "V12", "j_gene": "J3", "locus": "A" }], # list of sequences for the repertoire with matched sequences for each original sequence "repertoire": "fdjshfk321231", # repertoire identifier "repertoire_index": 2, # the index of the repertoire in the dataset, "sequences_matched": 4, # number of sequences from the repertoire which are a match for at least one reference sequence "percentage_of_sequences_matched": 0.75, # percentage of sequences from the repertoire that have at least one match in the reference sequences "metadata": {"CD": True}, # dict with parameters that can be used for analysis on repertoire level and that serve as a starting point for label configurations "locus": ["A","B"] # list of chains in the repertoire }, ...] } """ CORES = 4
[docs] def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict: matched = {"repertoires": []} for index, repertoire in enumerate(dataset.get_data()): matched["repertoires"].append(self.match_repertoire(repertoire, index, reference_sequences, max_distance, summary_type)) return matched
[docs] def matches_gene(self, gene1, gene2): if gene1 == gene2 or all(gene in ['', None] for gene in [gene1, gene2]): return True else: return gene2.split("-", 1)[0] == gene1 or gene1.split("-", 1)[0] == gene2
[docs] def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance): """ :param original_sequence: ReceptorSequence :param reference_sequence: ReceptorSequence :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance """ return reference_sequence.locus == original_sequence.locus \ and self.matches_gene(reference_sequence.v_call, original_sequence.v_call) \ and self.matches_gene(reference_sequence.j_call, original_sequence.j_call) \ and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance
[docs] def match_repertoire(self, repertoire: Repertoire, index: int, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict: matched = {"sequences": [], "repertoire": repertoire.identifier, "repertoire_index": index} arguments = [(seq, reference_sequences, max_distance) for seq in repertoire.sequences()] with Pool(SequenceMatcher.CORES) as pool: matched["sequences"] = pool.starmap(self.match_sequence, arguments) if summary_type == SequenceMatchingSummaryType.CLONAL_PERCENTAGE: total_count = np.sum([sequence.duplicate_count for sequence in repertoire.sequences()]) matched["clonal_percentage"] = np.sum([sequence.duplicate_count for index, sequence in enumerate(repertoire.sequences()) if len(matched["sequences"][index]["matching_sequences"]) > 0]) / total_count else: matched["count"] = len([r for r in matched["sequences"] if len(r["matching_sequences"]) > 0]) matched["percentage"] = matched["count"] / len(matched["sequences"]) matched["metadata"] = repertoire.metadata matched["patient_id"] = repertoire.identifier matched["locus"] = list(set([sequence.locus for sequence in repertoire.sequences()])) return matched
[docs] def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict: matching_sequences = [seq.get_sequence() for seq in reference_sequences if self.matches_sequence(sequence, seq, max_distance)] return { "matching_sequences": matching_sequences, "sequence": sequence.get_sequence(), "v_call": sequence.v_call, "j_call": sequence.j_call, "locus": sequence.locus }