Source code for immuneML.analysis.SequenceMatcher

from multiprocessing.pool import Pool

import numpy as np
from editdistance import eval as edit_distance

from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceSet import ReceptorSequence
from immuneML.data_model.SequenceSet import Repertoire
from immuneML.encodings.reference_encoding.SequenceMatchingSummaryType import SequenceMatchingSummaryType



[docs]
class SequenceMatcher:
    """
    Matches the sequences across the given list of reference sequences (a list of ReceptorSequence objects) and returns the following information:
    {
        "repertoires":[{
            "sequences": [{
                "sequence": "AAA",
                "matching_sequences": ["AAA", "AAC"],
                "v_gene": "V12",
                "j_gene": "J3",
                "locus": "A"
            }], # list of sequences for the repertoire with matched sequences for each original sequence
            "repertoire": "fdjshfk321231", # repertoire identifier
            "repertoire_index": 2,  # the index of the repertoire in the dataset,
            "sequences_matched": 4,  # number of sequences from the repertoire which are a match for at least one reference sequence
            "percentage_of_sequences_matched": 0.75,  # percentage of sequences from the repertoire that have at least one match in the reference sequences
            "metadata": {"CD": True},  # dict with parameters that can be used for analysis on repertoire level and that serve as a starting point for label configurations
            "locus": ["A","B"] # list of chains in the repertoire
        }, ...]
    }
    """

    CORES = 4


[docs]
    def match(self, dataset: RepertoireDataset, reference_sequences: list, max_distance: int, summary_type: SequenceMatchingSummaryType) -> dict:

        matched = {"repertoires": []}

        for index, repertoire in enumerate(dataset.get_data()):
            matched["repertoires"].append(self.match_repertoire(repertoire, index,
                                                                reference_sequences, max_distance, summary_type))

        return matched



[docs]
    def matches_gene(self, gene1, gene2):
        """for two genes (e.g. "TRBV12-2,TRBV12-4" and "TRBV12-2") returns True if they share at least one gene
        (here TRBV12-2)"""
        genes1 = [g.split("*")[0] for g in gene1.split(",")] if gene1 not in ['', None] else []
        genes2 = [g.split("*")[0] for g in gene2.split(",")] if gene2 not in ['', None] else []

        return len(set(genes1).intersection(set(genes2))) > 0 or (len(genes1) == len(genes2) and len(genes1) == 0)



[docs]
    def matches_sequence(self, original_sequence: ReceptorSequence, reference_sequence: ReceptorSequence, max_distance):
        """
        :param original_sequence: ReceptorSequence
        :param reference_sequence: ReceptorSequence
        :param max_distance: max allowed Levenshtein distance between two sequences to be considered a match
        :return: True if chain, v_gene and j_gene are the same and sequences are within given Levenshtein distance
        """
        return reference_sequence.locus == original_sequence.locus \
            and self.matches_gene(reference_sequence.v_call, original_sequence.v_call) \
            and self.matches_gene(reference_sequence.j_call, original_sequence.j_call) \
            and edit_distance(original_sequence.get_sequence(), reference_sequence.get_sequence()) <= max_distance



[docs]
    def match_repertoire(self, repertoire: Repertoire, index: int, reference_sequences: list, max_distance: int,
                         summary_type: SequenceMatchingSummaryType, num_processes: int = 1) -> dict:

        matched = {"sequences": [], "repertoire": repertoire.identifier, "repertoire_index": index}
        arguments = [(seq, reference_sequences, max_distance) for seq in repertoire.sequences()]

        with Pool(num_processes) as pool:
            matched["sequences"] = pool.starmap(self.match_sequence, arguments)

        if summary_type == SequenceMatchingSummaryType.CLONAL_PERCENTAGE:
            total_count = np.sum([sequence.duplicate_count for sequence in repertoire.sequences()])
            matched["clonal_percentage"] = np.sum([sequence.duplicate_count for index, sequence in enumerate(repertoire.sequences()) if len(matched["sequences"][index]["matching_sequences"]) > 0]) / total_count
        else:
            matched["count"] = len([r for r in matched["sequences"] if len(r["matching_sequences"]) > 0])
            matched["percentage"] = matched["count"] / len(matched["sequences"])
        matched["metadata"] = repertoire.metadata
        matched["patient_id"] = repertoire.identifier
        matched["locus"] = list(set([sequence.locus for sequence in repertoire.sequences()]))

        return matched



[docs]
    def match_sequence(self, sequence: ReceptorSequence, reference_sequences: list, max_distance: int) -> dict:
        matching_sequences = [seq.get_sequence() for seq in reference_sequences
                              if self.matches_sequence(sequence, seq, max_distance)]

        return {
            "matching_sequences": matching_sequences,
            "sequence": sequence.get_sequence(),
            "v_call": sequence.v_call,
            "j_call": sequence.j_call,
            "locus": sequence.locus
        }