Source code for immuneML.reports.ml_reports.TCRdistMotifDiscovery

import logging
from pathlib import Path
from typing import List, Tuple

from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.MLMethod import MLMethod
from immuneML.ml_methods.TCRdistClassifier import TCRdistClassifier
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.PathBuilder import PathBuilder



[docs]
class TCRdistMotifDiscovery(MLReport):
    """
    The report for discovering motifs in paired immune receptor data of given specificity based on TCRdist3. The receptors are hierarchically
    clustered based on the tcrdist distance and then motifs are discovered for each cluster. The report outputs logo plots for the motifs along with
    the raw data used for plotting in csv format.

    For the implementation, `TCRdist3 <https://tcrdist3.readthedocs.io/en/latest/>`_ library was used (source code available
    `here <https://github.com/kmayerb/tcrdist3>`_). More details on the functionality used for this report are available
    `here <https://tcrdist3.readthedocs.io/en/latest/motif_gallery.html>`_.

    Original publications:

    Dash P, Fiore-Gartland AJ, Hertz T, et al. Quantifiable predictive features define epitope-specific T cell receptor repertoires.
    Nature. 2017; 547(7661):89-93. `doi:10.1038/nature22383 <https://www.nature.com/articles/nature22383>`_

    Mayer-Blackwell K, Schattgen S, Cohen-Lavi L, et al. TCR meta-clonotypes for biomarker discovery with tcrdist3: quantification of public,
    HLA-restricted TCR biomarkers of SARS-CoV-2 infection. bioRxiv. Published online December 26, 2020:2020.12.24.424260.
    `doi:10.1101/2020.12.24.424260 <https://www.biorxiv.org/content/10.1101/2020.12.24.424260v1>`_


    Arguments:

        positive_class_name (str): the class value (e.g., epitope) used to select only the receptors that are specific to the given epitope so that
        only those sequences are used to infer motifs; the reference receptors as required by TCRdist will be the ones from the dataset that have
        different or no epitope specified in their metadata; if the labels are available only on the epitope level (e.g., label is "AVFDRKSDAK" and
        classes are True and False), then here it should be specified that only the receptors with value "True" for label "AVFDRKSDAK" should be used;
        there is no default value for this argument

        cores (int): number of processes to use for the computation of the distance and motifs

        min_cluster_size (int): the minimum size of the cluster to discover the motifs for

        use_reference_sequences (bool): when showing motifs, this parameter defines if reference sequences should be provided as well as a background

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_tcr_dist_report: # user-defined name
            TCRdistMotifDiscovery:
                positive_class_name: True # class name, could also be epitope name, depending on how it's defined in the dataset
                cores: 4
                min_cluster_size: 30
                use_reference_sequences: False

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        return TCRdistMotifDiscovery(**kwargs)


    def __init__(self, train_dataset: ReceptorDataset = None, test_dataset: ReceptorDataset = None, method: MLMethod = None, result_path: Path = None,
                 name: str = None, cores: int = None, context: dict = None, positive_class_name=None, min_cluster_size: int = None,
                 use_reference_sequences: bool = None, hp_setting: HPSetting = None, label=None, number_of_processes: int = 1):
        super().__init__(train_dataset=train_dataset, test_dataset=test_dataset, method=method, result_path=result_path,
                         name=name, hp_setting=hp_setting, label=label, number_of_processes=number_of_processes)
        self.cores = cores
        self.positive_class_name = positive_class_name
        self.min_cluster_size = min_cluster_size
        self.use_reference_sequences = use_reference_sequences
        self.context = context

    def _generate(self) -> ReportResult:
        from immuneML.util.TCRdistHelper import TCRdistHelper
        from tcrdist.rep_diff import hcluster_diff
        from tcrdist.summarize import member_summ

        PathBuilder.build(self.result_path)

        subsampled_dataset = self._extract_positive_example_dataset()
        reference_sequences = self._extract_reference_sequences()
        tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores)
        tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"],
                                                       count_col='count')

        figures, tables = [], []

        logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.')

        for index, row in tcr_rep.hcluster_df.iterrows():
            if len(row['neighbors_i']) >= self.min_cluster_size:
                figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences)
                figures.extend(figure_outputs)
                tables.extend(table_outputs)

        res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope'])
        res_summary.to_csv(self.result_path / "tcrdist_summary.csv")

        tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)"))

        return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables)

    def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]:
        from tcrdist.adpt_funcs import get_centroid_seq
        from tcrdist.summarize import _select

        from palmotif import compute_pal_motif
        from palmotif import svg_logo

        dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],]
        figure_outputs, table_outputs = [], []

        logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.")

        for chain in ['a', 'b']:

            if dfnode.shape[0] > 2:
                centroid, *_ = get_centroid_seq(df=dfnode)
            else:
                centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0]

            motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'),
                                            centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None)

            figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg"
            svg_logo(motif, filename=figure_path)

            motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv"
            motif.to_csv(motif_data_path)

            figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)'))
            table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data'))

        return figure_outputs, table_outputs


[docs]
    def set_context(self, context: dict):
        self.context = context
        return self



[docs]
    def check_prerequisites(self):
        if isinstance(self.train_dataset, ReceptorDataset) and isinstance(self.method, TCRdistClassifier):
            return True
        else:
            return False


    def _extract_positive_example_dataset(self) -> ReceptorDataset:
        positive_example_indices = []
        for index, receptor in enumerate(self.train_dataset.get_data()):
            if str(receptor.metadata[self.label.name]) == str(self.positive_class_name):
                positive_example_indices.append(index)

        subsampled_dataset = self.train_dataset.make_subset(example_indices=positive_example_indices, path=self.result_path,
                                                            dataset_type=ReceptorDataset.SUBSAMPLED)

        logging.info(f"{TCRdistMotifDiscovery.__name__}: extracted only positive examples from the training dataset (examples with class = "
                     f"{self.positive_class_name}) for motif discovery. Example count in the new dataset: {subsampled_dataset.get_example_count()}.")

        return subsampled_dataset

    def _extract_reference_sequences(self) -> dict:

        reference_sequences = {'a': [], 'b': []}

        if self.use_reference_sequences:
            for index, receptor in enumerate(self.train_dataset.get_data()):
                if str(receptor.metadata[self.label.name]) != str(self.positive_class_name):
                    reference_sequences['a'].append(receptor.alpha.amino_acid_sequence)
                    reference_sequences['b'].append(receptor.beta.amino_acid_sequence)

        return reference_sequences