Source code for immuneML.reports.ml_reports.TCRdistMotifDiscovery

import logging
from pathlib import Path
from typing import List, Tuple

from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.MLMethod import MLMethod
from immuneML.ml_methods.TCRdistClassifier import TCRdistClassifier
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.PathBuilder import PathBuilder


[docs] class TCRdistMotifDiscovery(MLReport): """ The report for discovering motifs in paired immune receptor data of given specificity based on TCRdist3. The receptors are hierarchically clustered based on the tcrdist distance and then motifs are discovered for each cluster. The report outputs logo plots for the motifs along with the raw data used for plotting in csv format. For the implementation, `TCRdist3 <https://tcrdist3.readthedocs.io/en/latest/>`_ library was used (source code available `here <https://github.com/kmayerb/tcrdist3>`_). More details on the functionality used for this report are available `here <https://tcrdist3.readthedocs.io/en/latest/motif_gallery.html>`_. Original publications: Dash P, Fiore-Gartland AJ, Hertz T, et al. Quantifiable predictive features define epitope-specific T cell receptor repertoires. Nature. 2017; 547(7661):89-93. `doi:10.1038/nature22383 <https://www.nature.com/articles/nature22383>`_ Mayer-Blackwell K, Schattgen S, Cohen-Lavi L, et al. TCR meta-clonotypes for biomarker discovery with tcrdist3: quantification of public, HLA-restricted TCR biomarkers of SARS-CoV-2 infection. bioRxiv. Published online December 26, 2020:2020.12.24.424260. `doi:10.1101/2020.12.24.424260 <https://www.biorxiv.org/content/10.1101/2020.12.24.424260v1>`_ Arguments: positive_class_name (str): the class value (e.g., epitope) used to select only the receptors that are specific to the given epitope so that only those sequences are used to infer motifs; the reference receptors as required by TCRdist will be the ones from the dataset that have different or no epitope specified in their metadata; if the labels are available only on the epitope level (e.g., label is "AVFDRKSDAK" and classes are True and False), then here it should be specified that only the receptors with value "True" for label "AVFDRKSDAK" should be used; there is no default value for this argument cores (int): number of processes to use for the computation of the distance and motifs min_cluster_size (int): the minimum size of the cluster to discover the motifs for use_reference_sequences (bool): when showing motifs, this parameter defines if reference sequences should be provided as well as a background YAML specification: .. indent with spaces .. code-block:: yaml my_tcr_dist_report: # user-defined name TCRdistMotifDiscovery: positive_class_name: True # class name, could also be epitope name, depending on how it's defined in the dataset cores: 4 min_cluster_size: 30 use_reference_sequences: False """
[docs] @classmethod def build_object(cls, **kwargs): return TCRdistMotifDiscovery(**kwargs)
def __init__(self, train_dataset: ReceptorDataset = None, test_dataset: ReceptorDataset = None, method: MLMethod = None, result_path: Path = None, name: str = None, cores: int = None, context: dict = None, positive_class_name=None, min_cluster_size: int = None, use_reference_sequences: bool = None, hp_setting: HPSetting = None, label=None, number_of_processes: int = 1): super().__init__(train_dataset=train_dataset, test_dataset=test_dataset, method=method, result_path=result_path, name=name, hp_setting=hp_setting, label=label, number_of_processes=number_of_processes) self.cores = cores self.positive_class_name = positive_class_name self.min_cluster_size = min_cluster_size self.use_reference_sequences = use_reference_sequences self.context = context def _generate(self) -> ReportResult: from immuneML.util.TCRdistHelper import TCRdistHelper from tcrdist.rep_diff import hcluster_diff from tcrdist.summarize import member_summ PathBuilder.build(self.result_path) subsampled_dataset = self._extract_positive_example_dataset() reference_sequences = self._extract_reference_sequences() tcr_rep = TCRdistHelper.compute_tcr_dist(subsampled_dataset, [self.label.name], self.cores) tcr_rep.hcluster_df, tcr_rep.Z = hcluster_diff(clone_df=tcr_rep.clone_df, pwmat=tcr_rep.pw_alpha + tcr_rep.pw_beta, x_cols=["epitope"], count_col='count') figures, tables = [], [] logging.info(f'{TCRdistMotifDiscovery.__name__}: created {tcr_rep.hcluster_df.shape[0]} clusters, now discovering motifs in clusters.') for index, row in tcr_rep.hcluster_df.iterrows(): if len(row['neighbors_i']) >= self.min_cluster_size: figure_outputs, table_outputs = self._discover_motif_in_cluster(tcr_rep, index, row, reference_sequences) figures.extend(figure_outputs) tables.extend(table_outputs) res_summary = member_summ(res_df=tcr_rep.hcluster_df, clone_df=tcr_rep.clone_df, addl_cols=['epitope']) res_summary.to_csv(self.result_path / "tcrdist_summary.csv") tables.append(ReportOutput(path=self.result_path / "tcrdist_summary.csv", name="TCRdist summary (csv)")) return ReportResult(name=self.name, info="TCRdist motif discovery", output_figures=figures, output_tables=tables) def _discover_motif_in_cluster(self, tcr_rep, index, row, negative_examples=None) -> Tuple[List[ReportOutput], List[ReportOutput]]: from tcrdist.adpt_funcs import get_centroid_seq from tcrdist.summarize import _select from palmotif import compute_pal_motif from palmotif import svg_logo dfnode = tcr_rep.clone_df.iloc[row['neighbors_i'],] figure_outputs, table_outputs = [], [] logging.info(f"{TCRdistMotifDiscovery.__name__}: in cluster {index+1}, there are {dfnode.shape[0]} neighbors.") for chain in ['a', 'b']: if dfnode.shape[0] > 2: centroid, *_ = get_centroid_seq(df=dfnode) else: centroid = dfnode[f'cdr3_{chain}_aa'].to_list()[0] motif, stat = compute_pal_motif(seqs=_select(df=tcr_rep.clone_df, iloc_rows=row['neighbors_i'], col=f'cdr3_{chain}_aa'), centroid=centroid, refs=negative_examples[chain] if self.use_reference_sequences else None) figure_path = self.result_path / f"motif_{chain}_{index + 1}.svg" svg_logo(motif, filename=figure_path) motif_data_path = self.result_path / f"motif_{chain}_{index + 1}.csv" motif.to_csv(motif_data_path) figure_outputs.append(ReportOutput(figure_path, f'Motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain)')) table_outputs.append(ReportOutput(motif_data_path, f'motif {index + 1} ({Chain.get_chain(chain.upper()).name.lower()} chain) csv data')) return figure_outputs, table_outputs
[docs] def set_context(self, context: dict): self.context = context return self
[docs] def check_prerequisites(self): if isinstance(self.train_dataset, ReceptorDataset) and isinstance(self.method, TCRdistClassifier): return True else: return False
def _extract_positive_example_dataset(self) -> ReceptorDataset: positive_example_indices = [] for index, receptor in enumerate(self.train_dataset.get_data()): if str(receptor.metadata[self.label.name]) == str(self.positive_class_name): positive_example_indices.append(index) subsampled_dataset = self.train_dataset.make_subset(example_indices=positive_example_indices, path=self.result_path, dataset_type=ReceptorDataset.SUBSAMPLED) logging.info(f"{TCRdistMotifDiscovery.__name__}: extracted only positive examples from the training dataset (examples with class = " f"{self.positive_class_name}) for motif discovery. Example count in the new dataset: {subsampled_dataset.get_example_count()}.") return subsampled_dataset def _extract_reference_sequences(self) -> dict: reference_sequences = {'a': [], 'b': []} if self.use_reference_sequences: for index, receptor in enumerate(self.train_dataset.get_data()): if str(receptor.metadata[self.label.name]) != str(self.positive_class_name): reference_sequences['a'].append(receptor.alpha.amino_acid_sequence) reference_sequences['b'].append(receptor.beta.amino_acid_sequence) return reference_sequences