Source code for immuneML.reports.train_ml_model_reports.DiseaseAssociatedSequenceCVOverlap

import logging
from pathlib import Path
from typing import List, Tuple

import pandas as pd
import plotly.express as px

from immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder import CompAIRRSequenceAbundanceEncoder
from immuneML.encodings.abundance_encoding.KmerAbundanceEncoder import KmerAbundanceEncoder
from immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder import SequenceAbundanceEncoder
from immuneML.environment.Label import Label
from immuneML.hyperparameter_optimization.states.HPItem import HPItem
from immuneML.hyperparameter_optimization.states.TrainMLModelState import TrainMLModelState
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.train_ml_model_reports.TrainMLModelReport import TrainMLModelReport
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SequenceAnalysisHelper import SequenceAnalysisHelper


[docs] class DiseaseAssociatedSequenceCVOverlap(TrainMLModelReport): """ DiseaseAssociatedSequenceCVOverlap report makes one heatmap per label showing the overlap of disease-associated sequences (or k-mers) produced by the :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder`, :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder` or :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` between folds of cross-validation (either inner or outer loop of the nested CV). The overlap is computed by the following equation: .. math:: overlap(X,Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)} x 100 For details, see Greiff V, Menzel U, Miho E, et al. Systems Analysis Reveals High Genetic and Antigen-Driven Predetermination of Antibody Repertoires throughout B Cell Development. Cell Reports. 2017;19(7):1467-1478. doi:10.1016/j.celrep.2017.04.054. Arguments: compare_in_selection (bool): whether to compute the overlap over the inner loop of the nested CV - the sequence overlap is shown across CV folds for the model chosen as optimal within that selection compare_in_assessment (bool): whether to compute the overlap over the optimal models in the outer loop of the nested CV YAML specification: .. indent with spaces .. code-block:: yaml reports: # the report is defined with all other reports under definitions/reports my_overlap_report: DiseaseAssociatedSequenceCVOverlap # report has no parameters """ COMPATIBLE_ENCODERS = (SequenceAbundanceEncoder, CompAIRRSequenceAbundanceEncoder, KmerAbundanceEncoder)
[docs] @classmethod def build_object(cls, **kwargs): return DiseaseAssociatedSequenceCVOverlap(**kwargs)
def __init__(self, state: TrainMLModelState = None, result_path: Path = None, name: str = None, compare_in_selection: bool = False, compare_in_assessment: bool = False, label: Label = None, number_of_processes: int = 1): super().__init__(name=name, state=state, label=label, result_path=result_path, number_of_processes=number_of_processes) self.compare_in_selection = compare_in_selection self.compare_in_assessment = compare_in_assessment def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) tables, figures = [], [] for label_name in self.state.label_configuration.get_labels_by_name(): if self.compare_in_assessment: table, figure = self._generate_for_assessment(label_name) tables.append(table) figures.append(figure) if self.compare_in_selection: tmp_tables, tmp_figures = self._generate_for_selection(label_name) tables += tmp_tables figures += tmp_figures return ReportResult(self.name, info="One heatmap per label showing the overlap of disease-associated sequences produced by the SequenceAbundance encoder between folds of cross-validation (either inner or outer loop of the nested CV)", output_figures=[fig for fig in figures if fig is not None], output_tables=[tab for tab in tables if tab is not None]) def _generate_for_assessment(self, label_name: str): hp_items = [st.label_states[label_name].optimal_assessment_item for st in self.state.assessment_states if st.label_states[label_name].optimal_assessment_item.encoder.__class__ in DiseaseAssociatedSequenceCVOverlap.COMPATIBLE_ENCODERS] table, figure = self._compute_overlap(hp_items, f'sequence_overlap_{label_name}_assessment') return table, figure def _generate_for_selection(self, label_name: str): tables, figures = [], [] for assessment_index, assessment_state in enumerate(self.state.assessment_states): selection_state = assessment_state.label_states[label_name].selection_state if selection_state.optimal_hp_setting.encoder.__class__ in DiseaseAssociatedSequenceCVOverlap.COMPATIBLE_ENCODERS: hp_items = selection_state.hp_items[selection_state.optimal_hp_setting.get_key()] table, figure = self._compute_overlap(hp_items, f'sequence_overlap_{label_name}_selection_{assessment_index + 1}_split') tables.append(table) figures.append(figure) return tables, figures def _compute_overlap(self, hp_items: List[HPItem], filename: str) -> Tuple[ReportOutput, ReportOutput]: overlap_matrix = SequenceAnalysisHelper.compute_overlap_matrix(hp_items) if overlap_matrix is None: logging.warning(f'{DiseaseAssociatedSequenceCVOverlap.__name__}: overlap matrix is None, some of the relevant sequence sets were empty, ' f'no report will be made.') return None, None row_col_names = [f"{item.hp_setting}_split_{item.split_index+1}" for item in hp_items] table_output = self._export_matrix(overlap_matrix, filename, row_col_names) figure_output = self._make_figure(overlap_matrix, filename, row_col_names) return table_output, figure_output def _export_matrix(self, overlap_matrix, filename, row_col_names) -> ReportOutput: data_path = self.result_path / f"{filename}.csv" pd.DataFrame(overlap_matrix, columns=row_col_names, index=row_col_names).to_csv(data_path) return ReportOutput(data_path, " ".join(filename.split('_') + ['data'])) def _make_figure(self, overlap_matrix, filename, row_col_names) -> ReportOutput: figure = px.imshow(overlap_matrix, x=row_col_names, y=row_col_names, zmin=0, zmax=100, color_continuous_scale=px.colors.sequential.Teal, template='plotly_white') figure.update_traces(hovertemplate="Overlap of disease-associated<br>sequences between<br>%{x} and %{y}:<br>%{z}%<extra></extra>") figure_path = self.result_path / f"{filename}.html" figure.write_html(str(figure_path)) return ReportOutput(figure_path, " ".join(filename.split('_')))