Source code for immuneML.reports.train_ml_model_reports.ReferenceSequenceOverlap

import logging
from collections import Counter
from pathlib import Path
from typing import Tuple

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib_venn import venn2

from immuneML.encodings.filtered_sequence_encoding.SequenceAbundanceEncoder import SequenceAbundanceEncoder
from immuneML.hyperparameter_optimization.states.TrainMLModelState import TrainMLModelState
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.train_ml_model_reports.TrainMLModelReport import TrainMLModelReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder

[docs]class ReferenceSequenceOverlap(TrainMLModelReport): """ The ReferenceSequenceOverlap report compares a list of disease-associated sequences produced by the :ref:`SequenceAbundance` encoder to a list of reference receptor sequences. It outputs a Venn diagram and a list of receptor sequences found both in the encoder and reference. The report compares the sequences by their sequence content and the additional comparison_attributes (such as V or J gene), as specified by the user. Arguments: reference_path (str): path to the reference file in csv format which contains one entry per row and has columns that correspond to the attributes listed under comparison_attributes argument comparison_attributes (list): list of attributes to use for comparison; all of them have to be present in the reference file where they should be the names of the columns label (str): name of the label for which the reference sequences should be compared to the model; if none, it takes the one label from the instruction; if it is none and multiple labels were specified for the instruction, the report will not be generated YAML specification: .. indent with spaces .. code-block:: yaml reports: # the report is defined with all other reports under definitions/reports my_reference_overlap_report: ReferenceSequenceOverlap: reference_path: reference.csv # a reference file with columns listed under comparison_attributes comparison_attributes: - sequence_aas - v_genes - j_genes """
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys(kwargs.keys(), ['reference_path', 'comparison_attributes', 'name', 'label'], ReferenceSequenceOverlap.__name__, f"reports: {kwargs['name'] if 'name' in kwargs else ''}") kwargs['reference_path'] = Path(kwargs['reference_path']) assert kwargs['reference_path'].is_file(), f"{ReferenceSequenceOverlap.__name__}: 'reference_path' for report {kwargs['name']} is not " \ f"a valid file path." reference_sequences_df = pd.read_csv(kwargs['reference_path']) attributes = reference_sequences_df.columns.tolist() ParameterValidator.assert_keys_present(expected_values=kwargs['comparison_attributes'], values=attributes, location=ReferenceSequenceOverlap.__name__, parameter_name='columns in file under reference_path') return ReferenceSequenceOverlap(**kwargs)
def __init__(self, reference_path: Path = None, comparison_attributes: list = None, name: str = None, state: TrainMLModelState = None, result_path: Path = None, label: str = None): super().__init__(name) self.reference_path = reference_path self.comparison_attributes = comparison_attributes self.state = state self.result_path = result_path self.label = label def _generate(self) -> ReportResult: figures, tables = [], [] if ReferenceSequenceOverlap._check_encoder_class(self.state.optimal_hp_items[self.label].encoder): figure, data = self._compute_optimal_model_overlap() figures.append(figure) tables.append(data) for assessment_state in self.state.assessment_states: encoder = assessment_state.label_states[self.label].optimal_assessment_item.encoder if ReferenceSequenceOverlap._check_encoder_class(encoder): figure_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_model_vs_reference_overlap_{self.label}.pdf" df_filename = self.result_path / f"assessment_split_{assessment_state.split_index + 1}_overlap_sequences_{self.label}" figure, data = self._compute_model_overlap(figure_filename, df_filename, encoder, f"overlap sequences between the model for assessment split " f"{assessment_state.split_index + 1} and reference list") figures.append(figure) tables.append(data) return ReportResult(, output_figures=figures, output_tables=tables) @staticmethod def _check_encoder_class(encoder): return any(isinstance(encoder, cls) for cls in [SequenceAbundanceEncoder])
[docs] def check_prerequisites(self): valid = True if self.label is None: if len(self.state.label_configuration.get_labels_by_name()) != 1: logging.warning(f"{ReferenceSequenceOverlap.__name__}: label parameter for report {} is None and it could not be inferred " f"from other information available in the report. It can be inferred automatically if there is only one label " f"specified in the analysis, but got {self.state.label_configuration.get_labels_by_name()} instead. Skipping this " f"report...") valid = False else: self.label = self.state.label_configuration.get_labels_by_name()[0] return valid
def _compute_optimal_model_overlap(self) -> Tuple[ReportOutput, ReportOutput]: filename = self.result_path / f"optimal_model_vs_reference_overlap_{self.label}.pdf" df_filename = self.result_path / f"overlap_sequences_{self.label}.csv" encoder = self.state.optimal_hp_items[self.label].encoder return self._compute_model_overlap(filename, df_filename, encoder, f"overlap sequences between the reference and the optimal model for label {self.label}") def _compute_model_overlap(self, figure_filename, df_filename, encoder, name): reference_sequences_df = pd.read_csv(self.reference_path, usecols=self.comparison_attributes) reference_sequences = list(reference_sequences_df.to_records(index=False)) attributes = reference_sequences_df.columns.tolist() model_sequences = self._extract_from_model(encoder) overlap_sequences = [sequence for sequence in model_sequences if sequence in reference_sequences] count_overlap = len(overlap_sequences) count_ref_only = len([sequence for sequence in reference_sequences if sequence not in model_sequences]) count_model_only = len([sequence for sequence in model_sequences if sequence not in reference_sequences]) self._make_venn_diagram(count_ref_only, count_overlap, count_model_only, 'reference', 'model', figure_filename) figure = ReportOutput(figure_filename, name) pd.DataFrame.from_records(overlap_sequences, columns=attributes).to_csv(df_filename, index=False) data = ReportOutput(df_filename, name) return figure, data def _extract_from_model(self, encoder): model_sequences_df = pd.read_csv(getattr(encoder, "relevant_sequence_csv_path")) model_attributes = model_sequences_df.columns.tolist() assert all(attribute in self.comparison_attributes for attribute in model_attributes), \ f"{ReferenceSequenceOverlap.__name__}: comparison attributes from the report {} ({self.comparison_attributes}) and from the optimal " \ f"encoding {} ({model_attributes}) do not match." return list(model_sequences_df[self.comparison_attributes].to_records(index=False)) def _make_venn_diagram(self, count_ref_only: int, count_overlap: int, count_model_only: int, label_reference: str, label_model: str, filename: str): subsets = Counter({"01": count_model_only, "10": count_ref_only, "11": count_overlap}) diagram = venn2(subsets=subsets, set_labels=(label_reference, label_model), set_colors=('#72AAA1', '#E5B9AD'), alpha=0.8) for index in subsets: if subsets[index] == 0 and diagram.get_label_by_id(index) is not None: diagram.get_label_by_id(index).set_text("") plt.savefig(filename) plt.clf()