Source code for immuneML.reports.data_reports.RepertoireClonotypeSummary

import logging
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
import plotly.express as px

from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder



[docs]
class RepertoireClonotypeSummary(DataReport):
    """
    Shows the number of distinct clonotypes per repertoire in a given dataset as a bar plot.

    Arguments:
        color_by_label (str): name of the label to use to color the plot, e.g., could be disease label, or None

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_clonotype_summary_rep:
          RepertoireClonotypeSummary:
            color_by_label: celiac

    """

    def __init__(self, dataset: Dataset = None, result_path: Path = None, name: str = None, number_of_processes: int = 1, color_by_label: str = None):
        super().__init__(dataset, result_path, name, number_of_processes)
        self.color_by_label = color_by_label


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        if "color_by_label" in kwargs and kwargs['color_by_label'] is not None:
            ParameterValidator.assert_type_and_value(kwargs['color_by_label'], str, RepertoireClonotypeSummary.__name__, 'color_by_label')

        return RepertoireClonotypeSummary(**kwargs)


    def _generate(self) -> ReportResult:
        assert isinstance(self.dataset, RepertoireDataset), \
            f"{RepertoireClonotypeSummary.__name__}: expected repertoire dataset, but got {type(self.dataset)}."

        PathBuilder.build(self.result_path)
        return self._safe_plot()

    def _plot(self) -> ReportResult:
        clonotypes = pd.DataFrame.from_records(sorted([self._get_clonotype_count_with_label(repertoire) for repertoire in self.dataset.get_data()],
                                                      key=lambda x: x[0]), columns=['clonotype_count', self.color_by_label])
        clonotypes['repertoire'] = list(range(1, self.dataset.get_example_count()+1))
        fig = px.bar(clonotypes, x='repertoire', y='clonotype_count', color=self.color_by_label, title='Clonotype count per repertoire',
                     color_discrete_sequence=px.colors.qualitative.Pastel2)
        fig.update_layout(template="plotly_white", yaxis_title='clonotype count', xaxis_title='repertoires sorted by clonotype count')
        clonotypes.to_csv(self.result_path / 'clonotype_count_per_repertoire.csv')
        fig.write_html(str(self.result_path / 'clonotype_count_per_repertoire.html'))

        return ReportResult(name=self.name, info="Clonotype count per repertoire",
                            output_figures=[ReportOutput(self.result_path / 'clonotype_count_per_repertoire.html')],
                            output_tables=[ReportOutput(self.result_path / 'clonotype_count_per_repertoire.csv')])

    def _get_clonotype_count_with_label(self, repertoire: Repertoire) -> Tuple[int, str]:

        sequences = repertoire.get_attribute('sequences')
        if sequences is None:
            sequences = repertoire.get_sequence_aas()

        sequence_count = sequences.shape[0]
        unique_sequence_count = np.unique(sequences).shape[0]
        if sequence_count != unique_sequence_count:
            logging.warning(f"{RepertoireClonotypeSummary.__name__}: for repertoire {repertoire.identifier}, there are {sequence_count} sequences, "
                            f"but {unique_sequence_count} unique sequences.")

        return unique_sequence_count, repertoire.metadata[self.color_by_label] if self.color_by_label else None