Source code for immuneML.reports.data_reports.LabelDist

from pathlib import Path

import math
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder



[docs]
class LabelDist(DataReport):
    """
    LabelDist report plots the distribution of label values for all labels provided as
    input to the report.

    Specification arguments:

    - labels (list): list of label names as they appear in the metadata file (RepertoireDataset)
      or in data files (Receptor/SequenceDataset).

    YAML specification:

    .. code-block: yaml

        reports:
            label_count_report:
                LabelCount:
                    labels: ['diagnosis', 'age_group', 'batch']
    """

    def __init__(self, dataset: Dataset = None, result_path: Path = None, name: str = None,
                 number_of_processes: int = 1, labels: list = None):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)
        self.labels = labels


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys(list(kwargs.keys()), ["labels", 'name'], "LabelDist report", "LabelCount")
        ParameterValidator.assert_all_type_and_value(kwargs["labels"], str, "LabelDist", "labels")
        return cls(name=kwargs["name"], labels=kwargs["labels"])


    def _generate(self) -> ReportResult:

        df = self.dataset.get_metadata(self.labels, return_df=True)

        n_cols = 2
        n_rows = math.ceil(len(df.columns) / n_cols)

        fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=df.columns)

        colors = px.colors.qualitative.Vivid * math.ceil(len(df.columns) / len(px.colors.qualitative.Vivid))

        for i, col in enumerate(df.columns):
            row = i // n_cols + 1
            col_pos = i % n_cols + 1
            color = colors[i]

            if pd.api.types.is_numeric_dtype(df[col]):
                trace = go.Histogram(x=df[col], name=col, marker_color=color)
            else:
                counts = df[col].value_counts()
                trace = go.Bar(x=counts.index.astype(str), y=counts.values, name=col, marker_color=color)

            fig.add_trace(trace, row=row, col=col_pos)

        fig.update_layout(template='plotly_white', height=300 * n_rows, showlegend=False,
                          title_text="Label distributions")

        path = PathBuilder.build(self.result_path) / f"{self.name}_label_distributions.html"
        PlotlyUtil.write_image_to_file(fig, path, df.shape[0])

        return ReportResult(name=self.name, info='Label distributions',
                            output_figures=[ReportOutput(name='Label distributions', path=path)])