Source code for immuneML.reports.data_reports.SequenceLengthDistribution

import warnings
from collections import Counter
from pathlib import Path

import pandas as pd
import plotly.express as px

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.PathBuilder import PathBuilder


[docs]class SequenceLengthDistribution(DataReport): """ Generates a histogram of the lengths of the sequences in a RepertoireDataset. YAML specification: .. indent with spaces .. code-block:: yaml my_sld_report: SequenceLengthDistribution """
[docs] @classmethod def build_object(cls, **kwargs): return SequenceLengthDistribution(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, batch_size: int = 1, result_path: Path = None, number_of_processes: int = 1, name: str = None): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.batch_size = batch_size
[docs] def check_prerequisites(self): if isinstance(self.dataset, RepertoireDataset): return True else: warnings.warn("SequenceLengthDistribution: report can be generated only from RepertoireDataset. Skipping this report...") return False
def _generate(self) -> ReportResult: sequence_lengths = self._get_sequence_lengths() report_output_fig = self._plot(sequence_lengths=sequence_lengths) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(name=self.name, info="A histogram of the lengths of the sequences in a RepertoireDataset.", output_figures=output_figures) def _get_sequence_lengths(self) -> Counter: sequence_lenghts = Counter() for repertoire in self.dataset.get_data(self.batch_size): seq_lengths = self._count_in_repertoire(repertoire) sequence_lenghts += seq_lengths return sequence_lenghts def _count_in_repertoire(self, repertoire: Repertoire) -> Counter: c = Counter([len(sequence.get_sequence()) for sequence in repertoire.sequences]) return c def _plot(self, sequence_lengths: Counter): df = pd.DataFrame({"counts": list(sequence_lengths.values()), 'sequence_lengths': list(sequence_lengths.keys())}) figure = px.bar(df, x="sequence_lengths", y="counts") figure.update_layout(xaxis=dict(tickmode='array', tickvals=df["sequence_lengths"]), yaxis=dict(tickmode='array', tickvals=df["counts"]), title="Sequence length distribution", template="plotly_white") figure.update_traces(marker_color=px.colors.diverging.Tealrose[0]) PathBuilder.build(self.result_path) file_path = self.result_path / "sequence_length_distribution.html" figure.write_html(str(file_path)) return ReportOutput(path=file_path, name="sequence length distribution plot")