import warnings
from collections import Counter
from pathlib import Path
from typing import Union
import pandas as pd
import plotly.express as px
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.dataset.SequenceDataset import SequenceDataset
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
[docs]
class SequenceLengthDistribution(DataReport):
"""
Generates a histogram of the lengths of the sequences in a repertoire or sequence dataset.
Arguments:
sequence_type (str): whether to check the length of amino acid or nucletoide sequences; default value is 'amino_acid'
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_sld_report:
SequenceLengthDistribution:
sequence_type: amino_acid
"""
[docs]
@classmethod
def build_object(cls, **kwargs):
ParameterValidator.assert_sequence_type(kwargs)
return SequenceLengthDistribution(**{**kwargs, 'sequence_type': SequenceType[kwargs['sequence_type'].upper()]})
def __init__(self, dataset: Union[RepertoireDataset, SequenceDataset] = None, batch_size: int = 1, result_path: Path = None, number_of_processes: int = 1,
sequence_type: SequenceType = SequenceType.AMINO_ACID, name: str = None):
super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
self.batch_size = batch_size
self.sequence_type = sequence_type
[docs]
def check_prerequisites(self):
if isinstance(self.dataset, RepertoireDataset) or isinstance(self.dataset, SequenceDataset):
return True
else:
warnings.warn("SequenceLengthDistribution: report can be generated only from repertoire and sequence datasets. Skipping this report...")
return False
def _generate(self) -> ReportResult:
sequence_lengths = self._get_sequence_lengths()
PathBuilder.build(self.result_path)
df = pd.DataFrame({"counts": list(sequence_lengths.values()), 'sequence_lengths': list(sequence_lengths.keys())})
df.to_csv(self.result_path / 'sequence_length_distribution.csv', index=False)
report_output_fig = self._safe_plot(df=df, output_written=False)
output_figures = None if report_output_fig is None else [report_output_fig]
return ReportResult(name=self.name,
info="A histogram of the lengths of the sequences in a dataset.",
output_figures=output_figures, output_tables=[ReportOutput(self.result_path / 'sequence_length_distribution.csv',
'lengths of sequences in the dataset')])
def _get_sequence_lengths(self) -> Counter:
if isinstance(self.dataset, RepertoireDataset):
sequence_lengths = Counter()
for repertoire in self.dataset.get_data(self.batch_size):
seq_lengths = self._count_in_repertoire(repertoire)
sequence_lengths += seq_lengths
else:
sequence_lengths = {}
for sequence in self.dataset.get_data():
l = len(sequence.get_sequence(self.sequence_type))
if l in sequence_lengths:
sequence_lengths[l] += 1
else:
sequence_lengths[l] = 1
return sequence_lengths
def _count_in_repertoire(self, repertoire: Repertoire) -> Counter:
c = Counter([len(sequence.get_sequence(self.sequence_type)) for sequence in repertoire.sequences])
return c
def _plot(self, df: pd.DataFrame) -> ReportOutput:
figure = px.bar(df, x="sequence_lengths", y="counts")
figure.update_layout(xaxis=dict(tickmode='array', tickvals=df["sequence_lengths"]), yaxis=dict(tickmode='array', tickvals=df["counts"]),
title="Sequence length distribution", template="plotly_white")
figure.update_traces(marker_color=px.colors.diverging.Tealrose[0])
PathBuilder.build(self.result_path)
file_path = self.result_path / "sequence_length_distribution.html"
figure.write_html(str(file_path))
return ReportOutput(path=file_path, name="sequence length distribution plot")