Source code for immuneML.reports.encoding_reports.PositionalMotifFrequencies

from pathlib import Path

import logging
import plotly.express as px
import pandas as pd
from typing import List

from immuneML.data_model import bnp_util
from immuneML.data_model.datasets.ElementDataset import SequenceDataset
from immuneML.encodings.motif_encoding.PositionalMotifHelper import (
    PositionalMotifHelper,
)
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.encodings.motif_encoding.MotifEncoder import MotifEncoder
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder



[docs]
class PositionalMotifFrequencies(EncodingReport):
    """
    This report must be used in combination with the :py:obj:`~immuneML.encodings.motif_encoding.MotifEncoder.MotifEncoder`.
    Plots a stacked bar plot of amino acid occurrence at different indices in any given dataset, along with a plot
    investigating motif continuity which displays a bar plot of the gap sizes between the amino acids in the motifs in
    the given dataset. Note that a distance of 1 means that the amino acids are continuous (next to each other).

    **Specification arguments:**

    - motif_color_map (dict): Optional mapping between motif lengths and specific colors to be used. Example:

        motif_color_map:
            1: #66C5CC
            2: #F6CF71
            3: #F89C74


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        definitions:
            reports:
                my_pos_motif_report:
                    PositionalMotifFrequencies:
                        motif_color_map:

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        if "motif_color_map" in kwargs:
            ParameterValidator.assert_type_and_value(kwargs["motif_color_map"], dict,
                                                     PositionalMotifFrequencies.__name__, "motif_color_map")
            kwargs["motif_color_map"] = {str(key): value for key, value in kwargs["motif_color_map"].items()}

        return PositionalMotifFrequencies(**kwargs)


    def __init__(self, dataset: SequenceDataset = None, result_path: Path = None,
                 motif_color_map: dict = None, name: str = None, number_of_processes: int = 1):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)
        self.motif_color_map = motif_color_map
        self.region_type = None
        self.sequence_type = None


[docs]
    def get_sequence_length(self):
        if self.sequence_type is None or self.region_type is None:
            self.sequence_type = self.dataset.encoded_data.info['sequence_type']
            self.region_type = self.dataset.encoded_data.info['region_type']

        my_sequences = getattr(self.dataset.data, bnp_util.get_sequence_field_name(self.region_type, self.sequence_type))
        return my_sequences.lengths[0]


    def _generate(self):
        PathBuilder.build(self.result_path)

        motifs = self.dataset.encoded_data.feature_names
        positional_aa_counts_df = self._get_positional_aa_counts(motifs)
        max_gap_size_df = self._get_max_gap_sizes(motifs)
        total_gap_size_df = self._get_total_gap_sizes(motifs)

        positional_aa_counts_table = self._write_output_table(positional_aa_counts_df,
                                                              file_path=self.result_path / f"positional_aa_counts.csv",
                                                              name="Frequencies of amino acids found in the high-precision high-recall motifs")

        max_gap_size_table = self._write_output_table(max_gap_size_df,
                                                      file_path=self.result_path / f"max_gap_size_table.csv",
                                                      name="Maximum gap sizes within motifs")

        total_gap_size_table = self._write_output_table(total_gap_size_df,
                                                        file_path=self.result_path / f"total_gap_size_table.csv",
                                                        name="Total (summed) gap sizes within motifs")

        output_figure1 = self._safe_plot(positional_aa_counts_df=positional_aa_counts_df,
                                         plot_callable="_plot_positional_aa_counts")
        output_figure2 = self._safe_plot(gap_size_df=max_gap_size_df, x="max_gap_size", plot_callable="_plot_gap_sizes")
        output_figure3 = self._safe_plot(gap_size_df=total_gap_size_df, x="total_gap_size",
                                         plot_callable="_plot_gap_sizes")

        return ReportResult(
            name=self.name,
            output_figures=[fig for fig in [output_figure1, output_figure2, output_figure3] if fig is not None],
            output_tables=[max_gap_size_table, total_gap_size_table, positional_aa_counts_table],
        )

    def _get_total_gap_sizes(self, motifs):
        data = {"motif_size": [],
                "total_gap_size": [],
                "occurrence": []}

        gap_size_count = {}

        for motif in motifs:
            motif_indices, amino_acids = PositionalMotifHelper.string_to_motif(motif, "&", "-")
            total_gap_size = (max(motif_indices) - min(motif_indices)) - len(motif_indices) + 1
            motif_size = len(motif_indices)

            if motif_size not in gap_size_count:
                gap_size_count[motif_size] = {total_gap_size: 1}
            else:
                if total_gap_size not in gap_size_count[motif_size]:
                    gap_size_count[motif_size][total_gap_size] = 1
                else:
                    gap_size_count[motif_size][total_gap_size] += 1

        for motif_size, counts in gap_size_count.items():
            for total_gap_size, occurrence in counts.items():
                data["motif_size"].append(motif_size)
                data["total_gap_size"].append(total_gap_size)
                data["occurrence"].append(occurrence)

        return pd.DataFrame(data)

    def _get_max_gap_sizes(self, motifs):
        gap_size_count = {
            motif_size: {
                gap_size: 0 for gap_size in range(0, self.get_sequence_length() - 1)
            }
            for motif_size in range(self.get_sequence_length())
        }

        for motif in motifs:
            indices, amino_acids = PositionalMotifHelper.string_to_motif(motif, "&", "-")
            motif_size = len(indices)

            if motif_size > 1:
                gap_size = max([indices[i + 1] - indices[i] - 1 for i in range(motif_size - 1)])
                gap_size_count[motif_size][gap_size] += 1

        motif_sizes = list()
        max_gap_sizes = list()
        occurrence = list()
        for motif_size in gap_size_count:
            if sum(gap_size_count[motif_size].values()) != 0:
                for gap_size in gap_size_count[motif_size]:
                    motif_sizes.append(str(motif_size))
                    max_gap_sizes.append(gap_size)
                    occurrence.append(gap_size_count[motif_size][gap_size])

        gap_size_df = pd.DataFrame()
        gap_size_df["motif_size"] = motif_sizes
        gap_size_df["max_gap_size"] = max_gap_sizes
        gap_size_df["occurrence"] = occurrence

        return gap_size_df

    def _get_positional_aa_counts(self, motifs):
        positional_aa_counts = {
            aa: [0 for i in range(self.get_sequence_length())]
            for aa in EnvironmentSettings.get_sequence_alphabet(SequenceType.AMINO_ACID)
        }

        for motif in motifs:
            indices, amino_acids = PositionalMotifHelper.string_to_motif(
                motif, "&", "-"
            )

            for index, amino_acid in zip(indices, amino_acids):
                positional_aa_counts[amino_acid][index] += 1

        positional_aa_counts_df = pd.DataFrame(positional_aa_counts)
        positional_aa_counts_df = positional_aa_counts_df.loc[
                                  :, (positional_aa_counts_df != 0).any(axis=0)
                                  ]

        # start counting positions at 1
        positional_aa_counts_df.index = [idx + 1 for idx in list(positional_aa_counts_df.index)]

        return positional_aa_counts_df

    def _plot_gap_sizes(self, gap_size_df, x):
        file_path = self.result_path / f"{x}.html"

        gap_size_df["occurrence_total"] = gap_size_df.groupby("motif_size")["occurrence"].transform(sum)
        gap_size_df["occurrence_percentage"] = gap_size_df["occurrence"] / gap_size_df["occurrence_total"]

        x_label = x.replace("_", " ").capitalize()

        if self.motif_color_map is not None:
            color_discrete_map = self.motif_color_map
            color_discrete_sequence = None
        else:
            color_discrete_map = None
            color_discrete_sequence = self._get_color_discrete_sequence()

        gap_size_df = gap_size_df.sort_values(by=x)
        gap_size_df["motif_size"] = gap_size_df["motif_size"].astype(str)

        gap_size_fig = px.line(
            gap_size_df,
            x=x,
            y="occurrence_percentage",
            color="motif_size",
            color_discrete_map=color_discrete_map,
            color_discrete_sequence=color_discrete_sequence,
            category_orders=dict(motif_size=sorted([int(rate) for rate in gap_size_df["motif_size"].unique()])),
            template="plotly_white",
            markers=True,
            labels={
                x: x_label,
                "occurrence_percentage": "Percentage of motifs",
                "motif_size": "Motif size",
            },
        )
        gap_size_fig.layout.yaxis.tickformat = ',.0%'

        gap_size_fig.update_layout(font={"size": 14}, xaxis={"tickmode": "linear"})
        file_path = PlotlyUtil.write_image_to_file(gap_size_fig, file_path)

        return ReportOutput(
            path=file_path,
            name=f"Gap size between amino acids in high-precision high-recall motifs",
        )

    def _plot_positional_aa_counts(self, positional_aa_counts_df):
        file_path = self.result_path / f"positional_motif_frequencies.html"

        # reverse sort column names makes amino acids stack alphabetically in bar chart
        positional_aa_counts_df = positional_aa_counts_df[sorted(positional_aa_counts_df.columns)[::-1]]

        positional_aa_counts_fig = px.bar(
            positional_aa_counts_df,
            labels={
                "index": "Amino acid position",
                "value": "Frequency across motifs",
            },
            text="variable",
            color_discrete_map=PlotlyUtil.get_amino_acid_color_map(),
            template="plotly_white",
        )
        positional_aa_counts_fig.update_layout(
            showlegend=False, font={"size": 14}, xaxis={"tickmode": "linear"}
        )
        file_path = PlotlyUtil.write_image_to_file(positional_aa_counts_fig, file_path)
        return ReportOutput(
            path=file_path,
            name=f"Frequencies of amino acids found in the high-precision high-recall motifs",
        )

    def _get_color_discrete_sequence(self):
        return px.colors.qualitative.Pastel[:-1] + px.colors.qualitative.Set3


[docs]
    def check_prerequisites(self):
        valid_encodings = [MotifEncoder.__name__]

        if self.dataset.encoded_data is None or self.dataset.encoded_data.info is None:
            logging.warning(
                "PositonalMotifFrequencies: the dataset is not encoded, skipping this report..."
            )
            return False
        elif self.dataset.encoded_data.encoding not in valid_encodings:
            logging.warning(
                f"PositonalMotifFrequencies: the dataset encoding ({self.dataset.encoded_data.encoding}) was not in the list of valid "
                f"encodings ({valid_encodings}), skipping this report..."
            )
            return False

        return True