Source code for immuneML.reports.encoding_reports.NonMotifSequenceSimilarity

import logging
from pathlib import Path

import pandas as pd
import plotly.express as px
from immuneML.data_model.datasets.ElementDataset import SequenceDataset

from immuneML.encodings.motif_encoding.MotifEncoder import MotifEncoder
from immuneML.encodings.motif_encoding.PositionalMotifHelper import PositionalMotifHelper
from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder



[docs]
class NonMotifSequenceSimilarity(EncodingReport):
    """
    Plots the similarity of positions outside the motifs of interest. This report can be used to investigate if the
    motifs of interest as determined by the :py:obj:`~immuneML.encodings.motif_encoding.MotifEncoder.MotifEncoder`
    have a tendency occur in sequences that are naturally very similar or dissimilar.

    For each motif, the subset of sequences containing the motif is selected, and the hamming distances are computed
    between all sequences in this subset. Finally, a plot is created showing the distribution of hamming distances
    between the sequences containing the motif. For motifs occurring in sets of very similar sequences, this distribution
    will lean towards small hamming distances. Likewise, for motifs occurring in a very diverse set of sequences, the
    distribution will lean towards containing more large hamming distances.


    **Specification arguments:**

    - motif_color_map (dict): An optional mapping between motif sizes and colors. If no mapping is given, default colors will be chosen.


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        definitions:
            reports:
                my_motif_sim:
                    NonMotifSimilarity:
                        motif_color_map:
                            3: "#66C5CC"
                            4: "#F6CF71"
                            5: "#F89C74"

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        if "motif_color_map" in kwargs:
            ParameterValidator.assert_type_and_value(kwargs["motif_color_map"], dict, NonMotifSequenceSimilarity.__name__, "motif_color_map")
            kwargs["motif_color_map"] = {str(key): value for key, value in kwargs["motif_color_map"].items()}

        return NonMotifSequenceSimilarity(**kwargs)


    def __init__(self, dataset: SequenceDataset = None, result_path: Path = None,
                 motif_color_map: dict = None, name: str = None, number_of_processes: int = 1):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)
        self.sequence_length = 0
        self.motif_color_map = motif_color_map

    def _generate(self):
        PathBuilder.build(self.result_path)

        raw_counts = self.get_hamming_distance_counts()
        plotting_data = self.get_plotting_data(raw_counts)

        table_1 = self._write_output_table(raw_counts, self.result_path / "sequence_hamming_distances_raw.tsv",
                                                "Hamming distances between sequences sharing the same motif, raw counts.")
        table_2 = self._write_output_table(plotting_data, self.result_path / "sequence_hamming_distances_percentage.tsv",
                                                "Hamming distances between sequences sharing the same motif, expressed as percentage.")

        output_figure = self._safe_plot(plotting_data=plotting_data)

        return ReportResult(
            name=self.name,
            output_figures=[output_figure],
            output_tables=[table_1, table_2],
        )


[docs]
    def get_hamming_distance_counts(self):
        np_sequences = PositionalMotifHelper.get_numpy_sequence_representation(self.dataset)
        self.sequence_length = len(np_sequences[0])

        raw_counts = pd.DataFrame([self._make_hamming_distance_hist_for_motif(motif_presence, np_sequences)
                                   for motif_presence in self.dataset.encoded_data.examples.T])

        ### Original code with multiprocessing (fails with bionumpy + pickle error?)
        # with Pool(processes=self.number_of_processes) as pool:
        #     partial_func = partial(self._make_hamming_distance_hist_for_motif,  np_sequences=np_sequences)
        #     raw_counts = pd.DataFrame(pool.map(partial_func, self.dataset.encoded_data.examples.T))

        raw_counts["motif"] = self.dataset.encoded_data.feature_names

        return raw_counts

    def _make_hamming_distance_hist_for_motif(self, motif_presence, np_sequences):
        positive_sequences = np_sequences[motif_presence]
        return self._make_hamming_distance_hist(positive_sequences)
    def _make_hamming_distance_hist(self, sequences):
        counts = {i: 0 for i in range(self.sequence_length)}
        for dist in self._calculate_all_hamming_distances(sequences):
            counts[dist] += 1

        return counts
    def _calculate_all_hamming_distances(self, sequences):
        for i in range(len(sequences)):
            for j in range(i + 1, len(sequences)):
                yield sum(sequences[i] != sequences[j])


[docs]
    def get_plotting_data(self, raw_counts):
        motif_col = raw_counts.loc[:,"motif"]
        counts = raw_counts.loc[:, raw_counts.columns != "motif"]

        total = counts.apply(sum, axis=1)
        plotting_data = counts.div(total, axis=0)
        plotting_data["motif"] = motif_col
        plotting_data = plotting_data[total > 0]
        plotting_data = plotting_data.loc[::-1]
        plotting_data = pd.melt(plotting_data, id_vars=["motif"], var_name="Hamming", value_name="Percentage")

        plotting_data["Hamming"] = plotting_data["Hamming"].astype(str)
        plotting_data["motif_size"] = plotting_data["motif"].apply(lambda motif: len(motif.split("-")[0].split("&")))

        return plotting_data


    def _plot(self, plotting_data) -> ReportOutput:
        if self.motif_color_map is not None:
            color_discrete_map = self.motif_color_map
            color_discrete_sequence = None
        else:
            color_discrete_map = None
            color_discrete_sequence = px.colors.sequential.Sunsetdark

        plotting_data["motif_size"] = plotting_data["motif_size"].astype(str)

        fig = px.line(plotting_data, x='Hamming', y='Percentage', markers=True, line_group='motif', color="motif_size",
                      template='plotly_white', color_discrete_sequence=color_discrete_sequence, color_discrete_map=color_discrete_map,
                      labels={"Percentage": "Percentage of sequence pairs containing motif",
                              "Hamming": "Hamming distance between sequences",
                              "motif_size": "Motif length<br>(number of<br>amino acids)"})

        fig.layout.yaxis.tickformat = ',.0%'

        fig.update_traces(opacity=0.7)

        fig.update_layout(
            font=dict(
                size=14,
            )
        )

        output_path = self.result_path / "sequence_hamming_distances.html"

        output_path = PlotlyUtil.write_image_to_file(fig, output_path)

        return ReportOutput(path=output_path, name="Hamming distances between sequences sharing the same motif")


[docs]
    def check_prerequisites(self):
        valid_encodings = [MotifEncoder.__name__]

        if self.dataset.encoded_data is None or self.dataset.encoded_data.info is None:
            logging.warning(f"{self.__class__.__name__}: the dataset is not encoded, skipping this report...")
            return False
        elif self.dataset.encoded_data.encoding not in valid_encodings:
            logging.warning(f"{self.__class__.__name__}: the dataset encoding ({self.dataset.encoded_data.encoding}) was not in the list of valid "
                            f"encodings ({valid_encodings}), skipping this report...")
            return False
        else:
            return True