Source code for immuneML.reports.encoding_reports.GroundTruthMotifOverlap

from pathlib import Path

import logging
import plotly.express as px
import pandas as pd

from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.encodings.motif_encoding.PositionalMotifHelper import PositionalMotifHelper
from immuneML.reports.PlotlyUtil import PlotlyUtil

from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.encodings.motif_encoding.MotifEncoder import MotifEncoder
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.PathBuilder import PathBuilder



[docs]
class GroundTruthMotifOverlap(EncodingReport):
    """
    Creates report displaying overlap between learned motifs and groundtruth motifs implanted in a given sequence dataset.
    This report must be used in combination with the MotifEncoder.

    **Specification arguments:**

    - groundtruth_motifs_path (str): Path to a .tsv file containing groundtruth position-specific motifs.
      The file should specify the motifs as position-specific amino acids, one column representing the positions
      concatenated with an '&' symbol, the next column specifying the amino acids concatenated with '&' symbol,
      and the last column specifying the implant rate.

      Example:

      =======  ===========  ===========
      indices  amino_acids  n_sequences
      =======  ===========  ===========
      0        A            4
      4&8&9    G&A&C        30
      =======  ===========  ===========

      This file shows a motif 'A' at position 0 implanted in 4 sequences, and motif G---AC implanted between positions 4 and 9 in 30 sequences


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        definitions:
            reports:
                my_ground_truth_motif_report:
                    GroundTruthMotifOverlap:
                        groundtruth_motifs_path: path/to/file.tsv
            """

    def __init__(self, dataset: Dataset = None, result_path: Path = None, name: str = None,
                 number_of_processes: int = 1, groundtruth_motifs_path: str = None):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)
        self.groundtruth_motifs_path = groundtruth_motifs_path


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        location = GroundTruthMotifOverlap.__name__

        if "groundtruth_motifs_path" in kwargs and kwargs["groundtruth_motifs_path"] is not None:
            PositionalMotifHelper.check_motif_filepath(kwargs["groundtruth_motifs_path"], location, "groundtruth_motifs_path", expected_header="indices\tamino_acids\tn_sequences\n")

        return GroundTruthMotifOverlap(**kwargs)


    def _generate(self):
        PathBuilder.build(self.result_path)

        groundtruth_motifs, implant_rate_dict = self._read_groundtruth_motifs(self.groundtruth_motifs_path)

        learned_motifs = self.dataset.encoded_data.feature_names

        overlap_df = self._generate_overlap(learned_motifs, groundtruth_motifs, implant_rate_dict)
        output_table = self._write_output_table(overlap_df, self.result_path / "ground_truth_motif_overlap.tsv", name=None)
        output_figure = self._safe_plot(overlap_df=overlap_df)

        return ReportResult(
            name=self.name,
            output_figures=[output_figure] if output_figure is not None else [],
            output_tables=[output_table],
        )

    def _read_groundtruth_motifs(self, filepath):
        with open(filepath) as file:
            PositionalMotifHelper.check_file_header(file.readline(), filepath, expected_header="indices\tamino_acids\tn_sequences\n")
            groundtruth_motifs = []
            groundtruth_implant_rate = []
            for line in file.readlines():
                motif, implant_rate = self._get_motif_and_implant_rate(
                    line, motif_sep="\t"
                )
                groundtruth_motifs.append(motif)
                groundtruth_implant_rate.append(implant_rate)

        implant_rate_dict = {
            groundtruth_motifs[i]: groundtruth_implant_rate[i]
            for i in range(len(groundtruth_motifs))
        }
        return groundtruth_motifs, implant_rate_dict

    def _get_motif_and_implant_rate(self, string, motif_sep):
        indices_str, amino_acids_str, implant_rate = string.strip().split(motif_sep)
        motif = indices_str + "-" + amino_acids_str
        return motif, implant_rate

    def _generate_overlap(self, learned_motifs, groundtruth_motifs, implant_rate_dict):
        motif_size_list = list()
        implant_rate_list = list()
        max_overlap_list = list()
        learned_motif_list = list()
        gt_motif_list = list()

        for learned_motif in learned_motifs:
            motif_size = len(learned_motif.split("-")[0].replace("&", ""))
            for groundtruth_motif in groundtruth_motifs:
                max_overlap = self._get_max_overlap(learned_motif, groundtruth_motif)

                if max_overlap != 0:
                    motif_size_list.append(motif_size)
                    implant_rate_list.append(implant_rate_dict[groundtruth_motif])
                    max_overlap_list.append(max_overlap)
                    learned_motif_list.append(learned_motif)
                    gt_motif_list.append(groundtruth_motif)

        df = pd.DataFrame()
        df["learned_motif"] = learned_motif_list
        df["ground_truth_motif"] = gt_motif_list
        df["implant_rate"] = implant_rate_list
        df["max_overlap"] = max_overlap_list
        df["motif_size"] = motif_size_list

        return df

    def _get_max_overlap(self, learned_motif, groundtruth_motif):
        # assumes no duplicates will occur as is the case with motifs

        split_learned = learned_motif.replace("&", "").split("-")
        split_groundtruth = groundtruth_motif.replace("&", "").split("-")

        learned_aa = split_learned[0]
        learned_indices = split_learned[1]
        groundtruth_aa = split_groundtruth[0]
        groundtruth_indices = split_groundtruth[1]

        learned_pairs = [learned_aa[i] + learned_indices[i] for i in range(len(learned_aa))]
        groundtruth_pairs = [groundtruth_aa[i] + groundtruth_indices[i] for i in range(len(groundtruth_aa))]

        score = 0
        for pair in learned_pairs:
            if pair in groundtruth_pairs:
                score += 1

        return score

    def _get_color_discrete_sequence(self):
        return px.colors.qualitative.Pastel[:-1] + px.colors.qualitative.Set3

    def _plot(self, overlap_df) -> ReportOutput:
        file_path = self.result_path / f"motif_overlap.html"
        facet_barplot = px.histogram(
            overlap_df,
            x="implant_rate",
            labels={
                "implant_rate": "Number of implanted ground truth motifs",
                "max_overlap": "Ground truth motif overlap",
                "motif_size": "Motif size",
            },
            facet_col="max_overlap",
            color_discrete_sequence=self._get_color_discrete_sequence(),
            category_orders=dict(implant_rate=sorted([int(rate) for rate in overlap_df["implant_rate"].unique()]),
                                 motif_size=sorted([int(size) for size in overlap_df["motif_size"].unique()]),
                                 max_overlap=sorted([int(overlap) for overlap in overlap_df["max_overlap"].unique()])),
            facet_col_spacing=0.05,
            color="motif_size",
            title="Amount of overlapping motifs per implant rate",
            template="plotly_white"
        )
        facet_barplot.update_yaxes(matches=None, showticklabels=True)
        facet_barplot.update_layout(
            yaxis_title="Number of overlapping learned motifs",
        )
        file_path = PlotlyUtil.write_image_to_file(facet_barplot, file_path)

        return ReportOutput(
            path=file_path, name="Amount of overlapping motifs per implant rate"
        )


[docs]
    def check_prerequisites(self):
        valid_encodings = [MotifEncoder.__name__]

        if self.dataset.encoded_data is None or self.dataset.encoded_data.info is None:
            logging.warning(
                "GroundTruthMotifOverlap: the dataset is not encoded, skipping this report..."
            )
            return False
        elif self.dataset.encoded_data.encoding not in valid_encodings:
            logging.warning(
                f"GroundTruthMotifOverlap: the dataset encoding ({self.dataset.encoded_data.encoding}) was not in the list of valid "
                f"encodings ({valid_encodings}), skipping this report..."
            )
            return False
        else:
            return True