Source code for immuneML.reports.encoding_reports.Matches

import itertools
import warnings
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.PathBuilder import PathBuilder


[docs]class Matches(EncodingReport):
    """
    Reports the number of matches that were found when using one of the following encoders:

    * :ref:`MatchedSequences` encoder
    * :ref:`MatchedReceptors` encoder
    * :ref:`MatchedRegex` encoder


    Report results are:

    * A table containing all matches, where the rows correspond to the Repertoires, and the
      columns correspond to the objects to match (regular expressions or receptor sequences).
    * The repertoire sizes (read frequencies and the number of unique sequences per repertoire), for each of the chains.
      This can be used to calculate the percentage of matched sequences in a repertoire.
    * When using :ref:`MatchedSequences` encoder or
      :ref:`MatchedReceptors` encoder, tables describing
      the chains and receptors (ids, chains, V and J genes and sequences).
    * When using :ref:`MatchedReceptors` encoder or using
      :ref:`MatchedRegex` encoder with chain pairs, tables describing
      the paired matches (where a match was found in both chains) per repertoire.


    YAML Specification:

    .. indent with spaces
    .. code-block:: yaml

        my_match_report: Matches
    """

[docs]    @classmethod
    def build_object(cls, **kwargs):
        return Matches(**kwargs)

    def __init__(self, dataset: RepertoireDataset = None, result_path: Path = None, name: str = None, number_of_processes: int = 1):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)

    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        return self._write_reports()

    def _write_reports(self) -> ReportResult:
        all_matches_table = self._write_match_table()
        repertoire_sizes = self._write_repertoire_sizes()

        output_tables = [all_matches_table, repertoire_sizes]

        if self.dataset.encoded_data.encoding == "MatchedSequencesEncoder":
            output_tables += self._write_sequence_info(self.result_path / "sequence_info")
        else:
            if len(self.dataset.encoded_data.feature_annotations["chain"].unique()) == 2:
                output_tables += self._write_paired_matches(self.result_path / "paired_matches")

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                output_tables += self._write_receptor_info(self.result_path / "receptor_info")

        return ReportResult(self.name,
                            info=f"Reports the number of matches that were found when using {self.dataset.encoded_data.encoding}",
                            output_tables=output_tables)

    def _write_match_table(self):
        id_df = pd.DataFrame({"repertoire_id": self.dataset.encoded_data.example_ids})
        label_df = pd.DataFrame(self.dataset.encoded_data.labels)
        matches_df = pd.DataFrame(self.dataset.encoded_data.examples, columns=self.dataset.encoded_data.feature_names)

        result_path = self.result_path / "complete_match_count_table.csv"
        id_df.join(label_df).join(matches_df).to_csv(result_path, index=False)

        return ReportOutput(result_path, "All matches")

    def _write_paired_matches(self, paired_matches_path: Path) -> List[ReportOutput]:
        PathBuilder.build(paired_matches_path)

        report_outputs = []
        for i in range(0, len(self.dataset.encoded_data.example_ids)): # todo don't mention subject in the name twice
            file_name = "example_{}_".format(self.dataset.encoded_data.example_ids[i])
            file_name += "_".join(["{label}_{value}".format(label=label, value=values[i]) for
                                  label, values in self.dataset.encoded_data.labels.items()])
            file_name += ".csv"
            file_path = paired_matches_path / file_name

            if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder":
                self._write_paired_receptor_matches_for_repertoire(self.dataset.encoded_data.examples[i], file_path)
            elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder":
                self._write_paired_regex_matches_for_repertoire(self.dataset.encoded_data.examples[i], file_path)

            report_outputs.append(ReportOutput(file_path, f"Example {self.dataset.encoded_data.example_ids[i]} paired matches"))

        return report_outputs

    def _write_paired_receptor_matches_for_repertoire(self, matches, filename):
        match_identifiers = []
        match_values = []

        for i in range(0, int(len(matches) / 2)):
            first_match_idx = i * 2
            second_match_idx = i * 2 + 1

            if matches[first_match_idx] > 0 and matches[second_match_idx] > 0:
                match_identifiers.append(self.dataset.encoded_data.feature_names[first_match_idx])
                match_identifiers.append(self.dataset.encoded_data.feature_names[second_match_idx])
                match_values.append(matches[first_match_idx])
                match_values.append(matches[second_match_idx])

        results_df = pd.DataFrame([match_values], columns=match_identifiers)
        results_df.to_csv(filename, index=False)

    def _write_paired_regex_matches_for_repertoire(self, matches, filename):
        match_identifiers = []
        match_values = []

        annotation_df = self.dataset.encoded_data.feature_annotations

        for receptor_id in sorted(set(annotation_df["receptor_id"])):
            chain_ids = list(annotation_df.loc[annotation_df["receptor_id"] == receptor_id]["chain_id"])

            if len(chain_ids) == 2:
                first_match_idx = self.dataset.encoded_data.feature_names.index(chain_ids[0])
                second_match_idx = self.dataset.encoded_data.feature_names.index(chain_ids[1])

                if matches[first_match_idx] > 0 and matches[second_match_idx] > 0:
                    match_identifiers.append(chain_ids[0])
                    match_identifiers.append(chain_ids[1])
                    match_values.append(matches[first_match_idx])
                    match_values.append(matches[second_match_idx])

        results_df = pd.DataFrame([match_values], columns=match_identifiers)
        results_df.to_csv(filename, index=False)

    def _write_repertoire_sizes(self):
        """
        Writes the repertoire sizes (# clones & # reads) per subject, per chain.
        """
        all_subjects = self.dataset.encoded_data.example_ids
        all_chains = sorted(set(self.dataset.encoded_data.feature_annotations["chain"]))

        results_df = pd.DataFrame(list(itertools.product(all_subjects, all_chains)),
                                  columns=["subject_id", "chain"])
        results_df["n_reads"] = 0
        results_df["n_clones"] = 0

        for repertoire in self.dataset.repertoires:
            rep_counts = repertoire.get_counts()
            rep_chains = repertoire.get_chains()

            for chain in all_chains:
                indices = rep_chains == Chain.get_chain(chain.upper())
                results_df.loc[(results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain),
                               'n_reads'] += np.sum(rep_counts[indices])
                results_df.loc[(results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain),
                               'n_clones'] += len(rep_counts[indices])

        results_path = self.result_path / "repertoire_sizes.csv"
        results_df.to_csv(results_path, index=False)

        return ReportOutput(results_path, "Repertoire sizes")

    def _write_receptor_info(self, receptor_info_path) -> List[ReportOutput]:
        PathBuilder.build(receptor_info_path)

        receptor_chains = self.dataset.encoded_data.feature_annotations
        chain_types = receptor_chains["chain"].unique()

        first_chains = receptor_chains.loc[receptor_chains.chain == chain_types[0]]
        second_chains = receptor_chains.loc[receptor_chains.chain == chain_types[1]]

        first_chains.drop(columns=["chain"], inplace=True)
        second_chains.drop(columns=["chain"], inplace=True)

        on_cols = ["receptor_id"]
        if "clonotype_id" in second_chains.columns and first_chains.columns:
            on_cols += ["clonotype_id"]

        receptors = pd.merge(first_chains, second_chains,
                             on=on_cols,
                             suffixes=(f"_{chain_types[0]}", f"_{chain_types[1]}"))

        unique_alpha_chains = first_chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"])
        unique_beta_chains = second_chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"])
        unique_receptors = receptors.drop_duplicates(subset=[f"sequence_{chain_types[0]}", f"v_gene_{chain_types[0]}", f"j_gene_{chain_types[0]}",
                                                             f"sequence_{chain_types[1]}", f"v_gene_{chain_types[1]}", f"j_gene_{chain_types[1]}"])

        receptor_chains_path = receptor_info_path / "all_chains.csv"
        receptor_chains.to_csv(receptor_chains_path, index=False)
        receptors_path = receptor_info_path / "all_receptors.csv"
        receptors.to_csv(receptors_path, index=False)
        unique_chain1_path = receptor_info_path / f"unique_{chain_types[0]}_chains.csv"
        unique_alpha_chains.to_csv(unique_chain1_path, index=False)
        unique_chain2_path = receptor_info_path / f"unique_{chain_types[1]}_chains.csv"
        unique_beta_chains.to_csv(unique_chain2_path, index=False)
        unique_receptors_path = receptor_info_path / "unique_receptors.csv"
        unique_receptors.to_csv(unique_receptors_path, index=False)

        return [ReportOutput(path=path, name=name) for path, name in [(receptors_path, "All receptors info"),
                                                                      (receptor_chains_path, "All receptor chains info"),
                                                                      (unique_receptors_path, "Unique receptors info"),
                                                                      (unique_chain1_path, "Unique chain 1 info"),
                                                                      (unique_chain2_path, "Unique chain 2 info")]]

    def _write_sequence_info(self, sequence_info_path) -> List[ReportOutput]:
        PathBuilder.build(sequence_info_path)

        chains = self.dataset.encoded_data.feature_annotations
        unique_chains = chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"])

        chains_path = sequence_info_path / "all_chains.csv"
        chains.to_csv(chains_path, index=False)
        unique_chains_path = sequence_info_path / "unique_chains.csv"
        unique_chains.to_csv(unique_chains_path, index=False)

        return [ReportOutput(path=path, name=name) for path, name in [(chains_path, "All chains info"), (unique_chains_path, "Unique chains info")]]

[docs]    def check_prerequisites(self):
        if self.dataset.encoded_data is None or self.dataset.encoded_data.examples is None:
            warnings.warn(f"No encoding was specified for dataset {self.dataset.identifier}. Please use one of the following encodings: MatchedReceptorsEncoder, MatchedSequencesEncoder, MatchedRegexEncoder. Matches report will not be created.")
            return False

        if self.dataset.encoded_data.encoding not in ("MatchedReceptorsEncoder", "MatchedSequencesEncoder", "MatchedRegexEncoder"):
            warnings.warn(f"Encoding {self.dataset.encoded_data.encoding} is not compatible with this report type. Matches report will not be created.")
            return False
        else:
            return True