Source code for immuneML.reports.encoding_reports.Matches

import itertools
import warnings
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.PathBuilder import PathBuilder


[docs]class Matches(EncodingReport): """ Reports the number of matches that were found when using one of the following encoders: * :ref:`MatchedSequences` encoder * :ref:`MatchedReceptors` encoder * :ref:`MatchedRegex` encoder Report results are: * A table containing all matches, where the rows correspond to the Repertoires, and the columns correspond to the objects to match (regular expressions or receptor sequences). * The repertoire sizes (read frequencies and the number of unique sequences per repertoire), for each of the chains. This can be used to calculate the percentage of matched sequences in a repertoire. * When using :ref:`MatchedSequences` encoder or :ref:`MatchedReceptors` encoder, tables describing the chains and receptors (ids, chains, V and J genes and sequences). * When using :ref:`MatchedReceptors` encoder or using :ref:`MatchedRegex` encoder with chain pairs, tables describing the paired matches (where a match was found in both chains) per repertoire. YAML Specification: .. indent with spaces .. code-block:: yaml my_match_report: Matches """
[docs] @classmethod def build_object(cls, **kwargs): return Matches(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, result_path: Path = None, name: str = None): super().__init__(name) self.dataset = dataset self.result_path = result_path self.name = name def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) return self._write_reports() def _write_reports(self) -> ReportResult: all_matches_table = self._write_match_table() repertoire_sizes = self._write_repertoire_sizes() output_tables = [all_matches_table, repertoire_sizes] if self.dataset.encoded_data.encoding == "MatchedSequencesEncoder": output_tables += self._write_sequence_info(self.result_path / "sequence_info") else: if len(self.dataset.encoded_data.feature_annotations["chain"].unique()) == 2: output_tables += self._write_paired_matches(self.result_path / "paired_matches") if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder": output_tables += self._write_receptor_info(self.result_path / "receptor_info") return ReportResult(self.name, output_tables=output_tables) def _write_match_table(self): id_df = pd.DataFrame({"repertoire_id": self.dataset.encoded_data.example_ids}) label_df = pd.DataFrame(self.dataset.encoded_data.labels) matches_df = pd.DataFrame(self.dataset.encoded_data.examples, columns=self.dataset.encoded_data.feature_names) result_path = self.result_path / "complete_match_count_table.csv" id_df.join(label_df).join(matches_df).to_csv(result_path, index=False) return ReportOutput(result_path, "All matches") def _write_paired_matches(self, paired_matches_path: Path) -> List[ReportOutput]: PathBuilder.build(paired_matches_path) report_outputs = [] for i in range(0, len(self.dataset.encoded_data.example_ids)): # todo don't mention subject in the name twice file_name = "example_{}_".format(self.dataset.encoded_data.example_ids[i]) file_name += "_".join(["{label}_{value}".format(label=label, value=values[i]) for label, values in self.dataset.encoded_data.labels.items()]) file_name += ".csv" file_path = paired_matches_path / file_name if self.dataset.encoded_data.encoding == "MatchedReceptorsEncoder": self._write_paired_receptor_matches_for_repertoire(self.dataset.encoded_data.examples[i], file_path) elif self.dataset.encoded_data.encoding == "MatchedRegexEncoder": self._write_paired_regex_matches_for_repertoire(self.dataset.encoded_data.examples[i], file_path) report_outputs.append(ReportOutput(file_path, f"Example {self.dataset.encoded_data.example_ids[i]} paired matches")) return report_outputs def _write_paired_receptor_matches_for_repertoire(self, matches, filename): match_identifiers = [] match_values = [] for i in range(0, int(len(matches) / 2)): first_match_idx = i * 2 second_match_idx = i * 2 + 1 if matches[first_match_idx] > 0 and matches[second_match_idx] > 0: match_identifiers.append(self.dataset.encoded_data.feature_names[first_match_idx]) match_identifiers.append(self.dataset.encoded_data.feature_names[second_match_idx]) match_values.append(matches[first_match_idx]) match_values.append(matches[second_match_idx]) results_df = pd.DataFrame([match_values], columns=match_identifiers) results_df.to_csv(filename, index=False) def _write_paired_regex_matches_for_repertoire(self, matches, filename): match_identifiers = [] match_values = [] annotation_df = self.dataset.encoded_data.feature_annotations for receptor_id in sorted(set(annotation_df["receptor_id"])): chain_ids = list(annotation_df.loc[annotation_df["receptor_id"] == receptor_id]["chain_id"]) if len(chain_ids) == 2: first_match_idx = self.dataset.encoded_data.feature_names.index(chain_ids[0]) second_match_idx = self.dataset.encoded_data.feature_names.index(chain_ids[1]) if matches[first_match_idx] > 0 and matches[second_match_idx] > 0: match_identifiers.append(chain_ids[0]) match_identifiers.append(chain_ids[1]) match_values.append(matches[first_match_idx]) match_values.append(matches[second_match_idx]) results_df = pd.DataFrame([match_values], columns=match_identifiers) results_df.to_csv(filename, index=False) def _write_repertoire_sizes(self): """ Writes the repertoire sizes (# clones & # reads) per subject, per chain. """ all_subjects = self.dataset.encoded_data.example_ids all_chains = sorted(set(self.dataset.encoded_data.feature_annotations["chain"])) results_df = pd.DataFrame(list(itertools.product(all_subjects, all_chains)), columns=["subject_id", "chain"]) results_df["n_reads"] = 0 results_df["n_clones"] = 0 for repertoire in self.dataset.repertoires: rep_counts = repertoire.get_counts() rep_chains = repertoire.get_chains() for chain in all_chains: indices = rep_chains == Chain.get_chain(chain.upper()) results_df.loc[(results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_reads'] += np.sum(rep_counts[indices]) results_df.loc[(results_df.subject_id == repertoire.metadata["subject_id"]) & (results_df.chain == chain), 'n_clones'] += len(rep_counts[indices]) results_path = self.result_path / "repertoire_sizes.csv" results_df.to_csv(results_path, index=False) return ReportOutput(results_path, "Repertoire sizes") def _write_receptor_info(self, receptor_info_path) -> List[ReportOutput]: PathBuilder.build(receptor_info_path) receptor_chains = self.dataset.encoded_data.feature_annotations chain_types = receptor_chains["chain"].unique() first_chains = receptor_chains.loc[receptor_chains.chain == chain_types[0]] second_chains = receptor_chains.loc[receptor_chains.chain == chain_types[1]] first_chains.drop(columns=["chain"], inplace=True) second_chains.drop(columns=["chain"], inplace=True) on_cols = ["receptor_id"] if "clonotype_id" in second_chains.columns and first_chains.columns: on_cols += ["clonotype_id"] receptors = pd.merge(first_chains, second_chains, on=on_cols, suffixes=(f"_{chain_types[0]}", f"_{chain_types[1]}")) unique_alpha_chains = first_chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"]) unique_beta_chains = second_chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"]) unique_receptors = receptors.drop_duplicates(subset=[f"sequence_{chain_types[0]}", f"v_gene_{chain_types[0]}", f"j_gene_{chain_types[0]}", f"sequence_{chain_types[1]}", f"v_gene_{chain_types[1]}", f"j_gene_{chain_types[1]}"]) receptor_chains_path = receptor_info_path / "all_chains.csv" receptor_chains.to_csv(receptor_chains_path, index=False) receptors_path = receptor_info_path / "all_receptors.csv" receptors.to_csv(receptors_path, index=False) unique_chain1_path = receptor_info_path / f"unique_{chain_types[0]}_chains.csv" unique_alpha_chains.to_csv(unique_chain1_path, index=False) unique_chain2_path = receptor_info_path / f"unique_{chain_types[1]}_chains.csv" unique_beta_chains.to_csv(unique_chain2_path, index=False) unique_receptors_path = receptor_info_path / "unique_receptors.csv" unique_receptors.to_csv(unique_receptors_path, index=False) return [ReportOutput(path=path, name=name) for path, name in [(receptors_path, "All receptors info"), (receptor_chains_path, "All receptor chains info"), (unique_receptors_path, "Unique receptors info"), (unique_chain1_path, "Unique chain 1 info"), (unique_chain2_path, "Unique chain 2 info")]] def _write_sequence_info(self, sequence_info_path) -> List[ReportOutput]: PathBuilder.build(sequence_info_path) chains = self.dataset.encoded_data.feature_annotations unique_chains = chains.drop_duplicates(subset=["sequence", "v_gene", "j_gene"]) chains_path = sequence_info_path / "all_chains.csv" chains.to_csv(chains_path, index=False) unique_chains_path = sequence_info_path / "unique_chains.csv" unique_chains.to_csv(unique_chains_path, index=False) return [ReportOutput(path=path, name=name) for path, name in [(chains_path, "All chains info"), (unique_chains_path, "Unique chains info")]]
[docs] def check_prerequisites(self): if self.dataset.encoded_data is None or self.dataset.encoded_data.examples is None: warnings.warn(f"No encoding was specified for dataset {self.dataset.identifier}. Please use one of the following encodings: MatchedReceptorsEncoder, MatchedSequencesEncoder, MatchedRegexEncoder. Matches report will not be created.") return False if self.dataset.encoded_data.encoding not in ("MatchedReceptorsEncoder", "MatchedSequencesEncoder", "MatchedRegexEncoder"): warnings.warn(f"Encoding {self.dataset.encoded_data.encoding} is not compatible with this report type. Matches report will not be created.") return False else: return True