Source code for immuneML.reports.data_reports.SignificantKmerPositions

import logging
from pathlib import Path
from typing import List

import pandas as pd
import plotly.express as px

from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceParams import RegionType
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.KmerHelper import KmerHelper
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper


[docs] class SignificantKmerPositions(DataReport): """ Plots the number of significant k-mers (as computed by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` using Fisher's exact test) observed at each IMGT position of a given list of reference sequences. This report creates a stacked bar chart, where each bar represents an IMGT position, and each segment of the stack represents the observed frequency of one 'significant' k-mer at that position. **Specification arguments:** - reference_sequences_path (str): Path to a file containing the reference sequences, The file should contain one sequence per line, without a header, and without V or J genes. - p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become one panel in the output figure. - k_values (list): Length of the k-mers (number of amino acids) created by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`. Each k-mer length will become one panel in the output figure. - label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example. - sequence_type (str): nucleotide or amino_acid - region_type (str): which AIRR field to consider, e.g., IMGT_CDR3 or IMGT_JUNCTION **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: my_significant_kmer_positions_report: SignificantKmerPositions: reference_sequences_path: path/to/reference/sequences.txt p_values: - 0.1 - 0.01 - 0.001 - 0.0001 k_values: - 3 - 4 - 5 label: # Define a label, and the positive class for that given label CMV: positive_class: + """
[docs] @classmethod def build_object(cls, **kwargs): location = SignificantKmerPositions.__name__ kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location) kwargs = SignificantFeaturesHelper.parse_sequences_path(kwargs, "reference_sequences_path", location) ParameterValidator.assert_all_type_and_value(kwargs["k_values"], int, location, "k_values") return SignificantKmerPositions(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, reference_sequences_path: Path = None, p_values: List[float] = None, k_values: List[int] = None, label: dict = None, compairr_path: Path = None, result_path: Path = None, name: str = None, number_of_processes: int = 1, region_type: RegionType = None, sequence_type: SequenceType = None): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.reference_sequences_path = reference_sequences_path self.reference_sequences = SignificantFeaturesHelper.load_sequences(reference_sequences_path) self.p_values = p_values self.k_values = k_values self.label = label self.compairr_path = compairr_path self.label_config = None self.region_type = RegionType[region_type.upper()] if isinstance(region_type, str) else region_type self.sequence_type = SequenceType[sequence_type.upper()] if isinstance(sequence_type, str) else sequence_type
[docs] def check_prerequisites(self): if isinstance(self.dataset, RepertoireDataset): return True else: logging.warning(f"{SignificantKmerPositions.__name__}: report can be generated only from RepertoireDataset. " f"Skipping this report...") return False
def _generate(self) -> ReportResult: self.label_config = LabelHelper.create_label_config([self.label], self.dataset, SignificantKmerPositions.__name__, f"{SignificantKmerPositions.__name__}/label") plotting_data = self._compute_plotting_data() table_result = self._write_results_table(plotting_data) report_output_fig = self._safe_plot(plotting_data=plotting_data) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(name=self.name, info="The number of significant k-mers observed at each IMGT position of a given list " "of reference sequences.", output_figures=output_figures, output_tables=[table_result]) def _compute_plotting_data(self): result = {"encoding": [], "p-value": [], "imgt_position": [], "k-mer": [], "count": []} for k in self.k_values: for p_value in self.p_values: significant_kmer_positions = self._compute_significant_kmer_positions(k, p_value) for imgt_pos, kmer_dict in significant_kmer_positions.items(): for kmer, count in kmer_dict.items(): result["encoding"].append(f"{k}-mer") result["p-value"].append(p_value) result["imgt_position"].append(str(imgt_pos)) result["k-mer"].append(kmer) result["count"].append(count) return pd.DataFrame(result).astype({'imgt_position': str}) def _get_encoder_result_path(self, k, p_value): result_path = self.result_path / f"{k}-mer_{p_value}" PathBuilder.build(result_path) return result_path def _write_results_table(self, data) -> ReportOutput: table_path = self.result_path / f"significant_kmer_positions_report.csv" data.to_csv(table_path, index=False) return ReportOutput(table_path, "Number of significant k-mers found at each position in a set of reference sequences") def _plot(self, plotting_data): figure = px.bar(plotting_data, x="imgt_position", y="count", color="k-mer", facet_row="encoding", facet_col="p-value", labels={ "encoding": "Encoding", "imgt_position": "sequence position (IMGT scheme)", "count": "Number of significant k-mers observed" }, template="plotly_white", category_orders={ "imgt_position": self._get_imgt_position_order(set(plotting_data["imgt_position"])) }, barmode="stack", color_discrete_sequence=px.colors.diverging.Tealrose) file_path = self.result_path / f"significant_kmer_positions.html" figure.write_html(str(file_path)) return ReportOutput(file_path, name="Significant k-mers observed at each position in the reference sequences") def _get_imgt_position_order(self, imgt_positions): sorted_positions = sorted([float(pos) for pos in imgt_positions]) return [str(pos_float) if int(pos_float) != pos_float else str(int(pos_float)) for pos_float in sorted_positions] def _compute_significant_kmer_positions(self, k, p_value): significant_kmers = self._compute_significant_kmers(k, p_value) results = {} for sequence in self.reference_sequences: reference_imgt_kmers = KmerHelper.create_IMGT_kmers_from_string(sequence, k, region_type=RegionType.IMGT_CDR3) for kmer, imgt_pos in reference_imgt_kmers: if imgt_pos not in results: results[imgt_pos] = {} if kmer in significant_kmers: if kmer in results[imgt_pos]: results[imgt_pos][kmer] += 1 else: results[imgt_pos][kmer] = 1 return results def _compute_significant_kmers(self, k, p_value): encoder_result_path = self._get_encoder_result_path(k, p_value) encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path, self.region_type, self.sequence_type) encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params) sequences = pd.read_csv(encoder.relevant_sequence_path) return list(sequences["k-mer"])