Source code for immuneML.reports.data_reports.SequencesWithSignificantKmers

import warnings
from pathlib import Path
from typing import List

import pandas as pd

from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper


[docs] class SequencesWithSignificantKmers(DataReport): """ Given a list of reference sequences, this report writes out the subsets of reference sequences containing significant k-mers (as computed by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` using Fisher's exact test). For each combination of p-value and k-mer size given, a file is written containing all sequences containing a significant k-mer of the given size at the given p-value. **Specification arguments:** - reference_sequences_path (str): Path to a file containing the reference sequences, The file should contain one sequence per line, without a header, and without V or J genes. - p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become one panel in the output figure. - k_values (list): Length of the k-mers (number of amino acids) created by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`. Each k-mer length will become one panel in the output figure. - label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example. **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: my_sequences_with_significant_kmers: SequencesWithSignificantKmers: reference_sequences_path: path/to/reference/sequences.txt p_values: - 0.1 - 0.01 - 0.001 - 0.0001 k_values: - 3 - 4 - 5 label: # Define a label, and the positive class for that given label CMV: positive_class: + """
[docs] @classmethod def build_object(cls, **kwargs): location = SequencesWithSignificantKmers.__name__ kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location) kwargs = SignificantFeaturesHelper.parse_sequences_path(kwargs, "reference_sequences_path", location) ParameterValidator.assert_all_type_and_value(kwargs["k_values"], int, location, "k_values") return SequencesWithSignificantKmers(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, reference_sequences_path: Path = None, p_values: List[float] = None, k_values: List[int] = None, label: dict = None, result_path: Path = None, name: str = None, number_of_processes: int = 1, sequence_type: SequenceType = None, region_type: RegionType = None): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.reference_sequences_path = reference_sequences_path self.reference_sequences = SignificantFeaturesHelper.load_sequences(reference_sequences_path) self.p_values = p_values self.k_values = k_values self.label = label self.sequence_type = sequence_type self.region_type = region_type
[docs] def check_prerequisites(self): if isinstance(self.dataset, RepertoireDataset): return True else: warnings.warn(f"{SequencesWithSignificantKmers.__name__}: report can be generated only from RepertoireDataset. Skipping this report...") return False
def _generate(self) -> ReportResult: self.label_config = LabelHelper.create_label_config([self.label], self.dataset, SequencesWithSignificantKmers.__name__, f"{SequencesWithSignificantKmers.__name__}/label") report_outputs = self._write_output_files() return ReportResult(name=self.name, info="Given a list of reference sequences, this report writes out the subsets of reference " "sequences containing significant k-mers.", output_tables=report_outputs) def _write_output_files(self): report_outputs = [] for k in self.k_values: for p_value in self.p_values: significant_kmers = self._compute_significant_kmers(k, p_value) output_file_path = self._get_output_file_path(k, p_value) self._write_sequences_containing_significant_kmers(significant_kmers, output_file_path) report_outputs.append(ReportOutput(output_file_path, f"Sequences containing significant {k}-mers with p-value {p_value}")) return report_outputs def _get_encoder_result_path(self, k, p_value): result_path = self.result_path / f"{k}-mer_{p_value}" PathBuilder.build(result_path) return result_path def _get_output_file_path(self, k, p_value): return self.result_path / f"sequences_with_significant_{k}-mers_at_p={p_value}.txt" def _write_sequences_containing_significant_kmers(self, significant_kmers, output_file): with open(output_file, "w") as f: for sequence in self.reference_sequences: for kmer in significant_kmers: if kmer in sequence: f.write(sequence) f.write("\n") break f.close() def _compute_significant_kmers(self, k, p_value): encoder_result_path = self._get_encoder_result_path(k, p_value) encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path, self.region_type, self.sequence_type) encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params) sequences = pd.read_csv(encoder.relevant_sequence_path) return list(sequences["k-mer"])