Source code for immuneML.encodings.reference_encoding.MatchedRegexEncoder

import abc
from pathlib import Path

import numpy as np
import pandas as pd

from immuneML.caching.CacheHandler import CacheHandler
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.ReflectionHandler import ReflectionHandler
from scripts.specification_util import update_docs_per_mapping

[docs]class MatchedRegexEncoder(DatasetEncoder): """ Encodes the dataset based on the matches between a RepertoireDataset and a collection of regular expressions. For each regular expression, the number of sequences in the RepertoireDataset containing the expression is counted. This can also be used to count how often a subsequence occurs in a RepertoireDataset. The regular expressions are defined per chain, and it is possible to require a V gene match in addition to the CDR3 sequence containing the regular expression. This encoding should be used in combination with the :ref:`Matches` report. Arguments: match_v_genes (bool): Whether V gene matches are required. If this is True, a match is only counted if the V gene matches the gene specified in the motif input file. By default match_v_genes is False. sum_counts (bool): When counting the number of matches, one can choose to count the number of matching sequences or sum the frequencies of those sequences. If sum_counts is True, the sequence frequencies are summed. Otherwise, if sum_counts is False, the number of matching unique sequences is counted. By default sum_counts is False. motif_filepath (str): The path to the motif input file. This should be a tab separated file containing a column named 'id' and for every chain that should be matched a column containing the regex (<chain>_regex) and a column containing the V gene (<chain>V) if match_v_genes is True. The chains are specified by their three letter code, see :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`. In the simplest case, when counting the number of occurrences of a given list of k-mers in TRB sequences, the contents of the motif file could look like this: ==== ========== id TRB_regex ==== ========== 1 ACG 2 EDNA 3 DFWG ==== ========== It is also possible to test whether paired regular expressions occur in the dataset (for example: regular expressions matching both a TRA chain and a TRB chain) by specifying them on the same line. In a more complex case where both paired and unpaired regular expressions are specified, in addition to matching the V genes, the contents of the motif file could look like this: ==== ========== ======= ========== ======== id TRA_regex TRAV TRB_regex TRBV ==== ========== ======= ========== ======== 1 AGQ.GSS TRAV35 S[APL]GQY TRBV29-1 2 ASS.R.* TRBV7-3 ==== ========== ======= ========== ======== YAML Specification: .. indent with spaces .. code-block:: yaml my_mr_encoding: MatchedRegex: motif_filepath: path/to/file.txt match_v_genes: True sum_counts: False """ dataset_mapping = { "RepertoireDataset": "MatchedRegexRepertoireEncoder" } def __init__(self, motif_filepath: Path, match_v_genes: bool, sum_counts: bool, chains: list, name: str = None): self.motif_filepath = motif_filepath self.match_v_genes = match_v_genes self.sum_counts = sum_counts self.chains = chains self.regex_df = None self.feature_count = None = name @staticmethod def _prepare_parameters(motif_filepath: str, match_v_genes: bool, sum_counts: bool, name: str = None): ParameterValidator.assert_type_and_value(match_v_genes, bool, "MatchedRegexEncoder", "match_v_genes") ParameterValidator.assert_type_and_value(sum_counts, bool, "MatchedRegexEncoder", "sum_counts") motif_filepath = Path(motif_filepath) assert motif_filepath.is_file(), f"MatchedRegexEncoder: the file {motif_filepath} does not exist. " \ f"Specify the correct path under motif_filepath." file_columns = list(pd.read_csv(motif_filepath, sep="\t", iterator=False, dtype=str, nrows=0).columns) ParameterValidator.assert_all_in_valid_list(file_columns, ["id"] + [f"{c.value}V" for c in Chain] + [f"{c.value}_regex" for c in Chain], "MatchedRegexEncoder", "motif_filepath (column names)") chains = [colname.split("_")[0] for colname in file_columns if colname.endswith("_regex")] if match_v_genes: for chain in chains: assert f"{chain}V" in file_columns, f"MatchedRegexEncoder: expected column {chain}V to be present in the columns of motif_filepath. " \ f"Remove {chain}_regex from columns, or set match_v_genes to False." return { "motif_filepath": motif_filepath, "match_v_genes": match_v_genes, "sum_counts": sum_counts, "chains": chains, "name": name }
[docs] @staticmethod def build_object(dataset=None, **params): try: prepared_params = MatchedRegexEncoder._prepare_parameters(**params) encoder = ReflectionHandler.get_class_by_name( MatchedRegexEncoder.dataset_mapping[dataset.__class__.__name__], "reference_encoding/")(**prepared_params) except ValueError: raise ValueError("{} is not defined for dataset of type {}.".format(MatchedRegexEncoder.__name__, dataset.__class__.__name__)) return encoder
[docs] def encode(self, dataset, params: EncoderParams): cache_key = CacheHandler.generate_cache_key(self._prepare_caching_params(dataset, params)) encoded_dataset = CacheHandler.memo(cache_key, lambda: self._encode_new_dataset(dataset, params)) return encoded_dataset
def _prepare_caching_params(self, dataset, params: EncoderParams): return (("dataset_identifiers", tuple(dataset.get_example_ids())), ("dataset_metadata", dataset.metadata_file), ("dataset_type", dataset.__class__.__name__), ("labels", tuple(params.label_config.get_labels_by_name())), ("encoding", MatchedRegexEncoder.__name__), ("learn_model", params.learn_model), ("encoding_params", tuple(vars(self).items()))) @abc.abstractmethod def _encode_new_dataset(self, dataset, params: EncoderParams): pass def _load_regex_df(self): df = pd.read_csv(self.motif_filepath, sep="\t", iterator=False, dtype=str) if not self.match_v_genes: for v_gene in [f"{c.value}V" for c in Chain]: if v_gene in df.columns: df.drop(v_gene, axis=1, inplace=True) colnames_subset = list(df.columns) colnames_subset.remove("id") df.drop_duplicates(subset=colnames_subset, inplace=True) df.replace({np.NaN: None}, inplace=True) self.feature_count = 0 for chain in Chain: regex_colname = f"{chain.value}_regex" if regex_colname in df.columns: self.feature_count += df[regex_colname].count() self.regex_df = df
[docs] @staticmethod def get_documentation(): doc = str(MatchedRegexEncoder.__doc__) chain_values = str([region_type.value for region_type in Chain])[1:-1].replace("'", "`") mapping = { "The chains are specified by their three letter code, see :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`.": f"The chains are specified by their three letter code, valid values are: {chain_values}.", } doc = update_docs_per_mapping(doc, mapping) return doc