import abc
from pathlib import Path
import numpy as np
import pandas as pd
from immuneML.caching.CacheHandler import CacheHandler
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.ReflectionHandler import ReflectionHandler
from scripts.specification_util import update_docs_per_mapping
[docs]class MatchedRegexEncoder(DatasetEncoder):
"""
Encodes the dataset based on the matches between a RepertoireDataset and a collection of regular expressions.
For each regular expression, the number of sequences in the RepertoireDataset containing the expression is counted.
This can also be used to count how often a subsequence occurs in a RepertoireDataset.
The regular expressions are defined per chain, and it is possible to require a V gene match in addition to the
CDR3 sequence containing the regular expression.
This encoding should be used in combination with the :ref:`Matches`
report.
Arguments:
match_v_genes (bool): Whether V gene matches are required. If this is True, a match is only counted if the
V gene matches the gene specified in the motif input file. By default match_v_genes is False.
sum_counts (bool): When counting the number of matches, one can choose to count the number of matching sequences
or sum the frequencies of those sequences. If sum_counts is True, the sequence frequencies are summed. Otherwise,
if sum_counts is False, the number of matching unique sequences is counted. By default sum_counts is False.
motif_filepath (str): The path to the motif input file. This should be a tab separated file containing a
column named 'id' and for every chain that should be matched a column containing the regex (<chain>_regex) and a column containing
the V gene (<chain>V) if match_v_genes is True.
The chains are specified by their three letter code, see :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`.
In the simplest case, when counting the number of occurrences of a given list of k-mers in TRB sequences, the contents of the motif file could look like this:
==== ==========
id TRB_regex
==== ==========
1 ACG
2 EDNA
3 DFWG
==== ==========
It is also possible to test whether paired regular expressions occur in the dataset (for example: regular expressions
matching both a TRA chain and a TRB chain) by specifying them on the same line.
In a more complex case where both paired and unpaired regular expressions are specified, in addition to matching the V
genes, the contents of the motif file could look like this:
==== ========== ======= ========== ========
id TRA_regex TRAV TRB_regex TRBV
==== ========== ======= ========== ========
1 AGQ.GSS TRAV35 S[APL]GQY TRBV29-1
2 ASS.R.* TRBV7-3
==== ========== ======= ========== ========
YAML Specification:
.. indent with spaces
.. code-block:: yaml
my_mr_encoding:
MatchedRegex:
motif_filepath: path/to/file.txt
match_v_genes: True
sum_counts: False
"""
dataset_mapping = {
"RepertoireDataset": "MatchedRegexRepertoireEncoder"
}
def __init__(self, motif_filepath: Path, match_v_genes: bool, sum_counts: bool, chains: list, name: str = None):
self.motif_filepath = motif_filepath
self.match_v_genes = match_v_genes
self.sum_counts = sum_counts
self.chains = chains
self.regex_df = None
self.feature_count = None
self.name = name
@staticmethod
def _prepare_parameters(motif_filepath: str, match_v_genes: bool, sum_counts: bool, name: str = None):
ParameterValidator.assert_type_and_value(match_v_genes, bool, "MatchedRegexEncoder", "match_v_genes")
ParameterValidator.assert_type_and_value(sum_counts, bool, "MatchedRegexEncoder", "sum_counts")
motif_filepath = Path(motif_filepath)
assert motif_filepath.is_file(), f"MatchedRegexEncoder: the file {motif_filepath} does not exist. " \
f"Specify the correct path under motif_filepath."
file_columns = list(pd.read_csv(motif_filepath, sep="\t", iterator=False, dtype=str, nrows=0).columns)
ParameterValidator.assert_all_in_valid_list(file_columns, ["id"] + [f"{c.value}V" for c in Chain] + [f"{c.value}_regex" for c in Chain], "MatchedRegexEncoder", "motif_filepath (column names)")
chains = [colname.split("_")[0] for colname in file_columns if colname.endswith("_regex")]
if match_v_genes:
for chain in chains:
assert f"{chain}V" in file_columns, f"MatchedRegexEncoder: expected column {chain}V to be present in the columns of motif_filepath. " \
f"Remove {chain}_regex from columns, or set match_v_genes to False."
return {
"motif_filepath": motif_filepath,
"match_v_genes": match_v_genes,
"sum_counts": sum_counts,
"chains": chains,
"name": name
}
[docs] @staticmethod
def build_object(dataset=None, **params):
try:
prepared_params = MatchedRegexEncoder._prepare_parameters(**params)
encoder = ReflectionHandler.get_class_by_name(
MatchedRegexEncoder.dataset_mapping[dataset.__class__.__name__], "reference_encoding/")(**prepared_params)
except ValueError:
raise ValueError("{} is not defined for dataset of type {}.".format(MatchedRegexEncoder.__name__,
dataset.__class__.__name__))
return encoder
[docs] def encode(self, dataset, params: EncoderParams):
cache_key = CacheHandler.generate_cache_key(self._prepare_caching_params(dataset, params))
encoded_dataset = CacheHandler.memo(cache_key,
lambda: self._encode_new_dataset(dataset, params))
return encoded_dataset
def _prepare_caching_params(self, dataset, params: EncoderParams):
return (("dataset_identifiers", tuple(dataset.get_example_ids())),
("dataset_metadata", dataset.metadata_file),
("dataset_type", dataset.__class__.__name__),
("labels", tuple(params.label_config.get_labels_by_name())),
("encoding", MatchedRegexEncoder.__name__),
("learn_model", params.learn_model),
("encoding_params", tuple(vars(self).items())))
@abc.abstractmethod
def _encode_new_dataset(self, dataset, params: EncoderParams):
pass
def _load_regex_df(self):
df = pd.read_csv(self.motif_filepath, sep="\t", iterator=False, dtype=str)
if not self.match_v_genes:
for v_gene in [f"{c.value}V" for c in Chain]:
if v_gene in df.columns:
df.drop(v_gene, axis=1, inplace=True)
colnames_subset = list(df.columns)
colnames_subset.remove("id")
df.drop_duplicates(subset=colnames_subset, inplace=True)
df.replace({np.NaN: None}, inplace=True)
self.feature_count = 0
for chain in Chain:
regex_colname = f"{chain.value}_regex"
if regex_colname in df.columns:
self.feature_count += df[regex_colname].count()
self.regex_df = df
[docs] @staticmethod
def get_documentation():
doc = str(MatchedRegexEncoder.__doc__)
chain_values = str([region_type.value for region_type in Chain])[1:-1].replace("'", "`")
mapping = {
"The chains are specified by their three letter code, see :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`.": f"The chains are specified by their three letter code, valid values are: {chain_values}.",
}
doc = update_docs_per_mapping(doc, mapping)
return doc