Source code for immuneML.preprocessing.filters.MetadataFilter
from pathlib import Path
import numpy as np
from immuneML.analysis.criteria_matches.CriteriaMatcher import CriteriaMatcher
from immuneML.analysis.criteria_matches.CriteriaTypeInstantiator import CriteriaTypeInstantiator
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.preprocessing.filters.Filter import Filter
from immuneML.util.PathBuilder import PathBuilder
[docs]
class MetadataFilter(Filter):
"""
Removes examples from a dataset based on the examples' metadata. It works for any dataset type. Note that
for repertoire datasets, this means that repertoires will be filtered out, and for sequences datasets - sequences.
Since this filter changes the number of examples, it cannot be used with
:ref:`TrainMLModel` instruction. Use with DatasetExport instruction instead.
**Specification arguments:**
- criteria (dict): a nested dictionary that specifies the criteria for keeping the dataset examples based on the
column values; it contains the type of evaluation, name of the column, and additional parameters depending on
evaluation; alternatively, it can contain a combination of multiple (evaluation, column, parameters) groups;
evaluation_types: IN, NOT_IN, NOT_NA, GREATER_THAN, LESS_THAN, TOP_N, RANDOM_N; for IN, NOT_IN the parameter name
is 'values', for GREATER_THAN, LESS_THAN the parameter name is 'threshold' and for TOP_N, RANDOM_N the parameter
name is 'number'; supported boolean combinations of groups are AND and OR with (evaluation, column, parameter)
groups specified under 'operands' key; see the YAML below for example.
**YAML specification:**
.. indent with spaces
.. code-block:: yaml
preprocessing_sequences:
my_preprocessing:
- my_filter:
# Example filter that keeps e.g., repertoires with values greater than 1 in the "my_column_name"
# column of the metadata_file
MetadataFilter:
type: GREATER_THAN
column: my_column_name
threshold: 1
my_second_preprocessing:
- my_filter2: # only examples which in column "label" have values 'label_val1' or 'label_val2' are kept
MetadataFilter:
type: IN
values: [label_val1, label_val2]
column: label
my_third_preprocessing_example:
- my_combined_filter:
MetadataFilter:
# keeps examples with that have label_val1 or label_val2 in the column label and
# that at the same time have a value larger than 1.3 in another_metadata_column
type: AND
operands:
- type: IN
values: [label_val1, label_val2]
column: label
- type: GREATER_THAN
column: another_metadata_column
threshold: 1.3
"""
def __init__(self, criteria: dict, result_path: Path = None):
super().__init__(result_path)
self.criteria = CriteriaTypeInstantiator.instantiate(criteria)
self.column_names = extract_column_names(self.criteria)
[docs]
@classmethod
def build_object(cls, **kwargs):
criteria = kwargs
if 'result_path' in criteria:
result_path = kwargs.get("result_path", None)
del criteria['result_path']
else:
result_path = None
return cls(criteria, result_path)
[docs]
def process_dataset(self, dataset: Dataset, result_path: Path, number_of_processes=1):
self.result_path = result_path if result_path is not None else self.result_path
PathBuilder.build(self.result_path)
indices = self._get_matching_indices(dataset)
return dataset.make_subset(indices, self.result_path, "filtered")
def _get_matching_indices(self, dataset: Dataset):
metadata = dataset.get_metadata(self.column_names, True)
matches = CriteriaMatcher().match(self.criteria, metadata)
indices = np.where(matches)[0]
return indices
[docs]
def extract_column_names(d):
results = []
if isinstance(d, dict):
for k, v in d.items():
if k == "column":
results.append(v)
else:
results.extend(extract_column_names(v))
elif isinstance(d, list):
for item in d:
results.extend(extract_column_names(item))
return results