Source code for immuneML.preprocessing.filters.MetadataFilter

from pathlib import Path

import numpy as np

from immuneML.analysis.criteria_matches.CriteriaMatcher import CriteriaMatcher
from immuneML.analysis.criteria_matches.CriteriaTypeInstantiator import CriteriaTypeInstantiator
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.preprocessing.filters.Filter import Filter
from immuneML.util.PathBuilder import PathBuilder



[docs]
class MetadataFilter(Filter):
    """
    Removes examples from a dataset based on the examples' metadata. It works for any dataset type. Note that
    for repertoire datasets, this means that repertoires will be filtered out, and for sequences datasets - sequences.

    Since this filter changes the number of examples, it cannot be used with
    :ref:`TrainMLModel` instruction. Use with DatasetExport instruction instead.

    **Specification arguments:**

    - criteria (dict): a nested dictionary that specifies the criteria for keeping the dataset examples based on the
      column values; it contains the type of evaluation, name of the column, and additional parameters depending on
      evaluation; alternatively, it can contain a combination of multiple (evaluation, column, parameters) groups;
      evaluation_types: IN, NOT_IN, NOT_NA, GREATER_THAN, LESS_THAN, TOP_N, RANDOM_N; for IN, NOT_IN the parameter name
      is 'values', for GREATER_THAN, LESS_THAN the parameter name is 'threshold' and for TOP_N, RANDOM_N the parameter
      name is 'number'; supported boolean combinations of groups are AND and OR with (evaluation, column, parameter)
      groups specified under 'operands' key; see the YAML below for example.


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        preprocessing_sequences:
            my_preprocessing:
                - my_filter:
                    # Example filter that keeps e.g., repertoires with values greater than 1 in the "my_column_name"
                    # column of the metadata_file
                    MetadataFilter:
                        type: GREATER_THAN
                        column: my_column_name
                        threshold: 1
            my_second_preprocessing:
                - my_filter2: # only examples which in column "label" have values 'label_val1' or 'label_val2' are kept
                    MetadataFilter:
                        type: IN
                        values: [label_val1, label_val2]
                        column: label
            my_third_preprocessing_example:
                - my_combined_filter:
                    MetadataFilter:
                    # keeps examples with that have label_val1 or label_val2 in the column label and
                    # that at the same time have a value larger than 1.3 in another_metadata_column
                        type: AND
                        operands:
                        - type: IN
                          values: [label_val1, label_val2]
                          column: label
                        - type: GREATER_THAN
                          column: another_metadata_column
                          threshold: 1.3

    """

    def __init__(self, criteria: dict, result_path: Path = None):
        super().__init__(result_path)
        self.criteria = CriteriaTypeInstantiator.instantiate(criteria)
        self.column_names = extract_column_names(self.criteria)


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        criteria = kwargs
        if 'result_path' in criteria:
            result_path = kwargs.get("result_path", None)
            del criteria['result_path']
        else:
            result_path = None
        return cls(criteria, result_path)



[docs]
    def keeps_example_count(self) -> bool:
        return False



[docs]
    def process_dataset(self, dataset: Dataset, result_path: Path, number_of_processes=1):
        self.result_path = result_path if result_path is not None else self.result_path
        PathBuilder.build(self.result_path)
        indices = self._get_matching_indices(dataset)
        return dataset.make_subset(indices, self.result_path, "filtered")


    def _get_matching_indices(self, dataset: Dataset):
        metadata = dataset.get_metadata(self.column_names, True)
        matches = CriteriaMatcher().match(self.criteria, metadata)
        indices = np.where(matches)[0]
        return indices




[docs]
def extract_column_names(d):
    results = []
    if isinstance(d, dict):
        for k, v in d.items():
            if k == "column":
                results.append(v)
            else:
                results.extend(extract_column_names(v))
    elif isinstance(d, list):
        for item in d:
            results.extend(extract_column_names(item))
    return results