Source code for immuneML.preprocessing.filters.MetadataRepertoireFilter

from pathlib import Path

import numpy as np
import pandas as pd

from immuneML.analysis.criteria_matches.CriteriaMatcher import CriteriaMatcher
from immuneML.analysis.criteria_matches.CriteriaTypeInstantiator import CriteriaTypeInstantiator
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.preprocessing.filters.Filter import Filter


[docs]class MetadataRepertoireFilter(Filter): """ Removes repertoires from a RepertoireDataset based on information stored in the metadata_file. Note that this filter filters out repertoires, not individual sequences, and can thus only be applied to RepertoireDatasets. Arguments: criteria (dict): a nested dictionary that specifies the criteria for keeping certain columns. See :py:obj:`~immuneML.analysis.criteria_matches.CriteriaMatcher.CriteriaMatcher` for a more detailed explanation. YAML specification: .. indent with spaces .. code-block:: yaml preprocessing_sequences: my_preprocessing: - my_filter: # Example filter that keeps repertoires with values greater than 1 in the "my_column_name" column of the metadata_file MetadataRepertoireFilter: type: GREATER_THAN value: type: COLUMN name: my_column_name threshold: 1 """ def __init__(self, criteria: dict): self.criteria = CriteriaTypeInstantiator.instantiate(criteria)
[docs] def process_dataset(self, dataset: RepertoireDataset, result_path: Path): params = {"result_path": result_path, "criteria": self.criteria} return MetadataRepertoireFilter.process(dataset, params)
[docs] @staticmethod def process(dataset: RepertoireDataset, params: dict) -> RepertoireDataset: MetadataRepertoireFilter.check_dataset_type(dataset, [RepertoireDataset], "MetadataRepertoireFilter") processed_dataset = dataset.clone() original_repertoires = processed_dataset.get_data() indices = MetadataRepertoireFilter.get_matching_indices(processed_dataset, params["criteria"]) processed_dataset.repertoires = [original_repertoires[i] for i in indices] processed_dataset.metadata_file = MetadataRepertoireFilter.build_new_metadata(dataset, indices, params["result_path"]) MetadataRepertoireFilter.check_dataset_not_empty(processed_dataset, "MetadataRepertoireFilter") return processed_dataset
[docs] @staticmethod def get_matching_indices(dataset: RepertoireDataset, criteria): metadata = pd.DataFrame(dataset.get_metadata(None)) matches = CriteriaMatcher().match(criteria, metadata) indices = np.where(matches)[0] return indices