Source code for immuneML.encodings.baseline_encoding.MetadataEncoder
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from immuneML.data_model.EncodedData import EncodedData
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.util.ParameterValidator import ParameterValidator
[docs]
class MetadataEncoder(DatasetEncoder):
"""
Encoder that uses metadata fields as features, such as HLA.
**Dataset type:**
- RepertoireDatasets
- SequenceDatasets
- ReceptorDatasets
**Specification arguments:**
- metadata_fields (list): List of metadata fields to use as features.
**YAML specification:**
.. code-block:: yaml
encodings:
metadata_encoding:
Metadata:
metadata_fields: [HLA, sex]
"""
def __init__(self, metadata_fields: list, name: str = None):
super().__init__(name=name)
self.metadata_fields = metadata_fields
self.mlbs = None
[docs]
@staticmethod
def build_object(dataset: Dataset, **params):
assert 'metadata_fields' in params, "Parameter 'metadata_fields' is required for MetadataEncoder."
ParameterValidator.assert_type_and_value(params['metadata_fields'], list, 'MetadataEncoder', 'metadata_fields')
ParameterValidator.assert_all_type_and_value(params['metadata_fields'], str, 'MetadataEncoder', 'metadata_fields')
ParameterValidator.assert_all_in_valid_list(params['metadata_fields'], dataset.get_label_names(),
'MetadataEncoder', 'metadata_fields')
name = params.get('name', 'metadata_encoding')
return MetadataEncoder(metadata_fields=params['metadata_fields'], name=name)
[docs]
def encode(self, dataset, params: EncoderParams) -> Dataset:
metadata = dataset.get_metadata(self.metadata_fields, return_df=True)
features = None
classes = []
if params.learn_model:
self.mlbs = {feature: None for feature in self.metadata_fields}
for feature in self.metadata_fields:
flattened, mlb = flatten_comma_separated_mlb(metadata, feature, self.mlbs[feature])
self.mlbs[feature] = mlb
classes += [f"{feature}_{c}" for c in mlb.classes_.tolist()]
if features is None:
features = flattened
else:
features = pd.concat([features, flattened], axis=1)
# Convert to sparse matrix
onehot_sparse = csr_matrix(features)
labels = {label: dataset.get_metadata([label])[label] for label in params.label_config.get_labels_by_name()} \
if params.encode_labels else None
encoded_dataset = dataset.clone()
encoded_dataset.encoded_data = EncodedData(
examples=onehot_sparse,
feature_names=classes,
feature_annotations=pd.DataFrame({"feature": classes}),
labels=labels,
info={'metadata_fields': self.metadata_fields}
)
return encoded_dataset
[docs]
def flatten_comma_separated_mlb(df, column_name, mlb=None):
"""
Flatten comma-separated values using MultiLabelBinarizer.
Parameters:
-----------
df : pandas DataFrame
Input dataframe
column_name : str
Name of column containing comma-separated values
Returns:
--------
pandas DataFrame with original columns + one-hot encoded columns
"""
# Split comma-separated values into lists
split_values = df[column_name].str.split(',').tolist()
# Remove whitespace from each element
split_values = [[item.strip() for item in row if item.strip()] for row in split_values]
# Create binary matrix
if mlb is None:
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(split_values)
else:
binary_matrix = mlb.transform(split_values)
# Create new column names
new_columns = [f"{column_name}_{label}" for label in mlb.classes_]
# Create DataFrame with binary columns
binary_df = pd.DataFrame(binary_matrix, columns=new_columns, index=df.index)
return binary_df, mlb