from collections import Counter
from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.kmer_frequency.KmerFrequencyEncoder import KmerFrequencyEncoder
[docs]class KmerFreqReceptorEncoder(KmerFrequencyEncoder):
def _encode_new_dataset(self, dataset, params: EncoderParams):
encoded_data = self._encode_data(dataset, params)
encoded_dataset = dataset.clone()
encoded_dataset.encoded_data = encoded_data
return encoded_dataset
def _encode_examples(self, dataset: ReceptorDataset, params: EncoderParams):
encoded_receptors_counts, encoded_receptors = [], []
receptor_ids = []
label_config = params.label_config
labels = {label: [] for label in label_config.get_labels_by_name()} if params.encode_labels else None
chains = []
sequence_encoder = self._prepare_sequence_encoder()
feature_names = sequence_encoder.get_feature_names(params)
for receptor in dataset.get_data(params.pool_size):
counts = {chain: Counter() for chain in receptor.get_chains()}
chains = receptor.get_chains()
for chain in receptor.get_chains():
counts[chain] = self._encode_sequence(receptor.get_chain(chain), params, sequence_encoder, counts[chain])
encoded_receptors_counts.append(counts)
receptor_ids.append(receptor.identifier)
if params.encode_labels:
for label_name in label_config.get_labels_by_name():
label = receptor.metadata[label_name]
labels[label_name].append(label)
for encoded_receptor_count in encoded_receptors_counts:
counts = [self._add_chain_to_name(encoded_receptor_count[chain], chain) for chain in chains]
encoded_receptors.append(counts[0] + counts[1])
return encoded_receptors, receptor_ids, labels, feature_names
def _add_chain_to_name(self, count: Counter, chain: str) -> Counter:
new_counter = Counter()
for key in count.keys():
new_counter[f"{chain}_{key}"] = count[key]
return new_counter