Source code for immuneML.encodings.reference_encoding.MatchedSequencesRepertoireEncoder

import numpy as np
import pandas as pd

from immuneML.analysis.SequenceMatcher import SequenceMatcher
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.reference_encoding.MatchedSequencesEncoder import MatchedSequencesEncoder


[docs]class MatchedSequencesRepertoireEncoder(MatchedSequencesEncoder): def _encode_new_dataset(self, dataset, params: EncoderParams): encoded_dataset = RepertoireDataset(repertoires=dataset.repertoires, labels=dataset.labels, metadata_file=dataset.metadata_file) encoded_repertoires, labels = self._encode_repertoires(dataset, params) feature_annotations = self._get_feature_info() encoded_dataset.add_encoded_data(EncodedData( examples=encoded_repertoires, labels=labels, feature_names=list(feature_annotations["sequence_id"]), feature_annotations=feature_annotations, example_ids=[repertoire.identifier for repertoire in dataset.get_data()], encoding=MatchedSequencesEncoder.__name__ )) return encoded_dataset def _get_feature_info(self): """ returns a pandas dataframe containing: - sequence id - chain - amino acid sequence - v gene - j gene """ features = [[] for i in range(0, len(self.reference_sequences))] for i, sequence in enumerate(self.reference_sequences): features[i] = [sequence.identifier, sequence.get_attribute("chain").name.lower(), sequence.get_sequence(), sequence.get_attribute("v_gene"), sequence.get_attribute("j_gene")] features = pd.DataFrame(features, columns=["sequence_id", "chain", "sequence", "v_gene", "j_gene"]) return features def _encode_repertoires(self, dataset: RepertoireDataset, params): # Rows = repertoires, Columns = reference sequences encoded_repertories = np.zeros((dataset.get_example_count(), len(self.reference_sequences)), dtype=int) labels = {label: [] for label in params.label_config.get_labels_by_name()} if params.encode_labels else None for i, repertoire in enumerate(dataset.get_data()): encoded_repertories[i] = self._match_repertoire_to_reference(repertoire) for label in params.label_config.get_labels_by_name(): labels[label].append(repertoire.metadata[label]) return encoded_repertories, labels def _match_repertoire_to_reference(self, repertoire: Repertoire): matcher = SequenceMatcher() matches = np.zeros(len(self.reference_sequences), dtype=int) rep_seqs = repertoire.sequences for i, reference_seq in enumerate(self.reference_sequences): for repertoire_seq in rep_seqs: if matcher.matches_sequence(reference_seq, repertoire_seq, max_distance=self.max_edit_distance): matches[i] += repertoire_seq.metadata.count return matches