[docs]classMatchedRegexRepertoireEncoder(MatchedRegexEncoder):def_encode_new_dataset(self,dataset,params:EncoderParams):self._load_regex_df()feature_annotations=self._get_feature_info()encoded_repertoires,labels=self._encode_repertoires(dataset,params)encoded_dataset=dataset.clone()encoded_dataset.encoded_data=EncodedData(examples=encoded_repertoires,example_ids=dataset.get_example_ids(),feature_names=list(feature_annotations["locus_id"]),feature_annotations=feature_annotations,labels=labels,encoding=MatchedRegexEncoder.__name__,info={'sequence_type':params.sequence_type,'region_type':params.region_type})returnencoded_datasetdef_get_feature_info(self):""" returns a pandas dataframe containing: - feature id (id_CHAIN) - regex - v_gene (if match_v_genes == True) only for the motifs for which a regex was specified """features={"receptor_id":[],"locus_id":[],"locus":[],"regex":[]}ifself.match_v_genes:features["v_call"]=[]forindex,rowinself.regex_df.iterrows():forchain_typeinself.chains:regex=row[f"{chain_type}_regex"]ifregexisnotNone:features["receptor_id"].append(f"{row['id']}")features["locus_id"].append(f"{row['id']}_{chain_type}")features["locus"].append(Chain.get_chain(chain_type).name.lower())features["regex"].append(regex)ifself.match_v_genes:v_gene=row[f"{chain_type}V"]iff"{chain_type}V"inrowelseNonefeatures["v_call"].append(v_gene)returnpd.DataFrame(features)def_encode_repertoires(self,dataset:RepertoireDataset,params:EncoderParams):# Rows = repertoires, Columns = regex matches (one chain per column)encoded_repertoires=np.zeros((dataset.get_example_count(),self.feature_count),dtype=int)labels={label:[]forlabelinparams.label_config.get_labels_by_name()}ifparams.encode_labelselseNonen_repertoires=dataset.get_example_count()fori,repertoireinenumerate(dataset.get_data()):print_log(f"Encoding repertoire {i+1}/{n_repertoires}",include_datetime=True)encoded_repertoires[i]=self._match_repertoire_to_regexes(repertoire,params)iflabelsisnotNone:forlabel_nameinparams.label_config.get_labels_by_name():labels[label_name].append(repertoire.metadata[label_name])returnencoded_repertoires,labelsdef_match_repertoire_to_regexes(self,repertoire:Repertoire,params:EncoderParams):matches=np.zeros(self.feature_count,dtype=int)rep_seqs=repertoire.sequences(params.region_type)match_idx=0forindex,rowinself.regex_df.iterrows():forchain_typeinself.chains:regex=row[f"{chain_type}_regex"]ifregexisnotNone:v_gene=row[f"{chain_type}V"]iff"{chain_type}V"inrowelseNoneforrep_seqinrep_seqs:ifrep_seq.locusisnotNone:ifrep_seq.locus==chain_type:ifself._matches(rep_seq,regex,v_gene):n_matches=1ifself.reads==ReadsType.UNIQUEelserep_seq.duplicate_countifn_matchesisNone:warnings.warn(f"MatchedRegexRepertoireEncoder: count not defined for sequence with id {rep_seq.sequence_id} in repertoire {repertoire.identifier}, ignoring sequence...")n_matches=0matches[match_idx]+=n_matcheselse:warnings.warn(f"{MatchedRegexRepertoireEncoder.__class__.__name__}: chain was not set for sequence {rep_seq.sequence_id}, skipping the sequence for matching...")match_idx+=1returnmatchesdef_matches(self,receptor_sequence,regex,v_gene=None):ifv_geneisnotNoneandreceptor_sequence.v_call!=v_gene:matches=Falseelse:matches=bool(re.search(regex,receptor_sequence.sequence_aa))returnmatches