[docs]classMatchedReferenceUtil:""" Utility class for MatchedSequencesEncoder and MatchedReceptorsEncoder """
[docs]@staticmethoddefprepare_reference(reference_params:dict,location:str,paired:bool):try:ParameterValidator.assert_keys(list(reference_params.keys()),["format","params"],location,"reference")seq_import_params=reference_params["params"]if"params"inreference_paramselse{}assertos.path.isfile(seq_import_params["path"]),f"{location}: the file {seq_import_params['path']} does not exist. " \
f"Specify the correct path under reference."if"is_repertoire"inseq_import_params:assertseq_import_params["is_repertoire"]isFalse,f"{location}: is_repertoire must be False for SequenceImport"else:seq_import_params["is_repertoire"]=Falseif"paired"inseq_import_params:assertseq_import_params["paired"]==paired,f"{location}: paired must be {paired} for SequenceImport"else:seq_import_params["paired"]=pairedformat_str=reference_params["format"]import_class=ReflectionHandler.get_class_by_name(f"{format_str}Import")assertimport_classisnotNone,(f"{MatchedReferenceUtil.__name__}: {format_str} could not be imported. "f"Check if the format name has been written correctly.")default_params=DefaultParamsLoader.load(EnvironmentSettings.default_params_path/"datasets",DefaultParamsLoader.convert_to_snake_case(format_str))params={**default_params,**seq_import_params}path=Path(reference_params['params']['path'])params['result_path']=PathBuilder.build(path.parent/'iml_imported'ifpath.is_file()elsepath/'iml_imported')ifformat_str=="SingleLineReceptor":receptors=list(import_class(params,'tmp_receptor_dataset').import_dataset().get_data())else:receptors=list(import_class(params=params,dataset_name="tmp_dataset").import_dataset().get_data())assertlen(receptors)>0,f"MatchedReferenceUtil: The total number of imported reference {'receptors'ifpairedelse'sequences'} is 0, please ensure that reference import is specified correctly."check_imported_references(paired,receptors,seq_import_params)logging.info(f"MatchedReferenceUtil: successfully imported {len(receptors)} reference {'receptors'ifpairedelse'sequences'}.")returnreceptorsexceptExceptionase:print_log(f"MatchedReferenceUtil: Error while preparing reference: {e}",logging.ERROR)raisee
[docs]defcheck_genes(paired,receptors,seq_import_params):importrepattern=re.compile(r'^[A-Za-z]+[0-9]+(?:-[0-9]+)?(?:\*[0-9]+)?(?:/[A-Za-z]+[0-9]+)?$')ifnotpaired:all_v_genes=set(seq.v_callforseqinreceptorsifseq.v_callisnotNoneandseq.v_call!='')all_j_genes=set(seq.j_callforseqinreceptorsifseq.j_callisnotNoneandseq.j_call!='')else:all_v_genes=(set(receptor.chain_1.v_callforreceptorinreceptorsifreceptor.chain_1.v_callisnotNoneandreceptor.chain_1.v_call!='').union(set(receptor.chain_2.v_callforreceptorinreceptorsifreceptor.chain_2.v_callisnotNoneandreceptor.chain_2.v_call!='')))all_j_genes=set(receptor.chain_1.j_callforreceptorinreceptorsifreceptor.chain_1.j_callisnotNoneandreceptor.chain_1.j_call!='').union(set(receptor.chain_2.j_callforreceptorinreceptorsifreceptor.chain_2.j_callisnotNoneandreceptor.chain_2.j_call!=''))forgene_name,gene_listin[('V',all_v_genes),('J',all_j_genes)]:iflen(gene_list)>0:assertall(pattern.match(gene)forgeneingene_list), \
(f"{MatchedReferenceUtil.__name__}: The {gene_name} gene names in the reference sequences "f"({seq_import_params['path']}) do not follow the IMGT nomenclature. Please ensure that the "f"{gene_name} gene names are in the correct format (e.g., TRBV5-1*01 for V genes, TRBJ2-7*01 "f"for J genes). Found {gene_name} genes: {gene_list}")
[docs]defcheck_for_duplicates(paired,receptors,seq_import_params):ifnotpaired:all_sequences=[f'{seq.v_call}_{seq.get_sequence()}_{seq.j_call}'forseqinreceptors]unique_sequences=set(all_sequences)iflen(unique_sequences)<len(receptors):logging.warning(f"MatchedReferenceUtil: The reference sequences ({seq_import_params['path']}) "f"contain duplicates: {len(all_sequences)-len(unique_sequences)} sequences are "f"not unique. This will result in duplicate features.")