[docs]classDiseaseAssociatedSequenceOverlap(MultiDatasetReport):""" DiseaseAssociatedSequenceOverlap report makes a heatmap showing the overlap of disease-associated sequences (or k-mers) produced by the :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder`, :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder` or :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` between multiple datasets of different sizes (different number of repertoires per dataset). This plot can be used only with MultiDatasetBenchmarkTool. The overlap is computed by the following equation: .. math:: overlap(X,Y) = \\frac{|X \\cap Y|}{min(|X|, |Y|)} * 100 For details, see: Greiff V, Menzel U, Miho E, et al. Systems Analysis Reveals High Genetic and Antigen-Driven Predetermination of Antibody Repertoires throughout B Cell Development. Cell Reports. 2017;19(7):1467-1478. doi:10.1016/j.celrep.2017.04.054. **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: my_overlap_report: DiseaseAssociatedSequenceOverlap # report has no parameters """
def__init__(self,instruction_states:List[TrainMLModelState]=None,name:str=None,result_path:Path=None,number_of_processes:int=1):super().__init__(instruction_states=instruction_states,name=name,result_path=result_path,number_of_processes=number_of_processes)self.label=Nonedef_generate(self)->ReportResult:self.result_path=PathBuilder.build(self.result_path/self.name)self._extract_label()hp_items=[state.optimal_hp_items[self.label.name]forstateinself.instruction_states]overlap_matrix=SequenceAnalysisHelper.compute_overlap_matrix(hp_items)labels=[state.dataset.nameforstateinself.instruction_states]figure_path=self._make_figure(overlap_matrix,labels)data_path=self._export_matrix(overlap_matrix,labels)returnReportResult(name=self.name,info="A heatmap showing the overlap of disease-associated sequences produced by SequenceAbundance encoders between multiple datasets of different sizes.",output_figures=[ReportOutput(figure_path,'sequence overlap across datasets')],output_tables=[ReportOutput(data_path,'sequence overlap across datasets (csv)')])def_extract_label(self):all_labels=[]forstateinself.instruction_states:all_labels+=state.label_configuration.get_label_objects()label_names=set([label.nameforlabelinall_labels])assertlen(label_names)==1, \
f"{DiseaseAssociatedSequenceOverlap.__name__}: multiple labels were specified {label_names}, but this report accepts only one label."self.label=list(all_labels)[0]def_export_matrix(self,overlap_matrix,labels):data_path=self.result_path/"sequence_overlap.csv"pd.DataFrame(overlap_matrix,columns=labels,index=labels).to_csv(data_path)returndata_pathdef_make_figure(self,overlap_matrix,labels):figure=px.imshow(overlap_matrix,x=labels,y=labels,zmin=0,zmax=100,color_continuous_scale=px.colors.sequential.Teal,template='plotly_white')figure.update_traces(hovertemplate="Overlap of disease-associated<br>sequences between datasets<br>%{x} and %{y}:<br>%{z}%<extra></extra>")figure_path=self.result_path/"sequence_overlap.html"figure.write_html(str(figure_path))returnfigure_path