[docs]classSequenceCountDistribution(DataReport):""" Generates a histogram of the duplicate counts of the sequences in a dataset. **Specification arguments:** - split_by_label (bool): Whether to split the plots by a label. If set to true, the Dataset must either contain a single label, or alternatively the label of interest can be specified under 'label'. By default, split_by_label is False. - label (str): Optional label for separating the results by color/creating separate plots. Note that this should the name of a valid dataset label. **YAML specification:** .. indent with spaces .. code-block:: yaml my_sld_report: SequenceCountDistribution: label: disease """
def_generate(self)->ReportResult:self._set_label_name()df=self._get_sequence_counts_df()PathBuilder.build(self.result_path)output_table=self._write_output_table(df,self.result_path/"sequence_count_distribution.tsv",name="Duplicate counts of sequences in the dataset")report_output_fig=self._safe_plot(df=df,output_written=False)output_figures=Noneifreport_output_figisNoneelse[report_output_fig]returnReportResult(name=self.name,info="The sequence count distribution of the dataset.",output_figures=output_figures,output_tables=[output_table])def_get_sequence_counts_df(self):ifisinstance(self.dataset,RepertoireDataset):returnself._get_repertoire_df()elifisinstance(self.dataset,ReceptorDataset)orisinstance(self.dataset,SequenceDataset):returnself._get_sequence_receptor_df()def_get_repertoire_df(self):sequence_counts=Counter()forrepertoireinself.dataset.get_data():ifself.split_by_label:label_class=repertoire.metadata[self.label_name]else:label_class=Nonerepertoire_counter=Counter(repertoire.data.duplicate_count)sequence_counts+=Counter({(key,label_class):valueforkey,valueinrepertoire_counter.items()})df=pd.DataFrame({"n_observations":list(sequence_counts.values()),"duplicate_count":[key[0]forkeyinsequence_counts.keys()]})ifself.split_by_label:df[self.label_name]=[key[1]forkeyinsequence_counts.keys()]returndfdef_get_sequence_receptor_df(self):data=self.dataset.datatry:counts=data.duplicate_countexceptAttributeErrorase:raiseAttributeError(f"{SequenceCountDistribution.__name__}: SequenceDataset does not contain attribute 'duplicate_count'. "f"This report can only be run when sequence counts are available.")chains=data.locus.tolist()ifself.split_by_label:label_classes=getattr(data,self.label_name).tolist()counter=Counter(zip(counts,chains,label_classes))else:counter=Counter(zip(counts,chains))df=pd.DataFrame({"duplicate_count":[key[0]forkeyincounter.keys()],"locus":[key[1]forkeyincounter.keys()],"n_observations":counter.values()})ifself.split_by_label:df[self.label_name]=[key[2]forkeyincounter.keys()]returndfdef_plot(self,df:pd.DataFrame)->ReportOutput:figure=px.bar(df,x="duplicate_count",y="n_observations",barmode="group",color=self.label_nameifself.split_by_labelelseNone,facet_col="locus"if"locus"indf.columnsandlen(set(df["locus"]))>1elseNone,color_discrete_sequence=px.colors.diverging.Tealrose,labels={"n_observations":"Number of observations","duplicate_count":"Sequence duplicate count"})figure.update_layout(template="plotly_white")PathBuilder.build(self.result_path)file_path=self.result_path/"sequence_count_distribution.html"figure.write_html(str(file_path))returnReportOutput(path=file_path,name="Sequence duplicate count distribution")