[docs]classSequenceLengthDistribution(DataReport):""" Generates a histogram of the lengths of the sequences in a dataset. **Specification arguments:** - sequence_type (str): whether to check the length of amino acid or nucleotide sequences; default value is 'amino_acid' - region_type (str): which part of the sequence to examine; e.g., IMGT_CDR3 **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: my_sld_report: SequenceLengthDistribution: sequence_type: amino_acid region_type: IMGT_CDR3 """
def_generate(self)->ReportResult:df=self._get_sequence_lengths_df()PathBuilder.build(self.result_path)df.to_csv(self.result_path/'sequence_length_distribution.csv',index=False)report_output_fig=self._safe_plot(df=df,output_written=False)output_figures=Noneifreport_output_figisNoneelse[report_output_fig]returnReportResult(name=self.name,info="A histogram of the lengths of the sequences in a dataset.",output_figures=output_figures,output_tables=[ReportOutput(self.result_path/'sequence_length_distribution.csv','lengths of sequences in the dataset')])def_get_sequence_lengths_df(self)->DataFrame:ifisinstance(self.dataset,RepertoireDataset):returnself._get_sequence_lengths_df_repertoire_dataset()elifisinstance(self.dataset,SequenceDataset):returnself._get_sequence_lengths_df_sequence_dataset()elifisinstance(self.dataset,ReceptorDataset):returnself._get_sequence_lengths_df_receptor_dataset()def_get_sequence_lengths_df_repertoire_dataset(self):sequence_lengths=Counter()forrepertoireinself.dataset.get_data(self.batch_size):seq_lengths=self._count_in_repertoire(repertoire)sequence_lengths+=seq_lengthsreturnpd.DataFrame({"counts":list(sequence_lengths.values()),'sequence_lengths':list(sequence_lengths.keys())})def_get_sequence_lengths_df_sequence_dataset(self):sequence_lengths=Counter(getattr(self.dataset.data,bnp_util.get_sequence_field_name(self.region_type,self.sequence_type)).lengths.tolist())returnpd.DataFrame({"counts":list(sequence_lengths.values()),'sequence_lengths':list(sequence_lengths.keys())})def_get_dataset_chains(self):returnnext(self.dataset.get_data()).get_chains()def_get_sequence_lengths_df_receptor_dataset(self):data=self.dataset.datachains=list(set(data.locus.tolist()))dfs=[]forchaininchains:chain_data=data[[el==chainforelindata.locus.tolist()]]chain_counter=Counter(getattr(chain_data,bnp_util.get_sequence_field_name(self.region_type,self.sequence_type)).lengths.tolist())dfs.append(pd.DataFrame({'counts':list(chain_counter.values()),'sequence_lengths':list(chain_counter.keys()),'chain':chain}))returnpd.concat(dfs)def_count_in_repertoire(self,repertoire:Repertoire)->Counter:returnCounter(getattr(repertoire.data,bnp_util.get_sequence_field_name(self.region_type,self.sequence_type)).lengths.tolist())def_plot(self,df:pd.DataFrame)->ReportOutput:figure=px.bar(df,x="sequence_lengths",y="counts",facet_col="chain"ifisinstance(self.dataset,ReceptorDataset)elseNone)figure.update_layout(xaxis=dict(tickmode='array',tickvals=df["sequence_lengths"]),yaxis=dict(tickmode='array',tickvals=df["counts"]),template="plotly_white")figure.update_traces(marker_color=px.colors.diverging.Tealrose[0])PathBuilder.build(self.result_path)file_path=self.result_path/"sequence_length_distribution.html"figure.write_html(str(file_path))returnReportOutput(path=file_path,name="Sequence length distribution plot")