Source code for immuneML.analysis.similarities.RepertoireSimilarityComputer

import numpy as np
from scipy import sparse
from sklearn import preprocessing


[docs]class RepertoireSimilarityComputer:
[docs] @staticmethod def compute_pearson(a): a = a.astype(np.float64) n = a.shape[1] # Compute the covariance matrix rowsum = a.sum(1) centering = rowsum.dot(rowsum.T.conjugate()) / n C = (a.dot(a.T.conjugate()) - centering) / (n - 1) # The correlation coefficients are given by # C_{i,j} / sqrt(C_{i} * C_{j}) d = np.diag(C) coeffs = C / np.sqrt(np.outer(d, d)) return coeffs
[docs] @staticmethod def compute_morisita(a): # Works on unnormalized or relative frequency normalized values only - not e.g. L2 normalized xy = a * a.T repertoire_totals = a.sum(axis=1).A1 pairwise_mult_repertoire_totals = repertoire_totals[:, None] * repertoire_totals[None, :] repertoire_frequency = sparse.diags(1 / a.sum(axis=1).A.ravel()) @ a repertoire_frequency.data **= 2 simpson_diversity = repertoire_frequency.sum(axis=1).A1 pairwise_sum_simpson = simpson_diversity[:, None] + simpson_diversity[None, :] return 2 * (xy / pairwise_sum_simpson) / pairwise_mult_repertoire_totals
[docs] @staticmethod def compute_jaccard(a): a = a.T a.data[:] = 1 cols_sum = a.getnnz(axis=0) ab = a.T * a # for rows aa = np.repeat(cols_sum, ab.getnnz(axis=0)) # for columns bb = cols_sum[ab.indices] similarities = ab.copy() similarities.data = similarities.data / (aa + bb - ab.data) return similarities
[docs] @staticmethod def compute_cosine(a): a = a.T col_normed_mat = preprocessing.normalize(a.tocsc(), axis=0) return (col_normed_mat.T * col_normed_mat).A