I've been working on building a gpu accelerated package based on Scanpy using the CUDA toolkit ( cudf=23.02, cuml=23.02 ,cugraph=23.02 cudatoolkit=11.8). I'm currently implementing the highly variable genes function but I'm running into some strange behavior where the dispersion_norm
is not properly calculated, there are to many features that will get the same score and that doesn't allow me to set the n_top_genes
parameter using the cut-off value. The cause of this I've tracked back to due to cudf.cut
, which doesn't set the bins properly or the way the means
and variance
.
import cudf
import cupy as cp
import cudf
import cupy as cp
import pandas as pd
def seurat_highly_variable_genes(cp_matrix: cp.ndarray,n_bins = 20, n_top_genes: int = 2450) -> cudf.DataFrame:
"""
Identifies highly variable genes in a gene expression matrix using the Seurat algorithm,
implemented in cuDF and cuPy for GPU acceleration.
Args:
cp_matrix (cupy.ndarray): A 2D numpy array or cupy ndarray of shape (n_samples, n_features)
containing the expression values of the genes across the samples.
n_top_genes (int): The number of highly variable genes to retain. If None, the top
10% of variable genes are retained
Returns:
cudf.DataFrame: A DataFrame with the following columns:
- gene_id: The gene identifier
- means: The mean expression of the gene across all samples
- var_sq: The variance of the gene expression across all samples
- variance: The normalized variance of the gene expression across all samples
- dispersion: The dispersion of the gene expression across all samples
- mean_bin: The mean expression bin for the gene
- dispersion_median: The median dispersion for genes in the same mean expression bin
- dispersions_norm: The normalized dispersion of the gene expression across all samples
- highly_variable: A boolean indicating whether the gene is highly variable or not
"""
hvg = cudf.DataFrame()
hvg['means'] = (cp_matrix.sum(axis=0) / cp_matrix.shape[0])[0]
cp_matrix_mul = (cp_matrix.multiply(cp_matrix))[0]
hvg['mean_sq'] = (cp_matrix_mul.sum(axis=0) / cp_matrix_mul.shape[0])
hvg['variance'] = hvg['mean_sq'] - cp.square(hvg['means'])
hvg['variance'] *= cp_matrix.shape[1] / (cp_matrix.shape[0] - 1)
hvg['dispersion'] = hvg['variance'] / hvg['means']
# here we shoudl add cuda cut
# for `cell_ranger` method make this return bin groups, does not return the number of genes specified
# mean_bin = cudf.Series(cp.digitize(hvg['means'].values, cp.r_[-cp.inf, cp.percentile(hvg['means'].values, cp.arange(10, 105, 5)), cp.inf]))
# hvg['mean_bin'] = mean_bin
# hvg = hvg.merge(disp_median_bin.reset_index(name='dispersion_median'), on="mean_bin")
# hvg['mean_bin']= cudf.cut(hvg['means'], bins= n_bins, include_lowest=True, labels=False)
bin_edges = cp.percentile(hvg['means'].values, cp.linspace(0, 100, n_bins + 1 ))
right = np.array(bin_edges.get()) # create an array of right sides of intervals
left = np.insert(right[:-1], 0, 0) # shift right edges and add leftmost value
hvg['mean_bin'] = cudf.cut(hvg['means'], bins= pd.IntervalIndex.from_arrays(left, right), include_lowest=True, labels=False)
disp_grouped = hvg.groupby("mean_bin")["dispersion"]
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# for `cell_ranger` method make this return bin groups
#disp_mad_bin = disp_grouped.apply(lambda x: cudf.Series(robust.mad(x.to_numpy())))
# retrieve those genes that have nan std, these are the ones where
# only a single gene fell in the bin and implicitly set them to have
# a normalized disperion of 1
one_gene_per_bin = disp_std_bin.isnull()
gen_indices = cp.where(one_gene_per_bin[hvg['mean_bin'].values].values)
disp_std_bin[one_gene_per_bin] = disp_mean_bin[one_gene_per_bin].values
disp_mean_bin[one_gene_per_bin] = 0
# actually do the normalization
hvg['dispersions_norm'] = (
hvg['dispersion'].values - disp_mean_bin[hvg['mean_bin']]
) / disp_std_bin[hvg['mean_bin']]
dispersion_norm = hvg["dispersions_norm"].sort_values(ascending=False)
#if n_top_genes is None:
# n_top_genes = hvg.shape[0] // 10
# print(n_top_genes)
n_top_genes = min(n_top_genes, hvg.shape[0])
disp_cut_off = dispersion_norm[int(n_top_genes - 1)]
print(hvg["dispersions_norm"].value_counts())
hvg['highly_variable'] = cp.nan_to_num(hvg["dispersions_norm"].values) >= disp_cut_off
return hvg
Thanks for taking the time to read the whole thing! I look forward to any feedback :).
I've wrote some tests to check whether the results are expected
def test_hvg_seurat():
"""testing variable genes method, currently, currently there are to many genes returend by seurat method """
adata = scanpy.datasets.pbmc3k()
adata.obs['unique_id'] = adata.obs.index.values
adata = adata[:,(np.array(adata.X.sum(axis=0)) > 0).ravel()]
adata.X= adata.X.log1p()
hvg_scanpy = scanpy.pp.highly_variable_genes(adata, inplace = False, flavor= 'seurat', n_top_genes= 200)
hvg_seurat = seurat_highly_variable_genes(cp_matrix = cp.sparse.csr_matrix(adata.X), n_top_genes= 200 )
assert ( hvg_scanpy.highly_variable.sum() == hvg_seurat.highly_variable.sum() ) , f'{hvg_scanpy.highly_variable.sum() } \
variable genes are calculated by scanpy hvg seurat method and {hvg_seurat.highly_variable.sum()} by cudf seurat method '
assert all(hvg_scanpy.highly_variable & hvg_seurat.highly_variable.values.get()), f'{np.sum(hvg_scanpy.highly_variable & hvg_seurat.highly_variable.values.get())}, are overlapping between the scanpy seurat method and cudf seurat method'
test_hvg_seurat()