calculating dispersion_norm using CUDF

Question

I've been working on building a gpu accelerated package based on Scanpy using the CUDA toolkit ( cudf=23.02, cuml=23.02 ,cugraph=23.02 cudatoolkit=11.8). I'm currently implementing the highly variable genes function but I'm running into some strange behavior where the dispersion_norm is not properly calculated, there are to many features that will get the same score and that doesn't allow me to set the n_top_genes parameter using the cut-off value. The cause of this I've tracked back to due to cudf.cut, which doesn't set the bins properly or the way the means and variance.

import cudf
import cupy as cp


import cudf
import cupy as cp
import pandas as pd


def seurat_highly_variable_genes(cp_matrix: cp.ndarray,n_bins = 20, n_top_genes: int = 2450) -> cudf.DataFrame:
    """
    Identifies highly variable genes in a gene expression matrix using the Seurat algorithm, 
    implemented in cuDF and cuPy for GPU acceleration.
    
    Args:
        cp_matrix (cupy.ndarray): A 2D numpy array or cupy ndarray of shape (n_samples, n_features)
            containing the expression values of the genes across the samples.
        n_top_genes (int): The number of highly variable genes to retain. If None, the top 
            10% of variable genes are retained 
            
    Returns:
        cudf.DataFrame: A DataFrame with the following columns:
            - gene_id: The gene identifier
            - means: The mean expression of the gene across all samples
            - var_sq: The variance of the gene expression across all samples
            - variance: The normalized variance of the gene expression across all samples
            - dispersion: The dispersion of the gene expression across all samples
            - mean_bin: The mean expression bin for the gene
            - dispersion_median: The median dispersion for genes in the same mean expression bin
            - dispersions_norm: The normalized dispersion of the gene expression across all samples
            - highly_variable: A boolean indicating whether the gene is highly variable or not
    
    """
    hvg = cudf.DataFrame()
    hvg['means'] = (cp_matrix.sum(axis=0) / cp_matrix.shape[0])[0]
    cp_matrix_mul = (cp_matrix.multiply(cp_matrix))[0]
    hvg['mean_sq'] = (cp_matrix_mul.sum(axis=0) / cp_matrix_mul.shape[0])
    hvg['variance'] = hvg['mean_sq'] - cp.square(hvg['means'])
    hvg['variance'] *= cp_matrix.shape[1] / (cp_matrix.shape[0] - 1)
    hvg['dispersion'] = hvg['variance'] / hvg['means']
    
    # here we shoudl add cuda cut
    
    # for `cell_ranger` method make this return bin groups, does not return the number of genes specified
    # mean_bin = cudf.Series(cp.digitize(hvg['means'].values, cp.r_[-cp.inf, cp.percentile(hvg['means'].values, cp.arange(10, 105, 5)), cp.inf]))
    # hvg['mean_bin'] = mean_bin
    # hvg = hvg.merge(disp_median_bin.reset_index(name='dispersion_median'), on="mean_bin")
    # hvg['mean_bin']= cudf.cut(hvg['means'], bins= n_bins, include_lowest=True, labels=False)
    
    bin_edges = cp.percentile(hvg['means'].values, cp.linspace(0, 100, n_bins + 1 ))
    right = np.array(bin_edges.get())  # create an array of right sides of intervals
    left = np.insert(right[:-1], 0, 0)  # shift right edges and add leftmost value
    hvg['mean_bin'] = cudf.cut(hvg['means'], bins=  pd.IntervalIndex.from_arrays(left, right), include_lowest=True, labels=False)
    
    disp_grouped = hvg.groupby("mean_bin")["dispersion"]

    disp_mean_bin = disp_grouped.mean()
    disp_std_bin = disp_grouped.std(ddof=1)
    
    # for `cell_ranger` method make this return bin groups
    #disp_mad_bin = disp_grouped.apply(lambda x: cudf.Series(robust.mad(x.to_numpy())))
    
    # retrieve those genes that have nan std, these are the ones where
    # only a single gene fell in the bin and implicitly set them to have
    # a normalized disperion of 1
    one_gene_per_bin = disp_std_bin.isnull()
    gen_indices = cp.where(one_gene_per_bin[hvg['mean_bin'].values].values)

    
    disp_std_bin[one_gene_per_bin] = disp_mean_bin[one_gene_per_bin].values
    disp_mean_bin[one_gene_per_bin] = 0
    
    # actually do the normalization
    hvg['dispersions_norm'] = (
      hvg['dispersion'].values - disp_mean_bin[hvg['mean_bin']]
            ) / disp_std_bin[hvg['mean_bin']]

    dispersion_norm = hvg["dispersions_norm"].sort_values(ascending=False)

    #if n_top_genes is None:
    # n_top_genes = hvg.shape[0] // 10
    # print(n_top_genes)

    n_top_genes = min(n_top_genes, hvg.shape[0])
    disp_cut_off = dispersion_norm[int(n_top_genes - 1)]
    print(hvg["dispersions_norm"].value_counts())
    hvg['highly_variable'] = cp.nan_to_num(hvg["dispersions_norm"].values) >= disp_cut_off
    return hvg

Thanks for taking the time to read the whole thing! I look forward to any feedback :).

I've wrote some tests to check whether the results are expected


def test_hvg_seurat():
  """testing variable genes method, currently, currently there are to many genes returend by seurat method """
  adata = scanpy.datasets.pbmc3k()
  adata.obs['unique_id'] = adata.obs.index.values
  adata = adata[:,(np.array(adata.X.sum(axis=0)) > 0).ravel()]
  adata.X= adata.X.log1p()
  hvg_scanpy = scanpy.pp.highly_variable_genes(adata, inplace  = False, flavor= 'seurat', n_top_genes=  200)
  hvg_seurat = seurat_highly_variable_genes(cp_matrix = cp.sparse.csr_matrix(adata.X), n_top_genes=  200 )
  assert ( hvg_scanpy.highly_variable.sum()  ==  hvg_seurat.highly_variable.sum() ) , f'{hvg_scanpy.highly_variable.sum() } \
  variable genes are calculated by scanpy hvg seurat method and {hvg_seurat.highly_variable.sum()} by cudf seurat method '
  assert all(hvg_scanpy.highly_variable  &   hvg_seurat.highly_variable.values.get()), f'{np.sum(hvg_scanpy.highly_variable  & hvg_seurat.highly_variable.values.get())}, are overlapping between the scanpy seurat method  and cudf seurat method'


test_hvg_seurat()

calculating dispersion_norm using CUDF

0 Answers0