Source code for DOLPHIN.graph_generation.process_feature_hvg

import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import math
import os
from scipy import sparse

[docs]def run_feature_hvg( out_name: str, out_directory="./"): """ Filters the feature matrix to retain only highly variable genes (HVGs), normalizes the data, and prepares it as input for the DOLPHIN model. Parameters ---------- out_name : str Output filename for the feature matrix CSV. out_directory : str Output directory to save the combined feature matrix, default save to ./data/ folder. Returns ------- None Saves the final feature matrix as `FeatureCompHvg_<out_name>.h5ad` in the specified output directory. """ final_out_dir = os.path.join(out_directory, "data") os.makedirs(final_out_dir, exist_ok=True) hvg_path = os.path.join(final_out_dir, "ExonGene_hvg_"+out_name+".h5ad") feature_anndata = os.path.join(final_out_dir, "FeatureComp_"+out_name+".h5ad") ## load feature matrix adata = anndata.read_h5ad(feature_anndata) hvg_adata = anndata.read_h5ad(hvg_path) cell_keep = list(hvg_adata.obs.index) hvg_list = set(hvg_adata.var["Geneid"]) adata = adata[adata.obs.index.isin(cell_keep), :] #normalize the feature matrix and round to nearest integer sc.pp.normalize_total(adata) #only keep highly varaible genes adata_hvg = adata[:,adata.var["gene_id"].isin(hvg_list)].copy() # if sparse.issparse(adata_hvg.X): # X_rounded = sparse.csr_matrix(np.round(adata_hvg.X.toarray()).astype(np.float32)) # .A converts to dense # else: # X_rounded = np.round(adata_hvg.X).astype(np.float32) # X_rounded_sparse = sparse.csr_matrix(np.round(X_rounded).astype(np.float32)) # adata_final = anndata.AnnData(X=X_rounded_sparse, obs=adata_hvg.obs.copy(), var=adata_hvg.var.copy()) if sparse.issparse(adata_hvg.X): X_dense = adata_hvg.X.toarray() else: X_dense = adata_hvg.X X_rounded_sparse = sparse.csr_matrix(np.round(X_dense).astype(np.float32)) adata_final = anndata.AnnData(X=X_rounded_sparse, obs=adata_hvg.obs.copy(), var=adata_hvg.var.copy()) print("Keep "+ str(len(set(adata_final.var["gene_id"]))) + " genes") print("The Final Feature Matrix Size is " + str(adata_final.shape[0]) + " Cells and " +str(adata_final.shape[1])+ " exons") #remove unneccary var names adata_final.write(os.path.join(final_out_dir, "FeatureCompHvg_"+out_name+".h5ad"))