Source code for DOLPHIN.graph_generation.process_adjacency_matrix_final

import pandas as pd
import numpy as np
import anndata
import os
from tqdm import tqdm
from scipy.sparse import csr_matrix

[docs]def run_adjacency_matrix_final( out_name: str, out_directory: str = "./", batch_size=1000): """ Generates the final adjacency matrix by filtering out invalid edges based on FeatureComp data. This function processes a compressed adjacency matrix and removes edges corresponding to exons that are consistently zero across all cells. Parameters ---------- out_name : str Output name prefix. out_directory : str, optional Output folder to save results. Returns ------- None Saves the final adjacency matrix to the output directory as `AdjacencyCompRe_<out_name>.h5ad`. """ print("Start Generating Final Adjacency Matrix...") final_out_dir = os.path.join(out_directory, "data") # temp_out_dir = os.path.join(final_out_dir, "temp") # os.makedirs(temp_out_dir, exist_ok=True) # === Step 1: Load adjacency and feature matrices adata_adj = anndata.read_h5ad(os.path.join(final_out_dir, f"AdjacencyComp_{out_name}.h5ad")) adata_adj.X = adata_adj.X.toarray() adata_fea_gtf = anndata.read_h5ad(os.path.join(final_out_dir, f"Feature_{out_name}.h5ad")) sample_list = list(adata_fea_gtf.obs_names) # === Step 2: Identify expressed exons df_fea_orig_var = pd.DataFrame(adata_fea_gtf.var) df_fea_gtf = pd.merge(adata_fea_gtf.to_df().T, df_fea_orig_var, how="left",left_index=True, right_index=True) df_fea_gtf["gene_order"] = df_fea_gtf["gene_id"].apply(lambda x: int(x[4:])) df_fea_gtf["gtf_index"] = df_fea_gtf.index df_fea_gtf["exon_order"] = df_fea_gtf["gtf_index"].apply(lambda x: int(x.split("-")[-1])) df_fea_gtf_order = df_fea_gtf.sort_values(by=["gene_order", "exon_order"]) df_fea_gtf_order = df_fea_gtf_order.reset_index(drop=True) df_fea_gtf_order["new_gtf_exon_index"] = df_fea_gtf_order.groupby(["gene_id"]).cumcount()+1 df_fea_gtf_order["new_gtf_index"] = df_fea_gtf_order['gene_id'].astype(str) +"-"+ df_fea_gtf_order["new_gtf_exon_index"].astype(str) df_fea_gtf_new_index = df_fea_gtf_order.set_index("new_gtf_index") df_fea_gtf_new_index = df_fea_gtf_new_index[sample_list] df_fea_gtf_new_index = df_fea_gtf_new_index.T df_fea_exon_keep = df_fea_gtf_new_index.loc[:, (df_fea_gtf_new_index > 0).any()] expressed_exons = df_fea_exon_keep.columns exon_keep_dict = { gene: set(exon.split("-")[-1] for exon in expressed_exons if exon.startswith(gene)) for gene in set(x.rsplit("-", 1)[0] for x in expressed_exons) } # === Step 3: Prepare adjacency DataFrame df_adj = adata_adj.to_df().T df_adj["_ck_empty"] = np.amax(df_adj.to_numpy(), axis=1) df_adj["_idx"] = df_adj.index var_df = pd.DataFrame(adata_adj.var) gene_name_to_id = dict(zip(var_df["gene_name"], var_df["gene_id"])) df_adj["gene_name"] = df_adj["_idx"].apply(lambda x: x[:x.rfind("-")]) df_adj["gene_id"] = df_adj["gene_name"].map(gene_name_to_id) df_adj["adj_vec_size"] = df_adj.groupby("gene_id")["gene_id"].transform("count") df_adj["gene_order"] = df_adj["gene_id"].apply(lambda x: int(x[4:])) df_adj["exon_order"] = df_adj["_idx"].apply(lambda x: int(x.split("-")[-1])) df_adj["exon_name"] = df_adj["gene_id"].astype(str) + "-" + df_adj["exon_order"].astype(str) # === Step 4: Flag zero edges def flag_zero(group): gene_id = group["gene_id"].iloc[0] adj_size = int(group["adj_vec_size"].iloc[0]) n = int(adj_size ** 0.5) matrix = group["_ck_empty"].to_numpy().reshape((n, n)) mask = np.zeros((n, n), dtype=np.uint8) keep_ids = exon_keep_dict.get(gene_id, set()) for i in range(n): if np.all(matrix[i, :] == 0) and np.all(matrix[:, i] == 0) and str(i + 1) not in keep_ids: mask[i, :] = 1 mask[:, i] = 1 return pd.Series(mask.flatten()) # === Step 5: Process in gene_id batches all_flags = [] ordered_gene_ids = df_adj["gene_id"].drop_duplicates().tolist() gene_batches = [ordered_gene_ids[i:i + batch_size] for i in range(0, len(ordered_gene_ids), batch_size)] for batch in tqdm(gene_batches, desc="Processing gene batches"): temp_df = df_adj[df_adj["gene_id"].isin(batch)].copy() temp_df = temp_df.sort_index() flags = temp_df.groupby("gene_id").apply(flag_zero).explode().astype(np.uint8).reset_index(level=0) flags.columns = ["gene_id", "_flag"] flags["exon_idx"] = flags.groupby("gene_id").cumcount() + 1 flags["exon_name"] = flags["gene_id"] + "-" + flags["exon_idx"].astype(str) all_flags.append(flags) df_flags = pd.concat(all_flags, ignore_index=True) # === Step 6: Merge flags with original matrix df_flags = df_flags.set_index("exon_name").reindex(df_adj["exon_name"]).reset_index() df_adj["_flag"] = df_flags["_flag"].values # === Step 7: Filter flagged entries and save df_filtered = df_adj[df_adj["_flag"] == 0].copy() # === Step 8: Convert to h5ad df_filtered["new_index"] = df_filtered.groupby(["gene_id"]).cumcount()+1 df_filtered["var_new_index"] = df_filtered['gene_id'].astype(str) +"-"+ df_filtered["new_index"].astype(str) df_filtered = df_filtered.reset_index(drop=True) ## == step6: adata for adjacency matrix adata_adj_comp = anndata.read_h5ad(os.path.join(final_out_dir, "AdjacencyComp_"+out_name+".h5ad")) obs = pd.DataFrame(adata_adj_comp.obs) ## dataframe for annotating the variables = geneid var_names = df_filtered["var_new_index"].values var = pd.DataFrame(index=var_names) var["gene_id"] = df_filtered["gene_id"].values var["gene_name"] = df_filtered["gene_name"].values # # ##the data matrix X = df_filtered.drop(columns=["gene_id", "gene_name","_ck_empty","_idx","adj_vec_size","_flag", "gene_order", "exon_order","exon_name","new_index","var_new_index"]).T.iloc[:,:].values adata = anndata.AnnData(X=X, obs=obs, var=var) adata.X = csr_matrix(adata.X) adata.write(os.path.join(final_out_dir, "AdjacencyCompRe_" + out_name +".h5ad"))