Source code for DOLPHIN.preprocess.generate_adj_index

import pickle
import pandas as pd
import os

[docs]def generate_adj_index_table(exon_pkl_path: str, output_dir: str = "./dolphin_exon_gtf/") -> pd.DataFrame: """ Generate and save an adjacency index table for gene-level exon graphs from a exon pickle file. This function reads a `.pkl` file containing exon annotations (as a pandas DataFrame), groups exons by `gene_id`, calculates the number of exons per gene, and computes the flattened adjacency matrix indices for each gene using the formula: ind = exon_count^2 ind_st = cumulative sum of previous `ind` values The resulting table is saved as `dolphin_adj_index.csv` in the specified output directory. Parameters ---------- exon_pkl_path : str Path to the pickle file (.pkl) containing the exon DataFrame. The DataFrame must include a 'gene_id' column. output_dir : str, optional Directory where the output `dolphin_adj_index.csv` will be saved. Default is './dolphin_exon_gtf/'. Returns ------- adj_df : pandas.DataFrame A DataFrame with the following columns: - 'geneid': gene ID - 'ind_st': starting index in the concatenated adjacency matrix - 'ind': size of the flattened square adjacency matrix for that gene (exon_count^2) Raises ------ AssertionError If the gene order in the output does not match the input DataFrame's gene appearance order. """ # 1. Load the DataFrame with open(exon_pkl_path, "rb") as f: exon_df = pickle.load(f) # 2. Preserve gene order as they first appear grouped = exon_df.groupby("gene_id", sort=False) gene_ordered = exon_df["gene_id"].drop_duplicates() exon_counts = grouped.size().reindex(gene_ordered).dropna() # 3. Build index table rows = [] current_ind = 0 for geneid, exon_count in exon_counts.items(): ind = exon_count * exon_count rows.append({ "geneid": geneid, "ind_st": float(current_ind), "ind": float(ind) }) current_ind += ind # 4. Create result DataFrame adj_df = pd.DataFrame(rows) # 5. Check gene order consistency assert adj_df["geneid"].tolist() == gene_ordered.tolist(), "Gene order mismatch!" # 6. Save to CSV os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "dolphin_adj_index.csv") adj_df.to_csv(output_path, index=False) print(f"[Saved] Adjacency index table saved to: {output_path}") return adj_df
[docs]def generate_adj_metadata_table(exon_pkl_path: str, output_dir: str = "./dolphin_exon_gtf/") -> pd.DataFrame: """ Generate metadata table for flattened exon adjacency matrices per gene. Ensures unique and non-missing gene names: - If gene_name is missing or empty, fallback to gene_id. - If gene_name is duplicated across gene_ids, disambiguate using gene_name-gene_id. Parameters ---------- exon_pkl_path : str Path to exon .pkl file. output_dir : str, optional Output directory to save CSV file. Default is './dolphin_exon_gtf/'. Returns ------- pd.DataFrame DataFrame with columns: 'Geneid', 'GeneName', 'Gene_Junc_name' and a separate mapping DataFrame with 'gene_id' and 'gene_name'. """ # 1. Load exon DataFrame with open(exon_pkl_path, "rb") as f: exon_df = pickle.load(f) # 2. Get gene order and exon counts gene_ordered = exon_df["gene_id"].drop_duplicates() grouped = exon_df.groupby("gene_id", sort=False) exon_counts = grouped.size().reindex(gene_ordered).dropna() # 3. Extract gene_id to gene_name map gene_info = exon_df.drop_duplicates("gene_id")[["gene_id", "gene_name"]].set_index("gene_id") # -- Fill missing gene names and empty strings with gene_id gene_info = gene_info.reset_index() gene_info["gene_name"] = gene_info.apply( lambda row: row["gene_id"] if pd.isna(row["gene_name"]) or row["gene_name"] == "" else row["gene_name"], axis=1 ) # -- Disambiguate duplicated gene_names name_counts = gene_info["gene_name"].value_counts() duplicated_names = name_counts[name_counts > 1].index gene_info["gene_name"] = gene_info.apply( lambda row: f"{row['gene_name']}-{row['gene_id']}" if row["gene_name"] in duplicated_names else row["gene_name"], axis=1 ) gene_info = gene_info.set_index("gene_id") # 4. Build metadata rows = [] for geneid in gene_ordered: gene_name = gene_info.loc[geneid, "gene_name"] exon_count = exon_counts[geneid] n_junc = exon_count * exon_count for i in range(1, n_junc + 1): rows.append({ "Geneid": geneid, "GeneName": gene_name, "Gene_Junc_name": f"{gene_name}-{i}" }) # 5. Construct and save meta_df = pd.DataFrame(rows) os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "dolphin_adj_metadata_table.csv") meta_df.to_csv(output_path, index=False) print(f"[Saved] Adjacency metadata table saved to: {output_path}") # 6. get the gene id and gene name mapping dataframe gene_mapping_df = meta_df[['Geneid', 'GeneName']].drop_duplicates().reset_index(drop=True).rename(columns={'Geneid': 'gene_id', 'GeneName': 'gene_name'}) output_path_gene = os.path.join(output_dir, "dolphin_gene_meta.csv") gene_mapping_df.to_csv(output_path_gene, index=False) print(f"[Saved] Gene metadata table saved to: {output_path_gene}") return meta_df, gene_mapping_df