Source code for DOLPHIN.preprocess.generate_adj_index
import pickle
import pandas as pd
import os
[docs]def generate_adj_index_table(exon_pkl_path: str, output_dir: str = "./dolphin_exon_gtf/") -> pd.DataFrame:
"""
Generate and save an adjacency index table for gene-level exon graphs from a exon pickle file.
This function reads a `.pkl` file containing exon annotations (as a pandas DataFrame),
groups exons by `gene_id`, calculates the number of exons per gene, and computes the
flattened adjacency matrix indices for each gene using the formula:
ind = exon_count^2
ind_st = cumulative sum of previous `ind` values
The resulting table is saved as `dolphin_adj_index.csv` in the specified output directory.
Parameters
----------
exon_pkl_path : str
Path to the pickle file (.pkl) containing the exon DataFrame. The DataFrame must include a 'gene_id' column.
output_dir : str, optional
Directory where the output `dolphin_adj_index.csv` will be saved. Default is './dolphin_exon_gtf/'.
Returns
-------
adj_df : pandas.DataFrame
A DataFrame with the following columns:
- 'geneid': gene ID
- 'ind_st': starting index in the concatenated adjacency matrix
- 'ind': size of the flattened square adjacency matrix for that gene (exon_count^2)
Raises
------
AssertionError
If the gene order in the output does not match the input DataFrame's gene appearance order.
"""
# 1. Load the DataFrame
with open(exon_pkl_path, "rb") as f:
exon_df = pickle.load(f)
# 2. Preserve gene order as they first appear
grouped = exon_df.groupby("gene_id", sort=False)
gene_ordered = exon_df["gene_id"].drop_duplicates()
exon_counts = grouped.size().reindex(gene_ordered).dropna()
# 3. Build index table
rows = []
current_ind = 0
for geneid, exon_count in exon_counts.items():
ind = exon_count * exon_count
rows.append({
"geneid": geneid,
"ind_st": float(current_ind),
"ind": float(ind)
})
current_ind += ind
# 4. Create result DataFrame
adj_df = pd.DataFrame(rows)
# 5. Check gene order consistency
assert adj_df["geneid"].tolist() == gene_ordered.tolist(), "Gene order mismatch!"
# 6. Save to CSV
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "dolphin_adj_index.csv")
adj_df.to_csv(output_path, index=False)
print(f"[Saved] Adjacency index table saved to: {output_path}")
return adj_df
[docs]def generate_adj_metadata_table(exon_pkl_path: str, output_dir: str = "./dolphin_exon_gtf/") -> pd.DataFrame:
"""
Generate metadata table for flattened exon adjacency matrices per gene.
Ensures unique and non-missing gene names:
- If gene_name is missing or empty, fallback to gene_id.
- If gene_name is duplicated across gene_ids, disambiguate using gene_name-gene_id.
Parameters
----------
exon_pkl_path : str
Path to exon .pkl file.
output_dir : str, optional
Output directory to save CSV file. Default is './dolphin_exon_gtf/'.
Returns
-------
pd.DataFrame
DataFrame with columns: 'Geneid', 'GeneName', 'Gene_Junc_name'
and a separate mapping DataFrame with 'gene_id' and 'gene_name'.
"""
# 1. Load exon DataFrame
with open(exon_pkl_path, "rb") as f:
exon_df = pickle.load(f)
# 2. Get gene order and exon counts
gene_ordered = exon_df["gene_id"].drop_duplicates()
grouped = exon_df.groupby("gene_id", sort=False)
exon_counts = grouped.size().reindex(gene_ordered).dropna()
# 3. Extract gene_id to gene_name map
gene_info = exon_df.drop_duplicates("gene_id")[["gene_id", "gene_name"]].set_index("gene_id")
# -- Fill missing gene names and empty strings with gene_id
gene_info = gene_info.reset_index()
gene_info["gene_name"] = gene_info.apply(
lambda row: row["gene_id"] if pd.isna(row["gene_name"]) or row["gene_name"] == "" else row["gene_name"],
axis=1
)
# -- Disambiguate duplicated gene_names
name_counts = gene_info["gene_name"].value_counts()
duplicated_names = name_counts[name_counts > 1].index
gene_info["gene_name"] = gene_info.apply(
lambda row: f"{row['gene_name']}-{row['gene_id']}" if row["gene_name"] in duplicated_names else row["gene_name"],
axis=1
)
gene_info = gene_info.set_index("gene_id")
# 4. Build metadata
rows = []
for geneid in gene_ordered:
gene_name = gene_info.loc[geneid, "gene_name"]
exon_count = exon_counts[geneid]
n_junc = exon_count * exon_count
for i in range(1, n_junc + 1):
rows.append({
"Geneid": geneid,
"GeneName": gene_name,
"Gene_Junc_name": f"{gene_name}-{i}"
})
# 5. Construct and save
meta_df = pd.DataFrame(rows)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "dolphin_adj_metadata_table.csv")
meta_df.to_csv(output_path, index=False)
print(f"[Saved] Adjacency metadata table saved to: {output_path}")
# 6. get the gene id and gene name mapping dataframe
gene_mapping_df = meta_df[['Geneid', 'GeneName']].drop_duplicates().reset_index(drop=True).rename(columns={'Geneid': 'gene_id', 'GeneName': 'gene_name'})
output_path_gene = os.path.join(output_dir, "dolphin_gene_meta.csv")
gene_mapping_df.to_csv(output_path_gene, index=False)
print(f"[Saved] Gene metadata table saved to: {output_path_gene}")
return meta_df, gene_mapping_df