Source code for DOLPHIN.EDEG.generate_EDEG

import pandas as pd
import numpy as np
import statsmodels.stats.multitest as smm
from DOLPHIN.preprocess import gtfpy
import scanpy as sc
from scipy.stats import norm

def weighted_stouffer_method(p_values, weights):
    non_nan_pvals = p_values[~np.isnan(p_values)]
    non_nan_weights = weights[~np.isnan(weights)]
    
    if (len(non_nan_pvals) == 0):
        return np.nan
    
    if (len(non_nan_pvals) != len(non_nan_weights)):
        return np.nan
    
    # Convert p-values to z-scores
    z_scores = norm.isf(non_nan_pvals)  # Inverse survival function (1 - CDF)
    
    # Calculate the weighted z-score
    combined_z = np.sum(non_nan_weights * z_scores) / np.sqrt(np.sum(non_nan_weights**2))
    
    # Convert the combined z-score back to a p-value
    combined_p_value = norm.sf(combined_z)  # Survival function (1 - CDF)
    return combined_p_value

def weighted_avg_log2fc(log2fc, weights):
    non_nan_log2fc = log2fc[~np.isnan(log2fc)]
    non_nan_weights = weights[~np.isnan(weights)]
    
    if (len(non_nan_log2fc) == 0):
        return np.nan
    if (len(non_nan_log2fc) != len(non_nan_weights)):
        return np.nan
    
    return np.sum(np.abs(non_nan_log2fc) * non_nan_weights)

[docs]def run_edeg(seurat_output, adata_input, gtf_path, output):
    """
    Aggregate exon-level marker results into gene-level statistics using weighted methods.

    This function converts exon-level marker results from Seurat using MAST into gene-level statistical
    insights. It applies the Stouffer method to combine exon-level p-values, weighted by
    exon length, and computes average absolute log2 fold changes, also weighted by exon length.

    Parameters
    ----------
    seurat_output : str
        Path to the CSV file containing exon-level differential expression results from Seurat.
        
    adata_input : str
        Path to the .h5ad file used for generating exon-level data.
    
    gtf_path : str
        Path to the GTF-derived annotation file generated by DOLPHIN.
        
    output : str
        Path where the final gene-level results will be saved as a CSV file.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing gene-level statistics including:
        - MAST_weighted_abs_avg_log2FC: Weighted average of absolute log2 fold changes per gene.
        - MAST_weighted_stouffer_pval: Combined p-value using the weighted Stouffer method.
        - MAST_weighted_stouffer_pval_adj_bonf: Bonferroni-adjusted Stouffer p-value.
        
    """
    
    pd_MAST = pd.read_csv(seurat_output)
    pd_MAST = pd_MAST.rename(columns={"Unnamed: 0":"Exon_names"})
    pd_MAST['Exon_names'] = pd_MAST['Exon_names'].str.replace('/', '_')
    pd_MAST['Gene_names'] = pd_MAST['Exon_names'].apply(lambda x: x[:x.rfind('-')] if '-' in x else x)

    df_gtf_orig = gtfpy.readGTF(gtf_path)
    attribute_split = df_gtf_orig['attribute'].str.split(';', expand=True)
    df_gtf = pd.concat([df_gtf_orig, attribute_split], axis=1)[[0, 2, 5, "start", "end"]]
    df_gtf["gene_id"] = df_gtf[0].str.extract(r'"(.*?)"')
    df_gtf["exon_num"] = df_gtf[5].str.extract(r'"(.*?)"')
    df_gtf["_new"] = df_gtf["gene_id"] + "-" + df_gtf["exon_num"]

    adata = sc.read_h5ad(adata_input)
    adata.var["_new"]= adata.var.groupby('gene_id').cumcount() + 1
    adata.var['_new'] = adata.var['gene_id'].astype("str") + '-' + adata.var['_new'].astype(str)

    dict_start = dict(zip(df_gtf['_new'], df_gtf['start']))
    dict_end = dict(zip(df_gtf['_new'], df_gtf['end']))

    adata.var['start'] = adata.var['_new'].map(dict_start)
    adata.var['end'] = adata.var['_new'].map(dict_end)

    dict_map_start = dict(zip(adata.var.index, adata.var["start"]))
    dict_map_end = dict(zip(adata.var.index, adata.var["end"]))
    
    pd_MAST["start"] = pd_MAST["Exon_names"].map(dict_map_start)
    pd_MAST["end"] = pd_MAST["Exon_names"].map(dict_map_end)
    pd_MAST['start'] = pd.to_numeric(pd_MAST['start'], errors='coerce')
    pd_MAST['end'] = pd.to_numeric(pd_MAST['end'], errors='coerce')
    pd_MAST['exon_length'] = pd_MAST['end'] - pd_MAST['start'] +1 # Calculate exon length
    
    pd_MAST['MAST_exon_length'] = np.where(pd_MAST["p_val"].isna(), np.nan, pd_MAST['exon_length'])
    pd_MAST['MAST_exon_weight'] = pd_MAST.groupby('Gene_names')['MAST_exon_length'].transform(lambda x: x / x.sum())
    
    temp_weighted_log2fc = pd_MAST.groupby('Gene_names').apply(
                lambda x: weighted_avg_log2fc(x['avg_log2FC'].values, x['MAST_exon_weight'].values)
            ).reset_index(name='temp_log2fc')
    
    dict_temp_log2fc = dict(zip(temp_weighted_log2fc['Gene_names'], temp_weighted_log2fc['temp_log2fc']))
    pd_MAST['MAST_weighted_abs_avg_log2FC'] = pd_MAST['Gene_names'].map(dict_temp_log2fc)
    
    temp_weighted_stouffer_p = pd_MAST.groupby('Gene_names').apply(
        lambda x: weighted_stouffer_method(x['p_val'].values, x['MAST_exon_weight'].values)
    ).reset_index(name='temp_p_value')

    dict_temp_p_value = dict(zip(temp_weighted_stouffer_p['Gene_names'], temp_weighted_stouffer_p['temp_p_value']))

    pd_MAST['MAST_weighted_stouffer_pval'] = pd_MAST['Gene_names'].map(dict_temp_p_value)

    ## adjusted weighted fisher's p-value    
    df_sub_temp = pd_MAST.dropna(subset=['MAST_weighted_stouffer_pval']).copy()
    df_sub_temp = df_sub_temp.drop_duplicates(subset=['Gene_names'])
    _, adjusted_pvals_bonf, _, _ = smm.multipletests(df_sub_temp['MAST_weighted_stouffer_pval'], method='bonferroni')

    df_sub_temp["temp_adj_p_bonf"] = adjusted_pvals_bonf

    dict_p_adj_bonf = dict(zip(df_sub_temp["Gene_names"], df_sub_temp["temp_adj_p_bonf"]))

    pd_MAST["MAST_weighted_stouffer_pval_adj_bonf"] = pd_MAST["Gene_names"].map(dict_p_adj_bonf)
    
    pd_MAST = pd_MAST.drop(columns=["start","end","exon_length","MAST_exon_length"])
                 
    pd_MAST.to_csv(output, index=False)
    
    return pd_MAST