Source code for DOLPHIN.AS.convert_random_psi

import os
import pandas as pd
import numpy as np
import anndata

[docs]def run_psi_random(
    outrigger_psi_data: str,
    out_name: str,
    out_directory: str = "./",
    seed_num: int = 0
):  

    """
    Randomly impute missing PSI values for downstream clustering analysis.

    This function reads an AnnData <out_name>_PSI.h5ad file containing PSI values generated by Outrigger.
    It replaces all missing values (NaNs) in the PSI matrix with random values uniformly 
    sampled between 0 and 1, enabling compatibility with dimensionality reduction and 
    clustering methods (e.g., PCA, Leiden). The output is a new `.h5ad` file with the 
    same structure but with imputed values.

    Parameters
    ----------
    outrigger_psi_data : str
        Path to the input `<out_name>_PSI.h5ad` file containing PSI values with NaNs.
    out_directory : str, optional
        Directory where the output `.h5ad` file will be saved. Default is the current directory ("./").
    out_name : str
        Output filename prefix (without extension). The result will be saved as "<out_name>_PSI_random.h5ad".
    seed_num : int, optional
        Random seed for reproducibility. Default is 0.

    Returns
    -------
    adata : anndata.AnnData
        An AnnData object with random-imputed PSI values. The file is saved to:
        `<out_directory>/alternative_splicing/<out_name>_PSI_random.h5ad`.

    """

    final_out_dir = os.path.join(out_directory, "alternative_splicing")
    os.makedirs(final_out_dir, exist_ok=True)

    if not os.path.exists(outrigger_psi_data):
        raise FileNotFoundError(f"Input file not found: {outrigger_psi_data}")
    
    np.random.seed(seed_num)

    _adata = anndata.read_h5ad(outrigger_psi_data)
    df_nan = _adata.to_df()
    
    nan_mask = df_nan.isna()
    random_vals = pd.DataFrame(np.random.uniform(0, 1, df_nan.shape), index=df_nan.index, columns=df_nan.columns)
    df_filled = df_nan.where(~nan_mask, random_vals)

    adata = anndata.AnnData(df_filled.values, obs=_adata.obs, var=_adata.var)
    adata.write(os.path.join(final_out_dir, out_name + "_PSI_random.h5ad"))

    return adata