Source code for DOLPHIN.model.run_model

import os
from .train import run_train
import numpy as np
import torch
import pickle
import random

"""
The main function, hyperparameter search
"""

[docs]def run_DOLPHIN(data_type, graph_in, fea_in, current_out_path='./', params=None, device='auto', seed_num=0): """ Run the DOLPHIN model on single-cell RNA-seq data to obtain latent cell embeddings. Parameters ---------- data_type : str Specifies the type of input single-cell RNA-seq data. - "full-length": For full-length RNA-seq data. - "10x": For 10x Genomics RNA-seq data. graph_in : object The input graph structure (precomputed from exon-level data). fea_in : anndata.AnnData The input feature matrix, provided as an AnnData object. current_out_path : str, optional Output directory where the resulting cell embeddings will be saved. The embeddings will be written to `DOLPHIN_Z.h5ad`. Default is `'./'`. params : dict, optional A dictionary of model hyperparameters. If not provided, default parameters will be used depending on `data_type`. Customizable parameters include: - "gat_channel" : Number of GAT output channels per head. - "nhead" : Number of GAT attention heads. - "gat_dropout" : Dropout rate in the GAT layer. - "list_gra_enc_hid" : Encoder MLP hidden layer sizes. - "gra_p_dropout" : Dropout rate in the encoder. - "z_dim" : Dimensionality of the latent space. - "list_fea_dec_hid" : Feature decoder MLP hidden layer sizes. - "list_adj_dec_hid" : Adjacency decoder MLP hidden layer sizes. - "lr" : Learning rate. - "batch" : Mini-batch size. - "epochs" : Number of training epochs. - "kl_beta" : KL divergence loss weight. - "fea_lambda" : Feature reconstruction loss weight. - "adj_lambda" : Adjacency reconstruction loss weight. device : str, optional Computational device to run the model on. Options are: - `'auto'` (default): Automatically selects `'cuda'` if a GPU is available, otherwise falls back to `'cpu'`. - `'cuda'` or `'cuda:0'`: Use the first available GPU. - `'cpu'`: Run on CPU only. GPU acceleration is recommended for large datasets or faster training. seed_num : int, optional Random seed for reproducibility. Default is `0`. Returns ------- None Saves the latent cell embedding matrix to `DOLPHIN_Z.h5ad` under `current_out_path`. """ random.seed(seed_num) os.environ['PYTHONHASHSEED'] = str(seed_num) np.random.seed(seed_num) torch.manual_seed(seed_num) torch.cuda.manual_seed(seed_num) torch.cuda.manual_seed_all(seed_num) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = True default_params_full_length = {"gat_channel":[2], "nhead": 9, "gat_dropout": 0.3, "concat": False, "list_gra_enc_hid": [256], "gra_p_dropout":0.2, "z_dim": 50, "list_fea_dec_hid":[256], "list_adj_dec_hid":[128], "lr": 1.0e-3, "batch": 20, "epochs": 200, "kl_beta": 0.7, "fea_lambda": 0.5, "adj_lambda": 0.5, } default_params_10x = {"gat_channel":[9], "nhead": 1, "gat_dropout": 0.1, "concat": False, "list_gra_enc_hid": [512], "gra_p_dropout":0.3, "z_dim": 35, "list_fea_dec_hid":[512], "list_adj_dec_hid":[256], "lr": 1.0e-3, "batch": 20, "epochs": 200, "kl_beta": 0.7, "fea_lambda": 0.5, "adj_lambda": 0.5, } # Select the default parameters based on data_type if data_type == "full-length": default_params = default_params_full_length else: default_params = default_params_10x # If params is None, use default parameters if params is None: params = default_params else: # Update the default parameters with user-provided parameters default_params.update(params) params = default_params # print(params) if device == 'auto': device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"[DOLPHIN] Training on device: {device.upper()} ({'GPU' if 'cuda' in device else 'CPU'})") run_train(graph_in, fea_in, current_out_path, params, device=device)