Source code for DOLPHIN.graph_generation.process_feature_matrix

from tqdm import tqdm
import math
import anndata
import os
import pandas as pd
import torch
from .func_step01_fea_mat_main_part1 import combine_fea
from .func_step01_fea_mat_main_part2 import fea_comp

[docs]def run_feature_combination(
    metadata_path: str,
    graph_directory: str,
    gene_annotation,
    gtf_pkl_path: str,
    out_name: str,
    out_directory="./",
    fea_run_num=100,
    clean_temp: bool = True
):
    """
    Run feature matrix combination in batches and merge the results into a final AnnData object.

    This function reads cell metadata and processes each cell's feature vectors in batches.
    It combines features and saves intermediate .h5ad files for each batch,
    and finally concatenates them into one unified .h5ad file. This is useful for large-scale
    datasets where memory-efficient batch processing is necessary. 
    It then removes exons whose values are zero across all cells from the feature matrix.

    Parameters
    ----------
    fea_run_num : int
        Number of cells to process per batch, default is 100.
    metadata_path : str
        Path to the metadata file (e.g., a csv file with cell information).
    graph_directory : str
        Path to the directory containing graph input files.
    gene_annotation : Any
        Gene annotation data (can be a list, dict, or DataFrame depending on context).
    gtf_pkl_path : str
        Path to the GTF pickle file.
    out_directory : str
        Output directory to save the combined feature matrix, default save to ./data/ folder.
    out_name : str
        Output filename for the feature matrix CSV.
    clean_temp : bool
        Whether to delete the temporary folder after processing. Default is True.

    Returns
    -------
    None
        Saves the following output files:
        - Batch-wise .h5ad files for each group of samples.
        - A final merged .h5ad file: Feature_<out_name>.h5ad.
        - The file with exons that are zero across all cells removed is saved as FeatureComp_<out_name>.h5ad.

    """
    
    df_label = pd.read_csv(metadata_path, sep='\t')
    total_sample_size = len(df_label)
    
    # Set output and temp directories
    final_out_dir = os.path.join(out_directory, "data")
    os.makedirs(final_out_dir, exist_ok=True)
    temp_out_dir = os.path.join(final_out_dir, "temp")
    os.makedirs(temp_out_dir, exist_ok=True)

    print("Start Combining Feature Matrix...")
    with tqdm(total=total_sample_size, desc="Combining Features") as pbar_fea:
        for i in range(0, total_sample_size, fea_run_num):
            pbar_fea = combine_fea(
                pbar_fea,
                df_label,
                graph_directory,
                gene_annotation,
                gtf_pkl_path,
                start_idx=i,
                sample_num=fea_run_num,
                output_path=temp_out_dir,
                output_name=out_name
            )
    
    #### combine all feature .h5ad files
    total_number_fea_anndata = math.ceil(total_sample_size/fea_run_num)
    for _idx, _fea_idx in enumerate(range(0, total_number_fea_anndata)):
        _temp_ad = anndata.read_h5ad(os.path.join(temp_out_dir, "Feature_"+out_name+"_"+str(_fea_idx)+".h5ad"))
        if _idx ==0:
            combine_anndata_fea = _temp_ad
        else:
            combine_anndata_fea = combine_anndata_fea.concatenate(_temp_ad, index_unique = None, batch_key = None)
    combine_anndata_fea.write(os.path.join(final_out_dir, "Feature_"+out_name+".h5ad"))

    fea_comp(final_out_dir, out_name)
    
    if clean_temp:
        import shutil
        print("Cleaning up temporary files...")
        shutil.rmtree(temp_out_dir)