FScanpy-commit-code/utils/function.py

"""
Common Functions for FScanpy Models
"""
import os
import numpy as np
import pandas as pd
import pickle
import json
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
    log_loss,
    f1_score
)
from utils.config import BaseConfig

def select_low_confidence_samples_cnn(model, X_unlabeled, unlabeled_data, confidence_threshold=0.5):
    """
    Select low confidence samples and assign pseudo labels

    Args:
        model: Current model
        X_unlabeled: Unlabeled data features
        unlabeled_data: Unlabeled data DataFrame with final_prob column as confidence
        confidence_threshold: Not used, kept for compatibility

    Returns:
        selected: Selected samples with pseudo labels
    """
    # Check if final_prob column exists
    if 'final_prob' not in unlabeled_data.columns:
        return pd.DataFrame()

    # Predict probabilities for unlabeled data
    probs = model.predict(X_unlabeled)

    # For binary classification, Keras outputs positive class probability
    # Build complete probability distribution [1-p, p]
    probs_full = np.column_stack([1-probs, probs])

    # Calculate entropy (prediction uncertainty)
    epsilon = 1e-15
    probs_full_safe = np.clip(probs_full, epsilon, 1 - epsilon)
    entropy = -np.sum(probs_full_safe * np.log(probs_full_safe), axis=1)

    # Get predicted labels
    preds = (probs > 0.5).astype(int).flatten()

    # Create result DataFrame
    result_df = pd.DataFrame({
        'entropy': entropy,
        'pseudo_label': preds,
        'prob': probs.flatten()
    }, index=unlabeled_data.index)

    # Get final_prob as confidence from original data
    final_probs = unlabeled_data['final_prob'].values

    # Select samples based on: entropy < confidence(final_prob) & prediction_prob > 0.5
    selected_mask = (result_df['entropy'] < final_probs) & (result_df['prob'] > 0.5)

    # Select qualifying samples
    selected = unlabeled_data.loc[result_df[selected_mask].index].copy()

    # Add pseudo labels to selected samples
    if not selected.empty:
        selected['label'] = result_df.loc[result_df[selected_mask].index, 'pseudo_label'].values

    return selected

def convert_numpy_types(obj):
    """
    Recursively convert NumPy data types to Python native types for JSON serialization

    Args:
        obj: Any object that may contain NumPy data types

    Returns:
        Converted object with all NumPy types converted to Python native types
    """
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_numpy_types(item) for item in obj)
    else:
        return obj

def save_training_info(model, training_info, save_dir, model_type="best", is_final_model=False):
    """
    Save model and training information

    Args:
        model: Trained model
        training_info: Training information dictionary
        save_dir: Save directory
        model_type: Model type, "best" for best model, "final" for final model
        is_final_model: Whether this is the final model from self-training
    """
    # Create save directory
    os.makedirs(save_dir, exist_ok=True)

    try:
        # Save model
        model_filename = f"{model_type}_model.h5"
        model_path = os.path.join(save_dir, model_filename)
        model.save(model_path)

        # Prepare training info for JSON serialization
        serializable_info = convert_numpy_types(training_info)

        # Save training info
        info_filename = f"{model_type}_training_info.json"
        info_path = os.path.join(save_dir, info_filename)

        with open(info_path, 'w') as f:
            json.dump(serializable_info, f, indent=2)

        # Save model weights separately
        weights_filename = f"{model_type}_weights.pkl"
        weights_path = os.path.join(save_dir, weights_filename)

        with open(weights_path, 'wb') as f:
            pickle.dump(model.get_weights(), f)

    except Exception as e:
        pass

def load_data(neg_samples=20000):
    """
    Load data

    Args:
        neg_samples: Number of randomly selected EUPLOTES negative samples,
                    if None use all negative samples

    Returns:
        train_data: Training data
        test_data: Test data
        low_conf_data: Low confidence data
        xu_data: Xu dataset as additional validation set
        atkins_data: Atkins dataset as additional validation set
    """
    try:
        # Load merged data files
        train_data = pd.read_csv(BaseConfig.TRAIN_DATA)
        test_data = pd.read_csv(BaseConfig.TEST_DATA)
        validation_data = pd.read_csv(BaseConfig.VALIDATION_DATA)

        # Ensure required columns exist
        required_columns = ['full_seq', 'label', 'source']

        for df in [train_data, test_data, validation_data]:
            for col in required_columns:
                if col not in df.columns:
                    if col == 'label':
                        df[col] = 0
                    elif col == 'source':
                        df[col] = 'unknown'
                    else:
                        df[col] = ''

        # Separate validation datasets
        xu_data = validation_data[validation_data['source'] == 'Xu'].copy()
        atkins_data = validation_data[validation_data['source'] == 'Atkins'].copy()

        # Create low confidence data (placeholder)
        low_conf_data = pd.DataFrame()

        # Sample negative samples if specified
        if neg_samples is not None:
            # Sample from EUPLOTES negative samples in training data
            euplotes_neg = train_data[
                (train_data['source'] == 'EUPLOTES') &
                (train_data['label'] == 0)
            ]

            if len(euplotes_neg) > neg_samples:
                sampled_neg = euplotes_neg.sample(n=neg_samples, random_state=42)
                # Keep positive samples and other sources, replace EUPLOTES negatives
                train_data = pd.concat([
                    train_data[~((train_data['source'] == 'EUPLOTES') & (train_data['label'] == 0))],
                    sampled_neg
                ], ignore_index=True)

        return train_data, test_data, low_conf_data, xu_data, atkins_data

    except Exception as e:
        # Return empty DataFrames on error
        empty_df = pd.DataFrame()
        return empty_df, empty_df, empty_df, empty_df, empty_df

def select_low_confidence_samples_gb(model, X_unlabeled, unlabeled_data, confidence_threshold=0.5):
    """
    Select low confidence samples and assign pseudo labels for GB model

    Args:
        model: Current model
        X_unlabeled: Unlabeled data features
        unlabeled_data: Unlabeled data DataFrame with final_prob column as confidence
        confidence_threshold: Not used, kept for compatibility

    Returns:
        selected: Low confidence samples with pseudo labels and sequence information
    """
    # Check if final_prob column exists
    if 'final_prob' not in unlabeled_data.columns:
        return pd.DataFrame()

    try:
        # Predict probabilities
        probs = model.predict_proba(X_unlabeled)

        # For binary classification, get positive class probability
        if probs.shape[1] == 2:
            pos_probs = probs[:, 1]
        else:
            pos_probs = probs.flatten()

        # Calculate entropy
        epsilon = 1e-15
        probs_safe = np.clip(probs, epsilon, 1 - epsilon)
        entropy = -np.sum(probs_safe * np.log(probs_safe), axis=1)

        # Get predicted labels
        preds = model.predict(X_unlabeled)

        # Create result DataFrame
        result_df = pd.DataFrame({
            'entropy': entropy,
            'pseudo_label': preds,
            'prob': pos_probs
        }, index=unlabeled_data.index)

        # Get confidence from original data
        final_probs = unlabeled_data['final_prob'].values

        # Select samples: entropy < confidence & prediction_prob > 0.5
        selected_mask = (result_df['entropy'] < final_probs) & (result_df['prob'] > 0.5)

        # Select qualifying samples
        selected = unlabeled_data.loc[result_df[selected_mask].index].copy()

        # Add pseudo labels
        if not selected.empty:
            selected['label'] = result_df.loc[result_df[selected_mask].index, 'pseudo_label'].values

        return selected

    except Exception as e:
        return pd.DataFrame()

def evaluate_model_gb(model, X, y):
    """
    Evaluate model performance for GB model

    Args:
        model: Trained model
        X: Feature matrix
        y: Labels

    Returns:
        metrics: Performance metrics dictionary
    """
    default_metrics = {
        'accuracy': 0.0, 'auc': 0.0, 'f1': 0.0,
        'precision': 0.0, 'recall': 0.0, 'loss': float('inf')
    }

    try:
        # Get predictions
        y_pred = model.predict(X)
        y_pred_proba = model.predict_proba(X)

        # Get positive class probabilities
        if y_pred_proba.shape[1] == 2:
            y_pred_prob = y_pred_proba[:, 1]
        else:
            y_pred_prob = y_pred_proba.flatten()

        metrics = default_metrics.copy()

        # Calculate metrics
        try:
            metrics['accuracy'] = accuracy_score(y, y_pred)
        except Exception:
            pass

        try:
            if len(np.unique(y_pred_prob)) > 1:
                metrics['auc'] = roc_auc_score(y, y_pred_prob)
            else:
                metrics['auc'] = 0.5
        except Exception:
            pass

        try:
            metrics['f1'] = f1_score(y, y_pred, zero_division=0)
        except Exception:
            pass

        try:
            metrics['precision'] = precision_score(y, y_pred, zero_division=0)
        except Exception:
            pass

        try:
            metrics['recall'] = recall_score(y, y_pred, zero_division=0)
        except Exception:
            pass

        try:
            y_pred_prob_safe = np.clip(y_pred_prob, 1e-15, 1-1e-15)
            metrics['loss'] = log_loss(y, y_pred_prob_safe)
        except Exception:
            pass

        return metrics

    except Exception as e:
        return default_metrics

def evaluate_model_cnn(model, X_test, y_test):
    """Evaluate CNN model performance"""
    default_metrics = {
        'accuracy': 0.0, 'auc': 0.0, 'f1': 0.0,
        'precision': 0.0, 'recall': 0.0, 'loss': float('inf')
    }

    try:
        # Batch prediction to avoid memory issues
        batch_size = 128
        n_samples = len(X_test)
        n_batches = (n_samples + batch_size - 1) // batch_size

        y_pred = np.zeros(n_samples)
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, n_samples)
            batch_preds = model.predict(X_test[start_idx:end_idx], verbose=0)

            # Handle multi-output models
            if isinstance(batch_preds, list):
                batch_preds = batch_preds[0]

            # Ensure predictions are 1D
            if len(batch_preds.shape) > 1 and batch_preds.shape[1] > 1:
                batch_preds = batch_preds[:, 1]
            elif len(batch_preds.shape) > 1:
                batch_preds = batch_preds.flatten()

            y_pred[start_idx:end_idx] = batch_preds

        # Convert probabilities to binary predictions
        y_pred_binary = (y_pred > 0.5).astype(int)

        # Ensure labels are 1D
        if len(y_test.shape) > 1:
            y_test = y_test.flatten()

        metrics = default_metrics.copy()

        # Calculate metrics individually
        try:
            metrics['accuracy'] = accuracy_score(y_test, y_pred_binary)
        except Exception:
            pass

        try:
            if len(np.unique(y_pred)) > 1:
                metrics['auc'] = roc_auc_score(y_test, y_pred)
            else:
                # Handle case where all predictions are the same
                if (np.mean(y_pred) > 0.5 and np.mean(y_test) > 0.5) or \
                   (np.mean(y_pred) <= 0.5 and np.mean(y_test) < 0.5):
                    metrics['auc'] = 0.55
                else:
                    metrics['auc'] = 0.45
        except Exception:
            pass

        try:
            metrics['f1'] = f1_score(y_test, y_pred_binary, zero_division=0)
        except Exception:
            pass

        try:
            metrics['precision'] = precision_score(y_test, y_pred_binary, zero_division=0)
        except Exception:
            pass

        try:
            metrics['recall'] = recall_score(y_test, y_pred_binary, zero_division=0)
        except Exception:
            pass

        # Calculate loss
        try:
            y_pred_prob = np.clip(y_pred, 1e-15, 1-1e-15)
            metrics['loss'] = log_loss(y_test, y_pred_prob)
        except Exception:
            pass

        return metrics

    except Exception as e:
        return default_metrics