上传文件至 utils

2025-08-17 15:31:58 +08:00 · 2025-08-17 15:31:58 +08:00 · 8026ee538e
parent aa520b6238
commit 8026ee538e
3 changed files with 464 additions and 0 deletions
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/config.py
+++ b/utils/config.py
@ -0,0 +1,46 @@
+"""
+FScanpy Configuration Module
+"""
+import os
+
+class BaseConfig:
+    """Base configuration class with virtual paths"""
+    
+    # Virtual data paths
+    DATA_DIR = "/path/to/data"
+    TRAIN_DATA = "/path/to/data/merged_train_data.csv"
+    TEST_DATA = "/path/to/data/merged_test_data.csv"
+    VALIDATION_DATA = "/path/to/data/merged_validation_data.csv"
+    
+    # Virtual model paths
+    MODEL_DIR = "/path/to/models"
+    BILSTM_MODEL_DIR = "/path/to/models/bilstm"
+    GB_MODEL_DIR = "/path/to/models/gradient_boosting"
+    
+    # Virtual result paths
+    RESULT_DIR = "/path/to/results"
+    BILSTM_DIR = "/path/to/results/bilstm"
+    GB_DIR = "/path/to/results/gradient_boosting"
+    MFEGB_DIR = "/path/to/results/mfe_gb"
+    
+    # Virtual log paths (for minimal logging if needed)
+    LOG_DIR = "/path/to/logs"
+    BILSTM_LOG_DIR = "/path/to/logs/bilstm"
+    MFEGB_LOG_DIR = "/path/to/logs/mfe_gb"
+    
+    # Virtual plot paths (not used in sanitized version)
+    PLOT_DIR = "/path/to/plots"
+    BILSTM_PLOT_DIR = "/path/to/plots/bilstm"
+    
+    @classmethod
+    def create_directories(cls):
+        """Create necessary directories (virtual implementation)"""
+        # In actual implementation, this would create the directories
+        # For publication, this is a placeholder
+        directories = [
+            cls.DATA_DIR, cls.MODEL_DIR, cls.RESULT_DIR, cls.LOG_DIR,
+            cls.BILSTM_MODEL_DIR, cls.GB_MODEL_DIR, cls.BILSTM_DIR, 
+            cls.GB_DIR, cls.MFEGB_DIR, cls.BILSTM_LOG_DIR, cls.MFEGB_LOG_DIR
+        ]
+        for directory in directories:
+            os.makedirs(directory, exist_ok=True)
--- a/utils/function.py
+++ b/utils/function.py
@ -0,0 +1,418 @@
+"""
+Common Functions for FScanpy Models
+"""
+import os
+import numpy as np
+import pandas as pd
+import pickle
+import json
+from sklearn.metrics import (
+    accuracy_score,
+    recall_score,
+    precision_score,
+    roc_auc_score,
+    log_loss,
+    f1_score
+)
+from utils.config import BaseConfig
+
+def select_low_confidence_samples_cnn(model, X_unlabeled, unlabeled_data, confidence_threshold=0.5):
+    """
+    Select low confidence samples and assign pseudo labels
+
+    Args:
+        model: Current model
+        X_unlabeled: Unlabeled data features
+        unlabeled_data: Unlabeled data DataFrame with final_prob column as confidence
+        confidence_threshold: Not used, kept for compatibility
+
+    Returns:
+        selected: Selected samples with pseudo labels
+    """
+    # Check if final_prob column exists
+    if 'final_prob' not in unlabeled_data.columns:
+        return pd.DataFrame()
+    
+    # Predict probabilities for unlabeled data
+    probs = model.predict(X_unlabeled)
+
+    # For binary classification, Keras outputs positive class probability
+    # Build complete probability distribution [1-p, p]
+    probs_full = np.column_stack([1-probs, probs])
+
+    # Calculate entropy (prediction uncertainty)
+    epsilon = 1e-15
+    probs_full_safe = np.clip(probs_full, epsilon, 1 - epsilon)
+    entropy = -np.sum(probs_full_safe * np.log(probs_full_safe), axis=1)
+    
+    # Get predicted labels
+    preds = (probs > 0.5).astype(int).flatten()
+    
+    # Create result DataFrame
+    result_df = pd.DataFrame({
+        'entropy': entropy,
+        'pseudo_label': preds,
+        'prob': probs.flatten()
+    }, index=unlabeled_data.index)
+    
+    # Get final_prob as confidence from original data
+    final_probs = unlabeled_data['final_prob'].values
+    
+    # Select samples based on: entropy < confidence(final_prob) & prediction_prob > 0.5
+    selected_mask = (result_df['entropy'] < final_probs) & (result_df['prob'] > 0.5)
+    
+    # Select qualifying samples
+    selected = unlabeled_data.loc[result_df[selected_mask].index].copy()
+    
+    # Add pseudo labels to selected samples
+    if not selected.empty:
+        selected['label'] = result_df.loc[result_df[selected_mask].index, 'pseudo_label'].values
+    
+    return selected
+
+def convert_numpy_types(obj):
+    """
+    Recursively convert NumPy data types to Python native types for JSON serialization
+
+    Args:
+        obj: Any object that may contain NumPy data types
+
+    Returns:
+        Converted object with all NumPy types converted to Python native types
+    """
+    if isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.bool_):
+        return bool(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_numpy_types(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    elif isinstance(obj, tuple):
+        return tuple(convert_numpy_types(item) for item in obj)
+    else:
+        return obj
+
+def save_training_info(model, training_info, save_dir, model_type="best", is_final_model=False):
+    """
+    Save model and training information
+    
+    Args:
+        model: Trained model
+        training_info: Training information dictionary
+        save_dir: Save directory
+        model_type: Model type, "best" for best model, "final" for final model
+        is_final_model: Whether this is the final model from self-training
+    """
+    # Create save directory
+    os.makedirs(save_dir, exist_ok=True)
+    
+    try:
+        # Save model
+        model_filename = f"{model_type}_model.h5"
+        model_path = os.path.join(save_dir, model_filename)
+        model.save(model_path)
+        
+        # Prepare training info for JSON serialization
+        serializable_info = convert_numpy_types(training_info)
+        
+        # Save training info
+        info_filename = f"{model_type}_training_info.json"
+        info_path = os.path.join(save_dir, info_filename)
+        
+        with open(info_path, 'w') as f:
+            json.dump(serializable_info, f, indent=2)
+        
+        # Save model weights separately
+        weights_filename = f"{model_type}_weights.pkl"
+        weights_path = os.path.join(save_dir, weights_filename)
+        
+        with open(weights_path, 'wb') as f:
+            pickle.dump(model.get_weights(), f)
+            
+    except Exception as e:
+        pass
+
+def load_data(neg_samples=20000):
+    """
+    Load data
+    
+    Args:
+        neg_samples: Number of randomly selected EUPLOTES negative samples, 
+                    if None use all negative samples
+    
+    Returns:
+        train_data: Training data
+        test_data: Test data
+        low_conf_data: Low confidence data
+        xu_data: Xu dataset as additional validation set
+        atkins_data: Atkins dataset as additional validation set
+    """
+    try:
+        # Load merged data files
+        train_data = pd.read_csv(BaseConfig.TRAIN_DATA)
+        test_data = pd.read_csv(BaseConfig.TEST_DATA)
+        validation_data = pd.read_csv(BaseConfig.VALIDATION_DATA)
+        
+        # Ensure required columns exist
+        required_columns = ['full_seq', 'label', 'source']
+        
+        for df in [train_data, test_data, validation_data]:
+            for col in required_columns:
+                if col not in df.columns:
+                    if col == 'label':
+                        df[col] = 0
+                    elif col == 'source':
+                        df[col] = 'unknown'
+                    else:
+                        df[col] = ''
+        
+        # Separate validation datasets
+        xu_data = validation_data[validation_data['source'] == 'Xu'].copy()
+        atkins_data = validation_data[validation_data['source'] == 'Atkins'].copy()
+        
+        # Create low confidence data (placeholder)
+        low_conf_data = pd.DataFrame()
+        
+        # Sample negative samples if specified
+        if neg_samples is not None:
+            # Sample from EUPLOTES negative samples in training data
+            euplotes_neg = train_data[
+                (train_data['source'] == 'EUPLOTES') & 
+                (train_data['label'] == 0)
+            ]
+            
+            if len(euplotes_neg) > neg_samples:
+                sampled_neg = euplotes_neg.sample(n=neg_samples, random_state=42)
+                # Keep positive samples and other sources, replace EUPLOTES negatives
+                train_data = pd.concat([
+                    train_data[~((train_data['source'] == 'EUPLOTES') & (train_data['label'] == 0))],
+                    sampled_neg
+                ], ignore_index=True)
+        
+        return train_data, test_data, low_conf_data, xu_data, atkins_data
+        
+    except Exception as e:
+        # Return empty DataFrames on error
+        empty_df = pd.DataFrame()
+        return empty_df, empty_df, empty_df, empty_df, empty_df
+
+def select_low_confidence_samples_gb(model, X_unlabeled, unlabeled_data, confidence_threshold=0.5):
+    """
+    Select low confidence samples and assign pseudo labels for GB model
+    
+    Args:
+        model: Current model
+        X_unlabeled: Unlabeled data features
+        unlabeled_data: Unlabeled data DataFrame with final_prob column as confidence
+        confidence_threshold: Not used, kept for compatibility
+    
+    Returns:
+        selected: Low confidence samples with pseudo labels and sequence information
+    """
+    # Check if final_prob column exists
+    if 'final_prob' not in unlabeled_data.columns:
+        return pd.DataFrame()
+    
+    try:
+        # Predict probabilities
+        probs = model.predict_proba(X_unlabeled)
+        
+        # For binary classification, get positive class probability
+        if probs.shape[1] == 2:
+            pos_probs = probs[:, 1]
+        else:
+            pos_probs = probs.flatten()
+        
+        # Calculate entropy
+        epsilon = 1e-15
+        probs_safe = np.clip(probs, epsilon, 1 - epsilon)
+        entropy = -np.sum(probs_safe * np.log(probs_safe), axis=1)
+        
+        # Get predicted labels
+        preds = model.predict(X_unlabeled)
+        
+        # Create result DataFrame
+        result_df = pd.DataFrame({
+            'entropy': entropy,
+            'pseudo_label': preds,
+            'prob': pos_probs
+        }, index=unlabeled_data.index)
+        
+        # Get confidence from original data
+        final_probs = unlabeled_data['final_prob'].values
+        
+        # Select samples: entropy < confidence & prediction_prob > 0.5
+        selected_mask = (result_df['entropy'] < final_probs) & (result_df['prob'] > 0.5)
+        
+        # Select qualifying samples
+        selected = unlabeled_data.loc[result_df[selected_mask].index].copy()
+        
+        # Add pseudo labels
+        if not selected.empty:
+            selected['label'] = result_df.loc[result_df[selected_mask].index, 'pseudo_label'].values
+        
+        return selected
+        
+    except Exception as e:
+        return pd.DataFrame()
+
+def evaluate_model_gb(model, X, y):
+    """
+    Evaluate model performance for GB model
+    
+    Args:
+        model: Trained model
+        X: Feature matrix
+        y: Labels
+    
+    Returns:
+        metrics: Performance metrics dictionary
+    """
+    default_metrics = {
+        'accuracy': 0.0, 'auc': 0.0, 'f1': 0.0,
+        'precision': 0.0, 'recall': 0.0, 'loss': float('inf')
+    }
+    
+    try:
+        # Get predictions
+        y_pred = model.predict(X)
+        y_pred_proba = model.predict_proba(X)
+        
+        # Get positive class probabilities
+        if y_pred_proba.shape[1] == 2:
+            y_pred_prob = y_pred_proba[:, 1]
+        else:
+            y_pred_prob = y_pred_proba.flatten()
+        
+        metrics = default_metrics.copy()
+        
+        # Calculate metrics
+        try:
+            metrics['accuracy'] = accuracy_score(y, y_pred)
+        except Exception:
+            pass
+            
+        try:
+            if len(np.unique(y_pred_prob)) > 1:
+                metrics['auc'] = roc_auc_score(y, y_pred_prob)
+            else:
+                metrics['auc'] = 0.5
+        except Exception:
+            pass
+            
+        try:
+            metrics['f1'] = f1_score(y, y_pred, zero_division=0)
+        except Exception:
+            pass
+            
+        try:
+            metrics['precision'] = precision_score(y, y_pred, zero_division=0)
+        except Exception:
+            pass
+            
+        try:
+            metrics['recall'] = recall_score(y, y_pred, zero_division=0)
+        except Exception:
+            pass
+        
+        try:
+            y_pred_prob_safe = np.clip(y_pred_prob, 1e-15, 1-1e-15)
+            metrics['loss'] = log_loss(y, y_pred_prob_safe)
+        except Exception:
+            pass
+        
+        return metrics
+        
+    except Exception as e:
+        return default_metrics
+
+def evaluate_model_cnn(model, X_test, y_test):
+    """Evaluate CNN model performance"""
+    default_metrics = {
+        'accuracy': 0.0, 'auc': 0.0, 'f1': 0.0,
+        'precision': 0.0, 'recall': 0.0, 'loss': float('inf')
+    }
+    
+    try:
+        # Batch prediction to avoid memory issues
+        batch_size = 128
+        n_samples = len(X_test)
+        n_batches = (n_samples + batch_size - 1) // batch_size
+        
+        y_pred = np.zeros(n_samples)
+        for i in range(n_batches):
+            start_idx = i * batch_size
+            end_idx = min((i + 1) * batch_size, n_samples)
+            batch_preds = model.predict(X_test[start_idx:end_idx], verbose=0)
+            
+            # Handle multi-output models
+            if isinstance(batch_preds, list):
+                batch_preds = batch_preds[0]
+                
+            # Ensure predictions are 1D
+            if len(batch_preds.shape) > 1 and batch_preds.shape[1] > 1:
+                batch_preds = batch_preds[:, 1]
+            elif len(batch_preds.shape) > 1:
+                batch_preds = batch_preds.flatten()
+                
+            y_pred[start_idx:end_idx] = batch_preds
+        
+        # Convert probabilities to binary predictions
+        y_pred_binary = (y_pred > 0.5).astype(int)
+        
+        # Ensure labels are 1D
+        if len(y_test.shape) > 1:
+            y_test = y_test.flatten()
+
+        metrics = default_metrics.copy()
+        
+        # Calculate metrics individually
+        try:
+            metrics['accuracy'] = accuracy_score(y_test, y_pred_binary)
+        except Exception:
+            pass
+            
+        try:
+            if len(np.unique(y_pred)) > 1:
+                metrics['auc'] = roc_auc_score(y_test, y_pred)
+            else:
+                # Handle case where all predictions are the same
+                if (np.mean(y_pred) > 0.5 and np.mean(y_test) > 0.5) or \
+                   (np.mean(y_pred) <= 0.5 and np.mean(y_test) < 0.5):
+                    metrics['auc'] = 0.55
+                else:
+                    metrics['auc'] = 0.45
+        except Exception:
+            pass
+            
+        try:
+            metrics['f1'] = f1_score(y_test, y_pred_binary, zero_division=0)
+        except Exception:
+            pass
+            
+        try:
+            metrics['precision'] = precision_score(y_test, y_pred_binary, zero_division=0)
+        except Exception:
+            pass
+            
+        try:
+            metrics['recall'] = recall_score(y_test, y_pred_binary, zero_division=0)
+        except Exception:
+            pass
+
+        # Calculate loss
+        try:
+            y_pred_prob = np.clip(y_pred, 1e-15, 1-1e-15)
+            metrics['loss'] = log_loss(y_test, y_pred_prob)
+        except Exception:
+            pass
+
+        return metrics
+        
+    except Exception as e:
+        return default_metrics