""" HistGradientBoosting Model with MFE Features """ import os import numpy as np import pandas as pd import itertools from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.metrics import ( roc_auc_score, roc_curve, confusion_matrix, precision_recall_curve, average_precision_score ) from utils.function import evaluate_model_gb from utils.config import BaseConfig class GBConfig: """HistGradientBoostingClassifier model configuration""" # Model training parameters MAX_ITER = 10000 LEARNING_RATE = 0.4 MAX_DEPTH = 5 RANDOM_STATE = 42 # Early stopping parameters EARLY_STOPPING = True N_ITER_NO_CHANGE = 10 SCORING = 'loss' # Sequence parameters SEQUENCE_LENGTH = 33 # Must be multiple of 3 (codon length) # Validation parameters VALIDATION_FRACTION = 0.2 SMALL_VALIDATION_FRACTION = 0.1 def load_data(neg_samples=20000): """Load training and validation data""" try: train_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_train_data.csv")) test_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_test_data.csv")) validation_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_validation_data.csv")) required_columns = ['full_seq', 'label'] for df in [train_data, test_data, validation_data]: for col in required_columns: if col not in df.columns: if col == 'label': df[col] = 0 else: df[col] = '' xu_data = validation_data[validation_data['source'] == 'Xu'].copy() atkins_data = validation_data[validation_data['source'] == 'Atkins'].copy() for df in [xu_data, atkins_data]: for col in required_columns: if col not in df.columns: df[col] = validation_data[col] if col in validation_data.columns else ( 0.0 if col in ['mfe_40bp', 'mfe_120bp'] else ( 0 if col == 'label' else '' ) ) return train_data, test_data, validation_data, xu_data, atkins_data except Exception as e: return None, None, None, None, None def train_hist_model(X_train, y_train, X_test, y_test, sample_weights=None, X_xu=None, y_xu=None, X_atkins=None, y_atkins=None): """Train HistGradientBoostingClassifier model""" # Determine validation fraction validation_fraction = GBConfig.VALIDATION_FRACTION if X_xu is not None or X_atkins is not None: validation_fraction = GBConfig.SMALL_VALIDATION_FRACTION # Create and train model model = HistGradientBoostingClassifier( max_iter=GBConfig.MAX_ITER, learning_rate=GBConfig.LEARNING_RATE, max_depth=GBConfig.MAX_DEPTH, random_state=GBConfig.RANDOM_STATE, early_stopping=GBConfig.EARLY_STOPPING, n_iter_no_change=GBConfig.N_ITER_NO_CHANGE, scoring=GBConfig.SCORING, validation_fraction=validation_fraction ) # Train model model.fit(X_train, y_train, sample_weight=sample_weights) # Evaluate on test set test_metrics = evaluate_model_gb(model, X_test, y_test) # Evaluate on external validation sets xu_metrics = None if X_xu is not None and y_xu is not None: xu_metrics = evaluate_model_gb(model, X_xu, y_xu) atkins_metrics = None if X_atkins is not None and y_atkins is not None: atkins_metrics = evaluate_model_gb(model, X_atkins, y_atkins) # Prepare training info training_info = { 'n_iter': model.n_iter_, 'train_score': model.train_score_, 'validation_scores': model.validation_scores_ if hasattr(model, 'validation_scores_') else None, 'final_metrics': { 'test': test_metrics, 'xu': xu_metrics, 'atkins': atkins_metrics } } return model, test_metrics, training_info def get_feature_names(seq_length=33): """Return feature names including all possible base features and MFE features""" features = [] # Single nucleotide features bases = ['A', 'T', 'C', 'G'] for i in range(seq_length): for base in bases: features.append(f'pos_{i+1}_{base}') # Dinucleotide features dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)] for i in range(seq_length - 1): for dinuc in dinucleotides: features.append(f'dinuc_{i+1}_{dinuc}') # Trinucleotide (codon) features trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)] for i in range(seq_length - 2): for trinuc in trinucleotides: features.append(f'codon_{i+1}_{trinuc}') # MFE features features.extend(['mfe_40bp', 'mfe_120bp']) return features def trim_sequence(seq, target_length): """Trim sequence from both ends to reach target length, keeping center position""" if len(seq) <= target_length: return seq excess = len(seq) - target_length left_trim = excess // 2 right_trim = excess - left_trim return seq[left_trim:len(seq)-right_trim] def sequence_to_features(sequence, seq_length=33, mfe_values=None): """Convert DNA sequence to feature vector including MFE features""" # Trim sequence to target length trimmed_seq = trim_sequence(sequence.upper(), seq_length) # Initialize feature vector feature_vector = [] # Single nucleotide features (one-hot encoding) bases = ['A', 'T', 'C', 'G'] for i in range(seq_length): for base in bases: if i < len(trimmed_seq) and trimmed_seq[i] == base: feature_vector.append(1) else: feature_vector.append(0) # Dinucleotide features dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)] for i in range(seq_length - 1): for dinuc in dinucleotides: if i + 1 < len(trimmed_seq) and trimmed_seq[i:i+2] == dinuc: feature_vector.append(1) else: feature_vector.append(0) # Trinucleotide (codon) features trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)] for i in range(seq_length - 2): for trinuc in trinucleotides: if i + 2 < len(trimmed_seq) and trimmed_seq[i:i+3] == trinuc: feature_vector.append(1) else: feature_vector.append(0) # Add MFE features if mfe_values is not None: if isinstance(mfe_values, dict): feature_vector.append(mfe_values.get('mfe_40bp', 0.0)) feature_vector.append(mfe_values.get('mfe_120bp', 0.0)) elif isinstance(mfe_values, (list, tuple)) and len(mfe_values) >= 2: feature_vector.extend(mfe_values[:2]) else: feature_vector.extend([0.0, 0.0]) else: feature_vector.extend([0.0, 0.0]) return np.array(feature_vector) def prepare_data(train_data, test_data, seq_length=33): """Prepare training and test data including MFE features""" # Process training data X_train = [] y_train = [] sample_weights = [] for _, row in train_data.iterrows(): sequence = row['full_seq'] label = row['label'] # Get MFE values mfe_values = {} if 'mfe_40bp' in row: mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 if 'mfe_120bp' in row: mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 # Convert to features features = sequence_to_features(sequence, seq_length, mfe_values) X_train.append(features) y_train.append(label) # Sample weight weight = 1.0 if 'sample_weight' in row and pd.notna(row['sample_weight']): weight = row['sample_weight'] sample_weights.append(weight) X_train = np.array(X_train) y_train = np.array(y_train) sample_weights = np.array(sample_weights) # Process test data X_test = [] y_test = [] if test_data is not None and not test_data.empty: for _, row in test_data.iterrows(): sequence = row['full_seq'] label = row['label'] # Get MFE values mfe_values = {} if 'mfe_40bp' in row: mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 if 'mfe_120bp' in row: mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 # Convert to features features = sequence_to_features(sequence, seq_length, mfe_values) X_test.append(features) y_test.append(label) X_test = np.array(X_test) if X_test else None y_test = np.array(y_test) if y_test else None return X_train, y_train, X_test, y_test, sample_weights, train_data, test_data def analyze_feature_importance(model, X_test, y_test, test_data): """Analyze feature importance (simplified version)""" try: # Get feature names feature_names = get_feature_names(GBConfig.SEQUENCE_LENGTH) # Built-in feature importance if hasattr(model, 'feature_importances_'): importance_scores = model.feature_importances_ # Create importance DataFrame importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': importance_scores }).sort_values('importance', ascending=False) # Save results importance_path = os.path.join(BaseConfig.GB_DIR, 'feature_importance.csv') importance_df.to_csv(importance_path, index=False) return {'built_in_importance': importance_df} return None except Exception as e: return None def main(): """Main training function""" try: # Set sequence length sequence_length = GBConfig.SEQUENCE_LENGTH # Load data train_data, test_data, _, xu_data, atkins_data = load_data() # Prepare data X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data( train_data, test_data, seq_length=sequence_length ) # Prepare validation data X_xu = y_xu = X_atkins = y_atkins = None if xu_data is not None and not xu_data.empty: try: empty_test = pd.DataFrame(columns=xu_data.columns) X_xu, y_xu, _, _, _, _, _ = prepare_data(xu_data, empty_test, seq_length=sequence_length) except Exception as e: X_xu = y_xu = None if atkins_data is not None and not atkins_data.empty: try: empty_test = pd.DataFrame(columns=atkins_data.columns) X_atkins, y_atkins, _, _, _, _, _ = prepare_data(atkins_data, empty_test, seq_length=sequence_length) except Exception as e: X_atkins = y_atkins = None # Train model model, _, training_info = train_hist_model( X_train, y_train, X_test, y_test, sample_weights, X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins ) # Feature importance analysis source_results = analyze_feature_importance(model, X_test, y_test, test_data) return model, training_info['final_metrics']['test'] except Exception as e: return None, None if __name__ == "__main__": BaseConfig.create_directories() main()