上传文件至 train_models

2025-08-17 15:30:14 +08:00 · 2025-08-17 15:30:14 +08:00 · aa520b6238
commit aa520b6238
5 changed files with 851 additions and 0 deletions
--- a/train_models/init.cpython-310.pyc
+++ b/train_models/init.cpython-310.pyc
--- a/train_models/bilstm_cnn.cpython-310.pyc
+++ b/train_models/bilstm_cnn.cpython-310.pyc
--- a/train_models/bilstm_cnn.py
+++ b/train_models/bilstm_cnn.py
@ -0,0 +1,507 @@
+"""
+BiLSTM-CNN Model for Sequence Classification
+"""
+import os
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, log_loss
+from tensorflow.keras import layers, models
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+from utils.function import load_data, save_training_info, select_low_confidence_samples_cnn, evaluate_model_cnn
+from utils.config import BaseConfig
+
+# Set random seeds
+np.random.seed(42)
+tf.random.set_seed(42)
+
+class MetricsCallback(tf.keras.callbacks.Callback):
+    """Callback for recording training metrics"""
+    
+    def __init__(self):
+        super().__init__()
+        self.training_metrics = {
+            'train_loss': [], 'train_auc': [], 'train_accuracy': [], 
+            'train_recall': [], 'train_precision': [], 'train_f1': [],
+            'test_loss': [], 'test_auc': [], 'test_accuracy': [], 
+            'test_recall': [], 'test_precision': [], 'test_f1': []
+        }
+        
+        self.iteration_metrics = {
+            'samples_added': [0],
+            'total_samples': []
+        }
+        
+        self.best_model = None
+        self.best_test_loss = float('inf')
+        self.best_epoch = -1
+        self.best_predictions = None
+        
+        self.xu_metrics_history = {
+            'loss': [], 'auc': [], 'accuracy': [], 
+            'recall': [], 'precision': [], 'f1': []
+        }
+        self.atkins_metrics_history = {
+            'loss': [], 'auc': [], 'accuracy': [], 
+            'recall': [], 'precision': [], 'f1': []
+        }
+
+        self.self_training_best_model = None
+        self.self_training_best_loss = float('inf')
+        self.self_training_best_metrics = None
+
+    def on_epoch_end(self, epoch, logs={}):
+        try:
+            train_loss = logs.get('loss', 0.0)
+            val_loss = logs.get('val_loss', 0.0)
+            
+            train_metrics = {
+                'loss': train_loss,
+                'auc': logs.get('auc', 0.0),
+                'accuracy': logs.get('accuracy', 0.0),
+                'recall': logs.get('recall', 0.0),
+                'precision': 0.0,
+                'f1': 0.0
+            }
+            
+            # Calculate test metrics using batch processing
+            batch_size = 128
+            n_test_samples = len(self.model.X_test)
+            n_test_batches = (n_test_samples + batch_size - 1) // batch_size
+            
+            test_probs = np.zeros(n_test_samples)
+            try:
+                for i in range(n_test_batches):
+                    start_idx = i * batch_size
+                    end_idx = min((i + 1) * batch_size, n_test_samples)
+                    batch_probs = self.model.predict(self.model.X_test[start_idx:end_idx], verbose=0)
+                    if isinstance(batch_probs, list):
+                        batch_probs = batch_probs[0]
+                    if len(batch_probs.shape) > 1:
+                        batch_probs = batch_probs.flatten()
+                    test_probs[start_idx:end_idx] = batch_probs
+                
+                test_preds = (test_probs > 0.5).astype(int)
+                
+                test_metrics = {
+                    'loss': log_loss(self.model.y_test, np.clip(test_probs, 1e-15, 1-1e-15)),
+                    'auc': roc_auc_score(self.model.y_test, test_probs) if len(np.unique(test_probs)) > 1 else 0.5,
+                    'accuracy': accuracy_score(self.model.y_test, test_preds),
+                    'recall': recall_score(self.model.y_test, test_preds, zero_division=0),
+                    'precision': precision_score(self.model.y_test, test_preds, zero_division=0),
+                    'f1': f1_score(self.model.y_test, test_preds, zero_division=0)
+                }
+                
+            except Exception as e:
+                test_metrics = {
+                    'loss': float('inf'), 'auc': 0.0, 'accuracy': 0.0,
+                    'recall': 0.0, 'precision': 0.0, 'f1': 0.0
+                }
+            
+            # Record metrics
+            for key in self.training_metrics:
+                if key.startswith('train_'):
+                    metric_name = key[6:]
+                    self.training_metrics[key].append(train_metrics.get(metric_name, 0.0))
+                elif key.startswith('test_'):
+                    metric_name = key[5:]
+                    self.training_metrics[key].append(test_metrics.get(metric_name, 0.0))
+            
+            # Update best model based on test loss
+            if test_metrics['loss'] < self.best_test_loss:
+                self.best_test_loss = test_metrics['loss']
+                self.best_epoch = epoch
+                self.best_model = tf.keras.models.clone_model(self.model)
+                self.best_model.set_weights(self.model.get_weights())
+                self.best_predictions = test_probs.copy()
+            
+            # Evaluate external validation sets if available
+            if hasattr(self.model, 'X_xu') and self.model.X_xu is not None:
+                xu_metrics = evaluate_model_cnn(self.model, self.model.X_xu, self.model.y_xu)
+                for key in self.xu_metrics_history:
+                    self.xu_metrics_history[key].append(xu_metrics.get(key, 0.0))
+            
+            if hasattr(self.model, 'X_atkins') and self.model.X_atkins is not None:
+                atkins_metrics = evaluate_model_cnn(self.model, self.model.X_atkins, self.model.y_atkins)
+                for key in self.atkins_metrics_history:
+                    self.atkins_metrics_history[key].append(atkins_metrics.get(key, 0.0))
+                    
+        except Exception as e:
+            pass
+
+    def on_train_end(self, logs=None):
+        if self.best_model is not None:
+            self.model.set_weights(self.best_model.get_weights())
+
+class Config:
+    """Model configuration parameters"""
+    NEG_SAMPLES = 20000
+    CONFIDENCE_THRESHOLD = 0.5
+    EMBEDDING_DIM = 64
+    LSTM_UNITS = 64
+    CNN_FILTERS = 64
+    CNN_KERNEL_SIZES = [3, 5, 7]
+    DROPOUT_RATE = 0.5
+    LEARNING_RATE = 1e-4
+    BATCH_SIZE = 1024
+    EPOCHS = 5
+    INITIAL_EPOCHS = 5
+    SELF_TRAINING_EPOCHS = 1
+    MAX_ITERATIONS = 20
+    EARLY_STOPPING_PATIENCE = 5
+    Sequence_len = 399
+
+def process_sequence(seq, max_length=399):
+    """Process single sequence"""
+    return seq[:max_length] if len(seq) > max_length else seq
+
+def encode_sequence(seq, max_length=399):
+    """Encode single sequence"""
+    mapping = {'A': 1, 'T': 2, 'C': 3, 'G': 4}
+    encoded = [mapping.get(base, 0) for base in seq.upper()]
+    if len(encoded) < max_length:
+        encoded.extend([0] * (max_length - len(encoded)))
+    return encoded[:max_length]
+
+def trim_sequence(seq, target_length):
+    """Trim sequence from both ends to reach target length"""
+    if len(seq) <= target_length:
+        return seq
+    
+    excess = len(seq) - target_length
+    left_trim = excess // 2
+    right_trim = excess - left_trim
+    
+    return seq[left_trim:len(seq)-right_trim]
+
+def prepare_data(train_data, test_data=None, low_conf_data=None, max_length=399):
+    """Prepare training and test data"""
+    # Process training data
+    train_sequences = []
+    train_labels = []
+    sample_weights = []
+    
+    for _, row in train_data.iterrows():
+        seq = process_sequence(row['full_seq'], max_length)
+        encoded_seq = encode_sequence(seq, max_length)
+        train_sequences.append(encoded_seq)
+        train_labels.append(row['label'])
+        
+        weight = 1.0
+        if 'sample_weight' in row and pd.notna(row['sample_weight']):
+            weight = row['sample_weight']
+        sample_weights.append(weight)
+    
+    X_train = np.array(train_sequences)
+    y_train = np.array(train_labels)
+    sample_weights = np.array(sample_weights)
+    
+    # Process test data
+    X_test = y_test = None
+    if test_data is not None and not test_data.empty:
+        test_sequences = []
+        test_labels = []
+        
+        for _, row in test_data.iterrows():
+            seq = process_sequence(row['full_seq'], max_length)
+            encoded_seq = encode_sequence(seq, max_length)
+            test_sequences.append(encoded_seq)
+            test_labels.append(row['label'])
+        
+        X_test = np.array(test_sequences)
+        y_test = np.array(test_labels)
+    
+    # Process low confidence data
+    X_low_conf = y_low_conf = None
+    if low_conf_data is not None and not low_conf_data.empty:
+        low_conf_sequences = []
+        low_conf_labels = []
+        
+        for _, row in low_conf_data.iterrows():
+            seq = process_sequence(row['full_seq'], max_length)
+            encoded_seq = encode_sequence(seq, max_length)
+            low_conf_sequences.append(encoded_seq)
+            low_conf_labels.append(row['label'])
+        
+        X_low_conf = np.array(low_conf_sequences)
+        y_low_conf = np.array(low_conf_labels)
+    
+    return X_train, y_train, X_test, y_test, sample_weights, X_low_conf, y_low_conf
+
+def create_bilstm_cnn_model(input_shape):
+    """Create BiLSTM-CNN model"""
+    input_layer = layers.Input(shape=input_shape)
+    
+    # Embedding layer
+    embedding = layers.Embedding(
+        input_dim=5, 
+        output_dim=Config.EMBEDDING_DIM, 
+        input_length=input_shape[0]
+    )(input_layer)
+    
+    # BiLSTM layers
+    lstm_out = layers.Bidirectional(
+        layers.LSTM(Config.LSTM_UNITS, return_sequences=True, dropout=Config.DROPOUT_RATE)
+    )(embedding)
+    
+    # CNN branches
+    cnn_outputs = []
+    for kernel_size in Config.CNN_KERNEL_SIZES:
+        cnn = layers.Conv1D(
+            filters=Config.CNN_FILTERS,
+            kernel_size=kernel_size,
+            activation='relu',
+            padding='same'
+        )(lstm_out)
+        cnn = layers.GlobalMaxPooling1D()(cnn)
+        cnn_outputs.append(cnn)
+    
+    # Concatenate CNN outputs
+    if len(cnn_outputs) > 1:
+        concat = layers.Concatenate()(cnn_outputs)
+    else:
+        concat = cnn_outputs[0]
+    
+    # Dense layers
+    dense = layers.Dense(128, activation='relu')(concat)
+    dense = layers.Dropout(Config.DROPOUT_RATE)(dense)
+    dense = layers.Dense(64, activation='relu')(dense)
+    dense = layers.Dropout(Config.DROPOUT_RATE)(dense)
+    
+    # Output layer
+    output = layers.Dense(1, activation='sigmoid')(dense)
+    
+    model = models.Model(inputs=input_layer, outputs=output)
+    
+    # Compile model
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE),
+        loss='binary_crossentropy',
+        metrics=['accuracy', 'auc', 'recall']
+    )
+    
+    return model
+
+def train_bilstm_cnn_model(X_train, y_train, X_test, y_test, sample_weights=None, 
+                          X_xu=None, y_xu=None, X_atkins=None, y_atkins=None):
+    """Train BiLSTM-CNN model with self-training"""
+    
+    # Create model
+    input_shape = (X_train.shape[1],)
+    model = create_bilstm_cnn_model(input_shape)
+    
+    # Store validation data in model for callback access
+    model.X_test = X_test
+    model.y_test = y_test
+    model.X_xu = X_xu
+    model.y_xu = y_xu
+    model.X_atkins = X_atkins
+    model.y_atkins = y_atkins
+    
+    # Initial training
+    metrics_callback = MetricsCallback()
+    
+    early_stopping = tf.keras.callbacks.EarlyStopping(
+        monitor='val_loss',
+        patience=Config.EARLY_STOPPING_PATIENCE,
+        restore_best_weights=True,
+        verbose=0
+    )
+    
+    # Split training data for validation
+    val_split = 0.2
+    n_val = int(len(X_train) * val_split)
+    indices = np.random.permutation(len(X_train))
+    train_indices = indices[n_val:]
+    val_indices = indices[:n_val]
+    
+    X_train_split = X_train[train_indices]
+    y_train_split = y_train[train_indices]
+    X_val_split = X_train[val_indices]
+    y_val_split = y_train[val_indices]
+    
+    if sample_weights is not None:
+        sample_weights_split = sample_weights[train_indices]
+    else:
+        sample_weights_split = None
+    
+    # Initial training
+    model.fit(
+        X_train_split, y_train_split,
+        validation_data=(X_val_split, y_val_split),
+        epochs=Config.INITIAL_EPOCHS,
+        batch_size=Config.BATCH_SIZE,
+        sample_weight=sample_weights_split,
+        callbacks=[metrics_callback, early_stopping],
+        verbose=0
+    )
+    
+    # Store initial training info
+    initial_info = {
+        'best_test_loss': metrics_callback.best_test_loss,
+        'best_epoch': metrics_callback.best_epoch,
+        'training_metrics': metrics_callback.training_metrics.copy()
+    }
+    
+    # Self-training iterations
+    current_X_train = X_train.copy()
+    current_y_train = y_train.copy()
+    current_weights = sample_weights.copy() if sample_weights is not None else None
+    
+    iteration_metrics = {
+        'iteration': [0],
+        'train_loss': [metrics_callback.training_metrics['train_loss'][-1]],
+        'test_loss': [metrics_callback.training_metrics['test_loss'][-1]],
+        'samples_added': [0],
+        'total_samples': [len(current_X_train)]
+    }
+    
+    if X_xu is not None:
+        xu_metrics = evaluate_model_cnn(model, X_xu, y_xu)
+        iteration_metrics['xu_loss'] = [xu_metrics['loss']]
+    
+    if X_atkins is not None:
+        atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins)
+        iteration_metrics['atkins_loss'] = [atkins_metrics['loss']]
+    
+    best_model = tf.keras.models.clone_model(model)
+    best_model.set_weights(model.get_weights())
+    best_loss = metrics_callback.best_test_loss
+    best_iteration = 0
+    
+    # Load low confidence data for self-training
+    _, _, low_conf_data, _, _ = load_data()
+    
+    if low_conf_data is not None and not low_conf_data.empty:
+        X_unlabeled, _, _, _, _, _, _ = prepare_data(
+            low_conf_data, pd.DataFrame(), max_length=Config.Sequence_len
+        )
+        
+        for iteration in range(1, Config.MAX_ITERATIONS + 1):
+            # Select low confidence samples
+            selected_samples = select_low_confidence_samples_cnn(
+                model, X_unlabeled, low_conf_data
+            )
+            
+            if selected_samples.empty:
+                break
+            
+            # Prepare selected samples
+            X_selected, y_selected, _, _, weights_selected, _, _ = prepare_data(
+                selected_samples, pd.DataFrame(), max_length=Config.Sequence_len
+            )
+            
+            if len(X_selected) == 0:
+                break
+            
+            # Add to training set
+            current_X_train = np.vstack([current_X_train, X_selected])
+            current_y_train = np.hstack([current_y_train, y_selected])
+            
+            if current_weights is not None:
+                current_weights = np.hstack([current_weights, weights_selected])
+            
+            # Retrain model
+            metrics_callback = MetricsCallback()
+            
+            # Split updated training data
+            n_val = int(len(current_X_train) * val_split)
+            indices = np.random.permutation(len(current_X_train))
+            train_indices = indices[n_val:]
+            val_indices = indices[:n_val]
+            
+            X_train_split = current_X_train[train_indices]
+            y_train_split = current_y_train[train_indices]
+            X_val_split = current_X_train[val_indices]
+            y_val_split = current_y_train[val_indices]
+            
+            if current_weights is not None:
+                sample_weights_split = current_weights[train_indices]
+            else:
+                sample_weights_split = None
+            
+            model.fit(
+                X_train_split, y_train_split,
+                validation_data=(X_val_split, y_val_split),
+                epochs=Config.SELF_TRAINING_EPOCHS,
+                batch_size=Config.BATCH_SIZE,
+                sample_weight=sample_weights_split,
+                callbacks=[metrics_callback, early_stopping],
+                verbose=0
+            )
+            
+            # Record iteration metrics
+            iteration_metrics['iteration'].append(iteration)
+            iteration_metrics['train_loss'].append(metrics_callback.training_metrics['train_loss'][-1])
+            iteration_metrics['test_loss'].append(metrics_callback.training_metrics['test_loss'][-1])
+            iteration_metrics['samples_added'].append(len(X_selected))
+            iteration_metrics['total_samples'].append(len(current_X_train))
+            
+            if X_xu is not None:
+                xu_metrics = evaluate_model_cnn(model, X_xu, y_xu)
+                iteration_metrics['xu_loss'].append(xu_metrics['loss'])
+            
+            if X_atkins is not None:
+                atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins)
+                iteration_metrics['atkins_loss'].append(atkins_metrics['loss'])
+            
+            # Update best model
+            current_loss = metrics_callback.training_metrics['test_loss'][-1]
+            if current_loss < best_loss:
+                best_model = tf.keras.models.clone_model(model)
+                best_model.set_weights(model.get_weights())
+                best_loss = current_loss
+                best_iteration = iteration
+    
+    # Final evaluation
+    final_metrics = evaluate_model_cnn(best_model, X_test, y_test)
+    
+    training_info = {
+        'initial_info': initial_info,
+        'iteration_metrics': iteration_metrics,
+        'best_iteration': best_iteration,
+        'final_metrics': final_metrics
+    }
+    
+    return best_model, model, training_info
+
+def main():
+    """Main training function"""
+    # Load data
+    train_data, test_data, low_conf_data, xu_data, atkins_data = load_data()
+    
+    # Prepare data
+    X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data(
+        train_data, test_data, max_length=Config.Sequence_len
+    )
+    
+    # Prepare validation data
+    X_xu = y_xu = X_atkins = y_atkins = None
+    
+    if xu_data is not None and not xu_data.empty:
+        X_xu, y_xu, _, _, _, _, _ = prepare_data(
+            xu_data, pd.DataFrame(), max_length=Config.Sequence_len
+        )
+    
+    if atkins_data is not None and not atkins_data.empty:
+        X_atkins, y_atkins, _, _, _, _, _ = prepare_data(
+            atkins_data, pd.DataFrame(), max_length=Config.Sequence_len
+        )
+    
+    # Train model
+    best_model, final_model, training_info = train_bilstm_cnn_model(
+        X_train, y_train, X_test, y_test, sample_weights,
+        X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins
+    )
+    
+    # Save results
+    save_training_info(best_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "best")
+    save_training_info(final_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "final", is_final_model=True)
+    
+    return best_model, final_model, training_info
+
+if __name__ == "__main__":
+    BaseConfig.create_directories()
+    main()
--- a/train_models/hist_gb.cpython-310.pyc
+++ b/train_models/hist_gb.cpython-310.pyc
--- a/train_models/hist_gb.py
+++ b/train_models/hist_gb.py
@ -0,0 +1,344 @@
+"""
+HistGradientBoosting Model with MFE Features
+"""
+import os
+import numpy as np
+import pandas as pd
+import itertools
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.metrics import (
+    roc_auc_score,
+    roc_curve,
+    confusion_matrix,
+    precision_recall_curve,
+    average_precision_score
+)
+from utils.function import evaluate_model_gb
+from utils.config import BaseConfig
+
+class GBConfig:
+    """HistGradientBoostingClassifier model configuration"""
+    # Model training parameters
+    MAX_ITER = 10000
+    LEARNING_RATE = 0.4
+    MAX_DEPTH = 5
+    RANDOM_STATE = 42
+    
+    # Early stopping parameters
+    EARLY_STOPPING = True
+    N_ITER_NO_CHANGE = 10
+    SCORING = 'loss'
+    
+    # Sequence parameters
+    SEQUENCE_LENGTH = 33  # Must be multiple of 3 (codon length)
+    
+    # Validation parameters
+    VALIDATION_FRACTION = 0.2
+    SMALL_VALIDATION_FRACTION = 0.1
+
+def load_data(neg_samples=20000):
+    """Load training and validation data"""
+    try:
+        train_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_train_data.csv"))
+        test_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_test_data.csv"))
+        validation_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_validation_data.csv"))
+        
+        required_columns = ['full_seq', 'label']
+        
+        for df in [train_data, test_data, validation_data]:
+            for col in required_columns:
+                if col not in df.columns:
+                    if col == 'label':
+                        df[col] = 0
+                    else:
+                        df[col] = ''
+        
+        xu_data = validation_data[validation_data['source'] == 'Xu'].copy()
+        atkins_data = validation_data[validation_data['source'] == 'Atkins'].copy()
+        
+        for df in [xu_data, atkins_data]:
+            for col in required_columns:
+                if col not in df.columns:
+                    df[col] = validation_data[col] if col in validation_data.columns else (
+                        0.0 if col in ['mfe_40bp', 'mfe_120bp'] else (
+                            0 if col == 'label' else ''
+                        )
+                    )
+        
+        return train_data, test_data, validation_data, xu_data, atkins_data
+        
+    except Exception as e:
+        return None, None, None, None, None
+
+def train_hist_model(X_train, y_train, X_test, y_test, sample_weights=None, 
+                    X_xu=None, y_xu=None, X_atkins=None, y_atkins=None):
+    """Train HistGradientBoostingClassifier model"""
+    
+    # Determine validation fraction
+    validation_fraction = GBConfig.VALIDATION_FRACTION
+    if X_xu is not None or X_atkins is not None:
+        validation_fraction = GBConfig.SMALL_VALIDATION_FRACTION
+    
+    # Create and train model
+    model = HistGradientBoostingClassifier(
+        max_iter=GBConfig.MAX_ITER,
+        learning_rate=GBConfig.LEARNING_RATE,
+        max_depth=GBConfig.MAX_DEPTH,
+        random_state=GBConfig.RANDOM_STATE,
+        early_stopping=GBConfig.EARLY_STOPPING,
+        n_iter_no_change=GBConfig.N_ITER_NO_CHANGE,
+        scoring=GBConfig.SCORING,
+        validation_fraction=validation_fraction
+    )
+    
+    # Train model
+    model.fit(X_train, y_train, sample_weight=sample_weights)
+    
+    # Evaluate on test set
+    test_metrics = evaluate_model_gb(model, X_test, y_test)
+    
+    # Evaluate on external validation sets
+    xu_metrics = None
+    if X_xu is not None and y_xu is not None:
+        xu_metrics = evaluate_model_gb(model, X_xu, y_xu)
+    
+    atkins_metrics = None
+    if X_atkins is not None and y_atkins is not None:
+        atkins_metrics = evaluate_model_gb(model, X_atkins, y_atkins)
+    
+    # Prepare training info
+    training_info = {
+        'n_iter': model.n_iter_,
+        'train_score': model.train_score_,
+        'validation_scores': model.validation_scores_ if hasattr(model, 'validation_scores_') else None,
+        'final_metrics': {
+            'test': test_metrics,
+            'xu': xu_metrics,
+            'atkins': atkins_metrics
+        }
+    }
+    
+    return model, test_metrics, training_info
+
+def get_feature_names(seq_length=33):
+    """Return feature names including all possible base features and MFE features"""
+    features = []
+    
+    # Single nucleotide features
+    bases = ['A', 'T', 'C', 'G']
+    for i in range(seq_length):
+        for base in bases:
+            features.append(f'pos_{i+1}_{base}')
+    
+    # Dinucleotide features
+    dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)]
+    for i in range(seq_length - 1):
+        for dinuc in dinucleotides:
+            features.append(f'dinuc_{i+1}_{dinuc}')
+    
+    # Trinucleotide (codon) features
+    trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)]
+    for i in range(seq_length - 2):
+        for trinuc in trinucleotides:
+            features.append(f'codon_{i+1}_{trinuc}')
+    
+    # MFE features
+    features.extend(['mfe_40bp', 'mfe_120bp'])
+    
+    return features
+
+def trim_sequence(seq, target_length):
+    """Trim sequence from both ends to reach target length, keeping center position"""
+    if len(seq) <= target_length:
+        return seq
+    
+    excess = len(seq) - target_length
+    left_trim = excess // 2
+    right_trim = excess - left_trim
+    
+    return seq[left_trim:len(seq)-right_trim]
+
+def sequence_to_features(sequence, seq_length=33, mfe_values=None):
+    """Convert DNA sequence to feature vector including MFE features"""
+    
+    # Trim sequence to target length
+    trimmed_seq = trim_sequence(sequence.upper(), seq_length)
+    
+    # Initialize feature vector
+    feature_vector = []
+    
+    # Single nucleotide features (one-hot encoding)
+    bases = ['A', 'T', 'C', 'G']
+    for i in range(seq_length):
+        for base in bases:
+            if i < len(trimmed_seq) and trimmed_seq[i] == base:
+                feature_vector.append(1)
+            else:
+                feature_vector.append(0)
+    
+    # Dinucleotide features
+    dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)]
+    for i in range(seq_length - 1):
+        for dinuc in dinucleotides:
+            if i + 1 < len(trimmed_seq) and trimmed_seq[i:i+2] == dinuc:
+                feature_vector.append(1)
+            else:
+                feature_vector.append(0)
+    
+    # Trinucleotide (codon) features
+    trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)]
+    for i in range(seq_length - 2):
+        for trinuc in trinucleotides:
+            if i + 2 < len(trimmed_seq) and trimmed_seq[i:i+3] == trinuc:
+                feature_vector.append(1)
+            else:
+                feature_vector.append(0)
+    
+    # Add MFE features
+    if mfe_values is not None:
+        if isinstance(mfe_values, dict):
+            feature_vector.append(mfe_values.get('mfe_40bp', 0.0))
+            feature_vector.append(mfe_values.get('mfe_120bp', 0.0))
+        elif isinstance(mfe_values, (list, tuple)) and len(mfe_values) >= 2:
+            feature_vector.extend(mfe_values[:2])
+        else:
+            feature_vector.extend([0.0, 0.0])
+    else:
+        feature_vector.extend([0.0, 0.0])
+    
+    return np.array(feature_vector)
+
+def prepare_data(train_data, test_data, seq_length=33):
+    """Prepare training and test data including MFE features"""
+    
+    # Process training data
+    X_train = []
+    y_train = []
+    sample_weights = []
+    
+    for _, row in train_data.iterrows():
+        sequence = row['full_seq']
+        label = row['label']
+        
+        # Get MFE values
+        mfe_values = {}
+        if 'mfe_40bp' in row:
+            mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0
+        if 'mfe_120bp' in row:
+            mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0
+        
+        # Convert to features
+        features = sequence_to_features(sequence, seq_length, mfe_values)
+        X_train.append(features)
+        y_train.append(label)
+        
+        # Sample weight
+        weight = 1.0
+        if 'sample_weight' in row and pd.notna(row['sample_weight']):
+            weight = row['sample_weight']
+        sample_weights.append(weight)
+    
+    X_train = np.array(X_train)
+    y_train = np.array(y_train)
+    sample_weights = np.array(sample_weights)
+    
+    # Process test data
+    X_test = []
+    y_test = []
+    
+    if test_data is not None and not test_data.empty:
+        for _, row in test_data.iterrows():
+            sequence = row['full_seq']
+            label = row['label']
+            
+            # Get MFE values
+            mfe_values = {}
+            if 'mfe_40bp' in row:
+                mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0
+            if 'mfe_120bp' in row:
+                mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0
+            
+            # Convert to features
+            features = sequence_to_features(sequence, seq_length, mfe_values)
+            X_test.append(features)
+            y_test.append(label)
+    
+    X_test = np.array(X_test) if X_test else None
+    y_test = np.array(y_test) if y_test else None
+    
+    return X_train, y_train, X_test, y_test, sample_weights, train_data, test_data
+
+def analyze_feature_importance(model, X_test, y_test, test_data):
+    """Analyze feature importance (simplified version)"""
+    try:
+        # Get feature names
+        feature_names = get_feature_names(GBConfig.SEQUENCE_LENGTH)
+        
+        # Built-in feature importance
+        if hasattr(model, 'feature_importances_'):
+            importance_scores = model.feature_importances_
+            
+            # Create importance DataFrame
+            importance_df = pd.DataFrame({
+                'feature': feature_names,
+                'importance': importance_scores
+            }).sort_values('importance', ascending=False)
+            
+            # Save results
+            importance_path = os.path.join(BaseConfig.GB_DIR, 'feature_importance.csv')
+            importance_df.to_csv(importance_path, index=False)
+            
+            return {'built_in_importance': importance_df}
+        
+        return None
+        
+    except Exception as e:
+        return None
+
+def main():
+    """Main training function"""
+    try:
+        # Set sequence length
+        sequence_length = GBConfig.SEQUENCE_LENGTH
+        
+        # Load data
+        train_data, test_data, _, xu_data, atkins_data = load_data()
+        
+        # Prepare data
+        X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data(
+            train_data, test_data, seq_length=sequence_length
+        )
+        
+        # Prepare validation data
+        X_xu = y_xu = X_atkins = y_atkins = None
+        if xu_data is not None and not xu_data.empty:
+            try:
+                empty_test = pd.DataFrame(columns=xu_data.columns)
+                X_xu, y_xu, _, _, _, _, _ = prepare_data(xu_data, empty_test, seq_length=sequence_length)
+            except Exception as e:
+                X_xu = y_xu = None
+
+        if atkins_data is not None and not atkins_data.empty:
+            try:
+                empty_test = pd.DataFrame(columns=atkins_data.columns)
+                X_atkins, y_atkins, _, _, _, _, _ = prepare_data(atkins_data, empty_test, seq_length=sequence_length)
+            except Exception as e:
+                X_atkins = y_atkins = None
+        
+        # Train model
+        model, _, training_info = train_hist_model(
+            X_train, y_train, X_test, y_test, sample_weights,
+            X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins
+        )
+        
+        # Feature importance analysis
+        source_results = analyze_feature_importance(model, X_test, y_test, test_data)
+        
+        return model, training_info['final_metrics']['test']
+        
+    except Exception as e:
+        return None, None
+
+if __name__ == "__main__":
+    BaseConfig.create_directories()
+    main()