commit aa520b6238764a7eec0c4d7b9419b56fa63eff11 Author: yyh Date: Sun Aug 17 15:30:14 2025 +0800 上传文件至 train_models diff --git a/train_models/__init__.cpython-310.pyc b/train_models/__init__.cpython-310.pyc new file mode 100644 index 0000000..e58686f Binary files /dev/null and b/train_models/__init__.cpython-310.pyc differ diff --git a/train_models/bilstm_cnn.cpython-310.pyc b/train_models/bilstm_cnn.cpython-310.pyc new file mode 100644 index 0000000..62edabb Binary files /dev/null and b/train_models/bilstm_cnn.cpython-310.pyc differ diff --git a/train_models/bilstm_cnn.py b/train_models/bilstm_cnn.py new file mode 100644 index 0000000..3ff9cd8 --- /dev/null +++ b/train_models/bilstm_cnn.py @@ -0,0 +1,507 @@ +""" +BiLSTM-CNN Model for Sequence Classification +""" +import os +import numpy as np +import pandas as pd +import tensorflow as tf +from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, log_loss +from tensorflow.keras import layers, models +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.preprocessing.sequence import pad_sequences + +from utils.function import load_data, save_training_info, select_low_confidence_samples_cnn, evaluate_model_cnn +from utils.config import BaseConfig + +# Set random seeds +np.random.seed(42) +tf.random.set_seed(42) + +class MetricsCallback(tf.keras.callbacks.Callback): + """Callback for recording training metrics""" + + def __init__(self): + super().__init__() + self.training_metrics = { + 'train_loss': [], 'train_auc': [], 'train_accuracy': [], + 'train_recall': [], 'train_precision': [], 'train_f1': [], + 'test_loss': [], 'test_auc': [], 'test_accuracy': [], + 'test_recall': [], 'test_precision': [], 'test_f1': [] + } + + self.iteration_metrics = { + 'samples_added': [0], + 'total_samples': [] + } + + self.best_model = None + self.best_test_loss = float('inf') + self.best_epoch = -1 + self.best_predictions = None + + self.xu_metrics_history = { + 'loss': [], 'auc': [], 'accuracy': [], + 'recall': [], 'precision': [], 'f1': [] + } + self.atkins_metrics_history = { + 'loss': [], 'auc': [], 'accuracy': [], + 'recall': [], 'precision': [], 'f1': [] + } + + self.self_training_best_model = None + self.self_training_best_loss = float('inf') + self.self_training_best_metrics = None + + def on_epoch_end(self, epoch, logs={}): + try: + train_loss = logs.get('loss', 0.0) + val_loss = logs.get('val_loss', 0.0) + + train_metrics = { + 'loss': train_loss, + 'auc': logs.get('auc', 0.0), + 'accuracy': logs.get('accuracy', 0.0), + 'recall': logs.get('recall', 0.0), + 'precision': 0.0, + 'f1': 0.0 + } + + # Calculate test metrics using batch processing + batch_size = 128 + n_test_samples = len(self.model.X_test) + n_test_batches = (n_test_samples + batch_size - 1) // batch_size + + test_probs = np.zeros(n_test_samples) + try: + for i in range(n_test_batches): + start_idx = i * batch_size + end_idx = min((i + 1) * batch_size, n_test_samples) + batch_probs = self.model.predict(self.model.X_test[start_idx:end_idx], verbose=0) + if isinstance(batch_probs, list): + batch_probs = batch_probs[0] + if len(batch_probs.shape) > 1: + batch_probs = batch_probs.flatten() + test_probs[start_idx:end_idx] = batch_probs + + test_preds = (test_probs > 0.5).astype(int) + + test_metrics = { + 'loss': log_loss(self.model.y_test, np.clip(test_probs, 1e-15, 1-1e-15)), + 'auc': roc_auc_score(self.model.y_test, test_probs) if len(np.unique(test_probs)) > 1 else 0.5, + 'accuracy': accuracy_score(self.model.y_test, test_preds), + 'recall': recall_score(self.model.y_test, test_preds, zero_division=0), + 'precision': precision_score(self.model.y_test, test_preds, zero_division=0), + 'f1': f1_score(self.model.y_test, test_preds, zero_division=0) + } + + except Exception as e: + test_metrics = { + 'loss': float('inf'), 'auc': 0.0, 'accuracy': 0.0, + 'recall': 0.0, 'precision': 0.0, 'f1': 0.0 + } + + # Record metrics + for key in self.training_metrics: + if key.startswith('train_'): + metric_name = key[6:] + self.training_metrics[key].append(train_metrics.get(metric_name, 0.0)) + elif key.startswith('test_'): + metric_name = key[5:] + self.training_metrics[key].append(test_metrics.get(metric_name, 0.0)) + + # Update best model based on test loss + if test_metrics['loss'] < self.best_test_loss: + self.best_test_loss = test_metrics['loss'] + self.best_epoch = epoch + self.best_model = tf.keras.models.clone_model(self.model) + self.best_model.set_weights(self.model.get_weights()) + self.best_predictions = test_probs.copy() + + # Evaluate external validation sets if available + if hasattr(self.model, 'X_xu') and self.model.X_xu is not None: + xu_metrics = evaluate_model_cnn(self.model, self.model.X_xu, self.model.y_xu) + for key in self.xu_metrics_history: + self.xu_metrics_history[key].append(xu_metrics.get(key, 0.0)) + + if hasattr(self.model, 'X_atkins') and self.model.X_atkins is not None: + atkins_metrics = evaluate_model_cnn(self.model, self.model.X_atkins, self.model.y_atkins) + for key in self.atkins_metrics_history: + self.atkins_metrics_history[key].append(atkins_metrics.get(key, 0.0)) + + except Exception as e: + pass + + def on_train_end(self, logs=None): + if self.best_model is not None: + self.model.set_weights(self.best_model.get_weights()) + +class Config: + """Model configuration parameters""" + NEG_SAMPLES = 20000 + CONFIDENCE_THRESHOLD = 0.5 + EMBEDDING_DIM = 64 + LSTM_UNITS = 64 + CNN_FILTERS = 64 + CNN_KERNEL_SIZES = [3, 5, 7] + DROPOUT_RATE = 0.5 + LEARNING_RATE = 1e-4 + BATCH_SIZE = 1024 + EPOCHS = 5 + INITIAL_EPOCHS = 5 + SELF_TRAINING_EPOCHS = 1 + MAX_ITERATIONS = 20 + EARLY_STOPPING_PATIENCE = 5 + Sequence_len = 399 + +def process_sequence(seq, max_length=399): + """Process single sequence""" + return seq[:max_length] if len(seq) > max_length else seq + +def encode_sequence(seq, max_length=399): + """Encode single sequence""" + mapping = {'A': 1, 'T': 2, 'C': 3, 'G': 4} + encoded = [mapping.get(base, 0) for base in seq.upper()] + if len(encoded) < max_length: + encoded.extend([0] * (max_length - len(encoded))) + return encoded[:max_length] + +def trim_sequence(seq, target_length): + """Trim sequence from both ends to reach target length""" + if len(seq) <= target_length: + return seq + + excess = len(seq) - target_length + left_trim = excess // 2 + right_trim = excess - left_trim + + return seq[left_trim:len(seq)-right_trim] + +def prepare_data(train_data, test_data=None, low_conf_data=None, max_length=399): + """Prepare training and test data""" + # Process training data + train_sequences = [] + train_labels = [] + sample_weights = [] + + for _, row in train_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + train_sequences.append(encoded_seq) + train_labels.append(row['label']) + + weight = 1.0 + if 'sample_weight' in row and pd.notna(row['sample_weight']): + weight = row['sample_weight'] + sample_weights.append(weight) + + X_train = np.array(train_sequences) + y_train = np.array(train_labels) + sample_weights = np.array(sample_weights) + + # Process test data + X_test = y_test = None + if test_data is not None and not test_data.empty: + test_sequences = [] + test_labels = [] + + for _, row in test_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + test_sequences.append(encoded_seq) + test_labels.append(row['label']) + + X_test = np.array(test_sequences) + y_test = np.array(test_labels) + + # Process low confidence data + X_low_conf = y_low_conf = None + if low_conf_data is not None and not low_conf_data.empty: + low_conf_sequences = [] + low_conf_labels = [] + + for _, row in low_conf_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + low_conf_sequences.append(encoded_seq) + low_conf_labels.append(row['label']) + + X_low_conf = np.array(low_conf_sequences) + y_low_conf = np.array(low_conf_labels) + + return X_train, y_train, X_test, y_test, sample_weights, X_low_conf, y_low_conf + +def create_bilstm_cnn_model(input_shape): + """Create BiLSTM-CNN model""" + input_layer = layers.Input(shape=input_shape) + + # Embedding layer + embedding = layers.Embedding( + input_dim=5, + output_dim=Config.EMBEDDING_DIM, + input_length=input_shape[0] + )(input_layer) + + # BiLSTM layers + lstm_out = layers.Bidirectional( + layers.LSTM(Config.LSTM_UNITS, return_sequences=True, dropout=Config.DROPOUT_RATE) + )(embedding) + + # CNN branches + cnn_outputs = [] + for kernel_size in Config.CNN_KERNEL_SIZES: + cnn = layers.Conv1D( + filters=Config.CNN_FILTERS, + kernel_size=kernel_size, + activation='relu', + padding='same' + )(lstm_out) + cnn = layers.GlobalMaxPooling1D()(cnn) + cnn_outputs.append(cnn) + + # Concatenate CNN outputs + if len(cnn_outputs) > 1: + concat = layers.Concatenate()(cnn_outputs) + else: + concat = cnn_outputs[0] + + # Dense layers + dense = layers.Dense(128, activation='relu')(concat) + dense = layers.Dropout(Config.DROPOUT_RATE)(dense) + dense = layers.Dense(64, activation='relu')(dense) + dense = layers.Dropout(Config.DROPOUT_RATE)(dense) + + # Output layer + output = layers.Dense(1, activation='sigmoid')(dense) + + model = models.Model(inputs=input_layer, outputs=output) + + # Compile model + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE), + loss='binary_crossentropy', + metrics=['accuracy', 'auc', 'recall'] + ) + + return model + +def train_bilstm_cnn_model(X_train, y_train, X_test, y_test, sample_weights=None, + X_xu=None, y_xu=None, X_atkins=None, y_atkins=None): + """Train BiLSTM-CNN model with self-training""" + + # Create model + input_shape = (X_train.shape[1],) + model = create_bilstm_cnn_model(input_shape) + + # Store validation data in model for callback access + model.X_test = X_test + model.y_test = y_test + model.X_xu = X_xu + model.y_xu = y_xu + model.X_atkins = X_atkins + model.y_atkins = y_atkins + + # Initial training + metrics_callback = MetricsCallback() + + early_stopping = tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=Config.EARLY_STOPPING_PATIENCE, + restore_best_weights=True, + verbose=0 + ) + + # Split training data for validation + val_split = 0.2 + n_val = int(len(X_train) * val_split) + indices = np.random.permutation(len(X_train)) + train_indices = indices[n_val:] + val_indices = indices[:n_val] + + X_train_split = X_train[train_indices] + y_train_split = y_train[train_indices] + X_val_split = X_train[val_indices] + y_val_split = y_train[val_indices] + + if sample_weights is not None: + sample_weights_split = sample_weights[train_indices] + else: + sample_weights_split = None + + # Initial training + model.fit( + X_train_split, y_train_split, + validation_data=(X_val_split, y_val_split), + epochs=Config.INITIAL_EPOCHS, + batch_size=Config.BATCH_SIZE, + sample_weight=sample_weights_split, + callbacks=[metrics_callback, early_stopping], + verbose=0 + ) + + # Store initial training info + initial_info = { + 'best_test_loss': metrics_callback.best_test_loss, + 'best_epoch': metrics_callback.best_epoch, + 'training_metrics': metrics_callback.training_metrics.copy() + } + + # Self-training iterations + current_X_train = X_train.copy() + current_y_train = y_train.copy() + current_weights = sample_weights.copy() if sample_weights is not None else None + + iteration_metrics = { + 'iteration': [0], + 'train_loss': [metrics_callback.training_metrics['train_loss'][-1]], + 'test_loss': [metrics_callback.training_metrics['test_loss'][-1]], + 'samples_added': [0], + 'total_samples': [len(current_X_train)] + } + + if X_xu is not None: + xu_metrics = evaluate_model_cnn(model, X_xu, y_xu) + iteration_metrics['xu_loss'] = [xu_metrics['loss']] + + if X_atkins is not None: + atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins) + iteration_metrics['atkins_loss'] = [atkins_metrics['loss']] + + best_model = tf.keras.models.clone_model(model) + best_model.set_weights(model.get_weights()) + best_loss = metrics_callback.best_test_loss + best_iteration = 0 + + # Load low confidence data for self-training + _, _, low_conf_data, _, _ = load_data() + + if low_conf_data is not None and not low_conf_data.empty: + X_unlabeled, _, _, _, _, _, _ = prepare_data( + low_conf_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + for iteration in range(1, Config.MAX_ITERATIONS + 1): + # Select low confidence samples + selected_samples = select_low_confidence_samples_cnn( + model, X_unlabeled, low_conf_data + ) + + if selected_samples.empty: + break + + # Prepare selected samples + X_selected, y_selected, _, _, weights_selected, _, _ = prepare_data( + selected_samples, pd.DataFrame(), max_length=Config.Sequence_len + ) + + if len(X_selected) == 0: + break + + # Add to training set + current_X_train = np.vstack([current_X_train, X_selected]) + current_y_train = np.hstack([current_y_train, y_selected]) + + if current_weights is not None: + current_weights = np.hstack([current_weights, weights_selected]) + + # Retrain model + metrics_callback = MetricsCallback() + + # Split updated training data + n_val = int(len(current_X_train) * val_split) + indices = np.random.permutation(len(current_X_train)) + train_indices = indices[n_val:] + val_indices = indices[:n_val] + + X_train_split = current_X_train[train_indices] + y_train_split = current_y_train[train_indices] + X_val_split = current_X_train[val_indices] + y_val_split = current_y_train[val_indices] + + if current_weights is not None: + sample_weights_split = current_weights[train_indices] + else: + sample_weights_split = None + + model.fit( + X_train_split, y_train_split, + validation_data=(X_val_split, y_val_split), + epochs=Config.SELF_TRAINING_EPOCHS, + batch_size=Config.BATCH_SIZE, + sample_weight=sample_weights_split, + callbacks=[metrics_callback, early_stopping], + verbose=0 + ) + + # Record iteration metrics + iteration_metrics['iteration'].append(iteration) + iteration_metrics['train_loss'].append(metrics_callback.training_metrics['train_loss'][-1]) + iteration_metrics['test_loss'].append(metrics_callback.training_metrics['test_loss'][-1]) + iteration_metrics['samples_added'].append(len(X_selected)) + iteration_metrics['total_samples'].append(len(current_X_train)) + + if X_xu is not None: + xu_metrics = evaluate_model_cnn(model, X_xu, y_xu) + iteration_metrics['xu_loss'].append(xu_metrics['loss']) + + if X_atkins is not None: + atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins) + iteration_metrics['atkins_loss'].append(atkins_metrics['loss']) + + # Update best model + current_loss = metrics_callback.training_metrics['test_loss'][-1] + if current_loss < best_loss: + best_model = tf.keras.models.clone_model(model) + best_model.set_weights(model.get_weights()) + best_loss = current_loss + best_iteration = iteration + + # Final evaluation + final_metrics = evaluate_model_cnn(best_model, X_test, y_test) + + training_info = { + 'initial_info': initial_info, + 'iteration_metrics': iteration_metrics, + 'best_iteration': best_iteration, + 'final_metrics': final_metrics + } + + return best_model, model, training_info + +def main(): + """Main training function""" + # Load data + train_data, test_data, low_conf_data, xu_data, atkins_data = load_data() + + # Prepare data + X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data( + train_data, test_data, max_length=Config.Sequence_len + ) + + # Prepare validation data + X_xu = y_xu = X_atkins = y_atkins = None + + if xu_data is not None and not xu_data.empty: + X_xu, y_xu, _, _, _, _, _ = prepare_data( + xu_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + if atkins_data is not None and not atkins_data.empty: + X_atkins, y_atkins, _, _, _, _, _ = prepare_data( + atkins_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + # Train model + best_model, final_model, training_info = train_bilstm_cnn_model( + X_train, y_train, X_test, y_test, sample_weights, + X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins + ) + + # Save results + save_training_info(best_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "best") + save_training_info(final_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "final", is_final_model=True) + + return best_model, final_model, training_info + +if __name__ == "__main__": + BaseConfig.create_directories() + main() diff --git a/train_models/hist_gb.cpython-310.pyc b/train_models/hist_gb.cpython-310.pyc new file mode 100644 index 0000000..d5b77e9 Binary files /dev/null and b/train_models/hist_gb.cpython-310.pyc differ diff --git a/train_models/hist_gb.py b/train_models/hist_gb.py new file mode 100644 index 0000000..7f4841d --- /dev/null +++ b/train_models/hist_gb.py @@ -0,0 +1,344 @@ +""" +HistGradientBoosting Model with MFE Features +""" +import os +import numpy as np +import pandas as pd +import itertools +from sklearn.ensemble import HistGradientBoostingClassifier +from sklearn.metrics import ( + roc_auc_score, + roc_curve, + confusion_matrix, + precision_recall_curve, + average_precision_score +) +from utils.function import evaluate_model_gb +from utils.config import BaseConfig + +class GBConfig: + """HistGradientBoostingClassifier model configuration""" + # Model training parameters + MAX_ITER = 10000 + LEARNING_RATE = 0.4 + MAX_DEPTH = 5 + RANDOM_STATE = 42 + + # Early stopping parameters + EARLY_STOPPING = True + N_ITER_NO_CHANGE = 10 + SCORING = 'loss' + + # Sequence parameters + SEQUENCE_LENGTH = 33 # Must be multiple of 3 (codon length) + + # Validation parameters + VALIDATION_FRACTION = 0.2 + SMALL_VALIDATION_FRACTION = 0.1 + +def load_data(neg_samples=20000): + """Load training and validation data""" + try: + train_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_train_data.csv")) + test_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_test_data.csv")) + validation_data = pd.read_csv(os.path.join(BaseConfig.DATA_DIR, "merged_validation_data.csv")) + + required_columns = ['full_seq', 'label'] + + for df in [train_data, test_data, validation_data]: + for col in required_columns: + if col not in df.columns: + if col == 'label': + df[col] = 0 + else: + df[col] = '' + + xu_data = validation_data[validation_data['source'] == 'Xu'].copy() + atkins_data = validation_data[validation_data['source'] == 'Atkins'].copy() + + for df in [xu_data, atkins_data]: + for col in required_columns: + if col not in df.columns: + df[col] = validation_data[col] if col in validation_data.columns else ( + 0.0 if col in ['mfe_40bp', 'mfe_120bp'] else ( + 0 if col == 'label' else '' + ) + ) + + return train_data, test_data, validation_data, xu_data, atkins_data + + except Exception as e: + return None, None, None, None, None + +def train_hist_model(X_train, y_train, X_test, y_test, sample_weights=None, + X_xu=None, y_xu=None, X_atkins=None, y_atkins=None): + """Train HistGradientBoostingClassifier model""" + + # Determine validation fraction + validation_fraction = GBConfig.VALIDATION_FRACTION + if X_xu is not None or X_atkins is not None: + validation_fraction = GBConfig.SMALL_VALIDATION_FRACTION + + # Create and train model + model = HistGradientBoostingClassifier( + max_iter=GBConfig.MAX_ITER, + learning_rate=GBConfig.LEARNING_RATE, + max_depth=GBConfig.MAX_DEPTH, + random_state=GBConfig.RANDOM_STATE, + early_stopping=GBConfig.EARLY_STOPPING, + n_iter_no_change=GBConfig.N_ITER_NO_CHANGE, + scoring=GBConfig.SCORING, + validation_fraction=validation_fraction + ) + + # Train model + model.fit(X_train, y_train, sample_weight=sample_weights) + + # Evaluate on test set + test_metrics = evaluate_model_gb(model, X_test, y_test) + + # Evaluate on external validation sets + xu_metrics = None + if X_xu is not None and y_xu is not None: + xu_metrics = evaluate_model_gb(model, X_xu, y_xu) + + atkins_metrics = None + if X_atkins is not None and y_atkins is not None: + atkins_metrics = evaluate_model_gb(model, X_atkins, y_atkins) + + # Prepare training info + training_info = { + 'n_iter': model.n_iter_, + 'train_score': model.train_score_, + 'validation_scores': model.validation_scores_ if hasattr(model, 'validation_scores_') else None, + 'final_metrics': { + 'test': test_metrics, + 'xu': xu_metrics, + 'atkins': atkins_metrics + } + } + + return model, test_metrics, training_info + +def get_feature_names(seq_length=33): + """Return feature names including all possible base features and MFE features""" + features = [] + + # Single nucleotide features + bases = ['A', 'T', 'C', 'G'] + for i in range(seq_length): + for base in bases: + features.append(f'pos_{i+1}_{base}') + + # Dinucleotide features + dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)] + for i in range(seq_length - 1): + for dinuc in dinucleotides: + features.append(f'dinuc_{i+1}_{dinuc}') + + # Trinucleotide (codon) features + trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)] + for i in range(seq_length - 2): + for trinuc in trinucleotides: + features.append(f'codon_{i+1}_{trinuc}') + + # MFE features + features.extend(['mfe_40bp', 'mfe_120bp']) + + return features + +def trim_sequence(seq, target_length): + """Trim sequence from both ends to reach target length, keeping center position""" + if len(seq) <= target_length: + return seq + + excess = len(seq) - target_length + left_trim = excess // 2 + right_trim = excess - left_trim + + return seq[left_trim:len(seq)-right_trim] + +def sequence_to_features(sequence, seq_length=33, mfe_values=None): + """Convert DNA sequence to feature vector including MFE features""" + + # Trim sequence to target length + trimmed_seq = trim_sequence(sequence.upper(), seq_length) + + # Initialize feature vector + feature_vector = [] + + # Single nucleotide features (one-hot encoding) + bases = ['A', 'T', 'C', 'G'] + for i in range(seq_length): + for base in bases: + if i < len(trimmed_seq) and trimmed_seq[i] == base: + feature_vector.append(1) + else: + feature_vector.append(0) + + # Dinucleotide features + dinucleotides = [''.join(pair) for pair in itertools.product(bases, repeat=2)] + for i in range(seq_length - 1): + for dinuc in dinucleotides: + if i + 1 < len(trimmed_seq) and trimmed_seq[i:i+2] == dinuc: + feature_vector.append(1) + else: + feature_vector.append(0) + + # Trinucleotide (codon) features + trinucleotides = [''.join(triplet) for triplet in itertools.product(bases, repeat=3)] + for i in range(seq_length - 2): + for trinuc in trinucleotides: + if i + 2 < len(trimmed_seq) and trimmed_seq[i:i+3] == trinuc: + feature_vector.append(1) + else: + feature_vector.append(0) + + # Add MFE features + if mfe_values is not None: + if isinstance(mfe_values, dict): + feature_vector.append(mfe_values.get('mfe_40bp', 0.0)) + feature_vector.append(mfe_values.get('mfe_120bp', 0.0)) + elif isinstance(mfe_values, (list, tuple)) and len(mfe_values) >= 2: + feature_vector.extend(mfe_values[:2]) + else: + feature_vector.extend([0.0, 0.0]) + else: + feature_vector.extend([0.0, 0.0]) + + return np.array(feature_vector) + +def prepare_data(train_data, test_data, seq_length=33): + """Prepare training and test data including MFE features""" + + # Process training data + X_train = [] + y_train = [] + sample_weights = [] + + for _, row in train_data.iterrows(): + sequence = row['full_seq'] + label = row['label'] + + # Get MFE values + mfe_values = {} + if 'mfe_40bp' in row: + mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 + if 'mfe_120bp' in row: + mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 + + # Convert to features + features = sequence_to_features(sequence, seq_length, mfe_values) + X_train.append(features) + y_train.append(label) + + # Sample weight + weight = 1.0 + if 'sample_weight' in row and pd.notna(row['sample_weight']): + weight = row['sample_weight'] + sample_weights.append(weight) + + X_train = np.array(X_train) + y_train = np.array(y_train) + sample_weights = np.array(sample_weights) + + # Process test data + X_test = [] + y_test = [] + + if test_data is not None and not test_data.empty: + for _, row in test_data.iterrows(): + sequence = row['full_seq'] + label = row['label'] + + # Get MFE values + mfe_values = {} + if 'mfe_40bp' in row: + mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 + if 'mfe_120bp' in row: + mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 + + # Convert to features + features = sequence_to_features(sequence, seq_length, mfe_values) + X_test.append(features) + y_test.append(label) + + X_test = np.array(X_test) if X_test else None + y_test = np.array(y_test) if y_test else None + + return X_train, y_train, X_test, y_test, sample_weights, train_data, test_data + +def analyze_feature_importance(model, X_test, y_test, test_data): + """Analyze feature importance (simplified version)""" + try: + # Get feature names + feature_names = get_feature_names(GBConfig.SEQUENCE_LENGTH) + + # Built-in feature importance + if hasattr(model, 'feature_importances_'): + importance_scores = model.feature_importances_ + + # Create importance DataFrame + importance_df = pd.DataFrame({ + 'feature': feature_names, + 'importance': importance_scores + }).sort_values('importance', ascending=False) + + # Save results + importance_path = os.path.join(BaseConfig.GB_DIR, 'feature_importance.csv') + importance_df.to_csv(importance_path, index=False) + + return {'built_in_importance': importance_df} + + return None + + except Exception as e: + return None + +def main(): + """Main training function""" + try: + # Set sequence length + sequence_length = GBConfig.SEQUENCE_LENGTH + + # Load data + train_data, test_data, _, xu_data, atkins_data = load_data() + + # Prepare data + X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data( + train_data, test_data, seq_length=sequence_length + ) + + # Prepare validation data + X_xu = y_xu = X_atkins = y_atkins = None + if xu_data is not None and not xu_data.empty: + try: + empty_test = pd.DataFrame(columns=xu_data.columns) + X_xu, y_xu, _, _, _, _, _ = prepare_data(xu_data, empty_test, seq_length=sequence_length) + except Exception as e: + X_xu = y_xu = None + + if atkins_data is not None and not atkins_data.empty: + try: + empty_test = pd.DataFrame(columns=atkins_data.columns) + X_atkins, y_atkins, _, _, _, _, _ = prepare_data(atkins_data, empty_test, seq_length=sequence_length) + except Exception as e: + X_atkins = y_atkins = None + + # Train model + model, _, training_info = train_hist_model( + X_train, y_train, X_test, y_test, sample_weights, + X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins + ) + + # Feature importance analysis + source_results = analyze_feature_importance(model, X_test, y_test, test_data) + + return model, training_info['final_metrics']['test'] + + except Exception as e: + return None, None + +if __name__ == "__main__": + BaseConfig.create_directories() + main()