112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
| """
 | |
| RNA Structure Prediction and MFE Calculation
 | |
| """
 | |
| import csv
 | |
| import RNA
 | |
| import pandas as pd
 | |
| from utils.config import BaseConfig
 | |
| 
 | |
| def predict_rna_structure_multiple(csv_file, predictions):
 | |
|     """
 | |
|     Predict RNA structure for multiple regions and write results back to CSV file
 | |
|     
 | |
|     Args:
 | |
|         csv_file: CSV file path
 | |
|         predictions: List of prediction configurations, each containing:
 | |
|             - start: Starting position (1-based)
 | |
|             - length: Prediction length
 | |
|             - column_name: Column name for results in CSV
 | |
|     """
 | |
|     # Read CSV file
 | |
|     df = pd.read_csv(csv_file)
 | |
|     
 | |
|     # Process each sequence
 | |
|     for index, row in df.iterrows():
 | |
|         sequence = row['full_seq']
 | |
|         
 | |
|         # Process each prediction configuration
 | |
|         for pred in predictions:
 | |
|             start = pred['start'] - 1  # Convert to 0-based index
 | |
|             length = pred['length']
 | |
|             column_name = pred['column_name']
 | |
|             
 | |
|             try:
 | |
|                 # Extract subsequence from specified region
 | |
|                 sub_seq = sequence[start:start + length]
 | |
|                 
 | |
|                 # Create fold_compound object and predict
 | |
|                 fc = RNA.fold_compound(sub_seq)
 | |
|                 (ss, mfe) = fc.mfe()
 | |
|                 
 | |
|                 # Store result in DataFrame
 | |
|                 df.at[index, column_name] = mfe
 | |
|                 
 | |
|             except Exception as e:
 | |
|                 df.at[index, column_name] = None
 | |
|     
 | |
|     # Save results to original CSV file
 | |
|     df.to_csv(csv_file, index=False)
 | |
| 
 | |
| def predict_rna_mfe_batch(sequences, start_pos=198, length=40):
 | |
|     """
 | |
|     Batch prediction of RNA MFE for multiple sequences
 | |
|     
 | |
|     Args:
 | |
|         sequences: List of RNA sequences
 | |
|         start_pos: Starting position (1-based)
 | |
|         length: Length of subsequence for MFE calculation
 | |
|         
 | |
|     Returns:
 | |
|         List of MFE values
 | |
|     """
 | |
|     mfe_values = []
 | |
|     start_idx = start_pos - 1  # Convert to 0-based
 | |
|     
 | |
|     for sequence in sequences:
 | |
|         try:
 | |
|             # Extract subsequence
 | |
|             sub_seq = sequence[start_idx:start_idx + length]
 | |
|             
 | |
|             # Calculate MFE
 | |
|             fc = RNA.fold_compound(sub_seq)
 | |
|             (ss, mfe) = fc.mfe()
 | |
|             
 | |
|             mfe_values.append(mfe)
 | |
|             
 | |
|         except Exception as e:
 | |
|             mfe_values.append(0.0)  # Default value on error
 | |
|     
 | |
|     return mfe_values
 | |
| 
 | |
| def calculate_mfe_features(data_file, output_file=None):
 | |
|     """
 | |
|     Calculate MFE features for sequences in a data file
 | |
|     
 | |
|     Args:
 | |
|         data_file: Input CSV file path
 | |
|         output_file: Output CSV file path (optional, defaults to input file)
 | |
|     """
 | |
|     if output_file is None:
 | |
|         output_file = data_file
 | |
|     
 | |
|     # Standard MFE prediction configurations
 | |
|     predictions = [
 | |
|         {
 | |
|             'start': 198,
 | |
|             'length': 40,
 | |
|             'column_name': 'mfe_40bp'
 | |
|         },
 | |
|         {
 | |
|             'start': 198,
 | |
|             'length': 120,
 | |
|             'column_name': 'mfe_120bp'
 | |
|         }
 | |
|     ]
 | |
|     
 | |
|     predict_rna_structure_multiple(data_file, predictions)
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     # Example usage with virtual paths
 | |
|     data_file = BaseConfig.VALIDATION_DATA
 | |
|     calculate_mfe_features(data_file)
 |