112 lines
3.2 KiB
Python
112 lines
3.2 KiB
Python
"""
|
|
RNA Structure Prediction and MFE Calculation
|
|
"""
|
|
import csv
|
|
import RNA
|
|
import pandas as pd
|
|
from utils.config import BaseConfig
|
|
|
|
def predict_rna_structure_multiple(csv_file, predictions):
|
|
"""
|
|
Predict RNA structure for multiple regions and write results back to CSV file
|
|
|
|
Args:
|
|
csv_file: CSV file path
|
|
predictions: List of prediction configurations, each containing:
|
|
- start: Starting position (1-based)
|
|
- length: Prediction length
|
|
- column_name: Column name for results in CSV
|
|
"""
|
|
# Read CSV file
|
|
df = pd.read_csv(csv_file)
|
|
|
|
# Process each sequence
|
|
for index, row in df.iterrows():
|
|
sequence = row['full_seq']
|
|
|
|
# Process each prediction configuration
|
|
for pred in predictions:
|
|
start = pred['start'] - 1 # Convert to 0-based index
|
|
length = pred['length']
|
|
column_name = pred['column_name']
|
|
|
|
try:
|
|
# Extract subsequence from specified region
|
|
sub_seq = sequence[start:start + length]
|
|
|
|
# Create fold_compound object and predict
|
|
fc = RNA.fold_compound(sub_seq)
|
|
(ss, mfe) = fc.mfe()
|
|
|
|
# Store result in DataFrame
|
|
df.at[index, column_name] = mfe
|
|
|
|
except Exception as e:
|
|
df.at[index, column_name] = None
|
|
|
|
# Save results to original CSV file
|
|
df.to_csv(csv_file, index=False)
|
|
|
|
def predict_rna_mfe_batch(sequences, start_pos=198, length=40):
|
|
"""
|
|
Batch prediction of RNA MFE for multiple sequences
|
|
|
|
Args:
|
|
sequences: List of RNA sequences
|
|
start_pos: Starting position (1-based)
|
|
length: Length of subsequence for MFE calculation
|
|
|
|
Returns:
|
|
List of MFE values
|
|
"""
|
|
mfe_values = []
|
|
start_idx = start_pos - 1 # Convert to 0-based
|
|
|
|
for sequence in sequences:
|
|
try:
|
|
# Extract subsequence
|
|
sub_seq = sequence[start_idx:start_idx + length]
|
|
|
|
# Calculate MFE
|
|
fc = RNA.fold_compound(sub_seq)
|
|
(ss, mfe) = fc.mfe()
|
|
|
|
mfe_values.append(mfe)
|
|
|
|
except Exception as e:
|
|
mfe_values.append(0.0) # Default value on error
|
|
|
|
return mfe_values
|
|
|
|
def calculate_mfe_features(data_file, output_file=None):
|
|
"""
|
|
Calculate MFE features for sequences in a data file
|
|
|
|
Args:
|
|
data_file: Input CSV file path
|
|
output_file: Output CSV file path (optional, defaults to input file)
|
|
"""
|
|
if output_file is None:
|
|
output_file = data_file
|
|
|
|
# Standard MFE prediction configurations
|
|
predictions = [
|
|
{
|
|
'start': 198,
|
|
'length': 40,
|
|
'column_name': 'mfe_40bp'
|
|
},
|
|
{
|
|
'start': 198,
|
|
'length': 120,
|
|
'column_name': 'mfe_120bp'
|
|
}
|
|
]
|
|
|
|
predict_rna_structure_multiple(data_file, predictions)
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage with virtual paths
|
|
data_file = BaseConfig.VALIDATION_DATA
|
|
calculate_mfe_features(data_file)
|