FScanpy-commit-code/data_pre/mfe.py

112 lines
3.2 KiB
Python

"""
RNA Structure Prediction and MFE Calculation
"""
import csv
import RNA
import pandas as pd
from utils.config import BaseConfig
def predict_rna_structure_multiple(csv_file, predictions):
"""
Predict RNA structure for multiple regions and write results back to CSV file
Args:
csv_file: CSV file path
predictions: List of prediction configurations, each containing:
- start: Starting position (1-based)
- length: Prediction length
- column_name: Column name for results in CSV
"""
# Read CSV file
df = pd.read_csv(csv_file)
# Process each sequence
for index, row in df.iterrows():
sequence = row['full_seq']
# Process each prediction configuration
for pred in predictions:
start = pred['start'] - 1 # Convert to 0-based index
length = pred['length']
column_name = pred['column_name']
try:
# Extract subsequence from specified region
sub_seq = sequence[start:start + length]
# Create fold_compound object and predict
fc = RNA.fold_compound(sub_seq)
(ss, mfe) = fc.mfe()
# Store result in DataFrame
df.at[index, column_name] = mfe
except Exception as e:
df.at[index, column_name] = None
# Save results to original CSV file
df.to_csv(csv_file, index=False)
def predict_rna_mfe_batch(sequences, start_pos=198, length=40):
"""
Batch prediction of RNA MFE for multiple sequences
Args:
sequences: List of RNA sequences
start_pos: Starting position (1-based)
length: Length of subsequence for MFE calculation
Returns:
List of MFE values
"""
mfe_values = []
start_idx = start_pos - 1 # Convert to 0-based
for sequence in sequences:
try:
# Extract subsequence
sub_seq = sequence[start_idx:start_idx + length]
# Calculate MFE
fc = RNA.fold_compound(sub_seq)
(ss, mfe) = fc.mfe()
mfe_values.append(mfe)
except Exception as e:
mfe_values.append(0.0) # Default value on error
return mfe_values
def calculate_mfe_features(data_file, output_file=None):
"""
Calculate MFE features for sequences in a data file
Args:
data_file: Input CSV file path
output_file: Output CSV file path (optional, defaults to input file)
"""
if output_file is None:
output_file = data_file
# Standard MFE prediction configurations
predictions = [
{
'start': 198,
'length': 40,
'column_name': 'mfe_40bp'
},
{
'start': 198,
'length': 120,
'column_name': 'mfe_120bp'
}
]
predict_rna_structure_multiple(data_file, predictions)
if __name__ == "__main__":
# Example usage with virtual paths
data_file = BaseConfig.VALIDATION_DATA
calculate_mfe_features(data_file)