203 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			203 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
| import numpy as np
 | ||
| import pandas as pd
 | ||
| from typing import Tuple, Optional
 | ||
| from Bio import SeqIO
 | ||
| from Bio.Seq import Seq
 | ||
| 
 | ||
| def fscanr(blastx_output: pd.DataFrame, 
 | ||
|            mismatch_cutoff: float = 10,
 | ||
|            evalue_cutoff: float = 1e-5,
 | ||
|            frameDist_cutoff: float = 10) -> pd.DataFrame:
 | ||
|     """
 | ||
|     identify PRF sites from BLASTX output
 | ||
|     
 | ||
|     Args:
 | ||
|         blastx_output: BLASTX output DataFrame
 | ||
|         mismatch_cutoff: mismatch threshold
 | ||
|         evalue_cutoff: E-value threshold 
 | ||
|         frameDist_cutoff: frame distance threshold
 | ||
|         
 | ||
|     Returns:
 | ||
|         pd.DataFrame: DataFrame containing PRF site information
 | ||
|     """
 | ||
|     blastx = blastx_output.copy()
 | ||
|     
 | ||
|     blastx.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", 
 | ||
|                      "gapopen", "qstart", "qend", "sstart", "send", 
 | ||
|                      "evalue", "bitscore", "qframe", "sframe"]
 | ||
|     
 | ||
|     blastx = blastx[
 | ||
|         (blastx['evalue'] <= evalue_cutoff) & 
 | ||
|         (blastx['mismatch'] <= mismatch_cutoff)
 | ||
|     ].dropna()
 | ||
| 
 | ||
|     freq = blastx['qseqid'].value_counts()
 | ||
|     multi_hits = freq[freq > 1].index
 | ||
|     blastx = blastx[blastx['qseqid'].isin(multi_hits)]
 | ||
|     
 | ||
|     blastx = blastx.sort_values(['qseqid', 'sseqid', 'qstart'])
 | ||
|     
 | ||
|     prf_list = []
 | ||
|     for i in range(1, len(blastx)):
 | ||
|         curr = blastx.iloc[i]
 | ||
|         prev = blastx.iloc[i-1]
 | ||
|         
 | ||
|         if (curr['qseqid'] == prev['qseqid'] and 
 | ||
|             curr['sseqid'] == prev['sseqid'] and
 | ||
|             curr['qframe'] != prev['qframe'] and 
 | ||
|             curr['qframe'] * prev['qframe'] > 0):
 | ||
|             
 | ||
|             if curr['qframe'] > 0 and prev['qframe'] > 0:
 | ||
|                 frame_start = prev['qend'] 
 | ||
|                 frame_end = curr['qstart']
 | ||
|                 pep_start = prev['send']
 | ||
|                 pep_end = curr['sstart']
 | ||
|                 strand = "+"
 | ||
|             elif curr['qframe'] < 0 and prev['qframe'] < 0:
 | ||
|                 frame_start = prev['qstart']
 | ||
|                 frame_end = curr['qend'] 
 | ||
|                 pep_start = curr['send']
 | ||
|                 pep_end = prev['sstart']
 | ||
|                 strand = "-"
 | ||
|             else:
 | ||
|                 continue
 | ||
|                 
 | ||
|             q_dist = frame_end - frame_start - 1
 | ||
|             s_dist = pep_end - pep_start
 | ||
|             fs_type = q_dist + (1 - s_dist) * 3
 | ||
|             
 | ||
|             if (abs(q_dist) <= frameDist_cutoff and 
 | ||
|                 abs(s_dist) <= frameDist_cutoff // 3 and
 | ||
|                 -3 < fs_type < 3):
 | ||
|                 
 | ||
|                 prf_list.append({
 | ||
|                     'DNA_seqid': curr['qseqid'],
 | ||
|                     'FS_start': frame_start,
 | ||
|                     'FS_end': frame_end,
 | ||
|                     'Pep_seqid': curr['sseqid'],
 | ||
|                     'Pep_FS_start': prev['send'] + 1,
 | ||
|                     'Pep_FS_end': curr['sstart'],
 | ||
|                     'FS_type': fs_type,
 | ||
|                     'Strand': strand
 | ||
|                 })
 | ||
|     
 | ||
|     if not prf_list:
 | ||
|         print("No PRF events detected!")
 | ||
|         return pd.DataFrame()
 | ||
|         
 | ||
|     prf = pd.DataFrame(prf_list)
 | ||
|     
 | ||
|     for col in ['DNA_seqid', 'Pep_seqid']:
 | ||
|         for pos in ['FS_start', 'FS_end']:
 | ||
|             loci = prf[col] + '_' + prf[pos].astype(str)
 | ||
|             prf = prf[~loci.duplicated()]
 | ||
|             
 | ||
|     return prf
 | ||
| 
 | ||
| def extract_prf_regions(mrna_file: str, prf_data: pd.DataFrame) -> pd.DataFrame:
 | ||
|     """
 | ||
|     从mRNA序列中提取PRF位点周围的序列
 | ||
|     
 | ||
|     Args:
 | ||
|         mrna_file: mRNA序列文件路径 (FASTA格式)
 | ||
|         prf_data: FScanR输出的PRF位点数据
 | ||
|         
 | ||
|     Returns:
 | ||
|         pd.DataFrame: 包含399bp序列的DataFrame
 | ||
|     """
 | ||
|     mrna_dict = {rec.id: str(rec.seq) 
 | ||
|                  for rec in SeqIO.parse(mrna_file, "fasta")}
 | ||
|     
 | ||
|     results = []
 | ||
|     for _, row in prf_data.iterrows():
 | ||
|         seq_id = row['DNA_seqid']
 | ||
|         if seq_id not in mrna_dict:
 | ||
|             print(f"警告: {seq_id} 未在mRNA文件中找到")
 | ||
|             continue
 | ||
|             
 | ||
|         sequence = mrna_dict[seq_id]
 | ||
|         strand = row['Strand']
 | ||
|         fs_start = int(row['FS_start'])
 | ||
|         
 | ||
|         try:
 | ||
|             if strand == '-':
 | ||
|                 sequence = str(Seq(sequence).reverse_complement())
 | ||
|             
 | ||
|             # 只提取399bp序列,33bp由predictor内部截取
 | ||
|             full_seq = extract_window_sequences(sequence, fs_start)[1]
 | ||
|             
 | ||
|             results.append({
 | ||
|                 'DNA_seqid': seq_id,
 | ||
|                 'FS_start': fs_start,
 | ||
|                 'FS_end': int(row['FS_end']),
 | ||
|                 'Strand': strand,
 | ||
|                 '399bp': full_seq,
 | ||
|                 'FS_type': row['FS_type']
 | ||
|             })
 | ||
|             
 | ||
|         except Exception as e:
 | ||
|             print(f"处理 {seq_id} 时出错: {str(e)}")
 | ||
|             continue
 | ||
|             
 | ||
|     return pd.DataFrame(results)
 | ||
| 
 | ||
| def extract_window_sequences(seq: str, position: int) -> Tuple[Optional[str], Optional[str]]:
 | ||
|     """
 | ||
|     从指定位置提取分析窗口序列
 | ||
|     
 | ||
|     Args:
 | ||
|         seq: 输入DNA序列
 | ||
|         position: 当前分析位置 (FS_start)
 | ||
|     
 | ||
|     Returns:
 | ||
|         Tuple[str, str]: (33bp序列, 399bp序列) - 已调整为与训练模型匹配的长度
 | ||
|     """
 | ||
|     # 确保位置在密码子边界上(整数倍的3)
 | ||
|     frame_position = position - (position % 3)
 | ||
| 
 | ||
|     # 计算33bp窗口的起止位置 (GB模型)
 | ||
|     half_size_small = 33 // 2
 | ||
|     start_small = frame_position - half_size_small
 | ||
|     end_small = frame_position + half_size_small + (33 % 2)  # 添加余数以处理奇数长度
 | ||
|     
 | ||
|     # 计算399bp窗口的起止位置 (CNN模型)
 | ||
|     half_size_large = 399 // 2
 | ||
|     start_large = frame_position - half_size_large
 | ||
|     end_large = frame_position + half_size_large + (399 % 2)  # 添加余数以处理奇数长度
 | ||
| 
 | ||
|     # 提取序列并填充
 | ||
|     seq_small = _extract_and_pad(seq, start_small, end_small, 33)
 | ||
|     seq_large = _extract_and_pad(seq, start_large, end_large, 399)
 | ||
|     
 | ||
|     return seq_small, seq_large
 | ||
| 
 | ||
| def _extract_and_pad(seq: str, start: int, end: int, target_length: int) -> str:
 | ||
|     """提取序列并用N填充"""
 | ||
|     if start < 0:
 | ||
|         prefix = 'N' * abs(start)
 | ||
|         extracted = prefix + seq[:end]
 | ||
|     elif end > len(seq):
 | ||
|         suffix = 'N' * (end - len(seq))
 | ||
|         extracted = seq[start:] + suffix
 | ||
|     else:
 | ||
|         extracted = seq[start:end]
 | ||
|     
 | ||
|     # 确保序列长度正确
 | ||
|     if len(extracted) < target_length:
 | ||
|         # 从中心填充
 | ||
|         pad_left = (target_length - len(extracted)) // 2
 | ||
|         pad_right = target_length - len(extracted) - pad_left
 | ||
|         extracted = 'N' * pad_left + extracted + 'N' * pad_right
 | ||
|     elif len(extracted) > target_length:
 | ||
|         # 从序列两端等量截取
 | ||
|         excess = len(extracted) - target_length
 | ||
|         trim_each_side = excess // 2
 | ||
|         extracted = extracted[trim_each_side:len(extracted)-trim_each_side]
 | ||
|     
 | ||
|     return extracted
 | ||
| 
 | ||
| def prepare_cnn_input(sequence: str) -> np.ndarray:
 | ||
|     """prepare CNN model input"""
 | ||
|     base_to_num = {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'N': 0}
 | ||
|     seq_numeric = [base_to_num.get(base, 0) for base in sequence.upper()]
 | ||
|     return np.array(seq_numeric).reshape(1, len(sequence), 1)  |