first commit
This commit is contained in:
commit
321c41687e
|
|
@ -0,0 +1,109 @@
|
||||||
|
from .predictor import PRFPredictor
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from typing import Union, List, Dict
|
||||||
|
|
||||||
|
__version__ = '0.3.0'
|
||||||
|
__author__ = ''
|
||||||
|
__email__ = ''
|
||||||
|
|
||||||
|
__all__ = ['PRFPredictor', 'predict_prf', '__version__', '__author__', '__email__']
|
||||||
|
|
||||||
|
def predict_prf(
|
||||||
|
sequence: Union[str, List[str], None] = None,
|
||||||
|
data: Union[pd.DataFrame, None] = None,
|
||||||
|
window_size: int = 3,
|
||||||
|
gb_threshold: float = 0.1,
|
||||||
|
model_dir: str = None
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
PRF位点预测函数
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: 单个或多个DNA序列,用于滑动窗口预测
|
||||||
|
data: DataFrame数据,必须包含'399bp'列,用于区域预测
|
||||||
|
window_size: 滑动窗口大小(默认为3)
|
||||||
|
gb_threshold: GB模型概率阈值(默认为0.1)
|
||||||
|
model_dir: 模型文件目录路径(可选)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.DataFrame: 预测结果
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# 1. 单条序列滑动窗口预测
|
||||||
|
>>> from FScanpy import predict_prf
|
||||||
|
>>> sequence = "ATGCGTACGT..."
|
||||||
|
>>> results = predict_prf(sequence=sequence)
|
||||||
|
|
||||||
|
# 2. 多条序列滑动窗口预测
|
||||||
|
>>> sequences = ["ATGCGTACGT...", "GCTATAGCAT..."]
|
||||||
|
>>> results = predict_prf(sequence=sequences)
|
||||||
|
|
||||||
|
# 3. DataFrame区域预测
|
||||||
|
>>> import pandas as pd
|
||||||
|
>>> data = pd.DataFrame({
|
||||||
|
... '399bp': ['ATGCGT...', 'GCTATAG...']
|
||||||
|
... })
|
||||||
|
>>> results = predict_prf(data=data)
|
||||||
|
"""
|
||||||
|
predictor = PRFPredictor(model_dir=model_dir)
|
||||||
|
|
||||||
|
# 验证输入参数
|
||||||
|
if sequence is None and data is None:
|
||||||
|
raise ValueError("必须提供sequence或data参数之一")
|
||||||
|
if sequence is not None and data is not None:
|
||||||
|
raise ValueError("sequence和data参数不能同时提供")
|
||||||
|
|
||||||
|
# 滑动窗口预测模式
|
||||||
|
if sequence is not None:
|
||||||
|
if isinstance(sequence, str):
|
||||||
|
# 单条序列预测
|
||||||
|
return predictor.predict_full(
|
||||||
|
sequence, window_size, gb_threshold)
|
||||||
|
elif isinstance(sequence, (list, tuple)):
|
||||||
|
# 多条序列预测
|
||||||
|
results = []
|
||||||
|
for i, seq in enumerate(sequence, 1):
|
||||||
|
try:
|
||||||
|
result = predictor.predict_full(
|
||||||
|
seq, window_size, gb_threshold)
|
||||||
|
result['Sequence_ID'] = f'seq_{i}'
|
||||||
|
results.append(result)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"警告:序列 {i} 预测失败 - {str(e)}")
|
||||||
|
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
|
||||||
|
|
||||||
|
# 区域化预测模式
|
||||||
|
else:
|
||||||
|
if not isinstance(data, pd.DataFrame):
|
||||||
|
raise ValueError("data参数必须是pandas DataFrame类型")
|
||||||
|
|
||||||
|
if '399bp' not in data.columns:
|
||||||
|
raise ValueError("DataFrame必须包含'399bp'列")
|
||||||
|
|
||||||
|
# 调用区域预测函数
|
||||||
|
try:
|
||||||
|
results = predictor.predict_region(
|
||||||
|
data['399bp'], gb_threshold)
|
||||||
|
|
||||||
|
# 添加原始数据的其他列
|
||||||
|
for col in data.columns:
|
||||||
|
if col not in ['399bp', '33bp']:
|
||||||
|
results[col] = data[col].values
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"警告:区域预测失败 - {str(e)}")
|
||||||
|
# 创建空结果
|
||||||
|
results = pd.DataFrame({
|
||||||
|
'GB_Probability': [0.0] * len(data),
|
||||||
|
'CNN_Probability': [0.0] * len(data),
|
||||||
|
'Voting_Probability': [0.0] * len(data)
|
||||||
|
})
|
||||||
|
|
||||||
|
# 添加原始数据列
|
||||||
|
for col in data.columns:
|
||||||
|
results[col] = data[col].values
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
import os
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
|
def get_test_data_path(filename: str) -> str:
|
||||||
|
return pkg_resources.resource_filename('FScanpy', f'data/test_data/{filename}')
|
||||||
|
|
||||||
|
def list_test_data() -> list:
|
||||||
|
data_dir = pkg_resources.resource_filename('FScanpy', 'data/test_data')
|
||||||
|
return os.listdir(data_dir)
|
||||||
Binary file not shown.
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,8 @@
|
||||||
|
FS_period,399bp,fs_position,DNA_seqid,label,source,FS_type,dataset
|
||||||
|
gtgtgaacacaatagtgagtgacatactaaacg,ggatatgtaacatggacaagtcattgtgtaggtatccaagaccaatagcctttactttcaaagggaaagaagaattcaggaccttgtttggaggactcatctcgatgtcgattcaggtggtcattgtgctctatgcttatattatgctaaagataatgatagaacgtaatgacacatcaaaaagtgtgaacacaatagtgagtgacatactaaacgacaaatctccagtatctctcaatacaacagatttctcgtttgcatttgatgcttttattcttggcgatgataatttcgatttcaacaataaccaatacttcggaattgagctacttcaatggattaagcagccagatactggagaactatcatccactaatattccatatgaaagatgtggaa,16.0,MSTRG.18491.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
gtctcagaagagtctgaggaatatctccaagga,caaattaataacaaatatgaattccatcaacaacttttatggagacgagaacttatcagatgaacttctgagtgaagatgtcgtgtcttgagaagtaagaggatcagaaaagatcttgcataacatggggagaaagtctctcagtaataagaagcctttaagcggagtggagttggactgagagtctcagaagagtctgaggaatatctccaaggataaaatttgttcgcaaggaagatctatctttaggcagaagaagtcaaaatcttgtgatcaagtagaagaacctcttagtagtcttaaagataacatgagtcactttaatgacatagacttgcaagctagtaagcctctaaaatcagagattagcaatctttttgggtactcaactcagcccaa,16.0,MSTRG.4662.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
cttacttgcaaacatgaatctaataaattagag,ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc,16.0,MSTRG.14742.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
atagtagaagatacagtctccatggccagtaca,ttgttgataaaaatacaggattttatctccaattaaagtcgagaaaataagccaagtagcagtaaacgtagataaacgaataagcatggaaaacaataatagaatagaccaaattgaagagatttctcacaattcgtatttgaattactgttagatttaggaataacaaatcgaactcgacacatagtagaagatacagtctccatggccagtacaaaagccaccatgaagaagaggaaaaagaacagataaatgctatcaaagaaaataaagttaacgatcaatatacaagcatgattgttattaatttaaccaactttagataatcagattataatcagcgacgatcaaagcaaaaaggtaaagaacaaccaattcaaattgaacatcagaaaggga,16.0,CUFF.17967.2,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
cctcgtctttgtctccagaaaataagaaaacaa,catcaataaatagagtcaatgttagaagtatgtctaaattcaaaccaaacgaaattctaaataaagcaagaatgccaacataattaatttagattaagcttgatagttcattagtactcaacaataaaaatatttcaaagggtaatattccagaatcaaaattaagaaataaaattattcctacctcgtctttgtctccagaaaataagaaaacaaataaatcagttatgttcgaaaatgttaaagagatggaaagccaggacaagtcgcaaaatacactaacacatttgaaagaaagcaataatggtagtccttccaaattttaaaactgaaaataatcttgcagatgtagttcgatctagagataataaagcttataacagtactctaaacttaaaa,16.0,CUFF.22392.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
aaaaatgacaaagatctgaacattagttctttc,ttaattttgttctgatcacctaattgtaagcccaaaaacgatactcaaaagatgaggaaactttattggaacattaaaagtaatctcttgagtttatttatgctaactatttacatacgaagcttttacgaaacattccaattcttggctcttgctggcttatcagctacttggaacaacgacaaaaatgacaaagatctgaacattagttctttcatttttgccattgttttgttgtttttatgcacaggtttcttcttatggtcactctaccattactttggatcccgctctgacaatcctcgaaatctcaaaatctctcaggagtttacgaatggagcaaaggagaataatagcggtaaactatatccagtgcttggattgctgagaagaggtctc,16.0,MSTRG.9455.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
agaagactgggagaactctcagatactatatct,agaagaagagaaggccaggagtagctcgaaagaggaggaatttaaggtttacccaaagaaccctatgactgactctaaagatgatcagtcggacactctccctccgaaatcttacagtgtaaagaaagccaatgtaggagaactaaacaagtacgattttgagatctcttattccaaataatgagaagactgggagaactctcagatactatatctgcaagtatgatgaatgaaggcgtaaatttaacaagacttggaactttattgatcacgctaggatacacacaggagagaagccttacaaatgtgagctgtgtggcaaagagtttgctcagaaggggaactacaacaaacacaggaatacccaccagcatagtgccaagaagacctcagtaatga,16.0,MSTRG.26803.1,0,EUPLOTES,negative,EUPLOTES
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
from .sequence import SequenceFeatureExtractor
|
||||||
|
from .cnn_input import CNNInputProcessor
|
||||||
|
|
||||||
|
__all__ = ['SequenceFeatureExtractor', 'CNNInputProcessor']
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
import numpy as np
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
|
class CNNInputProcessor:
|
||||||
|
"""CNN模型输入数据处理器"""
|
||||||
|
|
||||||
|
def __init__(self, max_length: int = 399):
|
||||||
|
self.max_length = max_length
|
||||||
|
self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4}
|
||||||
|
|
||||||
|
def trim_sequence(self, seq, target_length):
|
||||||
|
"""
|
||||||
|
从序列两端等量截取,使其达到目标长度,保持中心位置不变
|
||||||
|
|
||||||
|
参数:
|
||||||
|
seq: 原始序列
|
||||||
|
target_length: 目标长度
|
||||||
|
|
||||||
|
返回:
|
||||||
|
截取后的序列
|
||||||
|
"""
|
||||||
|
if len(seq) <= target_length:
|
||||||
|
return seq
|
||||||
|
|
||||||
|
# 计算需要从每端截取的长度
|
||||||
|
excess = len(seq) - target_length
|
||||||
|
trim_each_side = excess // 2
|
||||||
|
|
||||||
|
# 从两端等量截取,保持中心位置不变
|
||||||
|
return seq[trim_each_side:len(seq)-trim_each_side]
|
||||||
|
|
||||||
|
def prepare_sequence(self, sequence: str) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
处理单个序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: DNA序列
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: 处理后的序列数组
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 序列验证和预处理
|
||||||
|
if not isinstance(sequence, str):
|
||||||
|
sequence = str(sequence)
|
||||||
|
|
||||||
|
sequence = sequence.upper().replace('U', 'T')
|
||||||
|
|
||||||
|
# 如果序列长度不等于目标长度,进行截取
|
||||||
|
if len(sequence) > self.max_length:
|
||||||
|
sequence = self.trim_sequence(sequence, self.max_length)
|
||||||
|
|
||||||
|
# 使用与训练时相同的编码方式
|
||||||
|
self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4} # 与SemiBilstmCnn.py中保持一致
|
||||||
|
|
||||||
|
# 序列转换为数字
|
||||||
|
seq_numeric = []
|
||||||
|
for base in sequence:
|
||||||
|
seq_numeric.append(self.base_to_num.get(base, 4)) # 未知碱基用4表示
|
||||||
|
|
||||||
|
# 填充序列
|
||||||
|
if len(seq_numeric) < self.max_length:
|
||||||
|
seq_numeric.extend([4] * (self.max_length - len(seq_numeric)))
|
||||||
|
|
||||||
|
# 重塑数据为三维数组 (samples, timesteps, features)
|
||||||
|
result = np.array(seq_numeric).reshape(1, self.max_length, 1)
|
||||||
|
|
||||||
|
# 检查结果维度
|
||||||
|
if result.ndim != 3:
|
||||||
|
print(f"警告: CNN输入维度异常 - {result.ndim},应为3")
|
||||||
|
# 强制修正为正确的维度
|
||||||
|
result = result.reshape(1, self.max_length, 1)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"CNN序列处理失败: {str(e)}")
|
||||||
|
# 出错时返回全零的三维数组
|
||||||
|
return np.zeros((1, self.max_length, 1))
|
||||||
|
|
@ -0,0 +1,283 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import itertools
|
||||||
|
from typing import List, Dict, Union
|
||||||
|
|
||||||
|
class SequenceFeatureExtractor:
|
||||||
|
"""DNA序列特征提取器"""
|
||||||
|
|
||||||
|
def __init__(self, seq_length=33):
|
||||||
|
"""初始化特征提取器"""
|
||||||
|
self.bases = ['A', 'T', 'G', 'C']
|
||||||
|
self.valid_bases = set('ATGCN')
|
||||||
|
self.seq_length = seq_length # 添加序列长度配置
|
||||||
|
self.feature_names = self._get_feature_names()
|
||||||
|
|
||||||
|
def _get_feature_names(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
返回特征名称列表,包含所有可能的碱基特征
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
features: 特征名称列表
|
||||||
|
"""
|
||||||
|
features = []
|
||||||
|
|
||||||
|
# 基础特征 (包含N)
|
||||||
|
bases = ['A', 'T', 'G', 'C', 'N']
|
||||||
|
features.extend(bases)
|
||||||
|
|
||||||
|
# 3-mer特征
|
||||||
|
kmers_3 = [''.join(p) for p in itertools.product(bases, repeat=3)] # 125个特征
|
||||||
|
features.extend(kmers_3)
|
||||||
|
|
||||||
|
# 密码子特征
|
||||||
|
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)] # 64个密码子
|
||||||
|
n_codons = self.seq_length // 3 # 计算序列中包含的完整密码子数量
|
||||||
|
for i in range(n_codons):
|
||||||
|
for codon in codons:
|
||||||
|
features.append(f'codon_pos_{i}_{codon}')
|
||||||
|
|
||||||
|
# GC含量特征
|
||||||
|
features.append('gc_content')
|
||||||
|
|
||||||
|
# 序列复杂度特征
|
||||||
|
features.append('sequence_complexity')
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
def trim_sequence(self, seq, target_length):
|
||||||
|
"""
|
||||||
|
从序列两端等量截取,使其达到目标长度
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seq: 原始序列
|
||||||
|
target_length: 目标长度
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
截取后的序列
|
||||||
|
"""
|
||||||
|
if len(seq) <= target_length:
|
||||||
|
return seq
|
||||||
|
|
||||||
|
# 计算需要从每端截取的长度
|
||||||
|
excess = len(seq) - target_length
|
||||||
|
trim_each_side = excess // 2
|
||||||
|
|
||||||
|
# 从两端等量截取,保持中心位置不变
|
||||||
|
return seq[trim_each_side:len(seq)-trim_each_side]
|
||||||
|
|
||||||
|
def _preprocess_sequence(self, sequence):
|
||||||
|
"""
|
||||||
|
将DNA序列转换为特征向量
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: DNA序列
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
feature_vector: 特征向量
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
feature_names = self.feature_names
|
||||||
|
|
||||||
|
if pd.isna(sequence) or not isinstance(sequence, str):
|
||||||
|
sequence = str(sequence)
|
||||||
|
sequence = sequence.upper().replace('U', 'T') # 统一为大写字母
|
||||||
|
|
||||||
|
# 如果序列长度不等于目标长度,进行截取或填充
|
||||||
|
if len(sequence) > self.seq_length:
|
||||||
|
sequence = self.trim_sequence(sequence, self.seq_length)
|
||||||
|
else:
|
||||||
|
sequence = sequence[:self.seq_length].ljust(self.seq_length, 'N')
|
||||||
|
|
||||||
|
# 初始化特征字典
|
||||||
|
features = {
|
||||||
|
'A': 0,
|
||||||
|
'T': 0,
|
||||||
|
'G': 0,
|
||||||
|
'C': 0,
|
||||||
|
'N': 0
|
||||||
|
}
|
||||||
|
kmer_features = {}
|
||||||
|
|
||||||
|
# 碱基组成
|
||||||
|
for base in ['A', 'T', 'G', 'C', 'N']:
|
||||||
|
features[base] = sequence.count(base) / self.seq_length
|
||||||
|
|
||||||
|
# 3-mer特征
|
||||||
|
for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
|
||||||
|
kmer_count = 0
|
||||||
|
for i in range(self.seq_length - 2):
|
||||||
|
if sequence[i:i+3] == kmer:
|
||||||
|
kmer_count += 1
|
||||||
|
kmer_features[kmer] = kmer_count / max(1, self.seq_length - 2)
|
||||||
|
|
||||||
|
# 密码子特征
|
||||||
|
codon_features = {}
|
||||||
|
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)] # 64个密码子
|
||||||
|
n_codons = self.seq_length // 3 # 计算序列中包含的完整密码子数量
|
||||||
|
for i in range(n_codons):
|
||||||
|
pos_start = i * 3
|
||||||
|
current_codon = sequence[pos_start:pos_start+3]
|
||||||
|
for codon in codons:
|
||||||
|
codon_features[f'codon_pos_{i}_{codon}'] = 1 if current_codon == codon and 'N' not in current_codon else 0
|
||||||
|
|
||||||
|
# GC含量
|
||||||
|
valid_bases = [b for b in sequence if b != 'N']
|
||||||
|
gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
|
||||||
|
|
||||||
|
# 序列复杂度(Shannon熵)
|
||||||
|
from collections import Counter
|
||||||
|
valid_counts = Counter(valid_bases)
|
||||||
|
total_valid = sum(valid_counts.values())
|
||||||
|
entropy = 0
|
||||||
|
for cnt in valid_counts.values():
|
||||||
|
p = cnt / total_valid
|
||||||
|
entropy += -p * np.log2(p)
|
||||||
|
entropy /= np.log2(4) # 归一化到0-1
|
||||||
|
|
||||||
|
# 合并所有特征
|
||||||
|
all_features = {**features, **kmer_features, **codon_features}
|
||||||
|
all_features['gc_content'] = gc_content
|
||||||
|
all_features['sequence_complexity'] = entropy
|
||||||
|
|
||||||
|
# 确保特征顺序一致
|
||||||
|
feature_vector = [all_features.get(f, 0.0) for f in feature_names]
|
||||||
|
|
||||||
|
return feature_vector
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"特征提取失败: {str(e)}")
|
||||||
|
|
||||||
|
def extract_features_batch(self, sequences: List[Union[str, float]]) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
批量提取特征
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequences: DNA序列列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: 特征矩阵
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return np.array([self.extract_features(seq) for seq in sequences])
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"批量特征提取失败: {str(e)}")
|
||||||
|
|
||||||
|
def predict_region_batch(self, data: pd.DataFrame, gb_threshold: float = 0.1) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
批量预测区域序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: DataFrame包含'33bp'和'399bp'列
|
||||||
|
gb_threshold: GB模型概率阈值(默认为0.1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: 包含预测结果的DataFrame
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for idx, row in data.iterrows():
|
||||||
|
try:
|
||||||
|
# 确保序列是字符串
|
||||||
|
seq_33bp = str(row['33bp'])
|
||||||
|
seq = str(row['399bp'])
|
||||||
|
|
||||||
|
# 确保序列长度正确
|
||||||
|
seq_33bp = self._preprocess_sequence(seq_33bp)
|
||||||
|
seq = self._preprocess_sequence(seq)
|
||||||
|
|
||||||
|
# 预测
|
||||||
|
result = self.predict_region(seq_33bp, seq, gb_threshold)
|
||||||
|
|
||||||
|
# 添加原始数据的其他列
|
||||||
|
for col in data.columns:
|
||||||
|
if col not in ['33bp', '399bp']:
|
||||||
|
result[col] = row[col]
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理索引 {idx} 的序列时出错: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
def extract_features(self, sequence: str) -> list:
|
||||||
|
"""
|
||||||
|
提取序列特征
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: DNA序列
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 特征向量
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 确保输入是字符串
|
||||||
|
if not isinstance(sequence, str):
|
||||||
|
sequence = str(sequence)
|
||||||
|
|
||||||
|
# 大写并替换U为T
|
||||||
|
sequence = sequence.upper().replace('U', 'T')
|
||||||
|
|
||||||
|
# 如果序列长度不等于目标长度,进行截取
|
||||||
|
if len(sequence) != self.seq_length:
|
||||||
|
sequence = self.trim_sequence(sequence, self.seq_length)
|
||||||
|
|
||||||
|
# 初始化特征列表
|
||||||
|
features = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 基础特征 (碱基频率)
|
||||||
|
for base in ['A', 'T', 'G', 'C', 'N']:
|
||||||
|
features.append(sequence.count(base) / len(sequence))
|
||||||
|
|
||||||
|
# 3-mer特征
|
||||||
|
for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
|
||||||
|
count = 0
|
||||||
|
for i in range(len(sequence) - 2):
|
||||||
|
if sequence[i:i+3] == kmer:
|
||||||
|
count += 1
|
||||||
|
features.append(count / max(1, len(sequence) - 2))
|
||||||
|
|
||||||
|
# 密码子特征
|
||||||
|
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)]
|
||||||
|
n_codons = len(sequence) // 3
|
||||||
|
for i in range(n_codons):
|
||||||
|
pos_start = i * 3
|
||||||
|
current_codon = sequence[pos_start:pos_start+3]
|
||||||
|
for codon in codons:
|
||||||
|
features.append(1 if current_codon == codon and 'N' not in current_codon else 0)
|
||||||
|
|
||||||
|
# GC含量
|
||||||
|
valid_bases = [b for b in sequence if b != 'N']
|
||||||
|
gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
|
||||||
|
features.append(gc_content)
|
||||||
|
|
||||||
|
# 序列复杂度
|
||||||
|
from collections import Counter
|
||||||
|
valid_counts = Counter(valid_bases)
|
||||||
|
total_valid = sum(valid_counts.values())
|
||||||
|
entropy = 0
|
||||||
|
if total_valid > 0: # 避免除零错误
|
||||||
|
for cnt in valid_counts.values():
|
||||||
|
if cnt > 0: # 避免log(0)
|
||||||
|
p = cnt / total_valid
|
||||||
|
entropy += -p * np.log2(p)
|
||||||
|
entropy /= np.log2(4) if len(valid_counts) > 0 else 1 # 归一化到0-1,避免除零
|
||||||
|
features.append(entropy)
|
||||||
|
|
||||||
|
# 确保返回的是一维列表或数组
|
||||||
|
if isinstance(features, np.ndarray) and features.ndim > 1:
|
||||||
|
features = features.flatten()
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"特征计算过程出错: {str(e)}")
|
||||||
|
# 如果计算过程出错,返回正确长度的全零特征向量
|
||||||
|
expected_length = 5 + 125 + (len(sequence) // 3) * 64 + 2 # 根据特征提取逻辑计算特征向量长度
|
||||||
|
return [0.0] * expected_length
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"特征提取失败: {str(e)}")
|
||||||
|
# 返回一个空列表,调用方需处理这种情况
|
||||||
|
return []
|
||||||
|
|
@ -0,0 +1,434 @@
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from tensorflow.keras.models import load_model
|
||||||
|
from .features.sequence import SequenceFeatureExtractor
|
||||||
|
from .features.cnn_input import CNNInputProcessor
|
||||||
|
from .utils import extract_window_sequences
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
|
||||||
|
class PRFPredictor:
|
||||||
|
|
||||||
|
def __init__(self, model_dir=None):
|
||||||
|
"""
|
||||||
|
初始化PRF预测器
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_dir: 模型目录路径(可选)
|
||||||
|
"""
|
||||||
|
if model_dir is None:
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
model_dir = resource_filename('FScanpy', 'pretrained')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 加载模型
|
||||||
|
self.gb_model = self._load_pickle(os.path.join(model_dir, 'GradientBoosting_all.pkl'))
|
||||||
|
|
||||||
|
self.cnn_model = self._load_pickle(os.path.join(model_dir, 'BiLSTM-CNN_all.pkl'))
|
||||||
|
|
||||||
|
self.voting_model = self._load_pickle(os.path.join(model_dir, 'Voting_all.pkl'))
|
||||||
|
|
||||||
|
# 初始化特征提取器和CNN处理器,使用与训练时相同的序列长度
|
||||||
|
self.gb_seq_length = 33 # HistGradientBoosting使用的序列长度
|
||||||
|
self.cnn_seq_length = 399 # BiLSTM-CNN使用的序列长度
|
||||||
|
|
||||||
|
# 初始化特征提取器和CNN输入处理器
|
||||||
|
self.feature_extractor = SequenceFeatureExtractor(seq_length=self.gb_seq_length)
|
||||||
|
self.cnn_processor = CNNInputProcessor(max_length=self.cnn_seq_length)
|
||||||
|
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
raise FileNotFoundError(f"无法找到模型文件: {str(e)}")
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"加载模型出错: {str(e)}")
|
||||||
|
|
||||||
|
def _load_pickle(self, path):
|
||||||
|
return joblib.load(path)
|
||||||
|
|
||||||
|
def predict_single_position(self, fs_period, full_seq, gb_threshold=0.1):
|
||||||
|
'''
|
||||||
|
预测单个位置的PRF状态
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fs_period: 33bp序列 (将根据gb_seq_length处理)
|
||||||
|
full_seq: 完整序列 (将根据cnn_seq_length处理)
|
||||||
|
gb_threshold: GB模型的概率阈值 (默认为0.1)
|
||||||
|
Returns:
|
||||||
|
dict: 包含预测概率的字典
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
# 处理序列长度
|
||||||
|
if len(fs_period) > self.gb_seq_length:
|
||||||
|
fs_period = self.feature_extractor.trim_sequence(fs_period, self.gb_seq_length)
|
||||||
|
|
||||||
|
# GB模型预测 - 确保输入是二维数组
|
||||||
|
try:
|
||||||
|
gb_features = self.feature_extractor.extract_features(fs_period)
|
||||||
|
|
||||||
|
# 检查特征结构并确保是一维数组
|
||||||
|
if isinstance(gb_features, np.ndarray):
|
||||||
|
# 如果是多维数组,进行扁平化处理
|
||||||
|
if gb_features.ndim > 1:
|
||||||
|
print(f"警告: 特征是{gb_features.ndim}维数组,进行扁平化处理")
|
||||||
|
gb_features = gb_features.flatten()
|
||||||
|
|
||||||
|
# 明确将特征转换为二维数组,正确形状为(1, n_features)
|
||||||
|
gb_features_2d = np.array([gb_features])
|
||||||
|
|
||||||
|
# 再次检查维度
|
||||||
|
if gb_features_2d.ndim != 2:
|
||||||
|
raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组")
|
||||||
|
|
||||||
|
gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"GB模型预测时出错: {str(e)}")
|
||||||
|
# 出错时设置概率为0
|
||||||
|
gb_prob = 0.0
|
||||||
|
|
||||||
|
# 如果GB概率低于阈值,则跳过CNN模型
|
||||||
|
if gb_prob < gb_threshold:
|
||||||
|
return {
|
||||||
|
'GB_Probability': gb_prob,
|
||||||
|
'CNN_Probability': 0.0,
|
||||||
|
'Voting_Probability': 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
# CNN模型预测
|
||||||
|
try:
|
||||||
|
# 首先检查CNN模型的类型 - 通过尝试识别模型类型
|
||||||
|
is_sklearn_model = False
|
||||||
|
|
||||||
|
# 检测模型类型的方法
|
||||||
|
if hasattr(self.cnn_model, 'predict_proba'):
|
||||||
|
# 这可能是一个scikit-learn模型
|
||||||
|
is_sklearn_model = True
|
||||||
|
|
||||||
|
if is_sklearn_model:
|
||||||
|
# 如果是sklearn模型 (如HistGradientBoostingClassifier),使用与GB相同的特征提取
|
||||||
|
# 为CNN模型使用相同的特征提取方法,但从399bp序列中提取
|
||||||
|
cnn_features = self.feature_extractor.extract_features(full_seq)
|
||||||
|
if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
|
||||||
|
cnn_features = cnn_features.flatten()
|
||||||
|
# 转为二维数组
|
||||||
|
cnn_features_2d = np.array([cnn_features])
|
||||||
|
cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
|
||||||
|
cnn_prob = cnn_pred[0][1]
|
||||||
|
else:
|
||||||
|
# 假设是深度学习模型,需要三维输入
|
||||||
|
cnn_input = self.cnn_processor.prepare_sequence(full_seq)
|
||||||
|
# 尝试不同的预测方法
|
||||||
|
try:
|
||||||
|
# 先尝试不带参数
|
||||||
|
cnn_pred = self.cnn_model.predict(cnn_input)
|
||||||
|
except TypeError:
|
||||||
|
try:
|
||||||
|
# 再尝试带verbose参数
|
||||||
|
cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
|
||||||
|
except Exception:
|
||||||
|
# 最后尝试将输入重塑为2D
|
||||||
|
reshaped_input = cnn_input.reshape(1, -1)
|
||||||
|
cnn_pred = self.cnn_model.predict(reshaped_input)
|
||||||
|
|
||||||
|
# 处理预测结果
|
||||||
|
if isinstance(cnn_pred, list):
|
||||||
|
cnn_pred = cnn_pred[0]
|
||||||
|
|
||||||
|
# 提取概率值
|
||||||
|
if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
|
||||||
|
cnn_prob = cnn_pred[0][1]
|
||||||
|
else:
|
||||||
|
cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"CNN模型预测时出错: {str(e)}")
|
||||||
|
# 出错时设置概率为0
|
||||||
|
cnn_prob = 0.0
|
||||||
|
|
||||||
|
# 投票模型预测
|
||||||
|
try:
|
||||||
|
# 确保投票模型输入是二维数组 (1, n_features)
|
||||||
|
voting_input = np.array([[gb_prob, cnn_prob]])
|
||||||
|
voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"投票模型预测时出错: {str(e)}")
|
||||||
|
# 出错时使用两个模型的平均值
|
||||||
|
voting_prob = (gb_prob + cnn_prob) / 2
|
||||||
|
|
||||||
|
return {
|
||||||
|
'GB_Probability': gb_prob,
|
||||||
|
'CNN_Probability': cnn_prob,
|
||||||
|
'Voting_Probability': voting_prob
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"预测过程出错: {str(e)}")
|
||||||
|
|
||||||
|
def predict_full(self, sequence, window_size=3, gb_threshold=0.1, plot=False):
|
||||||
|
"""
|
||||||
|
预测完整序列中的PRF位点
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: 输入DNA序列
|
||||||
|
window_size: 滑动窗口大小 (默认为3)
|
||||||
|
gb_threshold: GB模型概率阈值 (默认为0.1)
|
||||||
|
plot: 是否绘制预测结果图表 (默认为False)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
if plot=False:
|
||||||
|
pd.DataFrame: 包含预测结果的DataFrame
|
||||||
|
if plot=True:
|
||||||
|
tuple: (pd.DataFrame, matplotlib.figure.Figure)
|
||||||
|
"""
|
||||||
|
if window_size < 1:
|
||||||
|
raise ValueError("窗口大小必须大于等于1")
|
||||||
|
if gb_threshold < 0:
|
||||||
|
raise ValueError("GB阈值必须大于等于0")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 确保序列为字符串并转换为大写
|
||||||
|
sequence = str(sequence).upper()
|
||||||
|
|
||||||
|
# 滑动窗口预测
|
||||||
|
for pos in range(0, len(sequence) - 2, window_size):
|
||||||
|
# 提取窗口序列 - 使用与训练时相同的窗口大小
|
||||||
|
fs_period, full_seq = extract_window_sequences(sequence, pos)
|
||||||
|
|
||||||
|
if fs_period is None or full_seq is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 预测并记录结果
|
||||||
|
pred = self.predict_single_position(fs_period, full_seq, gb_threshold)
|
||||||
|
pred.update({
|
||||||
|
'Position': pos,
|
||||||
|
'Codon': sequence[pos:pos+3],
|
||||||
|
'33bp': fs_period,
|
||||||
|
'399bp': full_seq
|
||||||
|
})
|
||||||
|
results.append(pred)
|
||||||
|
|
||||||
|
# 创建结果DataFrame
|
||||||
|
results_df = pd.DataFrame(results)
|
||||||
|
|
||||||
|
# 如需绘图
|
||||||
|
if plot:
|
||||||
|
# 创建图形
|
||||||
|
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[2, 1])
|
||||||
|
|
||||||
|
# 绘制折线图
|
||||||
|
ax1.plot(results_df['Position'], results_df['GB_Probability'],
|
||||||
|
label='GB模型', alpha=0.7, linewidth=1.5)
|
||||||
|
ax1.plot(results_df['Position'], results_df['CNN_Probability'],
|
||||||
|
label='CNN模型', alpha=0.7, linewidth=1.5)
|
||||||
|
ax1.plot(results_df['Position'], results_df['Voting_Probability'],
|
||||||
|
label='投票模型', linewidth=2, color='red')
|
||||||
|
|
||||||
|
ax1.set_xlabel('序列位置')
|
||||||
|
ax1.set_ylabel('移码概率')
|
||||||
|
ax1.set_title('移码预测概率')
|
||||||
|
ax1.legend()
|
||||||
|
ax1.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# 准备热图数据
|
||||||
|
positions = results_df['Position'].values
|
||||||
|
probabilities = results_df['Voting_Probability'].values
|
||||||
|
|
||||||
|
# 创建热图矩阵
|
||||||
|
heatmap_matrix = np.zeros((1, len(positions)))
|
||||||
|
heatmap_matrix[0, :] = probabilities
|
||||||
|
|
||||||
|
# 绘制热图
|
||||||
|
im = ax2.imshow(heatmap_matrix, aspect='auto', cmap='YlOrRd',
|
||||||
|
extent=[min(positions), max(positions), 0, 1])
|
||||||
|
|
||||||
|
# 添加颜色条
|
||||||
|
cbar = plt.colorbar(im, ax=ax2)
|
||||||
|
cbar.set_label('移码概率')
|
||||||
|
|
||||||
|
# 设置热图轴标签
|
||||||
|
ax2.set_xlabel('序列位置')
|
||||||
|
ax2.set_title('移码概率热图')
|
||||||
|
ax2.set_yticks([])
|
||||||
|
|
||||||
|
# 调整布局
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
return results_df, fig
|
||||||
|
|
||||||
|
return results_df
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"序列预测过程出错: {str(e)}")
|
||||||
|
|
||||||
|
def predict_region(self, seq, gb_threshold=0.1):
|
||||||
|
'''
|
||||||
|
预测区域序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seq: 399bp序列或包含399bp序列的DataFrame/Series
|
||||||
|
gb_threshold: GB模型概率阈值 (默认为0.1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: 包含所有序列预测概率的DataFrame
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
# 如果输入是DataFrame或Series,转换为列表
|
||||||
|
if isinstance(seq, (pd.DataFrame, pd.Series)):
|
||||||
|
seq = seq.tolist()
|
||||||
|
|
||||||
|
# 如果输入是单个字符串,转换为列表
|
||||||
|
if isinstance(seq, str):
|
||||||
|
seq = [seq]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, seq399 in enumerate(seq):
|
||||||
|
try:
|
||||||
|
# 从399bp序列中截取中心的33bp (GB模型使用)
|
||||||
|
seq33 = self._extract_center_sequence(seq399, target_length=self.gb_seq_length)
|
||||||
|
|
||||||
|
# GB模型预测 - 确保输入是二维数组
|
||||||
|
try:
|
||||||
|
gb_features = self.feature_extractor.extract_features(seq33)
|
||||||
|
|
||||||
|
# 检查特征结构并确保是一维数组
|
||||||
|
if isinstance(gb_features, np.ndarray):
|
||||||
|
# 如果是多维数组,进行扁平化处理
|
||||||
|
if gb_features.ndim > 1:
|
||||||
|
print(f"警告: 序列 {i+1} 的特征是{gb_features.ndim}维数组,进行扁平化处理")
|
||||||
|
gb_features = gb_features.flatten()
|
||||||
|
|
||||||
|
# 明确将特征转换为二维数组,正确形状为(1, n_features)
|
||||||
|
gb_features_2d = np.array([gb_features])
|
||||||
|
|
||||||
|
# 再次检查维度
|
||||||
|
if gb_features_2d.ndim != 2:
|
||||||
|
raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组")
|
||||||
|
|
||||||
|
gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"GB模型预测序列 {i+1} 时出错: {str(e)}")
|
||||||
|
# 出错时设置概率为0
|
||||||
|
gb_prob = 0.0
|
||||||
|
|
||||||
|
# 如果GB概率低于阈值,添加低概率结果
|
||||||
|
if gb_prob < gb_threshold:
|
||||||
|
results.append({
|
||||||
|
'GB_Probability': gb_prob,
|
||||||
|
'CNN_Probability': 0.0,
|
||||||
|
'Voting_Probability': 0.0,
|
||||||
|
'33bp': seq33,
|
||||||
|
'399bp': seq399
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# CNN模型预测
|
||||||
|
try:
|
||||||
|
# 首先检查CNN模型的类型 - 通过尝试识别模型类型
|
||||||
|
is_sklearn_model = False
|
||||||
|
|
||||||
|
# 检测模型类型的方法
|
||||||
|
if hasattr(self.cnn_model, 'predict_proba'):
|
||||||
|
# 这可能是一个scikit-learn模型
|
||||||
|
is_sklearn_model = True
|
||||||
|
|
||||||
|
if is_sklearn_model:
|
||||||
|
# 如果是sklearn模型 (如HistGradientBoostingClassifier),使用与GB相同的特征提取
|
||||||
|
# 为CNN模型使用相同的特征提取方法,但从399bp序列中提取
|
||||||
|
cnn_features = self.feature_extractor.extract_features(seq399)
|
||||||
|
if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
|
||||||
|
cnn_features = cnn_features.flatten()
|
||||||
|
# 转为二维数组
|
||||||
|
cnn_features_2d = np.array([cnn_features])
|
||||||
|
cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
|
||||||
|
cnn_prob = cnn_pred[0][1]
|
||||||
|
else:
|
||||||
|
# 假设是深度学习模型,需要三维输入
|
||||||
|
cnn_input = self.cnn_processor.prepare_sequence(seq399)
|
||||||
|
# 尝试不同的预测方法
|
||||||
|
try:
|
||||||
|
# 先尝试不带参数
|
||||||
|
cnn_pred = self.cnn_model.predict(cnn_input)
|
||||||
|
except TypeError:
|
||||||
|
try:
|
||||||
|
# 再尝试带verbose参数
|
||||||
|
cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
|
||||||
|
except Exception:
|
||||||
|
# 最后尝试将输入重塑为2D
|
||||||
|
reshaped_input = cnn_input.reshape(1, -1)
|
||||||
|
cnn_pred = self.cnn_model.predict(reshaped_input)
|
||||||
|
|
||||||
|
# 处理预测结果
|
||||||
|
if isinstance(cnn_pred, list):
|
||||||
|
cnn_pred = cnn_pred[0]
|
||||||
|
|
||||||
|
# 提取概率值
|
||||||
|
if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
|
||||||
|
cnn_prob = cnn_pred[0][1]
|
||||||
|
else:
|
||||||
|
cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"CNN模型预测序列 {i+1} 时出错: {str(e)}")
|
||||||
|
# 出错时设置概率为0
|
||||||
|
cnn_prob = 0.0
|
||||||
|
|
||||||
|
# 投票模型预测
|
||||||
|
try:
|
||||||
|
# 确保投票模型输入是二维数组 (1, n_features)
|
||||||
|
voting_input = np.array([[gb_prob, cnn_prob]])
|
||||||
|
voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"投票模型预测序列 {i+1} 时出错: {str(e)}")
|
||||||
|
# 出错时使用两个模型的平均值
|
||||||
|
voting_prob = (gb_prob + cnn_prob) / 2
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'GB_Probability': gb_prob,
|
||||||
|
'CNN_Probability': cnn_prob,
|
||||||
|
'Voting_Probability': voting_prob,
|
||||||
|
'33bp': seq33,
|
||||||
|
'399bp': seq399
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理第 {i+1} 个序列时出错: {str(e)}")
|
||||||
|
results.append({
|
||||||
|
'GB_Probability': 0.0,
|
||||||
|
'CNN_Probability': 0.0,
|
||||||
|
'Voting_Probability': 0.0,
|
||||||
|
'33bp': self._extract_center_sequence(seq399, target_length=self.gb_seq_length) if len(seq399) >= self.gb_seq_length else seq399,
|
||||||
|
'399bp': seq399
|
||||||
|
})
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"区域预测过程出错: {str(e)}")
|
||||||
|
|
||||||
|
def _extract_center_sequence(self, sequence, target_length=33):
|
||||||
|
"""从序列中心位置提取指定长度的子序列"""
|
||||||
|
# 确保序列为字符串
|
||||||
|
sequence = str(sequence).upper()
|
||||||
|
|
||||||
|
# 如果序列长度小于目标长度,返回原序列
|
||||||
|
if len(sequence) <= target_length:
|
||||||
|
return sequence
|
||||||
|
|
||||||
|
# 计算中心位置
|
||||||
|
center = len(sequence) // 2
|
||||||
|
half_target = target_length // 2
|
||||||
|
|
||||||
|
# 提取中心序列
|
||||||
|
start = center - half_target
|
||||||
|
end = start + target_length
|
||||||
|
|
||||||
|
# 边界检查
|
||||||
|
if start < 0:
|
||||||
|
start = 0
|
||||||
|
end = target_length
|
||||||
|
elif end > len(sequence):
|
||||||
|
end = len(sequence)
|
||||||
|
start = end - target_length
|
||||||
|
|
||||||
|
return sequence[start:end]
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,203 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
from Bio import SeqIO
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
|
||||||
|
def fscanr(blastx_output: pd.DataFrame,
|
||||||
|
mismatch_cutoff: float = 10,
|
||||||
|
evalue_cutoff: float = 1e-5,
|
||||||
|
frameDist_cutoff: float = 10) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
identify PRF sites from BLASTX output
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blastx_output: BLASTX output DataFrame
|
||||||
|
mismatch_cutoff: mismatch threshold
|
||||||
|
evalue_cutoff: E-value threshold
|
||||||
|
frameDist_cutoff: frame distance threshold
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame containing PRF site information
|
||||||
|
"""
|
||||||
|
blastx = blastx_output.copy()
|
||||||
|
|
||||||
|
blastx.columns = ["qseqid", "sseqid", "pident", "length", "mismatch",
|
||||||
|
"gapopen", "qstart", "qend", "sstart", "send",
|
||||||
|
"evalue", "bitscore", "qframe", "sframe"]
|
||||||
|
|
||||||
|
blastx = blastx[
|
||||||
|
(blastx['evalue'] <= evalue_cutoff) &
|
||||||
|
(blastx['mismatch'] <= mismatch_cutoff)
|
||||||
|
].dropna()
|
||||||
|
|
||||||
|
freq = blastx['qseqid'].value_counts()
|
||||||
|
multi_hits = freq[freq > 1].index
|
||||||
|
blastx = blastx[blastx['qseqid'].isin(multi_hits)]
|
||||||
|
|
||||||
|
blastx = blastx.sort_values(['qseqid', 'sseqid', 'qstart'])
|
||||||
|
|
||||||
|
prf_list = []
|
||||||
|
for i in range(1, len(blastx)):
|
||||||
|
curr = blastx.iloc[i]
|
||||||
|
prev = blastx.iloc[i-1]
|
||||||
|
|
||||||
|
if (curr['qseqid'] == prev['qseqid'] and
|
||||||
|
curr['sseqid'] == prev['sseqid'] and
|
||||||
|
curr['qframe'] != prev['qframe'] and
|
||||||
|
curr['qframe'] * prev['qframe'] > 0):
|
||||||
|
|
||||||
|
if curr['qframe'] > 0 and prev['qframe'] > 0:
|
||||||
|
frame_start = prev['qend']
|
||||||
|
frame_end = curr['qstart']
|
||||||
|
pep_start = prev['send']
|
||||||
|
pep_end = curr['sstart']
|
||||||
|
strand = "+"
|
||||||
|
elif curr['qframe'] < 0 and prev['qframe'] < 0:
|
||||||
|
frame_start = prev['qstart']
|
||||||
|
frame_end = curr['qend']
|
||||||
|
pep_start = curr['send']
|
||||||
|
pep_end = prev['sstart']
|
||||||
|
strand = "-"
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
q_dist = frame_end - frame_start - 1
|
||||||
|
s_dist = pep_end - pep_start
|
||||||
|
fs_type = q_dist + (1 - s_dist) * 3
|
||||||
|
|
||||||
|
if (abs(q_dist) <= frameDist_cutoff and
|
||||||
|
abs(s_dist) <= frameDist_cutoff // 3 and
|
||||||
|
-3 < fs_type < 3):
|
||||||
|
|
||||||
|
prf_list.append({
|
||||||
|
'DNA_seqid': curr['qseqid'],
|
||||||
|
'FS_start': frame_start,
|
||||||
|
'FS_end': frame_end,
|
||||||
|
'Pep_seqid': curr['sseqid'],
|
||||||
|
'Pep_FS_start': prev['send'] + 1,
|
||||||
|
'Pep_FS_end': curr['sstart'],
|
||||||
|
'FS_type': fs_type,
|
||||||
|
'Strand': strand
|
||||||
|
})
|
||||||
|
|
||||||
|
if not prf_list:
|
||||||
|
print("No PRF events detected!")
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
prf = pd.DataFrame(prf_list)
|
||||||
|
|
||||||
|
for col in ['DNA_seqid', 'Pep_seqid']:
|
||||||
|
for pos in ['FS_start', 'FS_end']:
|
||||||
|
loci = prf[col] + '_' + prf[pos].astype(str)
|
||||||
|
prf = prf[~loci.duplicated()]
|
||||||
|
|
||||||
|
return prf
|
||||||
|
|
||||||
|
def extract_prf_regions(mrna_file: str, prf_data: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
从mRNA序列中提取PRF位点周围的序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mrna_file: mRNA序列文件路径 (FASTA格式)
|
||||||
|
prf_data: FScanR输出的PRF位点数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: 包含399bp序列的DataFrame
|
||||||
|
"""
|
||||||
|
mrna_dict = {rec.id: str(rec.seq)
|
||||||
|
for rec in SeqIO.parse(mrna_file, "fasta")}
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for _, row in prf_data.iterrows():
|
||||||
|
seq_id = row['DNA_seqid']
|
||||||
|
if seq_id not in mrna_dict:
|
||||||
|
print(f"警告: {seq_id} 未在mRNA文件中找到")
|
||||||
|
continue
|
||||||
|
|
||||||
|
sequence = mrna_dict[seq_id]
|
||||||
|
strand = row['Strand']
|
||||||
|
fs_start = int(row['FS_start'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
if strand == '-':
|
||||||
|
sequence = str(Seq(sequence).reverse_complement())
|
||||||
|
|
||||||
|
# 只提取399bp序列,33bp由predictor内部截取
|
||||||
|
full_seq = extract_window_sequences(sequence, fs_start)[1]
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'DNA_seqid': seq_id,
|
||||||
|
'FS_start': fs_start,
|
||||||
|
'FS_end': int(row['FS_end']),
|
||||||
|
'Strand': strand,
|
||||||
|
'399bp': full_seq,
|
||||||
|
'FS_type': row['FS_type']
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理 {seq_id} 时出错: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return pd.DataFrame(results)
|
||||||
|
|
||||||
|
def extract_window_sequences(seq: str, position: int) -> Tuple[Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
从指定位置提取分析窗口序列
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seq: 输入DNA序列
|
||||||
|
position: 当前分析位置 (FS_start)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[str, str]: (33bp序列, 399bp序列) - 已调整为与训练模型匹配的长度
|
||||||
|
"""
|
||||||
|
# 确保位置在密码子边界上(整数倍的3)
|
||||||
|
frame_position = position - (position % 3)
|
||||||
|
|
||||||
|
# 计算33bp窗口的起止位置 (GB模型)
|
||||||
|
half_size_small = 33 // 2
|
||||||
|
start_small = frame_position - half_size_small
|
||||||
|
end_small = frame_position + half_size_small + (33 % 2) # 添加余数以处理奇数长度
|
||||||
|
|
||||||
|
# 计算399bp窗口的起止位置 (CNN模型)
|
||||||
|
half_size_large = 399 // 2
|
||||||
|
start_large = frame_position - half_size_large
|
||||||
|
end_large = frame_position + half_size_large + (399 % 2) # 添加余数以处理奇数长度
|
||||||
|
|
||||||
|
# 提取序列并填充
|
||||||
|
seq_small = _extract_and_pad(seq, start_small, end_small, 33)
|
||||||
|
seq_large = _extract_and_pad(seq, start_large, end_large, 399)
|
||||||
|
|
||||||
|
return seq_small, seq_large
|
||||||
|
|
||||||
|
def _extract_and_pad(seq: str, start: int, end: int, target_length: int) -> str:
|
||||||
|
"""提取序列并用N填充"""
|
||||||
|
if start < 0:
|
||||||
|
prefix = 'N' * abs(start)
|
||||||
|
extracted = prefix + seq[:end]
|
||||||
|
elif end > len(seq):
|
||||||
|
suffix = 'N' * (end - len(seq))
|
||||||
|
extracted = seq[start:] + suffix
|
||||||
|
else:
|
||||||
|
extracted = seq[start:end]
|
||||||
|
|
||||||
|
# 确保序列长度正确
|
||||||
|
if len(extracted) < target_length:
|
||||||
|
# 从中心填充
|
||||||
|
pad_left = (target_length - len(extracted)) // 2
|
||||||
|
pad_right = target_length - len(extracted) - pad_left
|
||||||
|
extracted = 'N' * pad_left + extracted + 'N' * pad_right
|
||||||
|
elif len(extracted) > target_length:
|
||||||
|
# 从序列两端等量截取
|
||||||
|
excess = len(extracted) - target_length
|
||||||
|
trim_each_side = excess // 2
|
||||||
|
extracted = extracted[trim_each_side:len(extracted)-trim_each_side]
|
||||||
|
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
def prepare_cnn_input(sequence: str) -> np.ndarray:
|
||||||
|
"""prepare CNN model input"""
|
||||||
|
base_to_num = {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'N': 0}
|
||||||
|
seq_numeric = [base_to_num.get(base, 0) for base in sequence.upper()]
|
||||||
|
return np.array(seq_numeric).reshape(1, len(sequence), 1)
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
# FScanpy
|
||||||
|
## A Machine Learning-Based Framework for Programmed Ribosomal Frameshifting Prediction
|
||||||
|
|
||||||
|
FScanpy is a comprehensive Python package designed for the prediction of [Programmed Ribosomal Frameshifting (PRF)](https://en.wikipedia.org/wiki/Ribosomal_frameshift) sites in nucleotide sequences. By integrating advanced machine learning approaches (Gradient Boosting and BiLSTM-CNN) with the established [FScanR](https://github.com/seanchen607/FScanR.git) framework, FScanpy provides robust and accurate PRF site predictions. The package requires input sequences to be in the positive (5' to 3') orientation.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
For detailed documentation and usage examples, please refer to our [tutorial](tutorial/tutorial.md).
|
||||||
|
|
||||||
|
## Installation Requirements
|
||||||
|
- Python ≥ 3.7
|
||||||
|
- Dependencies are automatically handled during installation
|
||||||
|
|
||||||
|
### Option 1: Install via pip
|
||||||
|
```bash
|
||||||
|
pip install FScanpy
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Install from source
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/seanchen607/FScanpy-package.git
|
||||||
|
cd FScanpy-package
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Authors
|
||||||
|
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
If you utilize FScanpy in your research, please cite our work:
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
[Citation details will be added upon publication]
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=42", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "FScanpy"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "PRF prediction tool"
|
||||||
|
authors = [{name = "FScanpy Developer", email = "example@example.com"}]
|
||||||
|
dependencies = [
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"tensorflow",
|
||||||
|
"scikit-learn",
|
||||||
|
"wrapt>=1.10.11"
|
||||||
|
]
|
||||||
|
requires-python = ">=3.7"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = ["FScanpy", "FScanpy.features"]
|
||||||
|
include-package-data = true
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
"FScanpy.data" = ["test_data/*"]
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="FScanpy",
|
||||||
|
version="1.0.0",
|
||||||
|
description="PRF prediction tool",
|
||||||
|
author="FScanpy Developer",
|
||||||
|
author_email="example@example.com",
|
||||||
|
packages=find_packages(),
|
||||||
|
install_requires=[
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"tensorflow",
|
||||||
|
"scikit-learn",
|
||||||
|
"wrapt>=1.10.11",
|
||||||
|
"biopython"
|
||||||
|
],
|
||||||
|
include_package_data=True,
|
||||||
|
python_requires=">=3.7",
|
||||||
|
)
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 170 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 148 KiB |
|
|
@ -0,0 +1,164 @@
|
||||||
|
## Abstract
|
||||||
|
FScanpy is a Python package designed to predict Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates advanced machine learning models, including Gradient Boosting and BiLSTM-CNN, to provide accurate predictions. This tool is essential for understanding gene expression regulation in various organisms, including eukaryotes and viruses, and offers a robust solution for PRF prediction challenges.
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|

|
||||||
|
|
||||||
|
FScanpy is a Python package dedicated to predicting Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates machine learning models (Gradient Boosting and BiLSTM-CNN) along with the FScanR package to furnish precise PRF predictions. Users are capable of employing three types of data as input: the entire cDNA/mRNA sequence that requires prediction, the nucleotide sequence in the vicinity of the suspected frameshift site, and the peptide library blastx results of the species or related species. It anticipates the input sequence to be in the + strand and can be integrated with FScanR to augment the accuracy.
|
||||||
|
|
||||||
|

|
||||||
|
For the prediction of the entire sequence, FScanpy adopts a sliding window approach to scan the entire sequence and predict the PRF sites. For regional prediction, it is based on the 33-bp and 399-bp sequences in the 0 reading frame around the suspected frameshift site. Initially, the Gradient Boosting model will predict the potential PRF sites within the scanning window. If the predicted probability exceeds the threshold, the BiLSTM-CNN model will predict the PRF sites in the 399bp sequence.Then,VotingClassifier will combine the two models to make the final prediction.
|
||||||
|
|
||||||
|
For PRF detection from BLASTX output, FScanpy identifies potential PRF sites from BLASTX alignment results, acquires the two hits of the same query sequence, and then utilizes frameDist_cutoff, mismatch_cutoff, and evalue_cutoff to filter the hits. Finally, it employs [FScanR](https://github.com/seanchen607/FScanR.git) to identify the PRF sites.
|
||||||
|
|
||||||
|
### Background
|
||||||
|
[Ribosomal frameshifting](https://en.wikipedia.org/wiki/Ribosomal_frameshift), also known as translational frameshifting or translational recoding, is a biological phenomenon that occurs during translation that results in the production of multiple, unique proteins from a single mRNA. The process can be programmed by the nucleotide sequence of the mRNA and is sometimes affected by the secondary, 3-dimensional mRNA structure. It has been described mainly in viruses (especially retroviruses), retrotransposons and bacterial insertion elements, and also in some cellular genes.
|
||||||
|
|
||||||
|
### Key features of FScanpy include:
|
||||||
|
|
||||||
|
- Integration of two predictive models:
|
||||||
|
- [Gradient Boosting](https://tensorflow.google.cn/tutorials/estimator/boosted_trees?hl=en): Analyzes local sequence features centered around potential frameshift sites (10 codons).
|
||||||
|
- [BiLSTM-CNN](https://paperswithcode.com/method/cnn-bilstm): Analyzes broader sequence features (100 codons).
|
||||||
|
- Supports PRF prediction across various species.
|
||||||
|
- Can be combined with [FScanR](https://github.com/seanchen607/FScanR.git) for enhanced accuracy.
|
||||||
|
|
||||||
|
## Installation (python>=3.7)
|
||||||
|
|
||||||
|
### 1. Use pip
|
||||||
|
```bash
|
||||||
|
pip install FScanpy
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Clone from [GitHub](https://github.com/.../FScanpy.git)
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/.../FScanpy.git
|
||||||
|
cd your_project_directory
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Methods and Usage
|
||||||
|
|
||||||
|
### 1. Load model and test data
|
||||||
|
Test data can be found in `FScanpy/data/test_data`,you can use the `list_test_data()` method to list all the test data and the `get_test_data_path()` method to get the path of the test data:
|
||||||
|
```python
|
||||||
|
from FScanpy import PRFPredictor
|
||||||
|
from FScanpy.data import get_test_data_path, list_test_data
|
||||||
|
predictor = PRFPredictor() # load model
|
||||||
|
list_test_data() # list all the test data
|
||||||
|
blastx_file = get_test_data_path('blastx_example.xlsx')
|
||||||
|
mrna_file = get_test_data_path('mrna_example.fasta')
|
||||||
|
region_example = get_test_data_path('region_example.xlsx')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Predict PRF Sites in a Full Sequence
|
||||||
|
Use the `predict_full()` method to scan the entire sequence,you can use the `window_size` parameter to adjust the scanning window size(default is 3) and the `gb_threshold` parameter to adjust the Gradient Boosting model fitting threshold(default is 0.1) for faster or more accurate prediction:
|
||||||
|
```python
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
sequence: mRNA sequence
|
||||||
|
window_size: scanning window size (default is 3)
|
||||||
|
gb_threshold: Gradient Boosting model threshold (default is 0.1)
|
||||||
|
Returns:
|
||||||
|
results: DataFrame containing prediction probabilities
|
||||||
|
'''
|
||||||
|
results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
|
||||||
|
window_size=3, # Scanning window size
|
||||||
|
gb_threshold=0.1, # Gradient Boosting model threshold
|
||||||
|
plot=True) # Whether to plot the prediction results
|
||||||
|
fig.savefig('predict_full.png')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Predict PRF in Specific Regions
|
||||||
|
Use the `predict_region()` method to predict PRF in known regions of interest:
|
||||||
|
```python
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
seq: 399bp sequence
|
||||||
|
gb_threshold: GB model probability threshold (default is 0.1)
|
||||||
|
Returns:
|
||||||
|
DataFrame: 包含所有序列预测概率的DataFrame
|
||||||
|
'''
|
||||||
|
import pandas as pd
|
||||||
|
region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
|
||||||
|
results = predictor.predict_region(seq=region_example['399bp'])
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Identify PRF Sites from BLASTX Output
|
||||||
|
BLASTX Output should contain the following columns: `qseqid`, `sseqid`, `pident`, `length`, `mismatch`, `gapopen`, `qstart`, `qend`, `sstart`, `send`, `evalue`, `bitscore`, `qframe`, `sframe`.
|
||||||
|
|
||||||
|
FScanR result contains `DNA_seqid`, `FS_start`, `FS_end`, `FS_type`,`Pep_seqid`, `Pep_FS_start`, `Pep_FS_end`, `Strand` columns.
|
||||||
|
Use the FScanR function to identify potential PRF sites from BLASTX alignment results:
|
||||||
|
```python
|
||||||
|
"""
|
||||||
|
identify PRF sites from BLASTX output
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blastx_output: BLASTX output DataFrame
|
||||||
|
mismatch_cutoff: mismatch threshold
|
||||||
|
evalue_cutoff: E-value threshold
|
||||||
|
frameDist_cutoff: frame distance threshold
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame containing PRF site information
|
||||||
|
"""
|
||||||
|
from FScanpy.utils import fscanr
|
||||||
|
blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
|
||||||
|
fscanr_result = fscanr(blastx_output,
|
||||||
|
mismatch_cutoff=10, # Allowed mismatches
|
||||||
|
evalue_cutoff=1e-5, # E-value threshold
|
||||||
|
frameDist_cutoff=10) # Frame distance threshold
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Extract PRF Sites from BLASTX Output or your Sequence Data and evaluate it by FScanpy
|
||||||
|
Use the `extract_prf_regions()` method to extract PRF site sequences from mRNA sequences,it based on the `FS_start` column of the FScanR output contact with the `DNA_seqid` column of the input mRNA sequence file to extract the 33bp and 399bp sequences around the PRF sites in 0 reading frame:
|
||||||
|
```python
|
||||||
|
"""
|
||||||
|
extract PRF site sequences from mRNA sequences
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mrna_file: mRNA sequence file path (FASTA format)
|
||||||
|
prf_data: FScanR output PRF site data or your suspected PRF site data which at least contains `DNA_seqid` `FS_start` `strand` columns
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame containing 33bp and 399bp sequences
|
||||||
|
"""
|
||||||
|
from FScanpy.utils import extract_prf_regions
|
||||||
|
prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
|
||||||
|
prf_data=fscanr_result)
|
||||||
|
prf_results = predictor.predict_region (prf_regions['399bp'])
|
||||||
|
```
|
||||||
|
|
||||||
|
## Total Test
|
||||||
|
```python
|
||||||
|
from FScanpy import PRFPredictor
|
||||||
|
from FScanpy.data import get_test_data_path, list_test_data
|
||||||
|
predictor = PRFPredictor() # load model
|
||||||
|
list_test_data() # list all the test data
|
||||||
|
blastx_file = get_test_data_path('blastx_example.xlsx')
|
||||||
|
mrna_file = get_test_data_path('mrna_example.fasta')
|
||||||
|
region_example = get_test_data_path('region_example.xlsx')
|
||||||
|
|
||||||
|
results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
|
||||||
|
window_size=3, # Scanning window size
|
||||||
|
gb_threshold=0.1, # Gradient Boosting model threshold
|
||||||
|
plot=True)
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
|
||||||
|
results = predictor.predict_region(seq=region_example['399bp'])
|
||||||
|
|
||||||
|
from FScanpy.utils import fscanr
|
||||||
|
blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
|
||||||
|
fscanr_result = fscanr(blastx_output,
|
||||||
|
mismatch_cutoff=10, # Allowed mismatches
|
||||||
|
evalue_cutoff=1e-5, # E-value threshold
|
||||||
|
frameDist_cutoff=10)
|
||||||
|
|
||||||
|
from FScanpy.utils import extract_prf_regions
|
||||||
|
prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
|
||||||
|
prf_data=fscanr_result)
|
||||||
|
prf_results = predictor.predict_region (prf_regions['399bp'])
|
||||||
|
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
If you use FScanpy, please cite our paper: [Paper Link]
|
||||||
Loading…
Reference in New Issue