first commit

This commit is contained in:
Chenlab 2025-03-18 11:21:54 +08:00
commit 321c41687e
20 changed files with 3956 additions and 0 deletions

109
FScanpy/__init__.py Normal file
View File

@ -0,0 +1,109 @@
from .predictor import PRFPredictor
import pandas as pd
import numpy as np
from typing import Union, List, Dict
__version__ = '0.3.0'
__author__ = ''
__email__ = ''
__all__ = ['PRFPredictor', 'predict_prf', '__version__', '__author__', '__email__']
def predict_prf(
sequence: Union[str, List[str], None] = None,
data: Union[pd.DataFrame, None] = None,
window_size: int = 3,
gb_threshold: float = 0.1,
model_dir: str = None
) -> pd.DataFrame:
"""
PRF位点预测函数
Args:
sequence: 单个或多个DNA序列用于滑动窗口预测
data: DataFrame数据必须包含'399bp'用于区域预测
window_size: 滑动窗口大小默认为3
gb_threshold: GB模型概率阈值默认为0.1
model_dir: 模型文件目录路径可选
Returns:
pandas.DataFrame: 预测结果
Examples:
# 1. 单条序列滑动窗口预测
>>> from FScanpy import predict_prf
>>> sequence = "ATGCGTACGT..."
>>> results = predict_prf(sequence=sequence)
# 2. 多条序列滑动窗口预测
>>> sequences = ["ATGCGTACGT...", "GCTATAGCAT..."]
>>> results = predict_prf(sequence=sequences)
# 3. DataFrame区域预测
>>> import pandas as pd
>>> data = pd.DataFrame({
... '399bp': ['ATGCGT...', 'GCTATAG...']
... })
>>> results = predict_prf(data=data)
"""
predictor = PRFPredictor(model_dir=model_dir)
# 验证输入参数
if sequence is None and data is None:
raise ValueError("必须提供sequence或data参数之一")
if sequence is not None and data is not None:
raise ValueError("sequence和data参数不能同时提供")
# 滑动窗口预测模式
if sequence is not None:
if isinstance(sequence, str):
# 单条序列预测
return predictor.predict_full(
sequence, window_size, gb_threshold)
elif isinstance(sequence, (list, tuple)):
# 多条序列预测
results = []
for i, seq in enumerate(sequence, 1):
try:
result = predictor.predict_full(
seq, window_size, gb_threshold)
result['Sequence_ID'] = f'seq_{i}'
results.append(result)
except Exception as e:
print(f"警告:序列 {i} 预测失败 - {str(e)}")
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
# 区域化预测模式
else:
if not isinstance(data, pd.DataFrame):
raise ValueError("data参数必须是pandas DataFrame类型")
if '399bp' not in data.columns:
raise ValueError("DataFrame必须包含'399bp'")
# 调用区域预测函数
try:
results = predictor.predict_region(
data['399bp'], gb_threshold)
# 添加原始数据的其他列
for col in data.columns:
if col not in ['399bp', '33bp']:
results[col] = data[col].values
return results
except Exception as e:
print(f"警告:区域预测失败 - {str(e)}")
# 创建空结果
results = pd.DataFrame({
'GB_Probability': [0.0] * len(data),
'CNN_Probability': [0.0] * len(data),
'Voting_Probability': [0.0] * len(data)
})
# 添加原始数据列
for col in data.columns:
results[col] = data[col].values
return results

9
FScanpy/data/__init__.py Normal file
View File

@ -0,0 +1,9 @@
import os
import pkg_resources
def get_test_data_path(filename: str) -> str:
return pkg_resources.resource_filename('FScanpy', f'data/test_data/{filename}')
def list_test_data() -> list:
data_dir = pkg_resources.resource_filename('FScanpy', 'data/test_data')
return os.listdir(data_dir)

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,8 @@
FS_period,399bp,fs_position,DNA_seqid,label,source,FS_type,dataset
gtgtgaacacaatagtgagtgacatactaaacg,ggatatgtaacatggacaagtcattgtgtaggtatccaagaccaatagcctttactttcaaagggaaagaagaattcaggaccttgtttggaggactcatctcgatgtcgattcaggtggtcattgtgctctatgcttatattatgctaaagataatgatagaacgtaatgacacatcaaaaagtgtgaacacaatagtgagtgacatactaaacgacaaatctccagtatctctcaatacaacagatttctcgtttgcatttgatgcttttattcttggcgatgataatttcgatttcaacaataaccaatacttcggaattgagctacttcaatggattaagcagccagatactggagaactatcatccactaatattccatatgaaagatgtggaa,16.0,MSTRG.18491.1,0,EUPLOTES,negative,EUPLOTES
gtctcagaagagtctgaggaatatctccaagga,caaattaataacaaatatgaattccatcaacaacttttatggagacgagaacttatcagatgaacttctgagtgaagatgtcgtgtcttgagaagtaagaggatcagaaaagatcttgcataacatggggagaaagtctctcagtaataagaagcctttaagcggagtggagttggactgagagtctcagaagagtctgaggaatatctccaaggataaaatttgttcgcaaggaagatctatctttaggcagaagaagtcaaaatcttgtgatcaagtagaagaacctcttagtagtcttaaagataacatgagtcactttaatgacatagacttgcaagctagtaagcctctaaaatcagagattagcaatctttttgggtactcaactcagcccaa,16.0,MSTRG.4662.1,0,EUPLOTES,negative,EUPLOTES
cttacttgcaaacatgaatctaataaattagag,ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc,16.0,MSTRG.14742.1,0,EUPLOTES,negative,EUPLOTES
atagtagaagatacagtctccatggccagtaca,ttgttgataaaaatacaggattttatctccaattaaagtcgagaaaataagccaagtagcagtaaacgtagataaacgaataagcatggaaaacaataatagaatagaccaaattgaagagatttctcacaattcgtatttgaattactgttagatttaggaataacaaatcgaactcgacacatagtagaagatacagtctccatggccagtacaaaagccaccatgaagaagaggaaaaagaacagataaatgctatcaaagaaaataaagttaacgatcaatatacaagcatgattgttattaatttaaccaactttagataatcagattataatcagcgacgatcaaagcaaaaaggtaaagaacaaccaattcaaattgaacatcagaaaggga,16.0,CUFF.17967.2,0,EUPLOTES,negative,EUPLOTES
cctcgtctttgtctccagaaaataagaaaacaa,catcaataaatagagtcaatgttagaagtatgtctaaattcaaaccaaacgaaattctaaataaagcaagaatgccaacataattaatttagattaagcttgatagttcattagtactcaacaataaaaatatttcaaagggtaatattccagaatcaaaattaagaaataaaattattcctacctcgtctttgtctccagaaaataagaaaacaaataaatcagttatgttcgaaaatgttaaagagatggaaagccaggacaagtcgcaaaatacactaacacatttgaaagaaagcaataatggtagtccttccaaattttaaaactgaaaataatcttgcagatgtagttcgatctagagataataaagcttataacagtactctaaacttaaaa,16.0,CUFF.22392.1,0,EUPLOTES,negative,EUPLOTES
aaaaatgacaaagatctgaacattagttctttc,ttaattttgttctgatcacctaattgtaagcccaaaaacgatactcaaaagatgaggaaactttattggaacattaaaagtaatctcttgagtttatttatgctaactatttacatacgaagcttttacgaaacattccaattcttggctcttgctggcttatcagctacttggaacaacgacaaaaatgacaaagatctgaacattagttctttcatttttgccattgttttgttgtttttatgcacaggtttcttcttatggtcactctaccattactttggatcccgctctgacaatcctcgaaatctcaaaatctctcaggagtttacgaatggagcaaaggagaataatagcggtaaactatatccagtgcttggattgctgagaagaggtctc,16.0,MSTRG.9455.1,0,EUPLOTES,negative,EUPLOTES
agaagactgggagaactctcagatactatatct,agaagaagagaaggccaggagtagctcgaaagaggaggaatttaaggtttacccaaagaaccctatgactgactctaaagatgatcagtcggacactctccctccgaaatcttacagtgtaaagaaagccaatgtaggagaactaaacaagtacgattttgagatctcttattccaaataatgagaagactgggagaactctcagatactatatctgcaagtatgatgaatgaaggcgtaaatttaacaagacttggaactttattgatcacgctaggatacacacaggagagaagccttacaaatgtgagctgtgtggcaaagagtttgctcagaaggggaactacaacaaacacaggaatacccaccagcatagtgccaagaagacctcagtaatga,16.0,MSTRG.26803.1,0,EUPLOTES,negative,EUPLOTES
1 FS_period 399bp fs_position DNA_seqid label source FS_type dataset
2 gtgtgaacacaatagtgagtgacatactaaacg ggatatgtaacatggacaagtcattgtgtaggtatccaagaccaatagcctttactttcaaagggaaagaagaattcaggaccttgtttggaggactcatctcgatgtcgattcaggtggtcattgtgctctatgcttatattatgctaaagataatgatagaacgtaatgacacatcaaaaagtgtgaacacaatagtgagtgacatactaaacgacaaatctccagtatctctcaatacaacagatttctcgtttgcatttgatgcttttattcttggcgatgataatttcgatttcaacaataaccaatacttcggaattgagctacttcaatggattaagcagccagatactggagaactatcatccactaatattccatatgaaagatgtggaa 16.0 MSTRG.18491.1 0 EUPLOTES negative EUPLOTES
3 gtctcagaagagtctgaggaatatctccaagga caaattaataacaaatatgaattccatcaacaacttttatggagacgagaacttatcagatgaacttctgagtgaagatgtcgtgtcttgagaagtaagaggatcagaaaagatcttgcataacatggggagaaagtctctcagtaataagaagcctttaagcggagtggagttggactgagagtctcagaagagtctgaggaatatctccaaggataaaatttgttcgcaaggaagatctatctttaggcagaagaagtcaaaatcttgtgatcaagtagaagaacctcttagtagtcttaaagataacatgagtcactttaatgacatagacttgcaagctagtaagcctctaaaatcagagattagcaatctttttgggtactcaactcagcccaa 16.0 MSTRG.4662.1 0 EUPLOTES negative EUPLOTES
4 cttacttgcaaacatgaatctaataaattagag ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc 16.0 MSTRG.14742.1 0 EUPLOTES negative EUPLOTES
5 atagtagaagatacagtctccatggccagtaca ttgttgataaaaatacaggattttatctccaattaaagtcgagaaaataagccaagtagcagtaaacgtagataaacgaataagcatggaaaacaataatagaatagaccaaattgaagagatttctcacaattcgtatttgaattactgttagatttaggaataacaaatcgaactcgacacatagtagaagatacagtctccatggccagtacaaaagccaccatgaagaagaggaaaaagaacagataaatgctatcaaagaaaataaagttaacgatcaatatacaagcatgattgttattaatttaaccaactttagataatcagattataatcagcgacgatcaaagcaaaaaggtaaagaacaaccaattcaaattgaacatcagaaaggga 16.0 CUFF.17967.2 0 EUPLOTES negative EUPLOTES
6 cctcgtctttgtctccagaaaataagaaaacaa catcaataaatagagtcaatgttagaagtatgtctaaattcaaaccaaacgaaattctaaataaagcaagaatgccaacataattaatttagattaagcttgatagttcattagtactcaacaataaaaatatttcaaagggtaatattccagaatcaaaattaagaaataaaattattcctacctcgtctttgtctccagaaaataagaaaacaaataaatcagttatgttcgaaaatgttaaagagatggaaagccaggacaagtcgcaaaatacactaacacatttgaaagaaagcaataatggtagtccttccaaattttaaaactgaaaataatcttgcagatgtagttcgatctagagataataaagcttataacagtactctaaacttaaaa 16.0 CUFF.22392.1 0 EUPLOTES negative EUPLOTES
7 aaaaatgacaaagatctgaacattagttctttc ttaattttgttctgatcacctaattgtaagcccaaaaacgatactcaaaagatgaggaaactttattggaacattaaaagtaatctcttgagtttatttatgctaactatttacatacgaagcttttacgaaacattccaattcttggctcttgctggcttatcagctacttggaacaacgacaaaaatgacaaagatctgaacattagttctttcatttttgccattgttttgttgtttttatgcacaggtttcttcttatggtcactctaccattactttggatcccgctctgacaatcctcgaaatctcaaaatctctcaggagtttacgaatggagcaaaggagaataatagcggtaaactatatccagtgcttggattgctgagaagaggtctc 16.0 MSTRG.9455.1 0 EUPLOTES negative EUPLOTES
8 agaagactgggagaactctcagatactatatct agaagaagagaaggccaggagtagctcgaaagaggaggaatttaaggtttacccaaagaaccctatgactgactctaaagatgatcagtcggacactctccctccgaaatcttacagtgtaaagaaagccaatgtaggagaactaaacaagtacgattttgagatctcttattccaaataatgagaagactgggagaactctcagatactatatctgcaagtatgatgaatgaaggcgtaaatttaacaagacttggaactttattgatcacgctaggatacacacaggagagaagccttacaaatgtgagctgtgtggcaaagagtttgctcagaaggggaactacaacaaacacaggaatacccaccagcatagtgccaagaagacctcagtaatga 16.0 MSTRG.26803.1 0 EUPLOTES negative EUPLOTES

View File

@ -0,0 +1,4 @@
from .sequence import SequenceFeatureExtractor
from .cnn_input import CNNInputProcessor
__all__ = ['SequenceFeatureExtractor', 'CNNInputProcessor']

View File

@ -0,0 +1,80 @@
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from typing import List, Union
class CNNInputProcessor:
"""CNN模型输入数据处理器"""
def __init__(self, max_length: int = 399):
self.max_length = max_length
self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4}
def trim_sequence(self, seq, target_length):
"""
从序列两端等量截取使其达到目标长度保持中心位置不变
参数:
seq: 原始序列
target_length: 目标长度
返回:
截取后的序列
"""
if len(seq) <= target_length:
return seq
# 计算需要从每端截取的长度
excess = len(seq) - target_length
trim_each_side = excess // 2
# 从两端等量截取,保持中心位置不变
return seq[trim_each_side:len(seq)-trim_each_side]
def prepare_sequence(self, sequence: str) -> np.ndarray:
"""
处理单个序列
Args:
sequence: DNA序列
Returns:
np.ndarray: 处理后的序列数组
"""
try:
# 序列验证和预处理
if not isinstance(sequence, str):
sequence = str(sequence)
sequence = sequence.upper().replace('U', 'T')
# 如果序列长度不等于目标长度,进行截取
if len(sequence) > self.max_length:
sequence = self.trim_sequence(sequence, self.max_length)
# 使用与训练时相同的编码方式
self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4} # 与SemiBilstmCnn.py中保持一致
# 序列转换为数字
seq_numeric = []
for base in sequence:
seq_numeric.append(self.base_to_num.get(base, 4)) # 未知碱基用4表示
# 填充序列
if len(seq_numeric) < self.max_length:
seq_numeric.extend([4] * (self.max_length - len(seq_numeric)))
# 重塑数据为三维数组 (samples, timesteps, features)
result = np.array(seq_numeric).reshape(1, self.max_length, 1)
# 检查结果维度
if result.ndim != 3:
print(f"警告: CNN输入维度异常 - {result.ndim}应为3")
# 强制修正为正确的维度
result = result.reshape(1, self.max_length, 1)
return result
except Exception as e:
print(f"CNN序列处理失败: {str(e)}")
# 出错时返回全零的三维数组
return np.zeros((1, self.max_length, 1))

View File

@ -0,0 +1,283 @@
import numpy as np
import pandas as pd
import itertools
from typing import List, Dict, Union
class SequenceFeatureExtractor:
"""DNA序列特征提取器"""
def __init__(self, seq_length=33):
"""初始化特征提取器"""
self.bases = ['A', 'T', 'G', 'C']
self.valid_bases = set('ATGCN')
self.seq_length = seq_length # 添加序列长度配置
self.feature_names = self._get_feature_names()
def _get_feature_names(self) -> List[str]:
"""
返回特征名称列表包含所有可能的碱基特征
Returns:
features: 特征名称列表
"""
features = []
# 基础特征 (包含N)
bases = ['A', 'T', 'G', 'C', 'N']
features.extend(bases)
# 3-mer特征
kmers_3 = [''.join(p) for p in itertools.product(bases, repeat=3)] # 125个特征
features.extend(kmers_3)
# 密码子特征
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)] # 64个密码子
n_codons = self.seq_length // 3 # 计算序列中包含的完整密码子数量
for i in range(n_codons):
for codon in codons:
features.append(f'codon_pos_{i}_{codon}')
# GC含量特征
features.append('gc_content')
# 序列复杂度特征
features.append('sequence_complexity')
return features
def trim_sequence(self, seq, target_length):
"""
从序列两端等量截取使其达到目标长度
Args:
seq: 原始序列
target_length: 目标长度
Returns:
截取后的序列
"""
if len(seq) <= target_length:
return seq
# 计算需要从每端截取的长度
excess = len(seq) - target_length
trim_each_side = excess // 2
# 从两端等量截取,保持中心位置不变
return seq[trim_each_side:len(seq)-trim_each_side]
def _preprocess_sequence(self, sequence):
"""
将DNA序列转换为特征向量
Args:
sequence: DNA序列
Returns:
feature_vector: 特征向量
"""
try:
feature_names = self.feature_names
if pd.isna(sequence) or not isinstance(sequence, str):
sequence = str(sequence)
sequence = sequence.upper().replace('U', 'T') # 统一为大写字母
# 如果序列长度不等于目标长度,进行截取或填充
if len(sequence) > self.seq_length:
sequence = self.trim_sequence(sequence, self.seq_length)
else:
sequence = sequence[:self.seq_length].ljust(self.seq_length, 'N')
# 初始化特征字典
features = {
'A': 0,
'T': 0,
'G': 0,
'C': 0,
'N': 0
}
kmer_features = {}
# 碱基组成
for base in ['A', 'T', 'G', 'C', 'N']:
features[base] = sequence.count(base) / self.seq_length
# 3-mer特征
for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
kmer_count = 0
for i in range(self.seq_length - 2):
if sequence[i:i+3] == kmer:
kmer_count += 1
kmer_features[kmer] = kmer_count / max(1, self.seq_length - 2)
# 密码子特征
codon_features = {}
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)] # 64个密码子
n_codons = self.seq_length // 3 # 计算序列中包含的完整密码子数量
for i in range(n_codons):
pos_start = i * 3
current_codon = sequence[pos_start:pos_start+3]
for codon in codons:
codon_features[f'codon_pos_{i}_{codon}'] = 1 if current_codon == codon and 'N' not in current_codon else 0
# GC含量
valid_bases = [b for b in sequence if b != 'N']
gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
# 序列复杂度Shannon熵
from collections import Counter
valid_counts = Counter(valid_bases)
total_valid = sum(valid_counts.values())
entropy = 0
for cnt in valid_counts.values():
p = cnt / total_valid
entropy += -p * np.log2(p)
entropy /= np.log2(4) # 归一化到0-1
# 合并所有特征
all_features = {**features, **kmer_features, **codon_features}
all_features['gc_content'] = gc_content
all_features['sequence_complexity'] = entropy
# 确保特征顺序一致
feature_vector = [all_features.get(f, 0.0) for f in feature_names]
return feature_vector
except Exception as e:
raise ValueError(f"特征提取失败: {str(e)}")
def extract_features_batch(self, sequences: List[Union[str, float]]) -> np.ndarray:
"""
批量提取特征
Args:
sequences: DNA序列列表
Returns:
np.ndarray: 特征矩阵
"""
try:
return np.array([self.extract_features(seq) for seq in sequences])
except Exception as e:
raise ValueError(f"批量特征提取失败: {str(e)}")
def predict_region_batch(self, data: pd.DataFrame, gb_threshold: float = 0.1) -> pd.DataFrame:
"""
批量预测区域序列
Args:
data: DataFrame包含'33bp''399bp'
gb_threshold: GB模型概率阈值默认为0.1
Returns:
DataFrame: 包含预测结果的DataFrame
"""
results = []
for idx, row in data.iterrows():
try:
# 确保序列是字符串
seq_33bp = str(row['33bp'])
seq = str(row['399bp'])
# 确保序列长度正确
seq_33bp = self._preprocess_sequence(seq_33bp)
seq = self._preprocess_sequence(seq)
# 预测
result = self.predict_region(seq_33bp, seq, gb_threshold)
# 添加原始数据的其他列
for col in data.columns:
if col not in ['33bp', '399bp']:
result[col] = row[col]
results.append(result)
except Exception as e:
print(f"处理索引 {idx} 的序列时出错: {str(e)}")
continue
return pd.DataFrame(results)
def extract_features(self, sequence: str) -> list:
"""
提取序列特征
Args:
sequence: DNA序列
Returns:
list: 特征向量
"""
try:
# 确保输入是字符串
if not isinstance(sequence, str):
sequence = str(sequence)
# 大写并替换U为T
sequence = sequence.upper().replace('U', 'T')
# 如果序列长度不等于目标长度,进行截取
if len(sequence) != self.seq_length:
sequence = self.trim_sequence(sequence, self.seq_length)
# 初始化特征列表
features = []
try:
# 基础特征 (碱基频率)
for base in ['A', 'T', 'G', 'C', 'N']:
features.append(sequence.count(base) / len(sequence))
# 3-mer特征
for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
count = 0
for i in range(len(sequence) - 2):
if sequence[i:i+3] == kmer:
count += 1
features.append(count / max(1, len(sequence) - 2))
# 密码子特征
codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)]
n_codons = len(sequence) // 3
for i in range(n_codons):
pos_start = i * 3
current_codon = sequence[pos_start:pos_start+3]
for codon in codons:
features.append(1 if current_codon == codon and 'N' not in current_codon else 0)
# GC含量
valid_bases = [b for b in sequence if b != 'N']
gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
features.append(gc_content)
# 序列复杂度
from collections import Counter
valid_counts = Counter(valid_bases)
total_valid = sum(valid_counts.values())
entropy = 0
if total_valid > 0: # 避免除零错误
for cnt in valid_counts.values():
if cnt > 0: # 避免log(0)
p = cnt / total_valid
entropy += -p * np.log2(p)
entropy /= np.log2(4) if len(valid_counts) > 0 else 1 # 归一化到0-1避免除零
features.append(entropy)
# 确保返回的是一维列表或数组
if isinstance(features, np.ndarray) and features.ndim > 1:
features = features.flatten()
return features
except Exception as e:
print(f"特征计算过程出错: {str(e)}")
# 如果计算过程出错,返回正确长度的全零特征向量
expected_length = 5 + 125 + (len(sequence) // 3) * 64 + 2 # 根据特征提取逻辑计算特征向量长度
return [0.0] * expected_length
except Exception as e:
print(f"特征提取失败: {str(e)}")
# 返回一个空列表,调用方需处理这种情况
return []

434
FScanpy/predictor.py Normal file
View File

@ -0,0 +1,434 @@
import os
import pickle
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from .features.sequence import SequenceFeatureExtractor
from .features.cnn_input import CNNInputProcessor
from .utils import extract_window_sequences
import matplotlib.pyplot as plt
import joblib
class PRFPredictor:
def __init__(self, model_dir=None):
"""
初始化PRF预测器
Args:
model_dir: 模型目录路径可选
"""
if model_dir is None:
from pkg_resources import resource_filename
model_dir = resource_filename('FScanpy', 'pretrained')
try:
# 加载模型
self.gb_model = self._load_pickle(os.path.join(model_dir, 'GradientBoosting_all.pkl'))
self.cnn_model = self._load_pickle(os.path.join(model_dir, 'BiLSTM-CNN_all.pkl'))
self.voting_model = self._load_pickle(os.path.join(model_dir, 'Voting_all.pkl'))
# 初始化特征提取器和CNN处理器使用与训练时相同的序列长度
self.gb_seq_length = 33 # HistGradientBoosting使用的序列长度
self.cnn_seq_length = 399 # BiLSTM-CNN使用的序列长度
# 初始化特征提取器和CNN输入处理器
self.feature_extractor = SequenceFeatureExtractor(seq_length=self.gb_seq_length)
self.cnn_processor = CNNInputProcessor(max_length=self.cnn_seq_length)
except FileNotFoundError as e:
raise FileNotFoundError(f"无法找到模型文件: {str(e)}")
except Exception as e:
raise Exception(f"加载模型出错: {str(e)}")
def _load_pickle(self, path):
return joblib.load(path)
def predict_single_position(self, fs_period, full_seq, gb_threshold=0.1):
'''
预测单个位置的PRF状态
Args:
fs_period: 33bp序列 (将根据gb_seq_length处理)
full_seq: 完整序列 (将根据cnn_seq_length处理)
gb_threshold: GB模型的概率阈值 (默认为0.1)
Returns:
dict: 包含预测概率的字典
'''
try:
# 处理序列长度
if len(fs_period) > self.gb_seq_length:
fs_period = self.feature_extractor.trim_sequence(fs_period, self.gb_seq_length)
# GB模型预测 - 确保输入是二维数组
try:
gb_features = self.feature_extractor.extract_features(fs_period)
# 检查特征结构并确保是一维数组
if isinstance(gb_features, np.ndarray):
# 如果是多维数组,进行扁平化处理
if gb_features.ndim > 1:
print(f"警告: 特征是{gb_features.ndim}维数组,进行扁平化处理")
gb_features = gb_features.flatten()
# 明确将特征转换为二维数组,正确形状为(1, n_features)
gb_features_2d = np.array([gb_features])
# 再次检查维度
if gb_features_2d.ndim != 2:
raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组")
gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
except Exception as e:
print(f"GB模型预测时出错: {str(e)}")
# 出错时设置概率为0
gb_prob = 0.0
# 如果GB概率低于阈值则跳过CNN模型
if gb_prob < gb_threshold:
return {
'GB_Probability': gb_prob,
'CNN_Probability': 0.0,
'Voting_Probability': 0.0
}
# CNN模型预测
try:
# 首先检查CNN模型的类型 - 通过尝试识别模型类型
is_sklearn_model = False
# 检测模型类型的方法
if hasattr(self.cnn_model, 'predict_proba'):
# 这可能是一个scikit-learn模型
is_sklearn_model = True
if is_sklearn_model:
# 如果是sklearn模型 (如HistGradientBoostingClassifier)使用与GB相同的特征提取
# 为CNN模型使用相同的特征提取方法但从399bp序列中提取
cnn_features = self.feature_extractor.extract_features(full_seq)
if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
cnn_features = cnn_features.flatten()
# 转为二维数组
cnn_features_2d = np.array([cnn_features])
cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
cnn_prob = cnn_pred[0][1]
else:
# 假设是深度学习模型,需要三维输入
cnn_input = self.cnn_processor.prepare_sequence(full_seq)
# 尝试不同的预测方法
try:
# 先尝试不带参数
cnn_pred = self.cnn_model.predict(cnn_input)
except TypeError:
try:
# 再尝试带verbose参数
cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
except Exception:
# 最后尝试将输入重塑为2D
reshaped_input = cnn_input.reshape(1, -1)
cnn_pred = self.cnn_model.predict(reshaped_input)
# 处理预测结果
if isinstance(cnn_pred, list):
cnn_pred = cnn_pred[0]
# 提取概率值
if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
cnn_prob = cnn_pred[0][1]
else:
cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
except Exception as e:
print(f"CNN模型预测时出错: {str(e)}")
# 出错时设置概率为0
cnn_prob = 0.0
# 投票模型预测
try:
# 确保投票模型输入是二维数组 (1, n_features)
voting_input = np.array([[gb_prob, cnn_prob]])
voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
except Exception as e:
print(f"投票模型预测时出错: {str(e)}")
# 出错时使用两个模型的平均值
voting_prob = (gb_prob + cnn_prob) / 2
return {
'GB_Probability': gb_prob,
'CNN_Probability': cnn_prob,
'Voting_Probability': voting_prob
}
except Exception as e:
raise Exception(f"预测过程出错: {str(e)}")
def predict_full(self, sequence, window_size=3, gb_threshold=0.1, plot=False):
"""
预测完整序列中的PRF位点
Args:
sequence: 输入DNA序列
window_size: 滑动窗口大小 (默认为3)
gb_threshold: GB模型概率阈值 (默认为0.1)
plot: 是否绘制预测结果图表 (默认为False)
Returns:
if plot=False:
pd.DataFrame: 包含预测结果的DataFrame
if plot=True:
tuple: (pd.DataFrame, matplotlib.figure.Figure)
"""
if window_size < 1:
raise ValueError("窗口大小必须大于等于1")
if gb_threshold < 0:
raise ValueError("GB阈值必须大于等于0")
results = []
try:
# 确保序列为字符串并转换为大写
sequence = str(sequence).upper()
# 滑动窗口预测
for pos in range(0, len(sequence) - 2, window_size):
# 提取窗口序列 - 使用与训练时相同的窗口大小
fs_period, full_seq = extract_window_sequences(sequence, pos)
if fs_period is None or full_seq is None:
continue
# 预测并记录结果
pred = self.predict_single_position(fs_period, full_seq, gb_threshold)
pred.update({
'Position': pos,
'Codon': sequence[pos:pos+3],
'33bp': fs_period,
'399bp': full_seq
})
results.append(pred)
# 创建结果DataFrame
results_df = pd.DataFrame(results)
# 如需绘图
if plot:
# 创建图形
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[2, 1])
# 绘制折线图
ax1.plot(results_df['Position'], results_df['GB_Probability'],
label='GB模型', alpha=0.7, linewidth=1.5)
ax1.plot(results_df['Position'], results_df['CNN_Probability'],
label='CNN模型', alpha=0.7, linewidth=1.5)
ax1.plot(results_df['Position'], results_df['Voting_Probability'],
label='投票模型', linewidth=2, color='red')
ax1.set_xlabel('序列位置')
ax1.set_ylabel('移码概率')
ax1.set_title('移码预测概率')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 准备热图数据
positions = results_df['Position'].values
probabilities = results_df['Voting_Probability'].values
# 创建热图矩阵
heatmap_matrix = np.zeros((1, len(positions)))
heatmap_matrix[0, :] = probabilities
# 绘制热图
im = ax2.imshow(heatmap_matrix, aspect='auto', cmap='YlOrRd',
extent=[min(positions), max(positions), 0, 1])
# 添加颜色条
cbar = plt.colorbar(im, ax=ax2)
cbar.set_label('移码概率')
# 设置热图轴标签
ax2.set_xlabel('序列位置')
ax2.set_title('移码概率热图')
ax2.set_yticks([])
# 调整布局
plt.tight_layout()
return results_df, fig
return results_df
except Exception as e:
raise Exception(f"序列预测过程出错: {str(e)}")
def predict_region(self, seq, gb_threshold=0.1):
'''
预测区域序列
Args:
seq: 399bp序列或包含399bp序列的DataFrame/Series
gb_threshold: GB模型概率阈值 (默认为0.1)
Returns:
DataFrame: 包含所有序列预测概率的DataFrame
'''
try:
# 如果输入是DataFrame或Series转换为列表
if isinstance(seq, (pd.DataFrame, pd.Series)):
seq = seq.tolist()
# 如果输入是单个字符串,转换为列表
if isinstance(seq, str):
seq = [seq]
results = []
for i, seq399 in enumerate(seq):
try:
# 从399bp序列中截取中心的33bp (GB模型使用)
seq33 = self._extract_center_sequence(seq399, target_length=self.gb_seq_length)
# GB模型预测 - 确保输入是二维数组
try:
gb_features = self.feature_extractor.extract_features(seq33)
# 检查特征结构并确保是一维数组
if isinstance(gb_features, np.ndarray):
# 如果是多维数组,进行扁平化处理
if gb_features.ndim > 1:
print(f"警告: 序列 {i+1} 的特征是{gb_features.ndim}维数组,进行扁平化处理")
gb_features = gb_features.flatten()
# 明确将特征转换为二维数组,正确形状为(1, n_features)
gb_features_2d = np.array([gb_features])
# 再次检查维度
if gb_features_2d.ndim != 2:
raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组")
gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
except Exception as e:
print(f"GB模型预测序列 {i+1} 时出错: {str(e)}")
# 出错时设置概率为0
gb_prob = 0.0
# 如果GB概率低于阈值添加低概率结果
if gb_prob < gb_threshold:
results.append({
'GB_Probability': gb_prob,
'CNN_Probability': 0.0,
'Voting_Probability': 0.0,
'33bp': seq33,
'399bp': seq399
})
continue
# CNN模型预测
try:
# 首先检查CNN模型的类型 - 通过尝试识别模型类型
is_sklearn_model = False
# 检测模型类型的方法
if hasattr(self.cnn_model, 'predict_proba'):
# 这可能是一个scikit-learn模型
is_sklearn_model = True
if is_sklearn_model:
# 如果是sklearn模型 (如HistGradientBoostingClassifier)使用与GB相同的特征提取
# 为CNN模型使用相同的特征提取方法但从399bp序列中提取
cnn_features = self.feature_extractor.extract_features(seq399)
if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
cnn_features = cnn_features.flatten()
# 转为二维数组
cnn_features_2d = np.array([cnn_features])
cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
cnn_prob = cnn_pred[0][1]
else:
# 假设是深度学习模型,需要三维输入
cnn_input = self.cnn_processor.prepare_sequence(seq399)
# 尝试不同的预测方法
try:
# 先尝试不带参数
cnn_pred = self.cnn_model.predict(cnn_input)
except TypeError:
try:
# 再尝试带verbose参数
cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
except Exception:
# 最后尝试将输入重塑为2D
reshaped_input = cnn_input.reshape(1, -1)
cnn_pred = self.cnn_model.predict(reshaped_input)
# 处理预测结果
if isinstance(cnn_pred, list):
cnn_pred = cnn_pred[0]
# 提取概率值
if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
cnn_prob = cnn_pred[0][1]
else:
cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
except Exception as e:
print(f"CNN模型预测序列 {i+1} 时出错: {str(e)}")
# 出错时设置概率为0
cnn_prob = 0.0
# 投票模型预测
try:
# 确保投票模型输入是二维数组 (1, n_features)
voting_input = np.array([[gb_prob, cnn_prob]])
voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
except Exception as e:
print(f"投票模型预测序列 {i+1} 时出错: {str(e)}")
# 出错时使用两个模型的平均值
voting_prob = (gb_prob + cnn_prob) / 2
results.append({
'GB_Probability': gb_prob,
'CNN_Probability': cnn_prob,
'Voting_Probability': voting_prob,
'33bp': seq33,
'399bp': seq399
})
except Exception as e:
print(f"处理第 {i+1} 个序列时出错: {str(e)}")
results.append({
'GB_Probability': 0.0,
'CNN_Probability': 0.0,
'Voting_Probability': 0.0,
'33bp': self._extract_center_sequence(seq399, target_length=self.gb_seq_length) if len(seq399) >= self.gb_seq_length else seq399,
'399bp': seq399
})
return pd.DataFrame(results)
except Exception as e:
raise Exception(f"区域预测过程出错: {str(e)}")
def _extract_center_sequence(self, sequence, target_length=33):
"""从序列中心位置提取指定长度的子序列"""
# 确保序列为字符串
sequence = str(sequence).upper()
# 如果序列长度小于目标长度,返回原序列
if len(sequence) <= target_length:
return sequence
# 计算中心位置
center = len(sequence) // 2
half_target = target_length // 2
# 提取中心序列
start = center - half_target
end = start + target_length
# 边界检查
if start < 0:
start = 0
end = target_length
elif end > len(sequence):
end = len(sequence)
start = end - target_length
return sequence[start:end]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

203
FScanpy/utils.py Normal file
View File

@ -0,0 +1,203 @@
import numpy as np
import pandas as pd
from typing import Tuple, Optional
from Bio import SeqIO
from Bio.Seq import Seq
def fscanr(blastx_output: pd.DataFrame,
mismatch_cutoff: float = 10,
evalue_cutoff: float = 1e-5,
frameDist_cutoff: float = 10) -> pd.DataFrame:
"""
identify PRF sites from BLASTX output
Args:
blastx_output: BLASTX output DataFrame
mismatch_cutoff: mismatch threshold
evalue_cutoff: E-value threshold
frameDist_cutoff: frame distance threshold
Returns:
pd.DataFrame: DataFrame containing PRF site information
"""
blastx = blastx_output.copy()
blastx.columns = ["qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send",
"evalue", "bitscore", "qframe", "sframe"]
blastx = blastx[
(blastx['evalue'] <= evalue_cutoff) &
(blastx['mismatch'] <= mismatch_cutoff)
].dropna()
freq = blastx['qseqid'].value_counts()
multi_hits = freq[freq > 1].index
blastx = blastx[blastx['qseqid'].isin(multi_hits)]
blastx = blastx.sort_values(['qseqid', 'sseqid', 'qstart'])
prf_list = []
for i in range(1, len(blastx)):
curr = blastx.iloc[i]
prev = blastx.iloc[i-1]
if (curr['qseqid'] == prev['qseqid'] and
curr['sseqid'] == prev['sseqid'] and
curr['qframe'] != prev['qframe'] and
curr['qframe'] * prev['qframe'] > 0):
if curr['qframe'] > 0 and prev['qframe'] > 0:
frame_start = prev['qend']
frame_end = curr['qstart']
pep_start = prev['send']
pep_end = curr['sstart']
strand = "+"
elif curr['qframe'] < 0 and prev['qframe'] < 0:
frame_start = prev['qstart']
frame_end = curr['qend']
pep_start = curr['send']
pep_end = prev['sstart']
strand = "-"
else:
continue
q_dist = frame_end - frame_start - 1
s_dist = pep_end - pep_start
fs_type = q_dist + (1 - s_dist) * 3
if (abs(q_dist) <= frameDist_cutoff and
abs(s_dist) <= frameDist_cutoff // 3 and
-3 < fs_type < 3):
prf_list.append({
'DNA_seqid': curr['qseqid'],
'FS_start': frame_start,
'FS_end': frame_end,
'Pep_seqid': curr['sseqid'],
'Pep_FS_start': prev['send'] + 1,
'Pep_FS_end': curr['sstart'],
'FS_type': fs_type,
'Strand': strand
})
if not prf_list:
print("No PRF events detected!")
return pd.DataFrame()
prf = pd.DataFrame(prf_list)
for col in ['DNA_seqid', 'Pep_seqid']:
for pos in ['FS_start', 'FS_end']:
loci = prf[col] + '_' + prf[pos].astype(str)
prf = prf[~loci.duplicated()]
return prf
def extract_prf_regions(mrna_file: str, prf_data: pd.DataFrame) -> pd.DataFrame:
"""
从mRNA序列中提取PRF位点周围的序列
Args:
mrna_file: mRNA序列文件路径 (FASTA格式)
prf_data: FScanR输出的PRF位点数据
Returns:
pd.DataFrame: 包含399bp序列的DataFrame
"""
mrna_dict = {rec.id: str(rec.seq)
for rec in SeqIO.parse(mrna_file, "fasta")}
results = []
for _, row in prf_data.iterrows():
seq_id = row['DNA_seqid']
if seq_id not in mrna_dict:
print(f"警告: {seq_id} 未在mRNA文件中找到")
continue
sequence = mrna_dict[seq_id]
strand = row['Strand']
fs_start = int(row['FS_start'])
try:
if strand == '-':
sequence = str(Seq(sequence).reverse_complement())
# 只提取399bp序列33bp由predictor内部截取
full_seq = extract_window_sequences(sequence, fs_start)[1]
results.append({
'DNA_seqid': seq_id,
'FS_start': fs_start,
'FS_end': int(row['FS_end']),
'Strand': strand,
'399bp': full_seq,
'FS_type': row['FS_type']
})
except Exception as e:
print(f"处理 {seq_id} 时出错: {str(e)}")
continue
return pd.DataFrame(results)
def extract_window_sequences(seq: str, position: int) -> Tuple[Optional[str], Optional[str]]:
"""
从指定位置提取分析窗口序列
Args:
seq: 输入DNA序列
position: 当前分析位置 (FS_start)
Returns:
Tuple[str, str]: (33bp序列, 399bp序列) - 已调整为与训练模型匹配的长度
"""
# 确保位置在密码子边界上整数倍的3
frame_position = position - (position % 3)
# 计算33bp窗口的起止位置 (GB模型)
half_size_small = 33 // 2
start_small = frame_position - half_size_small
end_small = frame_position + half_size_small + (33 % 2) # 添加余数以处理奇数长度
# 计算399bp窗口的起止位置 (CNN模型)
half_size_large = 399 // 2
start_large = frame_position - half_size_large
end_large = frame_position + half_size_large + (399 % 2) # 添加余数以处理奇数长度
# 提取序列并填充
seq_small = _extract_and_pad(seq, start_small, end_small, 33)
seq_large = _extract_and_pad(seq, start_large, end_large, 399)
return seq_small, seq_large
def _extract_and_pad(seq: str, start: int, end: int, target_length: int) -> str:
"""提取序列并用N填充"""
if start < 0:
prefix = 'N' * abs(start)
extracted = prefix + seq[:end]
elif end > len(seq):
suffix = 'N' * (end - len(seq))
extracted = seq[start:] + suffix
else:
extracted = seq[start:end]
# 确保序列长度正确
if len(extracted) < target_length:
# 从中心填充
pad_left = (target_length - len(extracted)) // 2
pad_right = target_length - len(extracted) - pad_left
extracted = 'N' * pad_left + extracted + 'N' * pad_right
elif len(extracted) > target_length:
# 从序列两端等量截取
excess = len(extracted) - target_length
trim_each_side = excess // 2
extracted = extracted[trim_each_side:len(extracted)-trim_each_side]
return extracted
def prepare_cnn_input(sequence: str) -> np.ndarray:
"""prepare CNN model input"""
base_to_num = {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'N': 0}
seq_numeric = [base_to_num.get(base, 0) for base in sequence.upper()]
return np.array(seq_numeric).reshape(1, len(sequence), 1)

34
README.md Normal file
View File

@ -0,0 +1,34 @@
# FScanpy
## A Machine Learning-Based Framework for Programmed Ribosomal Frameshifting Prediction
FScanpy is a comprehensive Python package designed for the prediction of [Programmed Ribosomal Frameshifting (PRF)](https://en.wikipedia.org/wiki/Ribosomal_frameshift) sites in nucleotide sequences. By integrating advanced machine learning approaches (Gradient Boosting and BiLSTM-CNN) with the established [FScanR](https://github.com/seanchen607/FScanR.git) framework, FScanpy provides robust and accurate PRF site predictions. The package requires input sequences to be in the positive (5' to 3') orientation.
![FScanpy Architecture](/tutorial/image/structure.jpeg)
For detailed documentation and usage examples, please refer to our [tutorial](tutorial/tutorial.md).
## Installation Requirements
- Python ≥ 3.7
- Dependencies are automatically handled during installation
### Option 1: Install via pip
```bash
pip install FScanpy
```
### Option 2: Install from source
```bash
git clone https://github.com/seanchen607/FScanpy-package.git
cd FScanpy-package
pip install -e .
```
## Authors
## Citation
If you utilize FScanpy in your research, please cite our work:
```bibtex
[Citation details will be added upon publication]
```

24
pyproject.toml Normal file
View File

@ -0,0 +1,24 @@
[build-system]
requires = ["setuptools>=42", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "FScanpy"
version = "1.0.0"
description = "PRF prediction tool"
authors = [{name = "FScanpy Developer", email = "example@example.com"}]
dependencies = [
"numpy",
"pandas",
"tensorflow",
"scikit-learn",
"wrapt>=1.10.11"
]
requires-python = ">=3.7"
[tool.setuptools]
packages = ["FScanpy", "FScanpy.features"]
include-package-data = true
[tool.setuptools.package-data]
"FScanpy.data" = ["test_data/*"]

20
setup.py Normal file
View File

@ -0,0 +1,20 @@
from setuptools import setup, find_packages
setup(
name="FScanpy",
version="1.0.0",
description="PRF prediction tool",
author="FScanpy Developer",
author_email="example@example.com",
packages=find_packages(),
install_requires=[
"numpy",
"pandas",
"tensorflow",
"scikit-learn",
"wrapt>=1.10.11",
"biopython"
],
include_package_data=True,
python_requires=">=3.7",
)

BIN
tutorial/image/ML.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

164
tutorial/tutorial.md Normal file
View File

@ -0,0 +1,164 @@
## Abstract
FScanpy is a Python package designed to predict Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates advanced machine learning models, including Gradient Boosting and BiLSTM-CNN, to provide accurate predictions. This tool is essential for understanding gene expression regulation in various organisms, including eukaryotes and viruses, and offers a robust solution for PRF prediction challenges.
## Introduction
![FScanpy structure](/tutorial/image/structure.jpeg)
FScanpy is a Python package dedicated to predicting Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates machine learning models (Gradient Boosting and BiLSTM-CNN) along with the FScanR package to furnish precise PRF predictions. Users are capable of employing three types of data as input: the entire cDNA/mRNA sequence that requires prediction, the nucleotide sequence in the vicinity of the suspected frameshift site, and the peptide library blastx results of the species or related species. It anticipates the input sequence to be in the + strand and can be integrated with FScanR to augment the accuracy.
![Machine learning models](/tutorial/image/ML.png)
For the prediction of the entire sequence, FScanpy adopts a sliding window approach to scan the entire sequence and predict the PRF sites. For regional prediction, it is based on the 33-bp and 399-bp sequences in the 0 reading frame around the suspected frameshift site. Initially, the Gradient Boosting model will predict the potential PRF sites within the scanning window. If the predicted probability exceeds the threshold, the BiLSTM-CNN model will predict the PRF sites in the 399bp sequence.Then,VotingClassifier will combine the two models to make the final prediction.
For PRF detection from BLASTX output, FScanpy identifies potential PRF sites from BLASTX alignment results, acquires the two hits of the same query sequence, and then utilizes frameDist_cutoff, mismatch_cutoff, and evalue_cutoff to filter the hits. Finally, it employs [FScanR](https://github.com/seanchen607/FScanR.git) to identify the PRF sites.
### Background
[Ribosomal frameshifting](https://en.wikipedia.org/wiki/Ribosomal_frameshift), also known as translational frameshifting or translational recoding, is a biological phenomenon that occurs during translation that results in the production of multiple, unique proteins from a single mRNA. The process can be programmed by the nucleotide sequence of the mRNA and is sometimes affected by the secondary, 3-dimensional mRNA structure. It has been described mainly in viruses (especially retroviruses), retrotransposons and bacterial insertion elements, and also in some cellular genes.
### Key features of FScanpy include:
- Integration of two predictive models:
- [Gradient Boosting](https://tensorflow.google.cn/tutorials/estimator/boosted_trees?hl=en): Analyzes local sequence features centered around potential frameshift sites (10 codons).
- [BiLSTM-CNN](https://paperswithcode.com/method/cnn-bilstm): Analyzes broader sequence features (100 codons).
- Supports PRF prediction across various species.
- Can be combined with [FScanR](https://github.com/seanchen607/FScanR.git) for enhanced accuracy.
## Installation (python>=3.7)
### 1. Use pip
```bash
pip install FScanpy
```
### 2. Clone from [GitHub](https://github.com/.../FScanpy.git)
```bash
git clone https://github.com/.../FScanpy.git
cd your_project_directory
pip install -e .
```
## Methods and Usage
### 1. Load model and test data
Test data can be found in `FScanpy/data/test_data`,you can use the `list_test_data()` method to list all the test data and the `get_test_data_path()` method to get the path of the test data:
```python
from FScanpy import PRFPredictor
from FScanpy.data import get_test_data_path, list_test_data
predictor = PRFPredictor() # load model
list_test_data() # list all the test data
blastx_file = get_test_data_path('blastx_example.xlsx')
mrna_file = get_test_data_path('mrna_example.fasta')
region_example = get_test_data_path('region_example.xlsx')
```
### 2. Predict PRF Sites in a Full Sequence
Use the `predict_full()` method to scan the entire sequence,you can use the `window_size` parameter to adjust the scanning window size(default is 3) and the `gb_threshold` parameter to adjust the Gradient Boosting model fitting threshold(default is 0.1) for faster or more accurate prediction:
```python
'''
Args:
sequence: mRNA sequence
window_size: scanning window size (default is 3)
gb_threshold: Gradient Boosting model threshold (default is 0.1)
Returns:
results: DataFrame containing prediction probabilities
'''
results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
window_size=3, # Scanning window size
gb_threshold=0.1, # Gradient Boosting model threshold
plot=True) # Whether to plot the prediction results
fig.savefig('predict_full.png')
```
### 3. Predict PRF in Specific Regions
Use the `predict_region()` method to predict PRF in known regions of interest:
```python
'''
Args:
seq: 399bp sequence
gb_threshold: GB model probability threshold (default is 0.1)
Returns:
DataFrame: 包含所有序列预测概率的DataFrame
'''
import pandas as pd
region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
results = predictor.predict_region(seq=region_example['399bp'])
```
### 4. Identify PRF Sites from BLASTX Output
BLASTX Output should contain the following columns: `qseqid`, `sseqid`, `pident`, `length`, `mismatch`, `gapopen`, `qstart`, `qend`, `sstart`, `send`, `evalue`, `bitscore`, `qframe`, `sframe`.
FScanR result contains `DNA_seqid`, `FS_start`, `FS_end`, `FS_type`,`Pep_seqid`, `Pep_FS_start`, `Pep_FS_end`, `Strand` columns.
Use the FScanR function to identify potential PRF sites from BLASTX alignment results:
```python
"""
identify PRF sites from BLASTX output
Args:
blastx_output: BLASTX output DataFrame
mismatch_cutoff: mismatch threshold
evalue_cutoff: E-value threshold
frameDist_cutoff: frame distance threshold
Returns:
pd.DataFrame: DataFrame containing PRF site information
"""
from FScanpy.utils import fscanr
blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
fscanr_result = fscanr(blastx_output,
mismatch_cutoff=10, # Allowed mismatches
evalue_cutoff=1e-5, # E-value threshold
frameDist_cutoff=10) # Frame distance threshold
```
### 5. Extract PRF Sites from BLASTX Output or your Sequence Data and evaluate it by FScanpy
Use the `extract_prf_regions()` method to extract PRF site sequences from mRNA sequences,it based on the `FS_start` column of the FScanR output contact with the `DNA_seqid` column of the input mRNA sequence file to extract the 33bp and 399bp sequences around the PRF sites in 0 reading frame:
```python
"""
extract PRF site sequences from mRNA sequences
Args:
mrna_file: mRNA sequence file path (FASTA format)
prf_data: FScanR output PRF site data or your suspected PRF site data which at least contains `DNA_seqid` `FS_start` `strand` columns
Returns:
pd.DataFrame: DataFrame containing 33bp and 399bp sequences
"""
from FScanpy.utils import extract_prf_regions
prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
prf_data=fscanr_result)
prf_results = predictor.predict_region (prf_regions['399bp'])
```
## Total Test
```python
from FScanpy import PRFPredictor
from FScanpy.data import get_test_data_path, list_test_data
predictor = PRFPredictor() # load model
list_test_data() # list all the test data
blastx_file = get_test_data_path('blastx_example.xlsx')
mrna_file = get_test_data_path('mrna_example.fasta')
region_example = get_test_data_path('region_example.xlsx')
results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
window_size=3, # Scanning window size
gb_threshold=0.1, # Gradient Boosting model threshold
plot=True)
import pandas as pd
region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
results = predictor.predict_region(seq=region_example['399bp'])
from FScanpy.utils import fscanr
blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
fscanr_result = fscanr(blastx_output,
mismatch_cutoff=10, # Allowed mismatches
evalue_cutoff=1e-5, # E-value threshold
frameDist_cutoff=10)
from FScanpy.utils import extract_prf_regions
prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
prf_data=fscanr_result)
prf_results = predictor.predict_region (prf_regions['399bp'])
## Citation
If you use FScanpy, please cite our paper: [Paper Link]