FScanpy-package/FScanpy/__init__.py

109 lines
3.8 KiB
Python
Raw Normal View History

2025-03-18 11:21:54 +08:00
from .predictor import PRFPredictor
import pandas as pd
import numpy as np
from typing import Union, List, Dict
__version__ = '0.3.0'
__author__ = ''
__email__ = ''
__all__ = ['PRFPredictor', 'predict_prf', '__version__', '__author__', '__email__']
def predict_prf(
sequence: Union[str, List[str], None] = None,
data: Union[pd.DataFrame, None] = None,
window_size: int = 3,
gb_threshold: float = 0.1,
model_dir: str = None
) -> pd.DataFrame:
"""
PRF位点预测函数
Args:
sequence: 单个或多个DNA序列用于滑动窗口预测
data: DataFrame数据必须包含'399bp'用于区域预测
window_size: 滑动窗口大小默认为3
gb_threshold: GB模型概率阈值默认为0.1
model_dir: 模型文件目录路径可选
Returns:
pandas.DataFrame: 预测结果
Examples:
# 1. 单条序列滑动窗口预测
>>> from FScanpy import predict_prf
>>> sequence = "ATGCGTACGT..."
>>> results = predict_prf(sequence=sequence)
# 2. 多条序列滑动窗口预测
>>> sequences = ["ATGCGTACGT...", "GCTATAGCAT..."]
>>> results = predict_prf(sequence=sequences)
# 3. DataFrame区域预测
>>> import pandas as pd
>>> data = pd.DataFrame({
... '399bp': ['ATGCGT...', 'GCTATAG...']
... })
>>> results = predict_prf(data=data)
"""
predictor = PRFPredictor(model_dir=model_dir)
# 验证输入参数
if sequence is None and data is None:
raise ValueError("必须提供sequence或data参数之一")
if sequence is not None and data is not None:
raise ValueError("sequence和data参数不能同时提供")
# 滑动窗口预测模式
if sequence is not None:
if isinstance(sequence, str):
# 单条序列预测
return predictor.predict_full(
sequence, window_size, gb_threshold)
elif isinstance(sequence, (list, tuple)):
# 多条序列预测
results = []
for i, seq in enumerate(sequence, 1):
try:
result = predictor.predict_full(
seq, window_size, gb_threshold)
result['Sequence_ID'] = f'seq_{i}'
results.append(result)
except Exception as e:
print(f"警告:序列 {i} 预测失败 - {str(e)}")
return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
# 区域化预测模式
else:
if not isinstance(data, pd.DataFrame):
raise ValueError("data参数必须是pandas DataFrame类型")
if '399bp' not in data.columns:
raise ValueError("DataFrame必须包含'399bp'")
# 调用区域预测函数
try:
results = predictor.predict_region(
data['399bp'], gb_threshold)
# 添加原始数据的其他列
for col in data.columns:
if col not in ['399bp', '33bp']:
results[col] = data[col].values
return results
except Exception as e:
print(f"警告:区域预测失败 - {str(e)}")
# 创建空结果
results = pd.DataFrame({
'GB_Probability': [0.0] * len(data),
'CNN_Probability': [0.0] * len(data),
'Voting_Probability': [0.0] * len(data)
})
# 添加原始数据列
for col in data.columns:
results[col] = data[col].values
return results