first commit

2025-03-18 11:21:54 +08:00 · 2025-03-18 11:21:54 +08:00 · 321c41687e
commit 321c41687e
20 changed files with 3956 additions and 0 deletions
--- a/FScanpy/init.py
+++ b/FScanpy/init.py
@ -0,0 +1,109 @@
 from .predictor import PRFPredictor
 import pandas as pd
 import numpy as np
 from typing import Union, List, Dict
 __version__ = '0.3.0'
 __author__ = ''
 __email__ = ''
 __all__ = ['PRFPredictor', 'predict_prf', '__version__', '__author__', '__email__']
 def predict_prf(
    sequence: Union[str, List[str], None] = None,
    data: Union[pd.DataFrame, None] = None,
    window_size: int = 3,
    gb_threshold: float = 0.1,
    model_dir: str = None
 ) -> pd.DataFrame:
    """
    PRF位点预测函数
    Args:
        sequence: 单个或多个DNA序列，用于滑动窗口预测
        data: DataFrame数据，必须包含'399bp'列，用于区域预测
        window_size: 滑动窗口大小（默认为3）
        gb_threshold: GB模型概率阈值（默认为0.1）
        model_dir: 模型文件目录路径（可选）
    Returns:
        pandas.DataFrame: 预测结果
    Examples:
        # 1. 单条序列滑动窗口预测
        >>> from FScanpy import predict_prf
        >>> sequence = "ATGCGTACGT..."
        >>> results = predict_prf(sequence=sequence)
        # 2. 多条序列滑动窗口预测
        >>> sequences = ["ATGCGTACGT...", "GCTATAGCAT..."]
        >>> results = predict_prf(sequence=sequences)
        # 3. DataFrame区域预测
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        ...     '399bp': ['ATGCGT...', 'GCTATAG...']
        ... })
        >>> results = predict_prf(data=data)
    """
    predictor = PRFPredictor(model_dir=model_dir)
    # 验证输入参数
    if sequence is None and data is None:
        raise ValueError("必须提供sequence或data参数之一")
    if sequence is not None and data is not None:
        raise ValueError("sequence和data参数不能同时提供")
    # 滑动窗口预测模式
    if sequence is not None:
        if isinstance(sequence, str):
            # 单条序列预测
            return predictor.predict_full(
                sequence, window_size, gb_threshold)
        elif isinstance(sequence, (list, tuple)):
            # 多条序列预测
            results = []
            for i, seq in enumerate(sequence, 1):
                try:
                    result = predictor.predict_full(
                        seq, window_size, gb_threshold)
                    result['Sequence_ID'] = f'seq_{i}'
                    results.append(result)
                except Exception as e:
                    print(f"警告：序列 {i} 预测失败 - {str(e)}")
            return pd.concat(results, ignore_index=True) if results else pd.DataFrame()
    # 区域化预测模式
    else:
        if not isinstance(data, pd.DataFrame):
            raise ValueError("data参数必须是pandas DataFrame类型")
        if '399bp' not in data.columns:
            raise ValueError("DataFrame必须包含'399bp'列")
        # 调用区域预测函数
        try:
            results = predictor.predict_region(
                data['399bp'], gb_threshold)
            # 添加原始数据的其他列
            for col in data.columns:
                if col not in ['399bp', '33bp']:
                    results[col] = data[col].values
            return results
        except Exception as e:
            print(f"警告：区域预测失败 - {str(e)}")
            # 创建空结果
            results = pd.DataFrame({
                'GB_Probability': [0.0] * len(data),
                'CNN_Probability': [0.0] * len(data),
                'Voting_Probability': [0.0] * len(data)
            })
            # 添加原始数据列
            for col in data.columns:
                results[col] = data[col].values
            return results
--- a/FScanpy/data/init.py
+++ b/FScanpy/data/init.py
@ -0,0 +1,9 @@
 import os
 import pkg_resources
 def get_test_data_path(filename: str) -> str:
    return pkg_resources.resource_filename('FScanpy', f'data/test_data/{filename}')
 def list_test_data() -> list:
    data_dir = pkg_resources.resource_filename('FScanpy', 'data/test_data')
    return os.listdir(data_dir)
--- a/FScanpy/data/test_data/blastx_example.xlsx
+++ b/FScanpy/data/test_data/blastx_example.xlsx
--- a/FScanpy/data/test_data/mrna_example.fasta
+++ b/FScanpy/data/test_data/mrna_example.fasta
--- a/FScanpy/data/test_data/region_example.csv
+++ b/FScanpy/data/test_data/region_example.csv
@ -0,0 +1,8 @@
 FS_period,399bp,fs_position,DNA_seqid,label,source,FS_type,dataset
 gtgtgaacacaatagtgagtgacatactaaacg,ggatatgtaacatggacaagtcattgtgtaggtatccaagaccaatagcctttactttcaaagggaaagaagaattcaggaccttgtttggaggactcatctcgatgtcgattcaggtggtcattgtgctctatgcttatattatgctaaagataatgatagaacgtaatgacacatcaaaaagtgtgaacacaatagtgagtgacatactaaacgacaaatctccagtatctctcaatacaacagatttctcgtttgcatttgatgcttttattcttggcgatgataatttcgatttcaacaataaccaatacttcggaattgagctacttcaatggattaagcagccagatactggagaactatcatccactaatattccatatgaaagatgtggaa,16.0,MSTRG.18491.1,0,EUPLOTES,negative,EUPLOTES
 gtctcagaagagtctgaggaatatctccaagga,caaattaataacaaatatgaattccatcaacaacttttatggagacgagaacttatcagatgaacttctgagtgaagatgtcgtgtcttgagaagtaagaggatcagaaaagatcttgcataacatggggagaaagtctctcagtaataagaagcctttaagcggagtggagttggactgagagtctcagaagagtctgaggaatatctccaaggataaaatttgttcgcaaggaagatctatctttaggcagaagaagtcaaaatcttgtgatcaagtagaagaacctcttagtagtcttaaagataacatgagtcactttaatgacatagacttgcaagctagtaagcctctaaaatcagagattagcaatctttttgggtactcaactcagcccaa,16.0,MSTRG.4662.1,0,EUPLOTES,negative,EUPLOTES
 cttacttgcaaacatgaatctaataaattagag,ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc,16.0,MSTRG.14742.1,0,EUPLOTES,negative,EUPLOTES
 atagtagaagatacagtctccatggccagtaca,ttgttgataaaaatacaggattttatctccaattaaagtcgagaaaataagccaagtagcagtaaacgtagataaacgaataagcatggaaaacaataatagaatagaccaaattgaagagatttctcacaattcgtatttgaattactgttagatttaggaataacaaatcgaactcgacacatagtagaagatacagtctccatggccagtacaaaagccaccatgaagaagaggaaaaagaacagataaatgctatcaaagaaaataaagttaacgatcaatatacaagcatgattgttattaatttaaccaactttagataatcagattataatcagcgacgatcaaagcaaaaaggtaaagaacaaccaattcaaattgaacatcagaaaggga,16.0,CUFF.17967.2,0,EUPLOTES,negative,EUPLOTES
 cctcgtctttgtctccagaaaataagaaaacaa,catcaataaatagagtcaatgttagaagtatgtctaaattcaaaccaaacgaaattctaaataaagcaagaatgccaacataattaatttagattaagcttgatagttcattagtactcaacaataaaaatatttcaaagggtaatattccagaatcaaaattaagaaataaaattattcctacctcgtctttgtctccagaaaataagaaaacaaataaatcagttatgttcgaaaatgttaaagagatggaaagccaggacaagtcgcaaaatacactaacacatttgaaagaaagcaataatggtagtccttccaaattttaaaactgaaaataatcttgcagatgtagttcgatctagagataataaagcttataacagtactctaaacttaaaa,16.0,CUFF.22392.1,0,EUPLOTES,negative,EUPLOTES
 aaaaatgacaaagatctgaacattagttctttc,ttaattttgttctgatcacctaattgtaagcccaaaaacgatactcaaaagatgaggaaactttattggaacattaaaagtaatctcttgagtttatttatgctaactatttacatacgaagcttttacgaaacattccaattcttggctcttgctggcttatcagctacttggaacaacgacaaaaatgacaaagatctgaacattagttctttcatttttgccattgttttgttgtttttatgcacaggtttcttcttatggtcactctaccattactttggatcccgctctgacaatcctcgaaatctcaaaatctctcaggagtttacgaatggagcaaaggagaataatagcggtaaactatatccagtgcttggattgctgagaagaggtctc,16.0,MSTRG.9455.1,0,EUPLOTES,negative,EUPLOTES
 agaagactgggagaactctcagatactatatct,agaagaagagaaggccaggagtagctcgaaagaggaggaatttaaggtttacccaaagaaccctatgactgactctaaagatgatcagtcggacactctccctccgaaatcttacagtgtaaagaaagccaatgtaggagaactaaacaagtacgattttgagatctcttattccaaataatgagaagactgggagaactctcagatactatatctgcaagtatgatgaatgaaggcgtaaatttaacaagacttggaactttattgatcacgctaggatacacacaggagagaagccttacaaatgtgagctgtgtggcaaagagtttgctcagaaggggaactacaacaaacacaggaatacccaccagcatagtgccaagaagacctcagtaatga,16.0,MSTRG.26803.1,0,EUPLOTES,negative,EUPLOTES
--- a/FScanpy/features/init.py
+++ b/FScanpy/features/init.py
@ -0,0 +1,4 @@
 from .sequence import SequenceFeatureExtractor
 from .cnn_input import CNNInputProcessor
 __all__ = ['SequenceFeatureExtractor', 'CNNInputProcessor'] 
--- a/FScanpy/features/cnn_input.py
+++ b/FScanpy/features/cnn_input.py
@ -0,0 +1,80 @@
 import numpy as np
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from typing import List, Union
 class CNNInputProcessor:
    """CNN模型输入数据处理器"""
    def __init__(self, max_length: int = 399):
        self.max_length = max_length
        self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4} 
    def trim_sequence(self, seq, target_length):
        """
        从序列两端等量截取，使其达到目标长度，保持中心位置不变
        参数:
        seq: 原始序列
        target_length: 目标长度
        返回:
        截取后的序列
        """
        if len(seq) <= target_length:
            return seq
        # 计算需要从每端截取的长度
        excess = len(seq) - target_length
        trim_each_side = excess // 2
        # 从两端等量截取，保持中心位置不变
        return seq[trim_each_side:len(seq)-trim_each_side]
    def prepare_sequence(self, sequence: str) -> np.ndarray:
        """
        处理单个序列
        Args:
            sequence: DNA序列
        Returns:
            np.ndarray: 处理后的序列数组
        """
        try:
            # 序列验证和预处理
            if not isinstance(sequence, str):
                sequence = str(sequence)
            sequence = sequence.upper().replace('U', 'T')
            # 如果序列长度不等于目标长度，进行截取
            if len(sequence) > self.max_length:
                sequence = self.trim_sequence(sequence, self.max_length)
            # 使用与训练时相同的编码方式
            self.base_to_num = {'A': 0, 'T': 1, 'C': 2, 'G': 3, 'N': 4}  # 与SemiBilstmCnn.py中保持一致
            # 序列转换为数字
            seq_numeric = []
            for base in sequence:
                seq_numeric.append(self.base_to_num.get(base, 4))  # 未知碱基用4表示
            # 填充序列
            if len(seq_numeric) < self.max_length:
                seq_numeric.extend([4] * (self.max_length - len(seq_numeric)))
            # 重塑数据为三维数组 (samples, timesteps, features)
            result = np.array(seq_numeric).reshape(1, self.max_length, 1)
            # 检查结果维度
            if result.ndim != 3:
                print(f"警告: CNN输入维度异常 - {result.ndim}，应为3")
                # 强制修正为正确的维度
                result = result.reshape(1, self.max_length, 1)
            return result
        except Exception as e:
            print(f"CNN序列处理失败: {str(e)}")
            # 出错时返回全零的三维数组
            return np.zeros((1, self.max_length, 1))
--- a/FScanpy/features/sequence.py
+++ b/FScanpy/features/sequence.py
@ -0,0 +1,283 @@
 import numpy as np
 import pandas as pd
 import itertools
 from typing import List, Dict, Union
 class SequenceFeatureExtractor:
    """DNA序列特征提取器"""
    def __init__(self, seq_length=33):
        """初始化特征提取器"""
        self.bases = ['A', 'T', 'G', 'C']
        self.valid_bases = set('ATGCN')
        self.seq_length = seq_length  # 添加序列长度配置
        self.feature_names = self._get_feature_names()
    def _get_feature_names(self) -> List[str]:
        """
        返回特征名称列表，包含所有可能的碱基特征
        Returns:
            features: 特征名称列表
        """
        features = []
        # 基础特征 (包含N)
        bases = ['A', 'T', 'G', 'C', 'N']
        features.extend(bases)
        # 3-mer特征
        kmers_3 = [''.join(p) for p in itertools.product(bases, repeat=3)]  # 125个特征
        features.extend(kmers_3)
        # 密码子特征
        codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)]  # 64个密码子
        n_codons = self.seq_length // 3  # 计算序列中包含的完整密码子数量
        for i in range(n_codons):
            for codon in codons:
                features.append(f'codon_pos_{i}_{codon}')
        # GC含量特征
        features.append('gc_content')
        # 序列复杂度特征
        features.append('sequence_complexity')
        return features
    def trim_sequence(self, seq, target_length):
        """
        从序列两端等量截取，使其达到目标长度
        Args:
            seq: 原始序列
            target_length: 目标长度
        Returns:
            截取后的序列
        """
        if len(seq) <= target_length:
            return seq
        # 计算需要从每端截取的长度
        excess = len(seq) - target_length
        trim_each_side = excess // 2
        # 从两端等量截取，保持中心位置不变
        return seq[trim_each_side:len(seq)-trim_each_side]
    def _preprocess_sequence(self, sequence):
        """
        将DNA序列转换为特征向量
        Args:
            sequence: DNA序列
        Returns:
            feature_vector: 特征向量
        """
        try:
            feature_names = self.feature_names
            if pd.isna(sequence) or not isinstance(sequence, str):
                sequence = str(sequence)
            sequence = sequence.upper().replace('U', 'T')  # 统一为大写字母
            # 如果序列长度不等于目标长度，进行截取或填充
            if len(sequence) > self.seq_length:
                sequence = self.trim_sequence(sequence, self.seq_length)
            else:
                sequence = sequence[:self.seq_length].ljust(self.seq_length, 'N')
            # 初始化特征字典
            features = {
                'A': 0,
                'T': 0,
                'G': 0,
                'C': 0,
                'N': 0
            }
            kmer_features = {}
            # 碱基组成
            for base in ['A', 'T', 'G', 'C', 'N']:
                features[base] = sequence.count(base) / self.seq_length
            # 3-mer特征
            for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
                kmer_count = 0
                for i in range(self.seq_length - 2):
                    if sequence[i:i+3] == kmer:
                        kmer_count += 1
                kmer_features[kmer] = kmer_count / max(1, self.seq_length - 2)
            # 密码子特征
            codon_features = {}
            codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)]  # 64个密码子
            n_codons = self.seq_length // 3  # 计算序列中包含的完整密码子数量
            for i in range(n_codons):
                pos_start = i * 3
                current_codon = sequence[pos_start:pos_start+3]
                for codon in codons:
                    codon_features[f'codon_pos_{i}_{codon}'] = 1 if current_codon == codon and 'N' not in current_codon else 0
            # GC含量
            valid_bases = [b for b in sequence if b != 'N']
            gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
            # 序列复杂度（Shannon熵）
            from collections import Counter
            valid_counts = Counter(valid_bases)
            total_valid = sum(valid_counts.values())
            entropy = 0
            for cnt in valid_counts.values():
                p = cnt / total_valid
                entropy += -p * np.log2(p)
            entropy /= np.log2(4)  # 归一化到0-1
            # 合并所有特征
            all_features = {**features, **kmer_features, **codon_features}
            all_features['gc_content'] = gc_content
            all_features['sequence_complexity'] = entropy
            # 确保特征顺序一致
            feature_vector = [all_features.get(f, 0.0) for f in feature_names]
            return feature_vector
        except Exception as e:
            raise ValueError(f"特征提取失败: {str(e)}")
    def extract_features_batch(self, sequences: List[Union[str, float]]) -> np.ndarray:
        """
        批量提取特征
        Args:
            sequences: DNA序列列表
        Returns:
            np.ndarray: 特征矩阵
        """
        try:
            return np.array([self.extract_features(seq) for seq in sequences])
        except Exception as e:
            raise ValueError(f"批量特征提取失败: {str(e)}")
    def predict_region_batch(self, data: pd.DataFrame, gb_threshold: float = 0.1) -> pd.DataFrame:
        """
        批量预测区域序列
        Args:
            data: DataFrame包含'33bp'和'399bp'列
            gb_threshold: GB模型概率阈值（默认为0.1）
        Returns:
            DataFrame: 包含预测结果的DataFrame
        """
        results = []
        for idx, row in data.iterrows():
            try:
                # 确保序列是字符串
                seq_33bp = str(row['33bp'])
                seq = str(row['399bp'])
                # 确保序列长度正确
                seq_33bp = self._preprocess_sequence(seq_33bp)
                seq = self._preprocess_sequence(seq)
                # 预测
                result = self.predict_region(seq_33bp, seq, gb_threshold)
                # 添加原始数据的其他列
                for col in data.columns:
                    if col not in ['33bp', '399bp']:
                        result[col] = row[col]
                results.append(result)
            except Exception as e:
                print(f"处理索引 {idx} 的序列时出错: {str(e)}")
                continue
        return pd.DataFrame(results)
    def extract_features(self, sequence: str) -> list:
        """
        提取序列特征
        Args:
            sequence: DNA序列
        Returns:
            list: 特征向量
        """
        try:
            # 确保输入是字符串
            if not isinstance(sequence, str):
                sequence = str(sequence)
            # 大写并替换U为T
            sequence = sequence.upper().replace('U', 'T')
            # 如果序列长度不等于目标长度，进行截取
            if len(sequence) != self.seq_length:
                sequence = self.trim_sequence(sequence, self.seq_length)
            # 初始化特征列表
            features = []
            try:
                # 基础特征 (碱基频率)
                for base in ['A', 'T', 'G', 'C', 'N']:
                    features.append(sequence.count(base) / len(sequence))
                # 3-mer特征
                for kmer in [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C', 'N'], repeat=3)]:
                    count = 0
                    for i in range(len(sequence) - 2):
                        if sequence[i:i+3] == kmer:
                            count += 1
                    features.append(count / max(1, len(sequence) - 2))
                # 密码子特征
                codons = [''.join(p) for p in itertools.product(['A', 'T', 'G', 'C'], repeat=3)]
                n_codons = len(sequence) // 3
                for i in range(n_codons):
                    pos_start = i * 3
                    current_codon = sequence[pos_start:pos_start+3]
                    for codon in codons:
                        features.append(1 if current_codon == codon and 'N' not in current_codon else 0)
                # GC含量
                valid_bases = [b for b in sequence if b != 'N']
                gc_content = (valid_bases.count('G') + valid_bases.count('C')) / len(valid_bases) if valid_bases else 0
                features.append(gc_content)
                # 序列复杂度
                from collections import Counter
                valid_counts = Counter(valid_bases)
                total_valid = sum(valid_counts.values())
                entropy = 0
                if total_valid > 0:  # 避免除零错误
                    for cnt in valid_counts.values():
                        if cnt > 0:  # 避免log(0)
                            p = cnt / total_valid
                            entropy += -p * np.log2(p)
                    entropy /= np.log2(4) if len(valid_counts) > 0 else 1  # 归一化到0-1，避免除零
                features.append(entropy)
                # 确保返回的是一维列表或数组
                if isinstance(features, np.ndarray) and features.ndim > 1:
                    features = features.flatten()
                return features
            except Exception as e:
                print(f"特征计算过程出错: {str(e)}")
                # 如果计算过程出错，返回正确长度的全零特征向量
                expected_length = 5 + 125 + (len(sequence) // 3) * 64 + 2  # 根据特征提取逻辑计算特征向量长度
                return [0.0] * expected_length
        except Exception as e:
            print(f"特征提取失败: {str(e)}")
            # 返回一个空列表，调用方需处理这种情况
            return []
--- a/FScanpy/predictor.py
+++ b/FScanpy/predictor.py
@ -0,0 +1,434 @@
 import os
 import pickle
 import numpy as np
 import pandas as pd
 from tensorflow.keras.models import load_model
 from .features.sequence import SequenceFeatureExtractor
 from .features.cnn_input import CNNInputProcessor
 from .utils import extract_window_sequences
 import matplotlib.pyplot as plt
 import joblib
 class PRFPredictor:
    def __init__(self, model_dir=None):
        """
        初始化PRF预测器
        Args:
            model_dir: 模型目录路径（可选）
        """
        if model_dir is None:
           from pkg_resources import resource_filename
           model_dir = resource_filename('FScanpy', 'pretrained')
        try:
            # 加载模型
            self.gb_model = self._load_pickle(os.path.join(model_dir, 'GradientBoosting_all.pkl'))
            self.cnn_model = self._load_pickle(os.path.join(model_dir, 'BiLSTM-CNN_all.pkl'))
            self.voting_model = self._load_pickle(os.path.join(model_dir, 'Voting_all.pkl'))
            # 初始化特征提取器和CNN处理器，使用与训练时相同的序列长度
            self.gb_seq_length = 33  # HistGradientBoosting使用的序列长度
            self.cnn_seq_length = 399  # BiLSTM-CNN使用的序列长度
            # 初始化特征提取器和CNN输入处理器
            self.feature_extractor = SequenceFeatureExtractor(seq_length=self.gb_seq_length)
            self.cnn_processor = CNNInputProcessor(max_length=self.cnn_seq_length)
        except FileNotFoundError as e:
            raise FileNotFoundError(f"无法找到模型文件: {str(e)}")
        except Exception as e:
            raise Exception(f"加载模型出错: {str(e)}")
    def _load_pickle(self, path):
        return joblib.load(path)
    def predict_single_position(self, fs_period, full_seq, gb_threshold=0.1):
        '''
        预测单个位置的PRF状态
        Args:
            fs_period: 33bp序列 (将根据gb_seq_length处理)
            full_seq: 完整序列 (将根据cnn_seq_length处理)
            gb_threshold: GB模型的概率阈值 (默认为0.1)
        Returns:
            dict: 包含预测概率的字典
        '''
        try:
            # 处理序列长度
            if len(fs_period) > self.gb_seq_length:
                fs_period = self.feature_extractor.trim_sequence(fs_period, self.gb_seq_length)
            # GB模型预测 - 确保输入是二维数组
            try:
                gb_features = self.feature_extractor.extract_features(fs_period)
                # 检查特征结构并确保是一维数组
                if isinstance(gb_features, np.ndarray):
                    # 如果是多维数组，进行扁平化处理
                    if gb_features.ndim > 1:
                        print(f"警告: 特征是{gb_features.ndim}维数组，进行扁平化处理")
                        gb_features = gb_features.flatten()
                # 明确将特征转换为二维数组，正确形状为(1, n_features)
                gb_features_2d = np.array([gb_features])
                # 再次检查维度
                if gb_features_2d.ndim != 2:
                    raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维，需要二维数组")
                gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
            except Exception as e:
                print(f"GB模型预测时出错: {str(e)}")
                # 出错时设置概率为0
                gb_prob = 0.0
            # 如果GB概率低于阈值，则跳过CNN模型
            if gb_prob < gb_threshold:
                return {
                    'GB_Probability': gb_prob,
                    'CNN_Probability': 0.0,
                    'Voting_Probability': 0.0
                }
            # CNN模型预测
            try:
                # 首先检查CNN模型的类型 - 通过尝试识别模型类型
                is_sklearn_model = False
                # 检测模型类型的方法
                if hasattr(self.cnn_model, 'predict_proba'):
                    # 这可能是一个scikit-learn模型
                    is_sklearn_model = True
                if is_sklearn_model:
                    # 如果是sklearn模型 (如HistGradientBoostingClassifier)，使用与GB相同的特征提取
                    # 为CNN模型使用相同的特征提取方法，但从399bp序列中提取
                    cnn_features = self.feature_extractor.extract_features(full_seq)
                    if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
                        cnn_features = cnn_features.flatten()
                    # 转为二维数组
                    cnn_features_2d = np.array([cnn_features])
                    cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
                    cnn_prob = cnn_pred[0][1]
                else:
                    # 假设是深度学习模型，需要三维输入
                    cnn_input = self.cnn_processor.prepare_sequence(full_seq)
                    # 尝试不同的预测方法
                    try:
                        # 先尝试不带参数
                        cnn_pred = self.cnn_model.predict(cnn_input)
                    except TypeError:
                        try:
                            # 再尝试带verbose参数
                            cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
                        except Exception:
                            # 最后尝试将输入重塑为2D
                            reshaped_input = cnn_input.reshape(1, -1)
                            cnn_pred = self.cnn_model.predict(reshaped_input)
                    # 处理预测结果
                    if isinstance(cnn_pred, list):
                        cnn_pred = cnn_pred[0]
                    # 提取概率值
                    if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
                        cnn_prob = cnn_pred[0][1]
                    else:
                        cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
            except Exception as e:
                print(f"CNN模型预测时出错: {str(e)}")
                # 出错时设置概率为0
                cnn_prob = 0.0
            # 投票模型预测
            try:
                # 确保投票模型输入是二维数组 (1, n_features)
                voting_input = np.array([[gb_prob, cnn_prob]])
                voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
            except Exception as e:
                print(f"投票模型预测时出错: {str(e)}")
                # 出错时使用两个模型的平均值
                voting_prob = (gb_prob + cnn_prob) / 2
            return {
                'GB_Probability': gb_prob,
                'CNN_Probability': cnn_prob,
                'Voting_Probability': voting_prob
            }
        except Exception as e:
            raise Exception(f"预测过程出错: {str(e)}")
    def predict_full(self, sequence, window_size=3, gb_threshold=0.1, plot=False):
        """
        预测完整序列中的PRF位点
        Args:
            sequence: 输入DNA序列
            window_size: 滑动窗口大小 (默认为3)
            gb_threshold: GB模型概率阈值 (默认为0.1)
            plot: 是否绘制预测结果图表 (默认为False)
        Returns:
            if plot=False:
                pd.DataFrame: 包含预测结果的DataFrame
            if plot=True:
                tuple: (pd.DataFrame, matplotlib.figure.Figure)
        """
        if window_size < 1:
            raise ValueError("窗口大小必须大于等于1")
        if gb_threshold < 0:
            raise ValueError("GB阈值必须大于等于0")
        results = []
        try:
            # 确保序列为字符串并转换为大写
            sequence = str(sequence).upper()
            # 滑动窗口预测
            for pos in range(0, len(sequence) - 2, window_size):
                # 提取窗口序列 - 使用与训练时相同的窗口大小
                fs_period, full_seq = extract_window_sequences(sequence, pos)
                if fs_period is None or full_seq is None:
                    continue
                # 预测并记录结果
                pred = self.predict_single_position(fs_period, full_seq, gb_threshold)
                pred.update({
                    'Position': pos,
                    'Codon': sequence[pos:pos+3],
                    '33bp': fs_period,
                    '399bp': full_seq
                })
                results.append(pred)
            # 创建结果DataFrame
            results_df = pd.DataFrame(results)
            # 如需绘图
            if plot:
                # 创建图形
                fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[2, 1])
                # 绘制折线图
                ax1.plot(results_df['Position'], results_df['GB_Probability'], 
                        label='GB模型', alpha=0.7, linewidth=1.5)
                ax1.plot(results_df['Position'], results_df['CNN_Probability'], 
                        label='CNN模型', alpha=0.7, linewidth=1.5)
                ax1.plot(results_df['Position'], results_df['Voting_Probability'], 
                        label='投票模型', linewidth=2, color='red')
                ax1.set_xlabel('序列位置')
                ax1.set_ylabel('移码概率')
                ax1.set_title('移码预测概率')
                ax1.legend()
                ax1.grid(True, alpha=0.3)
                # 准备热图数据
                positions = results_df['Position'].values
                probabilities = results_df['Voting_Probability'].values
                # 创建热图矩阵
                heatmap_matrix = np.zeros((1, len(positions)))
                heatmap_matrix[0, :] = probabilities
                # 绘制热图
                im = ax2.imshow(heatmap_matrix, aspect='auto', cmap='YlOrRd',
                              extent=[min(positions), max(positions), 0, 1])
                # 添加颜色条
                cbar = plt.colorbar(im, ax=ax2)
                cbar.set_label('移码概率')
                # 设置热图轴标签
                ax2.set_xlabel('序列位置')
                ax2.set_title('移码概率热图')
                ax2.set_yticks([])
                # 调整布局
                plt.tight_layout()
                return results_df, fig
            return results_df
        except Exception as e:
            raise Exception(f"序列预测过程出错: {str(e)}")
    def predict_region(self, seq, gb_threshold=0.1):
        '''
        预测区域序列
        Args:
            seq: 399bp序列或包含399bp序列的DataFrame/Series
            gb_threshold: GB模型概率阈值 (默认为0.1)
        Returns:
            DataFrame: 包含所有序列预测概率的DataFrame
        ''' 
        try:
            # 如果输入是DataFrame或Series，转换为列表
            if isinstance(seq, (pd.DataFrame, pd.Series)):
                seq = seq.tolist()
            # 如果输入是单个字符串，转换为列表
            if isinstance(seq, str):
                seq = [seq]
            results = []
            for i, seq399 in enumerate(seq):
                try:
                    # 从399bp序列中截取中心的33bp (GB模型使用)
                    seq33 = self._extract_center_sequence(seq399, target_length=self.gb_seq_length)
                    # GB模型预测 - 确保输入是二维数组
                    try:
                        gb_features = self.feature_extractor.extract_features(seq33)
                        # 检查特征结构并确保是一维数组
                        if isinstance(gb_features, np.ndarray):
                            # 如果是多维数组，进行扁平化处理
                            if gb_features.ndim > 1:
                                print(f"警告: 序列 {i+1} 的特征是{gb_features.ndim}维数组，进行扁平化处理")
                                gb_features = gb_features.flatten()
                        # 明确将特征转换为二维数组，正确形状为(1, n_features)
                        gb_features_2d = np.array([gb_features])
                        # 再次检查维度
                        if gb_features_2d.ndim != 2:
                            raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维，需要二维数组")
                        gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1]
                    except Exception as e:
                        print(f"GB模型预测序列 {i+1} 时出错: {str(e)}")
                        # 出错时设置概率为0
                        gb_prob = 0.0
                    # 如果GB概率低于阈值，添加低概率结果
                    if gb_prob < gb_threshold:
                        results.append({
                            'GB_Probability': gb_prob,
                            'CNN_Probability': 0.0,
                            'Voting_Probability': 0.0,
                            '33bp': seq33,
                            '399bp': seq399
                        })
                        continue
                    # CNN模型预测
                    try:
                        # 首先检查CNN模型的类型 - 通过尝试识别模型类型
                        is_sklearn_model = False
                        # 检测模型类型的方法
                        if hasattr(self.cnn_model, 'predict_proba'):
                            # 这可能是一个scikit-learn模型
                            is_sklearn_model = True
                        if is_sklearn_model:
                            # 如果是sklearn模型 (如HistGradientBoostingClassifier)，使用与GB相同的特征提取
                            # 为CNN模型使用相同的特征提取方法，但从399bp序列中提取
                            cnn_features = self.feature_extractor.extract_features(seq399)
                            if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1:
                                cnn_features = cnn_features.flatten()
                            # 转为二维数组
                            cnn_features_2d = np.array([cnn_features])
                            cnn_pred = self.cnn_model.predict_proba(cnn_features_2d)
                            cnn_prob = cnn_pred[0][1]
                        else:
                            # 假设是深度学习模型，需要三维输入
                            cnn_input = self.cnn_processor.prepare_sequence(seq399)
                            # 尝试不同的预测方法
                            try:
                                # 先尝试不带参数
                                cnn_pred = self.cnn_model.predict(cnn_input)
                            except TypeError:
                                try:
                                    # 再尝试带verbose参数
                                    cnn_pred = self.cnn_model.predict(cnn_input, verbose=0)
                                except Exception:
                                    # 最后尝试将输入重塑为2D
                                    reshaped_input = cnn_input.reshape(1, -1)
                                    cnn_pred = self.cnn_model.predict(reshaped_input)
                            # 处理预测结果
                            if isinstance(cnn_pred, list):
                                cnn_pred = cnn_pred[0]
                            # 提取概率值
                            if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1:
                                cnn_prob = cnn_pred[0][1]
                            else:
                                cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0]
                    except Exception as e:
                        print(f"CNN模型预测序列 {i+1} 时出错: {str(e)}")
                        # 出错时设置概率为0
                        cnn_prob = 0.0
                    # 投票模型预测
                    try:
                        # 确保投票模型输入是二维数组 (1, n_features)
                        voting_input = np.array([[gb_prob, cnn_prob]])
                        voting_prob = self.voting_model.predict_proba(voting_input)[0][1]
                    except Exception as e:
                        print(f"投票模型预测序列 {i+1} 时出错: {str(e)}")
                        # 出错时使用两个模型的平均值
                        voting_prob = (gb_prob + cnn_prob) / 2
                    results.append({
                        'GB_Probability': gb_prob,
                        'CNN_Probability': cnn_prob,
                        'Voting_Probability': voting_prob,
                        '33bp': seq33,
                        '399bp': seq399
                    })
                except Exception as e:
                    print(f"处理第 {i+1} 个序列时出错: {str(e)}")
                    results.append({
                        'GB_Probability': 0.0,
                        'CNN_Probability': 0.0,
                        'Voting_Probability': 0.0,
                        '33bp': self._extract_center_sequence(seq399, target_length=self.gb_seq_length) if len(seq399) >= self.gb_seq_length else seq399,
                        '399bp': seq399
                    })
            return pd.DataFrame(results)
        except Exception as e:
            raise Exception(f"区域预测过程出错: {str(e)}")
    def _extract_center_sequence(self, sequence, target_length=33):
        """从序列中心位置提取指定长度的子序列"""
        # 确保序列为字符串
        sequence = str(sequence).upper()
        # 如果序列长度小于目标长度，返回原序列
        if len(sequence) <= target_length:
            return sequence
        # 计算中心位置
        center = len(sequence) // 2
        half_target = target_length // 2
        # 提取中心序列
        start = center - half_target
        end = start + target_length
        # 边界检查
        if start < 0:
            start = 0
            end = target_length
        elif end > len(sequence):
            end = len(sequence)
            start = end - target_length
        return sequence[start:end]
--- a/FScanpy/pretrained/BiLSTM-CNN_all.pkl
+++ b/FScanpy/pretrained/BiLSTM-CNN_all.pkl
--- a/FScanpy/pretrained/BiLSTM-CNN_tokenizer.pickle
+++ b/FScanpy/pretrained/BiLSTM-CNN_tokenizer.pickle
--- a/FScanpy/pretrained/GradientBoosting_all.pkl
+++ b/FScanpy/pretrained/GradientBoosting_all.pkl
--- a/FScanpy/pretrained/Voting_all.pkl
+++ b/FScanpy/pretrained/Voting_all.pkl
--- a/FScanpy/utils.py
+++ b/FScanpy/utils.py
@ -0,0 +1,203 @@
 import numpy as np
 import pandas as pd
 from typing import Tuple, Optional
 from Bio import SeqIO
 from Bio.Seq import Seq
 def fscanr(blastx_output: pd.DataFrame, 
           mismatch_cutoff: float = 10,
           evalue_cutoff: float = 1e-5,
           frameDist_cutoff: float = 10) -> pd.DataFrame:
    """
    identify PRF sites from BLASTX output
    Args:
        blastx_output: BLASTX output DataFrame
        mismatch_cutoff: mismatch threshold
        evalue_cutoff: E-value threshold 
        frameDist_cutoff: frame distance threshold
    Returns:
        pd.DataFrame: DataFrame containing PRF site information
    """
    blastx = blastx_output.copy()
    blastx.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", 
                     "gapopen", "qstart", "qend", "sstart", "send", 
                     "evalue", "bitscore", "qframe", "sframe"]
    blastx = blastx[
        (blastx['evalue'] <= evalue_cutoff) & 
        (blastx['mismatch'] <= mismatch_cutoff)
    ].dropna()
    freq = blastx['qseqid'].value_counts()
    multi_hits = freq[freq > 1].index
    blastx = blastx[blastx['qseqid'].isin(multi_hits)]
    blastx = blastx.sort_values(['qseqid', 'sseqid', 'qstart'])
    prf_list = []
    for i in range(1, len(blastx)):
        curr = blastx.iloc[i]
        prev = blastx.iloc[i-1]
        if (curr['qseqid'] == prev['qseqid'] and 
            curr['sseqid'] == prev['sseqid'] and
            curr['qframe'] != prev['qframe'] and 
            curr['qframe'] * prev['qframe'] > 0):
            if curr['qframe'] > 0 and prev['qframe'] > 0:
                frame_start = prev['qend'] 
                frame_end = curr['qstart']
                pep_start = prev['send']
                pep_end = curr['sstart']
                strand = "+"
            elif curr['qframe'] < 0 and prev['qframe'] < 0:
                frame_start = prev['qstart']
                frame_end = curr['qend'] 
                pep_start = curr['send']
                pep_end = prev['sstart']
                strand = "-"
            else:
                continue
            q_dist = frame_end - frame_start - 1
            s_dist = pep_end - pep_start
            fs_type = q_dist + (1 - s_dist) * 3
            if (abs(q_dist) <= frameDist_cutoff and 
                abs(s_dist) <= frameDist_cutoff // 3 and
                -3 < fs_type < 3):
                prf_list.append({
                    'DNA_seqid': curr['qseqid'],
                    'FS_start': frame_start,
                    'FS_end': frame_end,
                    'Pep_seqid': curr['sseqid'],
                    'Pep_FS_start': prev['send'] + 1,
                    'Pep_FS_end': curr['sstart'],
                    'FS_type': fs_type,
                    'Strand': strand
                })
    if not prf_list:
        print("No PRF events detected!")
        return pd.DataFrame()
    prf = pd.DataFrame(prf_list)
    for col in ['DNA_seqid', 'Pep_seqid']:
        for pos in ['FS_start', 'FS_end']:
            loci = prf[col] + '_' + prf[pos].astype(str)
            prf = prf[~loci.duplicated()]
    return prf
 def extract_prf_regions(mrna_file: str, prf_data: pd.DataFrame) -> pd.DataFrame:
    """
    从mRNA序列中提取PRF位点周围的序列
    Args:
        mrna_file: mRNA序列文件路径 (FASTA格式)
        prf_data: FScanR输出的PRF位点数据
    Returns:
        pd.DataFrame: 包含399bp序列的DataFrame
    """
    mrna_dict = {rec.id: str(rec.seq) 
                 for rec in SeqIO.parse(mrna_file, "fasta")}
    results = []
    for _, row in prf_data.iterrows():
        seq_id = row['DNA_seqid']
        if seq_id not in mrna_dict:
            print(f"警告: {seq_id} 未在mRNA文件中找到")
            continue
        sequence = mrna_dict[seq_id]
        strand = row['Strand']
        fs_start = int(row['FS_start'])
        try:
            if strand == '-':
                sequence = str(Seq(sequence).reverse_complement())
            # 只提取399bp序列，33bp由predictor内部截取
            full_seq = extract_window_sequences(sequence, fs_start)[1]
            results.append({
                'DNA_seqid': seq_id,
                'FS_start': fs_start,
                'FS_end': int(row['FS_end']),
                'Strand': strand,
                '399bp': full_seq,
                'FS_type': row['FS_type']
            })
        except Exception as e:
            print(f"处理 {seq_id} 时出错: {str(e)}")
            continue
    return pd.DataFrame(results)
 def extract_window_sequences(seq: str, position: int) -> Tuple[Optional[str], Optional[str]]:
    """
    从指定位置提取分析窗口序列
    Args:
        seq: 输入DNA序列
        position: 当前分析位置 (FS_start)
    Returns:
        Tuple[str, str]: (33bp序列, 399bp序列) - 已调整为与训练模型匹配的长度
    """
    # 确保位置在密码子边界上（整数倍的3）
    frame_position = position - (position % 3)
    # 计算33bp窗口的起止位置 (GB模型)
    half_size_small = 33 // 2
    start_small = frame_position - half_size_small
    end_small = frame_position + half_size_small + (33 % 2)  # 添加余数以处理奇数长度
    # 计算399bp窗口的起止位置 (CNN模型)
    half_size_large = 399 // 2
    start_large = frame_position - half_size_large
    end_large = frame_position + half_size_large + (399 % 2)  # 添加余数以处理奇数长度
    # 提取序列并填充
    seq_small = _extract_and_pad(seq, start_small, end_small, 33)
    seq_large = _extract_and_pad(seq, start_large, end_large, 399)
    return seq_small, seq_large
 def _extract_and_pad(seq: str, start: int, end: int, target_length: int) -> str:
    """提取序列并用N填充"""
    if start < 0:
        prefix = 'N' * abs(start)
        extracted = prefix + seq[:end]
    elif end > len(seq):
        suffix = 'N' * (end - len(seq))
        extracted = seq[start:] + suffix
    else:
        extracted = seq[start:end]
    # 确保序列长度正确
    if len(extracted) < target_length:
        # 从中心填充
        pad_left = (target_length - len(extracted)) // 2
        pad_right = target_length - len(extracted) - pad_left
        extracted = 'N' * pad_left + extracted + 'N' * pad_right
    elif len(extracted) > target_length:
        # 从序列两端等量截取
        excess = len(extracted) - target_length
        trim_each_side = excess // 2
        extracted = extracted[trim_each_side:len(extracted)-trim_each_side]
    return extracted
 def prepare_cnn_input(sequence: str) -> np.ndarray:
    """prepare CNN model input"""
    base_to_num = {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'N': 0}
    seq_numeric = [base_to_num.get(base, 0) for base in sequence.upper()]
    return np.array(seq_numeric).reshape(1, len(sequence), 1) 
--- a/README.md
+++ b/README.md
@ -0,0 +1,34 @@
 # FScanpy
 ## A Machine Learning-Based Framework for Programmed Ribosomal Frameshifting Prediction
 FScanpy is a comprehensive Python package designed for the prediction of [Programmed Ribosomal Frameshifting (PRF)](https://en.wikipedia.org/wiki/Ribosomal_frameshift) sites in nucleotide sequences. By integrating advanced machine learning approaches (Gradient Boosting and BiLSTM-CNN) with the established [FScanR](https://github.com/seanchen607/FScanR.git) framework, FScanpy provides robust and accurate PRF site predictions. The package requires input sequences to be in the positive (5' to 3') orientation.
 ![FScanpy Architecture](/tutorial/image/structure.jpeg)
 For detailed documentation and usage examples, please refer to our [tutorial](tutorial/tutorial.md).
 ## Installation Requirements
 - Python ≥ 3.7
 - Dependencies are automatically handled during installation
 ### Option 1: Install via pip
 ```bash
 pip install FScanpy
 ```
 ### Option 2: Install from source
 ```bash
 git clone https://github.com/seanchen607/FScanpy-package.git
 cd FScanpy-package
 pip install -e .
 ```
 ## Authors
 ## Citation
 If you utilize FScanpy in your research, please cite our work:
 ```bibtex
 [Citation details will be added upon publication]
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,24 @@
 [build-system]
 requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "FScanpy"
 version = "1.0.0"
 description = "PRF prediction tool"
 authors = [{name = "FScanpy Developer", email = "example@example.com"}]
 dependencies = [
    "numpy",
    "pandas",
    "tensorflow",
    "scikit-learn",
    "wrapt>=1.10.11"
 ]
 requires-python = ">=3.7"
 [tool.setuptools]
 packages = ["FScanpy", "FScanpy.features"]
 include-package-data = true
 [tool.setuptools.package-data]
 "FScanpy.data" = ["test_data/*"]
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,20 @@
 from setuptools import setup, find_packages
 setup(
    name="FScanpy",
    version="1.0.0",
    description="PRF prediction tool",
    author="FScanpy Developer",
    author_email="example@example.com",
    packages=find_packages(),
    install_requires=[
        "numpy",
        "pandas",
        "tensorflow",
        "scikit-learn",
        "wrapt>=1.10.11",
        "biopython"
    ],
    include_package_data=True,
    python_requires=">=3.7",
 ) 
--- a/tutorial/image/ML.png
+++ b/tutorial/image/ML.png
--- a/tutorial/image/structure.jpeg
+++ b/tutorial/image/structure.jpeg
--- a/tutorial/tutorial.md
+++ b/tutorial/tutorial.md
@ -0,0 +1,164 @@
 ## Abstract
 FScanpy is a Python package designed to predict Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates advanced machine learning models, including Gradient Boosting and BiLSTM-CNN, to provide accurate predictions. This tool is essential for understanding gene expression regulation in various organisms, including eukaryotes and viruses, and offers a robust solution for PRF prediction challenges.
 ## Introduction
 ![FScanpy structure](/tutorial/image/structure.jpeg)
 FScanpy is a Python package dedicated to predicting Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates machine learning models (Gradient Boosting and BiLSTM-CNN) along with the FScanR package to furnish precise PRF predictions. Users are capable of employing three types of data as input: the entire cDNA/mRNA sequence that requires prediction, the nucleotide sequence in the vicinity of the suspected frameshift site, and the peptide library blastx results of the species or related species. It anticipates the input sequence to be in the + strand and can be integrated with FScanR to augment the accuracy.
 ![Machine learning models](/tutorial/image/ML.png)
 For the prediction of the entire sequence, FScanpy adopts a sliding window approach to scan the entire sequence and predict the PRF sites. For regional prediction, it is based on the 33-bp and 399-bp sequences in the 0 reading frame around the suspected frameshift site. Initially, the Gradient Boosting model will predict the potential PRF sites within the scanning window. If the predicted probability exceeds the threshold, the BiLSTM-CNN model will predict the PRF sites in the 399bp sequence.Then,VotingClassifier will combine the two models to make the final prediction.
 For PRF detection from BLASTX output, FScanpy identifies potential PRF sites from BLASTX alignment results, acquires the two hits of the same query sequence, and then utilizes frameDist_cutoff, mismatch_cutoff, and evalue_cutoff to filter the hits. Finally, it employs [FScanR](https://github.com/seanchen607/FScanR.git) to identify the PRF sites.
 ### Background
 [Ribosomal frameshifting](https://en.wikipedia.org/wiki/Ribosomal_frameshift), also known as translational frameshifting or translational recoding, is a biological phenomenon that occurs during translation that results in the production of multiple, unique proteins from a single mRNA. The process can be programmed by the nucleotide sequence of the mRNA and is sometimes affected by the secondary, 3-dimensional mRNA structure. It has been described mainly in viruses (especially retroviruses), retrotransposons and bacterial insertion elements, and also in some cellular genes.
 ### Key features of FScanpy include:
 - Integration of two predictive models:
  - [Gradient Boosting](https://tensorflow.google.cn/tutorials/estimator/boosted_trees?hl=en): Analyzes local sequence features centered around potential frameshift sites (10 codons).
  - [BiLSTM-CNN](https://paperswithcode.com/method/cnn-bilstm): Analyzes broader sequence features (100 codons).
 - Supports PRF prediction across various species.
 - Can be combined with [FScanR](https://github.com/seanchen607/FScanR.git) for enhanced accuracy.
 ## Installation (python>=3.7)
 ### 1. Use pip
 ```bash
 pip install FScanpy
 ```
 ### 2. Clone from [GitHub](https://github.com/.../FScanpy.git)
 ```bash
 git clone https://github.com/.../FScanpy.git
 cd your_project_directory
 pip install -e .
 ```
 ## Methods and Usage
 ### 1. Load model and test data
 Test data can be found in `FScanpy/data/test_data`,you can use the `list_test_data()` method to list all the test data and the `get_test_data_path()` method to get the path of the test data:
 ```python
 from FScanpy import PRFPredictor
 from FScanpy.data import get_test_data_path, list_test_data
 predictor = PRFPredictor() # load model
 list_test_data() # list all the test data
 blastx_file = get_test_data_path('blastx_example.xlsx')
 mrna_file = get_test_data_path('mrna_example.fasta')
 region_example = get_test_data_path('region_example.xlsx')
 ```
 ### 2. Predict PRF Sites in a Full Sequence
 Use the `predict_full()` method to scan the entire sequence,you can use the `window_size` parameter to adjust the scanning window size(default is 3) and the `gb_threshold` parameter to adjust the Gradient Boosting model fitting threshold(default is 0.1) for faster or more accurate prediction:
 ```python
    '''
    Args:
        sequence: mRNA sequence
        window_size: scanning window size (default is 3)
        gb_threshold: Gradient Boosting model threshold (default is 0.1)
    Returns:
        results: DataFrame containing prediction probabilities
    ''' 
 results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
                               window_size=3,    # Scanning window size
                               gb_threshold=0.1, # Gradient Boosting model threshold
                               plot=True) # Whether to plot the prediction results
 fig.savefig('predict_full.png')
 ```
 ### 3. Predict PRF in Specific Regions
 Use the `predict_region()` method to predict PRF in known regions of interest:
 ```python
    '''
    Args:
        seq: 399bp sequence
        gb_threshold: GB model probability threshold (default is 0.1)
    Returns:
        DataFrame: 包含所有序列预测概率的DataFrame
    '''
 import pandas as pd
 region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
 results = predictor.predict_region(seq=region_example['399bp'])
 ```
 ### 4. Identify PRF Sites from BLASTX Output
 BLASTX Output should contain the following columns: `qseqid`, `sseqid`, `pident`, `length`, `mismatch`, `gapopen`, `qstart`, `qend`, `sstart`, `send`, `evalue`, `bitscore`, `qframe`, `sframe`.
 FScanR result contains `DNA_seqid`, `FS_start`, `FS_end`, `FS_type`,`Pep_seqid`, `Pep_FS_start`, `Pep_FS_end`, `Strand` columns.
 Use the FScanR function to identify potential PRF sites from BLASTX alignment results:
 ```python
    """
    identify PRF sites from BLASTX output
    Args:
        blastx_output: BLASTX output DataFrame
        mismatch_cutoff: mismatch threshold
        evalue_cutoff: E-value threshold 
        frameDist_cutoff: frame distance threshold
    Returns:
        pd.DataFrame: DataFrame containing PRF site information
    """
 from FScanpy.utils import fscanr
 blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
 fscanr_result = fscanr(blastx_output, 
                      mismatch_cutoff=10,    # Allowed mismatches
                      evalue_cutoff=1e-5,    # E-value threshold
                      frameDist_cutoff=10)   # Frame distance threshold
 ```
 ### 5. Extract PRF Sites from BLASTX Output or your Sequence Data and evaluate it by FScanpy
 Use the `extract_prf_regions()` method to extract PRF site sequences from mRNA sequences,it based on the `FS_start` column of the FScanR output contact with the `DNA_seqid` column of the input mRNA sequence file to extract the 33bp and 399bp sequences around the PRF sites in 0 reading frame:
 ```python
    """
    extract PRF site sequences from mRNA sequences
    Args:
        mrna_file: mRNA sequence file path (FASTA format)
        prf_data: FScanR output PRF site data or your suspected PRF site data which at least contains `DNA_seqid` `FS_start` `strand` columns
    Returns:
        pd.DataFrame: DataFrame containing 33bp and 399bp sequences
    """
 from FScanpy.utils import extract_prf_regions
 prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
                                prf_data=fscanr_result)
 prf_results = predictor.predict_region (prf_regions['399bp'])
 ```
 ## Total Test
 ```python
 from FScanpy import PRFPredictor
 from FScanpy.data import get_test_data_path, list_test_data
 predictor = PRFPredictor() # load model
 list_test_data() # list all the test data
 blastx_file = get_test_data_path('blastx_example.xlsx')
 mrna_file = get_test_data_path('mrna_example.fasta')
 region_example = get_test_data_path('region_example.xlsx')
 results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT',
                               window_size=3,    # Scanning window size
                               gb_threshold=0.1, # Gradient Boosting model threshold
                               plot=True) 
 import pandas as pd
 region_example = pd.read_excel(get_test_data_path('region_example.xlsx'))
 results = predictor.predict_region(seq=region_example['399bp'])
 from FScanpy.utils import fscanr
 blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx'))
 fscanr_result = fscanr(blastx_output, 
                      mismatch_cutoff=10,    # Allowed mismatches
                      evalue_cutoff=1e-5,    # E-value threshold
                      frameDist_cutoff=10) 
 from FScanpy.utils import extract_prf_regions
 prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'),
                                prf_data=fscanr_result)
 prf_results = predictor.predict_region (prf_regions['399bp'])
 ## Citation
 If you use FScanpy, please cite our paper: [Paper Link]