diff --git a/FScanpy.egg-info/PKG-INFO b/FScanpy.egg-info/PKG-INFO deleted file mode 100644 index 31947af..0000000 --- a/FScanpy.egg-info/PKG-INFO +++ /dev/null @@ -1,14 +0,0 @@ -Metadata-Version: 2.4 -Name: FScanpy -Version: 1.0.0 -Summary: PRF prediction tool -Author: FScanpy Developer -Author-email: FScanpy Developer -Requires-Python: >=3.7 -Requires-Dist: numpy -Requires-Dist: pandas -Requires-Dist: tensorflow -Requires-Dist: scikit-learn -Requires-Dist: wrapt>=1.10.11 -Dynamic: author -Dynamic: requires-python diff --git a/FScanpy.egg-info/SOURCES.txt b/FScanpy.egg-info/SOURCES.txt deleted file mode 100644 index 1ca5e1d..0000000 --- a/FScanpy.egg-info/SOURCES.txt +++ /dev/null @@ -1,14 +0,0 @@ -README.md -pyproject.toml -setup.py -FScanpy/__init__.py -FScanpy/predictor.py -FScanpy/utils.py -FScanpy.egg-info/PKG-INFO -FScanpy.egg-info/SOURCES.txt -FScanpy.egg-info/dependency_links.txt -FScanpy.egg-info/requires.txt -FScanpy.egg-info/top_level.txt -FScanpy/features/__init__.py -FScanpy/features/cnn_input.py -FScanpy/features/sequence.py \ No newline at end of file diff --git a/FScanpy.egg-info/dependency_links.txt b/FScanpy.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/FScanpy.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/FScanpy.egg-info/requires.txt b/FScanpy.egg-info/requires.txt deleted file mode 100644 index 8351bce..0000000 --- a/FScanpy.egg-info/requires.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy -pandas -tensorflow -scikit-learn -wrapt>=1.10.11 diff --git a/FScanpy.egg-info/top_level.txt b/FScanpy.egg-info/top_level.txt deleted file mode 100644 index 729703a..0000000 --- a/FScanpy.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -FScanpy diff --git a/FScanpy/__init__.py b/FScanpy/__init__.py index f0a6751..a30b0ad 100644 --- a/FScanpy/__init__.py +++ b/FScanpy/__init__.py @@ -1,4 +1,5 @@ from .predictor import PRFPredictor +from . import data import pandas as pd import numpy as np from typing import Union, List, Dict @@ -7,13 +8,14 @@ __version__ = '0.3.0' __author__ = '' __email__ = '' -__all__ = ['PRFPredictor', 'predict_prf', '__version__', '__author__', '__email__'] +__all__ = ['PRFPredictor', 'predict_prf', 'plot_prf_prediction', 'data', '__version__', '__author__', '__email__'] def predict_prf( sequence: Union[str, List[str], None] = None, data: Union[pd.DataFrame, None] = None, window_size: int = 3, - gb_threshold: float = 0.1, + short_threshold: float = 0.1, + ensemble_weight: float = 0.4, model_dir: str = None ) -> pd.DataFrame: """ @@ -21,13 +23,18 @@ def predict_prf( Args: sequence: 单个或多个DNA序列,用于滑动窗口预测 - data: DataFrame数据,必须包含'399bp'列,用于区域预测 + data: DataFrame数据,必须包含'Long_Sequence'或'399bp'列,用于区域预测 window_size: 滑动窗口大小(默认为3) - gb_threshold: GB模型概率阈值(默认为0.1) + short_threshold: Short模型(HistGB)概率阈值(默认为0.1) + ensemble_weight: Short模型在集成中的权重(默认为0.4,Long权重为0.6) model_dir: 模型文件目录路径(可选) Returns: - pandas.DataFrame: 预测结果 + pandas.DataFrame: 预测结果,包含以下主要字段: + - Short_Probability: Short模型预测概率 + - Long_Probability: Long模型预测概率 + - Ensemble_Probability: 集成预测概率(主要结果) + - Ensemble_Weights: 权重配置信息 Examples: # 1. 单条序列滑动窗口预测 @@ -39,10 +46,13 @@ def predict_prf( >>> sequences = ["ATGCGTACGT...", "GCTATAGCAT..."] >>> results = predict_prf(sequence=sequences) - # 3. DataFrame区域预测 + # 3. 自定义集成权重比例 + >>> results = predict_prf(sequence=sequence, ensemble_weight=0.3) # 3:7 比例 + + # 4. DataFrame区域预测 >>> import pandas as pd >>> data = pd.DataFrame({ - ... '399bp': ['ATGCGT...', 'GCTATAG...'] + ... 'Long_Sequence': ['ATGCGT...', 'GCTATAG...'] # 或使用 '399bp' ... }) >>> results = predict_prf(data=data) """ @@ -53,20 +63,22 @@ def predict_prf( raise ValueError("必须提供sequence或data参数之一") if sequence is not None and data is not None: raise ValueError("sequence和data参数不能同时提供") + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") # 滑动窗口预测模式 if sequence is not None: if isinstance(sequence, str): # 单条序列预测 - return predictor.predict_full( - sequence, window_size, gb_threshold) + return predictor.predict_sequence( + sequence, window_size, short_threshold, ensemble_weight) elif isinstance(sequence, (list, tuple)): # 多条序列预测 results = [] for i, seq in enumerate(sequence, 1): try: - result = predictor.predict_full( - seq, window_size, gb_threshold) + result = predictor.predict_sequence( + seq, window_size, short_threshold, ensemble_weight) result['Sequence_ID'] = f'seq_{i}' results.append(result) except Exception as e: @@ -78,17 +90,23 @@ def predict_prf( if not isinstance(data, pd.DataFrame): raise ValueError("data参数必须是pandas DataFrame类型") - if '399bp' not in data.columns: - raise ValueError("DataFrame必须包含'399bp'列") + # 检查列名(支持新旧两种命名) + seq_column = None + if 'Long_Sequence' in data.columns: + seq_column = 'Long_Sequence' + elif '399bp' in data.columns: + seq_column = '399bp' + else: + raise ValueError("DataFrame必须包含'Long_Sequence'或'399bp'列") # 调用区域预测函数 try: - results = predictor.predict_region( - data['399bp'], gb_threshold) + results = predictor.predict_regions( + data[seq_column], short_threshold, ensemble_weight) # 添加原始数据的其他列 for col in data.columns: - if col not in ['399bp', '33bp']: + if col not in ['Long_Sequence', '399bp', 'Short_Sequence', '33bp']: results[col] = data[col].values return results @@ -96,14 +114,92 @@ def predict_prf( except Exception as e: print(f"警告:区域预测失败 - {str(e)}") # 创建空结果 + long_weight = 1.0 - ensemble_weight results = pd.DataFrame({ - 'GB_Probability': [0.0] * len(data), - 'CNN_Probability': [0.0] * len(data), - 'Voting_Probability': [0.0] * len(data) + 'Short_Probability': [0.0] * len(data), + 'Long_Probability': [0.0] * len(data), + 'Ensemble_Probability': [0.0] * len(data), + 'Ensemble_Weights': [f'Short:{ensemble_weight:.1f}, Long:{long_weight:.1f}'] * len(data) }) # 添加原始数据列 for col in data.columns: results[col] = data[col].values - return results \ No newline at end of file + return results + +def plot_prf_prediction( + sequence: str, + window_size: int = 3, + short_threshold: float = 0.65, + long_threshold: float = 0.8, + ensemble_weight: float = 0.4, + title: str = None, + save_path: str = None, + figsize: tuple = (12, 6), + dpi: int = 300, + model_dir: str = None +) -> tuple: + """ + 绘制序列PRF预测结果的移码概率图 + + Args: + sequence: 输入DNA序列 + window_size: 滑动窗口大小(默认为3) + short_threshold: Short模型(HistGB)过滤阈值(默认为0.65) + long_threshold: Long模型(BiLSTM-CNN)过滤阈值(默认为0.8) + ensemble_weight: Short模型在集成中的权重(默认为0.4,Long权重为0.6) + title: 图片标题(可选) + save_path: 保存路径(可选,如果提供则保存图片) + figsize: 图片尺寸(默认为(12, 6)) + dpi: 图片分辨率(默认为300) + model_dir: 模型文件目录路径(可选) + + Returns: + tuple: (pd.DataFrame, matplotlib.figure.Figure) 预测结果和图形对象 + + Examples: + # 1. 简单绘图 + >>> from FScanpy import plot_prf_prediction + >>> sequence = "ATGCGTACGT..." + >>> results, fig = plot_prf_prediction(sequence) + >>> plt.show() + + # 2. 自定义阈值和集成权重 + >>> results, fig = plot_prf_prediction( + ... sequence, + ... short_threshold=0.7, + ... long_threshold=0.85, + ... ensemble_weight=0.3, # 3:7 权重比例 + ... title="自定义权重的预测结果", + ... save_path="prediction_result.png" + ... ) + + # 3. 等权重组合 + >>> results, fig = plot_prf_prediction( + ... sequence, + ... ensemble_weight=0.5 # 5:5 等权重 + ... ) + + # 4. Long模型主导 + >>> results, fig = plot_prf_prediction( + ... sequence, + ... ensemble_weight=0.2 # 2:8 权重,Long模型主导 + ... ) + """ + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") + + predictor = PRFPredictor(model_dir=model_dir) + + return predictor.plot_sequence_prediction( + sequence=sequence, + window_size=window_size, + short_threshold=short_threshold, + long_threshold=long_threshold, + ensemble_weight=ensemble_weight, + title=title, + save_path=save_path, + figsize=figsize, + dpi=dpi + ) \ No newline at end of file diff --git a/FScanpy/data/__init__.py b/FScanpy/data/__init__.py index 23c5bdb..ef84f00 100644 --- a/FScanpy/data/__init__.py +++ b/FScanpy/data/__init__.py @@ -1,9 +1,120 @@ +""" +FScanpy数据模块 +提供测试数据访问和处理功能 +""" + import os -import pkg_resources +from pathlib import Path +from typing import List def get_test_data_path(filename: str) -> str: - return pkg_resources.resource_filename('FScanpy', f'data/test_data/{filename}') + """ + 获取测试数据文件的完整路径 + + Args: + filename: 测试数据文件名 + + Returns: + str: 文件的完整路径 + + Examples: + >>> from FScanpy.data import get_test_data_path + >>> blastx_file = get_test_data_path('blastx_example.xlsx') + >>> mrna_file = get_test_data_path('mrna_example.fasta') + >>> region_file = get_test_data_path('region_example.csv') + """ + current_dir = Path(__file__).parent + test_data_dir = current_dir / "test_data" + file_path = test_data_dir / filename + + if not file_path.exists(): + available_files = list_test_data() + raise FileNotFoundError( + f"测试数据文件不存在: {filename}\n" + f"可用的测试数据文件: {available_files}" + ) + + return str(file_path) -def list_test_data() -> list: - data_dir = pkg_resources.resource_filename('FScanpy', 'data/test_data') - return os.listdir(data_dir) \ No newline at end of file +def list_test_data() -> List[str]: + """ + 列出所有可用的测试数据文件 + + Returns: + List[str]: 测试数据文件名列表 + + Examples: + >>> from FScanpy.data import list_test_data + >>> files = list_test_data() + >>> print(files) + ['blastx_example.xlsx', 'mrna_example.fasta', 'region_example.csv'] + """ + try: + current_dir = Path(__file__).parent + test_data_dir = current_dir / "test_data" + + if not test_data_dir.exists(): + return [] + + files = [] + for file_path in test_data_dir.iterdir(): + if file_path.is_file() and not file_path.name.startswith('.'): + files.append(file_path.name) + + return sorted(files) + + except Exception: + return [] + +def print_test_data_info(): + """ + 打印测试数据的详细信息 + """ + print("📋 FScanpy 测试数据信息:") + print("=" * 50) + + try: + current_dir = Path(__file__).parent + test_data_dir = current_dir / "test_data" + + if not test_data_dir.exists(): + print("❌ 测试数据目录不存在") + return + + files = list_test_data() + if not files: + print("❌ 没有找到测试数据文件") + return + + print(f"📁 数据目录: {test_data_dir}") + print(f"📊 文件数量: {len(files)}") + print() + + file_descriptions = { + 'blastx_example.xlsx': '🧬 BLASTX比对结果示例 (1000条记录)', + 'mrna_example.fasta': '🧬 mRNA序列示例数据', + 'region_example.csv': '🎯 PRF区域验证数据 (含标签)' + } + + for filename in files: + file_path = test_data_dir / filename + size_mb = file_path.stat().st_size / (1024 * 1024) + + description = file_descriptions.get(filename, '📄 数据文件') + print(f" {description}") + print(f" 文件名: {filename}") + print(f" 大小: {size_mb:.2f} MB") + print(f" 路径: {file_path}") + print() + + print("🚀 使用示例:") + print(" from FScanpy.data import get_test_data_path") + print(" blastx_file = get_test_data_path('blastx_example.xlsx')") + print(" mrna_file = get_test_data_path('mrna_example.fasta')") + print(" region_file = get_test_data_path('region_example.csv')") + + except Exception as e: + print(f"❌ 获取数据信息时出错: {e}") + +# 导出主要函数 +__all__ = ['get_test_data_path', 'list_test_data', 'print_test_data_info'] \ No newline at end of file diff --git a/FScanpy/data/test_data/region_example.csv b/FScanpy/data/test_data/region_example.csv index a46f895..f86fb3d 100644 --- a/FScanpy/data/test_data/region_example.csv +++ b/FScanpy/data/test_data/region_example.csv @@ -1,8 +1,4 @@ FS_period,399bp,fs_position,DNA_seqid,label,source,FS_type,dataset gtgtgaacacaatagtgagtgacatactaaacg,ggatatgtaacatggacaagtcattgtgtaggtatccaagaccaatagcctttactttcaaagggaaagaagaattcaggaccttgtttggaggactcatctcgatgtcgattcaggtggtcattgtgctctatgcttatattatgctaaagataatgatagaacgtaatgacacatcaaaaagtgtgaacacaatagtgagtgacatactaaacgacaaatctccagtatctctcaatacaacagatttctcgtttgcatttgatgcttttattcttggcgatgataatttcgatttcaacaataaccaatacttcggaattgagctacttcaatggattaagcagccagatactggagaactatcatccactaatattccatatgaaagatgtggaa,16.0,MSTRG.18491.1,0,EUPLOTES,negative,EUPLOTES gtctcagaagagtctgaggaatatctccaagga,caaattaataacaaatatgaattccatcaacaacttttatggagacgagaacttatcagatgaacttctgagtgaagatgtcgtgtcttgagaagtaagaggatcagaaaagatcttgcataacatggggagaaagtctctcagtaataagaagcctttaagcggagtggagttggactgagagtctcagaagagtctgaggaatatctccaaggataaaatttgttcgcaaggaagatctatctttaggcagaagaagtcaaaatcttgtgatcaagtagaagaacctcttagtagtcttaaagataacatgagtcactttaatgacatagacttgcaagctagtaagcctctaaaatcagagattagcaatctttttgggtactcaactcagcccaa,16.0,MSTRG.4662.1,0,EUPLOTES,negative,EUPLOTES -cttacttgcaaacatgaatctaataaattagag,ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc,16.0,MSTRG.14742.1,0,EUPLOTES,negative,EUPLOTES -atagtagaagatacagtctccatggccagtaca,ttgttgataaaaatacaggattttatctccaattaaagtcgagaaaataagccaagtagcagtaaacgtagataaacgaataagcatggaaaacaataatagaatagaccaaattgaagagatttctcacaattcgtatttgaattactgttagatttaggaataacaaatcgaactcgacacatagtagaagatacagtctccatggccagtacaaaagccaccatgaagaagaggaaaaagaacagataaatgctatcaaagaaaataaagttaacgatcaatatacaagcatgattgttattaatttaaccaactttagataatcagattataatcagcgacgatcaaagcaaaaaggtaaagaacaaccaattcaaattgaacatcagaaaggga,16.0,CUFF.17967.2,0,EUPLOTES,negative,EUPLOTES -cctcgtctttgtctccagaaaataagaaaacaa,catcaataaatagagtcaatgttagaagtatgtctaaattcaaaccaaacgaaattctaaataaagcaagaatgccaacataattaatttagattaagcttgatagttcattagtactcaacaataaaaatatttcaaagggtaatattccagaatcaaaattaagaaataaaattattcctacctcgtctttgtctccagaaaataagaaaacaaataaatcagttatgttcgaaaatgttaaagagatggaaagccaggacaagtcgcaaaatacactaacacatttgaaagaaagcaataatggtagtccttccaaattttaaaactgaaaataatcttgcagatgtagttcgatctagagataataaagcttataacagtactctaaacttaaaa,16.0,CUFF.22392.1,0,EUPLOTES,negative,EUPLOTES -aaaaatgacaaagatctgaacattagttctttc,ttaattttgttctgatcacctaattgtaagcccaaaaacgatactcaaaagatgaggaaactttattggaacattaaaagtaatctcttgagtttatttatgctaactatttacatacgaagcttttacgaaacattccaattcttggctcttgctggcttatcagctacttggaacaacgacaaaaatgacaaagatctgaacattagttctttcatttttgccattgttttgttgtttttatgcacaggtttcttcttatggtcactctaccattactttggatcccgctctgacaatcctcgaaatctcaaaatctctcaggagtttacgaatggagcaaaggagaataatagcggtaaactatatccagtgcttggattgctgagaagaggtctc,16.0,MSTRG.9455.1,0,EUPLOTES,negative,EUPLOTES -agaagactgggagaactctcagatactatatct,agaagaagagaaggccaggagtagctcgaaagaggaggaatttaaggtttacccaaagaaccctatgactgactctaaagatgatcagtcggacactctccctccgaaatcttacagtgtaaagaaagccaatgtaggagaactaaacaagtacgattttgagatctcttattccaaataatgagaagactgggagaactctcagatactatatctgcaagtatgatgaatgaaggcgtaaatttaacaagacttggaactttattgatcacgctaggatacacacaggagagaagccttacaaatgtgagctgtgtggcaaagagtttgctcagaaggggaactacaacaaacacaggaatacccaccagcatagtgccaagaagacctcagtaatga,16.0,MSTRG.26803.1,0,EUPLOTES,negative,EUPLOTES \ No newline at end of file +cttacttgcaaacatgaatctaataaattagag,ttaagaaggcataagagttttgctaaaaataaagatttgaagaatattactactaagtttggcaagagtaaacagagaagaagtaccatttctggctctccgacaaaatcagtcagatgcccttctgcaaaaaagagcctaacagatagaccaagaagaggaggtatccttgccaggaagaatcttacttgcaaacatgaatctaataaattagagatgctgatgaacctcatctatcgtacaccgaatgtagacctgattgaaaataggatcgatggactgataagaagtaactctatattgaacaaagtcgagaagagagtagctcactccggcattaagacttacaggttttctcctaatttactgaagaagataattccaaagaagataaaattc,16.0,MSTRG.14742.1,0,EUPLOTES,negative,EUPLOTES \ No newline at end of file diff --git a/FScanpy/predictor.py b/FScanpy/predictor.py index b57542e..27a352b 100644 --- a/FScanpy/predictor.py +++ b/FScanpy/predictor.py @@ -24,163 +24,170 @@ class PRFPredictor: model_dir = resource_filename('FScanpy', 'pretrained') try: - # 加载模型 - self.gb_model = self._load_pickle(os.path.join(model_dir, 'GradientBoosting_all.pkl')) - self.cnn_model = self._load_pickle(os.path.join(model_dir, 'BiLSTM-CNN_all.pkl')) + # 加载模型 - 使用新的命名约定 + self.short_model = self._load_pickle(os.path.join(model_dir, 'short.pkl')) # HistGB模型 + self.long_model = self._load_pickle(os.path.join(model_dir, 'long.pkl')) # BiLSTM-CNN模型 # 初始化特征提取器和CNN处理器,使用与训练时相同的序列长度 - self.gb_seq_length = 33 # HistGradientBoosting使用的序列长度 - self.cnn_seq_length = 399 # BiLSTM-CNN使用的序列长度 + self.short_seq_length = 33 # HistGB使用的序列长度 + self.long_seq_length = 399 # BiLSTM-CNN使用的序列长度 # 初始化特征提取器和CNN输入处理器 - self.feature_extractor = SequenceFeatureExtractor(seq_length=self.gb_seq_length) - self.cnn_processor = CNNInputProcessor(max_length=self.cnn_seq_length) + self.feature_extractor = SequenceFeatureExtractor(seq_length=self.short_seq_length) + self.cnn_processor = CNNInputProcessor(max_length=self.long_seq_length) + + # 检测模型类型以优化预测性能 + self._detect_model_types() except FileNotFoundError as e: - raise FileNotFoundError(f"无法找到模型文件: {str(e)}") + raise FileNotFoundError(f"无法找到模型文件: {str(e)}。请确保模型文件 'short.pkl' 和 'long.pkl' 存在于 {model_dir}") except Exception as e: raise Exception(f"加载模型出错: {str(e)}") def _load_pickle(self, path): - return joblib.load(path) + """安全加载pickle文件""" + try: + return joblib.load(path) + except Exception as e: + raise FileNotFoundError(f"无法加载模型文件 {path}: {str(e)}") - def predict_single_position(self, fs_period, full_seq, gb_threshold=0.1): + def _detect_model_types(self): + """检测模型类型以优化预测性能""" + self.short_is_sklearn = hasattr(self.short_model, 'predict_proba') + self.long_is_sklearn = hasattr(self.long_model, 'predict_proba') + + def _predict_model(self, model, features, is_sklearn, seq_length): + """统一的模型预测方法""" + try: + if is_sklearn: + # sklearn模型使用特征向量 + if isinstance(features, np.ndarray) and features.ndim > 1: + features = features.flatten() + features_2d = np.array([features]) + pred = model.predict_proba(features_2d) + return pred[0][1] + else: + # 深度学习模型 + if seq_length == self.long_seq_length: + # 对于长序列,使用CNN处理器 + model_input = self.cnn_processor.prepare_sequence(features) + else: + # 对于短序列,转换为数值编码 + base_to_num = {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'N': 0} + seq_numeric = [base_to_num.get(base, 0) for base in features.upper()] + model_input = np.array(seq_numeric).reshape(1, len(seq_numeric), 1) + + # 统一的预测调用 + try: + pred = model.predict(model_input, verbose=0) + except TypeError: + pred = model.predict(model_input) + + # 处理预测结果 + if isinstance(pred, list): + pred = pred[0] + + if hasattr(pred, 'shape') and len(pred.shape) > 1 and pred.shape[1] > 1: + return pred[0][1] + else: + return pred[0][0] if hasattr(pred[0], '__getitem__') else pred[0] + + except Exception as e: + raise Exception(f"模型预测失败: {str(e)}") + + def predict_single_position(self, fs_period, full_seq, short_threshold=0.1, ensemble_weight=0.4): ''' 预测单个位置的PRF状态 Args: - fs_period: 33bp序列 (将根据gb_seq_length处理) - full_seq: 完整序列 (将根据cnn_seq_length处理) - gb_threshold: GB模型的概率阈值 (默认为0.1) + fs_period: 33bp序列 (short模型使用) + full_seq: 完整序列 (long模型使用) + short_threshold: short模型的概率阈值 (默认为0.1) + ensemble_weight: short模型在集成中的权重 (默认为0.4,long权重为0.6) Returns: dict: 包含预测概率的字典 ''' try: - # 处理序列长度 - if len(fs_period) > self.gb_seq_length: - fs_period = self.feature_extractor.trim_sequence(fs_period, self.gb_seq_length) - - # GB模型预测 - 确保输入是二维数组 - try: - gb_features = self.feature_extractor.extract_features(fs_period) - - # 检查特征结构并确保是一维数组 - if isinstance(gb_features, np.ndarray): - # 如果是多维数组,进行扁平化处理 - if gb_features.ndim > 1: - print(f"警告: 特征是{gb_features.ndim}维数组,进行扁平化处理") - gb_features = gb_features.flatten() - - # 明确将特征转换为二维数组,正确形状为(1, n_features) - gb_features_2d = np.array([gb_features]) - - # 再次检查维度 - if gb_features_2d.ndim != 2: - raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组") - - gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1] - except Exception as e: - print(f"GB模型预测时出错: {str(e)}") - # 出错时设置概率为0 - gb_prob = 0.0 + # 验证权重参数 + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") - # 如果GB概率低于阈值,则跳过CNN模型 - if gb_prob < gb_threshold: + long_weight = 1.0 - ensemble_weight + + # 处理序列长度 + if len(fs_period) > self.short_seq_length: + fs_period = self.feature_extractor.trim_sequence(fs_period, self.short_seq_length) + + # Short模型预测 (HistGB) + try: + if self.short_is_sklearn: + short_features = self.feature_extractor.extract_features(fs_period) + short_prob = self._predict_model(self.short_model, short_features, True, self.short_seq_length) + else: + short_prob = self._predict_model(self.short_model, fs_period, False, self.short_seq_length) + except Exception as e: + print(f"Short模型预测时出错: {str(e)}") + short_prob = 0.0 + + # 如果short概率低于阈值,则跳过long模型 + if short_prob < short_threshold: return { - 'GB_Probability': gb_prob, - 'CNN_Probability': 0.0, - 'Voting_Probability': 0.0 + 'Short_Probability': short_prob, + 'Long_Probability': 0.0, + 'Ensemble_Probability': 0.0, + 'Ensemble_Weights': f'Short:{ensemble_weight:.1f}, Long:{long_weight:.1f}' } - # CNN模型预测 + # Long模型预测 (BiLSTM-CNN) try: - # 首先检查CNN模型的类型 - 通过尝试识别模型类型 - is_sklearn_model = False - - # 检测模型类型的方法 - if hasattr(self.cnn_model, 'predict_proba'): - # 这可能是一个scikit-learn模型 - is_sklearn_model = True - - if is_sklearn_model: - # 如果是sklearn模型 (如HistGradientBoostingClassifier),使用与GB相同的特征提取 - # 为CNN模型使用相同的特征提取方法,但从399bp序列中提取 - cnn_features = self.feature_extractor.extract_features(full_seq) - if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1: - cnn_features = cnn_features.flatten() - # 转为二维数组 - cnn_features_2d = np.array([cnn_features]) - cnn_pred = self.cnn_model.predict_proba(cnn_features_2d) - cnn_prob = cnn_pred[0][1] + if self.long_is_sklearn: + long_features = self.feature_extractor.extract_features(full_seq) + long_prob = self._predict_model(self.long_model, long_features, True, self.long_seq_length) else: - # 假设是深度学习模型,需要三维输入 - cnn_input = self.cnn_processor.prepare_sequence(full_seq) - # 尝试不同的预测方法 - try: - # 先尝试不带参数 - cnn_pred = self.cnn_model.predict(cnn_input) - except TypeError: - try: - # 再尝试带verbose参数 - cnn_pred = self.cnn_model.predict(cnn_input, verbose=0) - except Exception: - # 最后尝试将输入重塑为2D - reshaped_input = cnn_input.reshape(1, -1) - cnn_pred = self.cnn_model.predict(reshaped_input) - - # 处理预测结果 - if isinstance(cnn_pred, list): - cnn_pred = cnn_pred[0] - - # 提取概率值 - if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1: - cnn_prob = cnn_pred[0][1] - else: - cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0] + long_prob = self._predict_model(self.long_model, full_seq, False, self.long_seq_length) except Exception as e: - print(f"CNN模型预测时出错: {str(e)}") - # 出错时设置概率为0 - cnn_prob = 0.0 + print(f"Long模型预测时出错: {str(e)}") + long_prob = 0.0 - # 使用4:6的加权平均替代投票模型 + # 计算集成概率 try: - voting_prob = 0.4 * gb_prob + 0.6 * cnn_prob + ensemble_prob = ensemble_weight * short_prob + long_weight * long_prob except Exception as e: - print(f"计算加权平均时出错: {str(e)}") - # 出错时使用简单平均 - voting_prob = (gb_prob + cnn_prob) / 2 + print(f"计算集成概率时出错: {str(e)}") + ensemble_prob = (short_prob + long_prob) / 2 return { - 'GB_Probability': gb_prob, - 'CNN_Probability': cnn_prob, - 'Voting_Probability': voting_prob + 'Short_Probability': short_prob, + 'Long_Probability': long_prob, + 'Ensemble_Probability': ensemble_prob, + 'Ensemble_Weights': f'Short:{ensemble_weight:.1f}, Long:{long_weight:.1f}' } except Exception as e: raise Exception(f"预测过程出错: {str(e)}") - def predict_full(self, sequence, window_size=3, gb_threshold=0.1, plot=False): + def predict_sequence(self, sequence, window_size=3, short_threshold=0.1, ensemble_weight=0.4): """ - 预测完整序列中的PRF位点 + 预测完整序列中的PRF位点(滑动窗口方法) Args: sequence: 输入DNA序列 window_size: 滑动窗口大小 (默认为3) - gb_threshold: GB模型概率阈值 (默认为0.1) - plot: 是否绘制预测结果图表 (默认为False) + short_threshold: short模型概率阈值 (默认为0.1) + ensemble_weight: short模型在集成中的权重 (默认为0.4) Returns: - if plot=False: - pd.DataFrame: 包含预测结果的DataFrame - if plot=True: - tuple: (pd.DataFrame, matplotlib.figure.Figure) + pd.DataFrame: 包含预测结果的DataFrame """ if window_size < 1: raise ValueError("窗口大小必须大于等于1") - if gb_threshold < 0: - raise ValueError("GB阈值必须大于等于0") + if short_threshold < 0: + raise ValueError("short模型阈值必须大于等于0") + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") results = [] + long_weight = 1.0 - ensemble_weight try: # 确保序列为字符串并转换为大写 @@ -188,210 +195,207 @@ class PRFPredictor: # 滑动窗口预测 for pos in range(0, len(sequence) - 2, window_size): - # 提取窗口序列 - 使用与训练时相同的窗口大小 + # 提取窗口序列 fs_period, full_seq = extract_window_sequences(sequence, pos) if fs_period is None or full_seq is None: continue # 预测并记录结果 - pred = self.predict_single_position(fs_period, full_seq, gb_threshold) + pred = self.predict_single_position(fs_period, full_seq, short_threshold, ensemble_weight) pred.update({ 'Position': pos, 'Codon': sequence[pos:pos+3], - '33bp': fs_period, - '399bp': full_seq + 'Short_Sequence': fs_period, # 更清晰的命名 + 'Long_Sequence': full_seq # 更清晰的命名 }) results.append(pred) # 创建结果DataFrame results_df = pd.DataFrame(results) - # 如需绘图 - if plot: - # 创建图形 - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[2, 1]) - - # 绘制折线图 - ax1.plot(results_df['Position'], results_df['GB_Probability'], - label='GB模型', alpha=0.7, linewidth=1.5) - ax1.plot(results_df['Position'], results_df['CNN_Probability'], - label='CNN模型', alpha=0.7, linewidth=1.5) - ax1.plot(results_df['Position'], results_df['Voting_Probability'], - label='投票模型', linewidth=2, color='red') - - ax1.set_xlabel('序列位置') - ax1.set_ylabel('移码概率') - ax1.set_title('移码预测概率') - ax1.legend() - ax1.grid(True, alpha=0.3) - - # 准备热图数据 - positions = results_df['Position'].values - probabilities = results_df['Voting_Probability'].values - - # 创建热图矩阵 - heatmap_matrix = np.zeros((1, len(positions))) - heatmap_matrix[0, :] = probabilities - - # 绘制热图 - im = ax2.imshow(heatmap_matrix, aspect='auto', cmap='YlOrRd', - extent=[min(positions), max(positions), 0, 1]) - - # 添加颜色条 - cbar = plt.colorbar(im, ax=ax2) - cbar.set_label('移码概率') - - # 设置热图轴标签 - ax2.set_xlabel('序列位置') - ax2.set_title('移码概率热图') - ax2.set_yticks([]) - - # 调整布局 - plt.tight_layout() - - return results_df, fig - return results_df except Exception as e: raise Exception(f"序列预测过程出错: {str(e)}") - def predict_region(self, seq, gb_threshold=0.1): - ''' - 预测区域序列 + def plot_sequence_prediction(self, sequence, window_size=3, short_threshold=0.65, + long_threshold=0.8, ensemble_weight=0.4, title=None, save_path=None, + figsize=(12, 6), dpi=300): + """ + 绘制序列预测结果的移码概率图 Args: - seq: 399bp序列或包含399bp序列的DataFrame/Series - gb_threshold: GB模型概率阈值 (默认为0.1) + sequence: 输入DNA序列 + window_size: 滑动窗口大小 (默认为3) + short_threshold: Short模型(HistGB)过滤阈值 (默认为0.65) + long_threshold: Long模型(BiLSTM-CNN)过滤阈值 (默认为0.8) + ensemble_weight: Short模型在集成中的权重 (默认为0.4) + title: 图片标题 (可选) + save_path: 保存路径 (可选,如果提供则保存图片) + figsize: 图片尺寸 (默认为(12, 6)) + dpi: 图片分辨率 (默认为300) + + Returns: + tuple: (pd.DataFrame, matplotlib.figure.Figure) 预测结果和图形对象 + """ + try: + # 验证权重参数 + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") + + long_weight = 1.0 - ensemble_weight + + # 获取预测结果 - 使用新的方法名 + results_df = self.predict_sequence(sequence, window_size=window_size, + short_threshold=0.1, ensemble_weight=ensemble_weight) + + if results_df.empty: + raise ValueError("预测结果为空,请检查输入序列") + + # 获取序列长度 + seq_length = len(sequence) + + # 计算显示宽度 + prob_width = max(1, seq_length // 300) # 概率标记的宽度 + + # 创建图形,包含两个子图 + fig = plt.figure(figsize=figsize) + gs = fig.add_gridspec(2, 1, height_ratios=[0.15, 1], hspace=0.3) + + # 设置标题 + if title: + fig.suptitle(title, y=0.95, fontsize=12) + else: + fig.suptitle(f'序列移码概率预测结果 (权重 {ensemble_weight:.1f}:{long_weight:.1f})', y=0.95, fontsize=12) + + # 预测概率热图 + ax0 = fig.add_subplot(gs[0]) + prob_data = np.zeros((1, seq_length)) + + # 应用双重阈值过滤 + for _, row in results_df.iterrows(): + pos = int(row['Position']) + if (row['Short_Probability'] >= short_threshold and + row['Long_Probability'] >= long_threshold): + # 为每个满足阈值的位置设置概率值 + start = max(0, pos - prob_width//2) + end = min(seq_length, pos + prob_width//2 + 1) + prob_data[0, start:end] = row['Ensemble_Probability'] + + im = ax0.imshow(prob_data, cmap='Reds', aspect='auto', vmin=0, vmax=1, + interpolation='nearest') + ax0.set_xticks([]) + ax0.set_yticks([]) + ax0.set_title(f'预测概率热图 (Short≥{short_threshold}, Long≥{long_threshold})', + pad=5, fontsize=10) + + # 主图(条形图) + ax1 = fig.add_subplot(gs[1]) + + # 应用过滤阈值 + filtered_probs = results_df['Ensemble_Probability'].copy() + mask = ((results_df['Short_Probability'] < short_threshold) | + (results_df['Long_Probability'] < long_threshold)) + filtered_probs[mask] = 0 + + # 绘制条形图 + bars = ax1.bar(results_df['Position'], filtered_probs, + alpha=0.7, color='darkred', width=max(1, window_size)) + + # 设置x轴刻度 + step = max(seq_length // 10, 50) + x_ticks = np.arange(0, seq_length, step) + ax1.set_xticks(x_ticks) + ax1.tick_params(axis='x', rotation=45) + + # 设置标签和标题 + ax1.set_xlabel('序列位置 (bp)', fontsize=10) + ax1.set_ylabel('移码概率', fontsize=10) + ax1.set_title(f'移码概率分布 (集成权重 {ensemble_weight:.1f}:{long_weight:.1f})', fontsize=11) + + # 设置y轴范围 + ax1.set_ylim(0, 1) + + # 添加网格 + ax1.grid(True, alpha=0.3) + + # 添加阈值和权重说明 + info_text = (f'过滤阈值: Short≥{short_threshold}, Long≥{long_threshold}\n' + f'集成权重: Short:{ensemble_weight:.1f}, Long:{long_weight:.1f}') + ax1.text(0.02, 0.95, info_text, transform=ax1.transAxes, + fontsize=9, verticalalignment='top', + bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8)) + + # 确保所有子图的x轴范围一致 + for ax in [ax0, ax1]: + ax.set_xlim(-1, seq_length) + + # 调整布局 + plt.tight_layout() + + # 如果提供了保存路径,则保存图片 + if save_path: + plt.savefig(save_path, dpi=dpi, bbox_inches='tight') + # 同时保存PDF版本 + if save_path.endswith('.png'): + pdf_path = save_path.replace('.png', '.pdf') + plt.savefig(pdf_path, bbox_inches='tight') + print(f"图片已保存至: {save_path}") + + return results_df, fig + + except Exception as e: + raise Exception(f"绘制序列预测图时出错: {str(e)}") + + def predict_regions(self, sequences, short_threshold=0.1, ensemble_weight=0.4): + ''' + 预测区域序列(批量预测已知的399bp序列) + + Args: + sequences: 399bp序列或包含399bp序列的DataFrame/Series/list + short_threshold: short模型概率阈值 (默认为0.1) + ensemble_weight: short模型在集成中的权重 (默认为0.4) Returns: DataFrame: 包含所有序列预测概率的DataFrame ''' try: - # 如果输入是DataFrame或Series,转换为列表 - if isinstance(seq, (pd.DataFrame, pd.Series)): - seq = seq.tolist() + # 验证权重参数 + if not (0.0 <= ensemble_weight <= 1.0): + raise ValueError("ensemble_weight 必须在 0.0 到 1.0 之间") - # 如果输入是单个字符串,转换为列表 - if isinstance(seq, str): - seq = [seq] + # 统一输入格式 + if isinstance(sequences, (pd.DataFrame, pd.Series)): + sequences = sequences.tolist() + elif isinstance(sequences, str): + sequences = [sequences] results = [] - for i, seq399 in enumerate(seq): + for i, seq399 in enumerate(sequences): try: - # 从399bp序列中截取中心的33bp (GB模型使用) - seq33 = self._extract_center_sequence(seq399, target_length=self.gb_seq_length) + # 从399bp序列中截取中心的33bp (short模型使用) + seq33 = self._extract_center_sequence(seq399, target_length=self.short_seq_length) - # GB模型预测 - 确保输入是二维数组 - try: - gb_features = self.feature_extractor.extract_features(seq33) - - # 检查特征结构并确保是一维数组 - if isinstance(gb_features, np.ndarray): - # 如果是多维数组,进行扁平化处理 - if gb_features.ndim > 1: - print(f"警告: 序列 {i+1} 的特征是{gb_features.ndim}维数组,进行扁平化处理") - gb_features = gb_features.flatten() - - # 明确将特征转换为二维数组,正确形状为(1, n_features) - gb_features_2d = np.array([gb_features]) - - # 再次检查维度 - if gb_features_2d.ndim != 2: - raise ValueError(f"处理后特征仍为{gb_features_2d.ndim}维,需要二维数组") - - gb_prob = self.gb_model.predict_proba(gb_features_2d)[0][1] - except Exception as e: - print(f"GB模型预测序列 {i+1} 时出错: {str(e)}") - # 出错时设置概率为0 - gb_prob = 0.0 - - # 如果GB概率低于阈值,添加低概率结果 - if gb_prob < gb_threshold: - results.append({ - 'GB_Probability': gb_prob, - 'CNN_Probability': 0.0, - 'Voting_Probability': 0.0, - '33bp': seq33, - '399bp': seq399 - }) - continue - - # CNN模型预测 - try: - # 首先检查CNN模型的类型 - 通过尝试识别模型类型 - is_sklearn_model = False - - # 检测模型类型的方法 - if hasattr(self.cnn_model, 'predict_proba'): - # 这可能是一个scikit-learn模型 - is_sklearn_model = True - - if is_sklearn_model: - # 如果是sklearn模型 (如HistGradientBoostingClassifier),使用与GB相同的特征提取 - # 为CNN模型使用相同的特征提取方法,但从399bp序列中提取 - cnn_features = self.feature_extractor.extract_features(seq399) - if isinstance(cnn_features, np.ndarray) and cnn_features.ndim > 1: - cnn_features = cnn_features.flatten() - # 转为二维数组 - cnn_features_2d = np.array([cnn_features]) - cnn_pred = self.cnn_model.predict_proba(cnn_features_2d) - cnn_prob = cnn_pred[0][1] - else: - # 假设是深度学习模型,需要三维输入 - cnn_input = self.cnn_processor.prepare_sequence(seq399) - # 尝试不同的预测方法 - try: - # 先尝试不带参数 - cnn_pred = self.cnn_model.predict(cnn_input) - except TypeError: - try: - # 再尝试带verbose参数 - cnn_pred = self.cnn_model.predict(cnn_input, verbose=0) - except Exception: - # 最后尝试将输入重塑为2D - reshaped_input = cnn_input.reshape(1, -1) - cnn_pred = self.cnn_model.predict(reshaped_input) - - # 处理预测结果 - if isinstance(cnn_pred, list): - cnn_pred = cnn_pred[0] - - # 提取概率值 - if hasattr(cnn_pred, 'shape') and len(cnn_pred.shape) > 1 and cnn_pred.shape[1] > 1: - cnn_prob = cnn_pred[0][1] - else: - cnn_prob = cnn_pred[0][0] if hasattr(cnn_pred[0], '__getitem__') else cnn_pred[0] - except Exception as e: - print(f"CNN模型预测序列 {i+1} 时出错: {str(e)}") - # 出错时设置概率为0 - cnn_prob = 0.0 - - # 使用4:6的加权平均替代投票模型 - try: - voting_prob = 0.4 * gb_prob + 0.6 * cnn_prob - except Exception as e: - print(f"计算加权平均时出错: {str(e)}") - # 出错时使用简单平均 - voting_prob = (gb_prob + cnn_prob) / 2 - - results.append({ - 'GB_Probability': gb_prob, - 'CNN_Probability': cnn_prob, - 'Voting_Probability': voting_prob, - '33bp': seq33, - '399bp': seq399 + # 使用统一的预测方法 + pred_result = self.predict_single_position(seq33, seq399, short_threshold, ensemble_weight) + pred_result.update({ + 'Short_Sequence': seq33, + 'Long_Sequence': seq399 }) + results.append(pred_result) + except Exception as e: print(f"处理第 {i+1} 个序列时出错: {str(e)}") + long_weight = 1.0 - ensemble_weight results.append({ - 'GB_Probability': 0.0, - 'CNN_Probability': 0.0, - 'Voting_Probability': 0.0, - '33bp': self._extract_center_sequence(seq399, target_length=self.gb_seq_length) if len(seq399) >= self.gb_seq_length else seq399, - '399bp': seq399 + 'Short_Probability': 0.0, + 'Long_Probability': 0.0, + 'Ensemble_Probability': 0.0, + 'Ensemble_Weights': f'Short:{ensemble_weight:.1f}, Long:{long_weight:.1f}', + 'Short_Sequence': self._extract_center_sequence(seq399, target_length=self.short_seq_length) if len(seq399) >= self.short_seq_length else seq399, + 'Long_Sequence': seq399 }) return pd.DataFrame(results) @@ -424,4 +428,60 @@ class PRFPredictor: end = len(sequence) start = end - target_length - return sequence[start:end] \ No newline at end of file + return sequence[start:end] + + # 兼容性方法(向后兼容,但标记为废弃) + def predict_full(self, sequence, window_size=3, short_threshold=0.1, short_weight=0.4, plot=False): + """ + ⚠️ 已废弃:请使用 predict_sequence() 方法 + + 向后兼容的方法,内部调用新的 predict_sequence() + """ + import warnings + warnings.warn("predict_full() 已废弃,请使用 predict_sequence() 方法", DeprecationWarning, stacklevel=2) + + # 调用新方法并添加兼容性字段 + results_df = self.predict_sequence(sequence, window_size, short_threshold, short_weight) + + # 添加兼容性字段 + if 'Ensemble_Probability' in results_df.columns: + results_df['Voting_Probability'] = results_df['Ensemble_Probability'] + results_df['Weighted_Probability'] = results_df['Ensemble_Probability'] + if 'Ensemble_Weights' in results_df.columns: + results_df['Weight_Info'] = results_df['Ensemble_Weights'] + if 'Short_Sequence' in results_df.columns: + results_df['33bp'] = results_df['Short_Sequence'] + if 'Long_Sequence' in results_df.columns: + results_df['399bp'] = results_df['Long_Sequence'] + + if plot: + # 如果需要绘图,调用绘图方法 + _, fig = self.plot_sequence_prediction(sequence, window_size, 0.65, 0.8, short_weight) + return results_df, fig + + return results_df + + def predict_region(self, seq, short_threshold=0.1, short_weight=0.4): + """ + ⚠️ 已废弃:请使用 predict_regions() 方法 + + 向后兼容的方法,内部调用新的 predict_regions() + """ + import warnings + warnings.warn("predict_region() 已废弃,请使用 predict_regions() 方法", DeprecationWarning, stacklevel=2) + + # 调用新方法并添加兼容性字段 + results_df = self.predict_regions(seq, short_threshold, short_weight) + + # 添加兼容性字段 + if 'Ensemble_Probability' in results_df.columns: + results_df['Voting_Probability'] = results_df['Ensemble_Probability'] + results_df['Weighted_Probability'] = results_df['Ensemble_Probability'] + if 'Ensemble_Weights' in results_df.columns: + results_df['Weight_Info'] = results_df['Ensemble_Weights'] + if 'Short_Sequence' in results_df.columns: + results_df['33bp'] = results_df['Short_Sequence'] + if 'Long_Sequence' in results_df.columns: + results_df['399bp'] = results_df['Long_Sequence'] + + return results_df \ No newline at end of file diff --git a/FScanpy/pretrained/BiLSTM-CNN_all.pkl b/FScanpy/pretrained/long.pkl similarity index 100% rename from FScanpy/pretrained/BiLSTM-CNN_all.pkl rename to FScanpy/pretrained/long.pkl diff --git a/FScanpy/pretrained/GradientBoosting_all.pkl b/FScanpy/pretrained/short.pkl similarity index 100% rename from FScanpy/pretrained/GradientBoosting_all.pkl rename to FScanpy/pretrained/short.pkl diff --git a/FScanpy_Demo.ipynb b/FScanpy_Demo.ipynb new file mode 100644 index 0000000..4937334 --- /dev/null +++ b/FScanpy_Demo.ipynb @@ -0,0 +1,702 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FScanpy \n", + "\n", + "这个 Notebook 展示了如何使用 FScanpy 的真实测试数据进行完整的 PRF 位点预测分析,包括:\n", + "\n", + "## 🎯 完整工作流程\n", + "1. **加载测试数据** - 使用内置的真实测试数据\n", + "2. **FScanR 分析** - 从 BLASTX 结果识别潜在 PRF 位点\n", + "3. **序列提取** - 提取 PRF 位点周围的序列\n", + "4. **FScanpy 预测** - 使用机器学习模型预测概率\n", + "5. **结果可视化** - 使用内置绘图函数生成预测结果图表\n", + "6. **序列级预测演示** - 完整序列的滑动窗口分析\n", + "\n", + "## 📊 数据说明\n", + "- **blastx_example.xlsx**: 真实BLASTX比对结果\n", + "- **mrna_example.fasta**: 真实mRNA序列数据\n", + "- **region_example.csv**: 单独对某个位点进行预测的样本" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📦 环境准备和数据加载" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ 环境准备完成!\n", + "📋 可用的测试数据:\n" + ] + }, + { + "data": { + "text/plain": [ + "['blastx_example.xlsx', 'mrna_example.fasta', 'region_example.csv']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 导入必要的库\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# 导入FScanpy相关模块\n", + "from FScanpy import PRFPredictor, predict_prf, plot_prf_prediction\n", + "from FScanpy.data import get_test_data_path, list_test_data\n", + "from FScanpy.utils import fscanr, extract_prf_regions\n", + "\n", + "print(\"✅ 环境准备完成!\")\n", + "print(\"📋 可用的测试数据:\")\n", + "list_test_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 加载和探索测试数据\n", + "\n", + "首先加载 FScanpy 提供的真实测试数据,了解数据结构。" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📁 数据文件路径:\n", + " BLASTX数据: /mnt/lmpbe/guest01/FScanpy-package-main/FScanpy/data/test_data/blastx_example.xlsx\n", + " mRNA序列: /mnt/lmpbe/guest01/FScanpy-package-main/FScanpy/data/test_data/mrna_example.fasta\n", + " 验证区域: /mnt/lmpbe/guest01/FScanpy-package-main/FScanpy/data/test_data/region_example.csv\n", + "\n", + "🧬 BLASTX数据概览:\n", + " 数据形状: (1000, 14)\n", + " 列名: ['DNA_seqid', 'Pep_seqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qframe', 'sframe']\n", + " 唯一序列数: 704\n", + "\n", + "📊 BLASTX数据示例:\n", + " DNA_seqid Pep_seqid pident length evalue qframe\n", + "0 MSTRG.9998.1 CAMPEP_0196994412 68.27 104 1.000000e-33 2\n", + "1 MSTRG.9996.1 CAMPEP_0197017426 49.16 297 3.000000e-79 2\n", + "2 MSTRG.9994.1 CAMPEP_0197009206 98.31 354 0.000000e+00 2\n", + "3 MSTRG.9993.1 CAMPEP_0168331218 51.67 60 2.000000e-37 2\n", + "4 MSTRG.9993.1 CAMPEP_0168331218 45.45 88 2.000000e-37 3\n" + ] + } + ], + "source": [ + "# 获取测试数据路径\n", + "blastx_file = get_test_data_path('blastx_example.xlsx')\n", + "mrna_file = get_test_data_path('mrna_example.fasta')\n", + "region_file = get_test_data_path('region_example.csv')\n", + "\n", + "print(f\"📁 数据文件路径:\")\n", + "print(f\" BLASTX数据: {blastx_file}\")\n", + "print(f\" mRNA序列: {mrna_file}\")\n", + "print(f\" 验证区域: {region_file}\")\n", + "\n", + "# 加载BLASTX数据\n", + "blastx_data = pd.read_excel(blastx_file)\n", + "print(f\"\\n🧬 BLASTX数据概览:\")\n", + "print(f\" 数据形状: {blastx_data.shape}\")\n", + "print(f\" 列名: {list(blastx_data.columns)}\")\n", + "print(f\" 唯一序列数: {blastx_data['DNA_seqid'].nunique()}\")\n", + "\n", + "# 显示前几行\n", + "print(\"\\n📊 BLASTX数据示例:\")\n", + "display_cols = ['DNA_seqid', 'Pep_seqid', 'pident', 'length', 'evalue', 'qframe']\n", + "print(blastx_data[display_cols].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🎯 验证区域数据概览:\n", + " 数据形状: (3, 8)\n", + " 列名: ['FS_period', '399bp', 'fs_position', 'DNA_seqid', 'label', 'source', 'FS_type', 'dataset']\n", + " 数据来源: {'EUPLOTES': 3}\n", + "\n", + "📋 验证区域数据示例:\n", + " fs_position DNA_seqid label source FS_type\n", + "0 16.0 MSTRG.18491.1 0 EUPLOTES negative\n", + "1 16.0 MSTRG.4662.1 0 EUPLOTES negative\n", + "2 16.0 MSTRG.14742.1 0 EUPLOTES negative\n", + "\n", + "📈 标签分布:\n", + "label\n", + "0 3\n", + "Name: count, dtype: int64\n", + "\n", + "🔬 FS类型分布:\n", + "FS_type\n", + "negative 3\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# 加载验证区域数据\n", + "region_data = pd.read_csv(region_file)\n", + "print(f\"🎯 验证区域数据概览:\")\n", + "print(f\" 数据形状: {region_data.shape}\")\n", + "print(f\" 列名: {list(region_data.columns)}\")\n", + "print(f\" 数据来源: {region_data['source'].value_counts().to_dict()}\")\n", + "\n", + "print(\"\\n📋 验证区域数据示例:\")\n", + "display_cols = ['fs_position', 'DNA_seqid', 'label', 'source', 'FS_type']\n", + "print(region_data[display_cols].head())\n", + "\n", + "# 统计分析\n", + "print(f\"\\n📈 标签分布:\")\n", + "print(region_data['label'].value_counts())\n", + "print(f\"\\n🔬 FS类型分布:\")\n", + "print(region_data['FS_type'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. FScanR 分析 - 从 BLASTX 识别潜在 PRF 位点\n", + "\n", + "使用 FScanR 算法分析 BLASTX 结果,识别潜在的程序性核糖体移码位点。" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 运行FScanR分析...\n", + "参数设置: mismatch_cutoff=10, evalue_cutoff=1e-5, frameDist_cutoff=10\n", + "\n", + "✅ FScanR分析完成!\n", + "检测到的潜在PRF位点数量: 24\n", + "\n", + "📊 FScanR结果概览:\n", + " 列名: ['DNA_seqid', 'FS_start', 'FS_end', 'Pep_seqid', 'Pep_FS_start', 'Pep_FS_end', 'FS_type', 'Strand']\n", + " 涉及的序列数: 16\n", + " 链方向分布: {'+': 16, '-': 8}\n", + " FS类型分布: {1: 16, -1: 7, -2: 1}\n", + "\n", + "🎯 FScanR结果示例:\n", + " DNA_seqid FS_start FS_end Pep_seqid Pep_FS_start \\\n", + "0 MSTRG.9380.1 3797 3802 CAMPEP_0197017206 1137 \n", + "1 MSTRG.9431.1 4136 4192 CAMPEP_0197016790 657 \n", + "3 MSTRG.9432.1 848 904 CAMPEP_0197016790 753 \n", + "4 MSTRG.9582.1 302 304 CAMPEP_0197003180 214 \n", + "5 MSTRG.961.1 1536 1533 CAMPEP_0197017908 590 \n", + "\n", + " Pep_FS_end FS_type Strand \n", + "0 1138 1 + \n", + "1 675 1 + \n", + "3 2 1 - \n", + "4 214 1 + \n", + "5 19 -1 - \n" + ] + } + ], + "source": [ + "# 运行FScanR分析\n", + "print(\"🔍 运行FScanR分析...\")\n", + "print(\"参数设置: mismatch_cutoff=10, evalue_cutoff=1e-5, frameDist_cutoff=10\")\n", + "\n", + "fscanr_results = fscanr(\n", + " blastx_data,\n", + " mismatch_cutoff=10,\n", + " evalue_cutoff=1e-5,\n", + " frameDist_cutoff=100\n", + ")\n", + "\n", + "print(f\"\\n✅ FScanR分析完成!\")\n", + "print(f\"检测到的潜在PRF位点数量: {len(fscanr_results)}\")\n", + "\n", + "if len(fscanr_results) > 0:\n", + " print(f\"\\n📊 FScanR结果概览:\")\n", + " print(f\" 列名: {list(fscanr_results.columns)}\")\n", + " print(f\" 涉及的序列数: {fscanr_results['DNA_seqid'].nunique()}\")\n", + " print(f\" 链方向分布: {fscanr_results['Strand'].value_counts().to_dict()}\")\n", + " print(f\" FS类型分布: {fscanr_results['FS_type'].value_counts().to_dict()}\")\n", + " \n", + " print(\"\\n🎯 FScanR结果示例:\")\n", + " print(fscanr_results.head())\n", + "else:\n", + " print(\"⚠️ 未检测到PRF位点,可能需要调整参数\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 序列提取 - 获取 PRF 位点周围序列\n", + "\n", + "从 mRNA 序列中提取 FScanR 识别的 PRF 位点周围的序列片段。" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📝 从mRNA序列中提取PRF位点周围序列...\n", + "\n", + "✅ 序列提取完成!\n", + "成功提取的序列数量: 24\n", + "\n", + "📏 序列长度验证:\n", + " 399bp序列长度分布: {399: 24}\n", + " 平均长度: 399.0\n", + "\n", + "🧬 提取序列示例:\n", + "序列 1: MSTRG.9380.1\n", + " FS位置: 3797-3802\n", + " 链方向: +\n", + " FS类型: 1\n", + " 序列片段: AAGGAGTTTGAAGAAGAACAGGAAAAACAAGAGAAAGAGAGAAAGGAGAA...NNNNNNNNNNNNNNNNNNNN\n", + "\n", + "序列 2: MSTRG.9431.1\n", + " FS位置: 4136-4192\n", + " 链方向: +\n", + " FS类型: 1\n", + " 序列片段: CAAGTATCTGAGTGGGAGGGAGACACAGGTGTTGATCAAACCCCATTCCC...ATAATGACGGAGGCTTCAGA\n", + "\n", + "序列 3: MSTRG.9432.1\n", + " FS位置: 848-904\n", + " 链方向: -\n", + " FS类型: 1\n", + " 序列片段: AGAAAGGATGGTACTGAAAATCAACGAAGTACTTTCACATTTTAGAAAGA...GCTGAGAACGATATTGACAA\n", + "\n" + ] + } + ], + "source": [ + "# 提取PRF位点周围的序列\n", + "if len(fscanr_results) > 0:\n", + " print(\"📝 从mRNA序列中提取PRF位点周围序列...\")\n", + " \n", + " prf_sequences = extract_prf_regions(\n", + " mrna_file=mrna_file,\n", + " prf_data=fscanr_results\n", + " )\n", + " \n", + " print(f\"\\n✅ 序列提取完成!\")\n", + " print(f\"成功提取的序列数量: {len(prf_sequences)}\")\n", + " \n", + " if len(prf_sequences) > 0:\n", + " print(f\"\\n📏 序列长度验证:\")\n", + " seq_lengths = prf_sequences['399bp'].str.len()\n", + " print(f\" 399bp序列长度分布: {seq_lengths.value_counts().to_dict()}\")\n", + " print(f\" 平均长度: {seq_lengths.mean():.1f}\")\n", + " \n", + " print(\"\\n🧬 提取序列示例:\")\n", + " for i, row in prf_sequences.head(3).iterrows():\n", + " print(f\"序列 {i+1}: {row['DNA_seqid']}\")\n", + " print(f\" FS位置: {row['FS_start']}-{row['FS_end']}\")\n", + " print(f\" 链方向: {row['Strand']}\")\n", + " print(f\" FS类型: {row['FS_type']}\")\n", + " print(f\" 序列片段: {row['399bp'][:50]}...{row['399bp'][-20:]}\")\n", + " print()\n", + " else:\n", + " print(\"❌ 序列提取失败\")\n", + "else:\n", + " print(\"⚠️ 跳过序列提取 - 无FScanR结果\")\n", + " prf_sequences = pd.DataFrame()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. FScanpy 预测 - 机器学习模型分析\n", + "\n", + "使用 FScanpy 的机器学习模型对提取的序列进行 PRF 概率预测。" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🤖 FScanpy预测器初始化完成\n", + "\n", + "🎯 对 24 个FScanR识别的序列进行预测...\n", + "\n", + "📊 FScanR+FScanpy预测结果:\n", + " DNA_seqid FS_start FS_type Short_Probability Long_Probability \\\n", + "0 MSTRG.9380.1 3797 1 0.239192 0.087024 \n", + "1 MSTRG.9431.1 4136 1 0.326807 0.356356 \n", + "2 MSTRG.9432.1 848 1 0.310908 0.159746 \n", + "3 MSTRG.9582.1 302 1 0.272451 0.223354 \n", + "4 MSTRG.961.1 1536 -1 0.263269 0.046773 \n", + "\n", + " Ensemble_Probability \n", + "0 0.147891 \n", + "1 0.344536 \n", + "2 0.220211 \n", + "3 0.242993 \n", + "4 0.133372 \n" + ] + } + ], + "source": [ + "# 初始化预测器\n", + "predictor = PRFPredictor()\n", + "print(\"🤖 FScanpy预测器初始化完成\")\n", + "\n", + "# 对FScanR识别的序列进行预测\n", + "if len(prf_sequences) > 0:\n", + " print(f\"\\n🎯 对 {len(prf_sequences)} 个FScanR识别的序列进行预测...\")\n", + " \n", + " fscanr_predictions = predictor.predict_regions(\n", + " sequences=prf_sequences['399bp'],\n", + " ensemble_weight=0.4 # 平衡配置\n", + " )\n", + " \n", + " # 合并结果\n", + " fscanr_predictions = pd.concat([\n", + " prf_sequences.reset_index(drop=True),\n", + " fscanr_predictions.reset_index(drop=True)\n", + " ], axis=1)\n", + " \n", + " print(\"\\n📊 FScanR+FScanpy预测结果:\")\n", + " result_cols = ['DNA_seqid', 'FS_start', 'FS_type', 'Short_Probability', 'Long_Probability', 'Ensemble_Probability']\n", + " print(fscanr_predictions[result_cols].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🧪 对 3 个验证区域进行预测...\n", + "\n", + "📊 验证区域预测结果:\n", + " DNA_seqid label source Short_Probability Long_Probability \\\n", + "0 MSTRG.18491.1 0 EUPLOTES 0.368610 0.144442 \n", + "1 MSTRG.4662.1 0 EUPLOTES 0.229811 0.053352 \n", + "2 MSTRG.14742.1 0 EUPLOTES 0.454152 0.345118 \n", + "\n", + " Ensemble_Probability \n", + "0 0.234109 \n", + "1 0.123936 \n", + "2 0.388732 \n" + ] + } + ], + "source": [ + "# 对验证区域数据进行预测\n", + "print(f\"\\n🧪 对 {len(region_data)} 个验证区域进行预测...\")\n", + "\n", + "validation_predictions = predict_prf(\n", + " data=region_data.rename(columns={'399bp': 'Long_Sequence'}),\n", + " ensemble_weight=0.4\n", + ")\n", + "\n", + "print(\"\\n📊 验证区域预测结果:\")\n", + "result_cols = ['DNA_seqid', 'label', 'source', 'Short_Probability', 'Long_Probability', 'Ensemble_Probability']\n", + "print(validation_predictions[result_cols].head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. 序列级预测和可视化\n", + "\n", + "选择一个具体的mRNA序列,使用内置的plot_prf_prediction函数进行完整的滑动窗口预测和可视化。" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🧬 选择演示序列: MSTRG.9127.1\n", + "序列长度: 256 bp\n", + "序列前100bp: TGGCCTTCTTACTTGGAAGTCCCCAAGGATCATCTTGGCCATCCTTGCTTTCTTCATGGCTAGATTCTACCTCCTCCCATAATTGTGTGAAACAAGTAAC...\n", + "\n", + "🎯 使用plot_prf_prediction进行序列预测和可视化...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mnt/lmpbe/guest01/FScanpy-package-main/FScanpy/predictor.py:335: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.\n", + " plt.tight_layout()\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 39044 (\\N{CJK UNIFIED IDEOGRAPH-9884}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 27979 (\\N{CJK UNIFIED IDEOGRAPH-6D4B}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 27010 (\\N{CJK UNIFIED IDEOGRAPH-6982}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 29575 (\\N{CJK UNIFIED IDEOGRAPH-7387}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 28909 (\\N{CJK UNIFIED IDEOGRAPH-70ED}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 22270 (\\N{CJK UNIFIED IDEOGRAPH-56FE}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 31227 (\\N{CJK UNIFIED IDEOGRAPH-79FB}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 30721 (\\N{CJK UNIFIED IDEOGRAPH-7801}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 20998 (\\N{CJK UNIFIED IDEOGRAPH-5206}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 24067 (\\N{CJK UNIFIED IDEOGRAPH-5E03}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 38598 (\\N{CJK UNIFIED IDEOGRAPH-96C6}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 25104 (\\N{CJK UNIFIED IDEOGRAPH-6210}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 26435 (\\N{CJK UNIFIED IDEOGRAPH-6743}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 37325 (\\N{CJK UNIFIED IDEOGRAPH-91CD}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 24207 (\\N{CJK UNIFIED IDEOGRAPH-5E8F}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 21015 (\\N{CJK UNIFIED IDEOGRAPH-5217}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 20301 (\\N{CJK UNIFIED IDEOGRAPH-4F4D}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 32622 (\\N{CJK UNIFIED IDEOGRAPH-7F6E}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 36807 (\\N{CJK UNIFIED IDEOGRAPH-8FC7}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 28388 (\\N{CJK UNIFIED IDEOGRAPH-6EE4}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 38408 (\\N{CJK UNIFIED IDEOGRAPH-9608}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 20540 (\\N{CJK UNIFIED IDEOGRAPH-503C}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 30340 (\\N{CJK UNIFIED IDEOGRAPH-7684}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 32467 (\\N{CJK UNIFIED IDEOGRAPH-7ED3}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 26524 (\\N{CJK UNIFIED IDEOGRAPH-679C}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 65288 (\\N{FULLWIDTH LEFT PARENTHESIS}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 26465 (\\N{CJK UNIFIED IDEOGRAPH-6761}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 24418 (\\N{CJK UNIFIED IDEOGRAPH-5F62}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n", + "/home/guest01/.conda/envs/tf200/lib/python3.9/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 65289 (\\N{FULLWIDTH RIGHT PARENTHESIS}) missing from font(s) Liberation Sans.\n", + " fig.canvas.print_figure(bytes_io, **kw)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABRcAAALmCAYAAADYLKN3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB3P0lEQVR4nOzdd5hV1b0/4M8MvYmAHUUS7IgRG/ZCxBqCKd6AsSCxBcHY9UaNJvYoMYJ6FQtKNGo0Yosltti9RpMIErFiQ6IIDIq0YWZ+f/DjXMcBxA0yDL7v8+TJnHXWPuu798w65/hh7b3LampqagIAAAAA8BWV13cBAAAAAEDDJFwEAAAAAAoRLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAU0ri+CwAAGpa//e1vufnmm2u1XXXVVRk9enQuv/zyWu3nnXdeOnTosMDXOeigg/L8889n0KBBGTx48AL79OnTJ+PGjcv555+fH/7wh6X2v/zlL/nTn/6UN954IxUVFWndunU23XTTHHnkkdlmm22SJD179syECRMWuS/zx55fy+eVlZWlbdu22WabbXLMMcdk/fXXr7P9gw8+mD/96U8ZO3ZsPv3007Rq1Spdu3bNj3/84+y7776LHDtJ/vnPf+aSSy7JSy+9lKZNm2bnnXfOL3/5yzrHbMqUKTnllFPyxBNP5Oqrr87OO+9c6/nKysrccMMNGTVqVCZMmJBVV101u+++e44++ui0bt36S4/HD37wg1xwwQWLrPX666/PxRdfnF69euWSSy5ZZN/bb789Dz30UOlxhw4dct555yVJTj311IwaNapW/yZNmmTttddO7969c/jhh6dp06ZJkmHDhuWyyy6r8/otW7bMJptskp/97Gfp2bNnqX1h/ef78Y9/nKOPPjq//vWva7Uff/zx6dy5c4455pha7f379892222XE044IdOnTy+1f+9730vv3r1z3nnn5Z133im1b7fddunfv3+GDx+eF198sdS+wQYb5IQTTljocVlac+qLvurrzp49e4HHZsMNN1ys8QCAbybhIgDwlbz//vs544wzsvbaaydJLrzwwiTJ5MmTM2DAgPTo0SNJcuONN2bmzJmLfK2WLVtm1KhRGTRoUMrKymo9N27cuLz77rt1trniiity2WWXZdCgQTnjjDPSsmXLvPvuu7nqqqvys5/9LCNHjkz37t1z++23p6qqqrTd97///fTo0SOnnXZarfHn69q1a6666qrS46qqqrz11lu55JJLcsABB+Suu+7KWmutlSSpqanJqaeemvvvvz8DBgzI8ccfn5VXXjkfffRR7rnnnpxwwgn5+9//nrPOOmuh+z5u3LgcfPDB2XHHHXPLLbeksrIyp5xySo466qjceuutKS+fd4LJ888/nxNOOCFt2rRZ6GtdeOGFue2223LmmWdmq622yssvv5wzzjgjH330UYYMGZIkdY5HkkydOjU/+clPst122y30tSsqKnLqqadm7Nixadas2UL7fd6bb75Z61jO/xuZr3379rn77rtLjz/55JM8++yzufjii/Pmm2/md7/7Xa3+jz76aClwrKmpyX/+85/84Q9/yMCBA3PZZZdl9913X2j/z2vRokU++OCD7LnnnqWw+rHHHktFRUXmzp2bTTfdtBR0v/baa3n55ZeTJKuttlrpOH722We57rrrkiSNGjVa4H5OnTp1ge0LOy5Lc0593ld93RkzZizw2AAALIpwEQCoN1tvvXWeeOKJPPfcc3UCrlGjRmXrrbfO448/Xqv9xhtvzL777puBAweW2tZaa61sscUWOfDAA/Ovf/0r3bt3T/v27WttV15enubNm2fVVVddYC2NGzeu89waa6yRLl26ZOedd86f/vSnHHvssUmSP/7xj7nzzjszfPjw7LLLLqX+HTt2TPfu3bPOOuvk2muvzaGHHpp11113geNdd911admyZYYMGVIKOS+55JL06dMnTzzxRHbdddckycUXX5yDDjoom222WQ455JA6rzN9+vTccsstOeqoo0qhUKdOnfLqq6/mqquuyplnnpmVVlqpzvGYP94GG2yQ73//+wusMUnuvffezJgxI3feeWf233//hfb7KsrLy2sd61VXXTVdunTJlClTcvnll+fkk0/OGmusUXp+lVVWqRVsrrbaarnwwgvz8ssv57rrrqsTLn6xPwAAXx/XXAQA6k379u3TvXv33HHHHbXa586dm3vuuafWKa/zzZo1K3PmzKnT3rRp0/zpT3/KoYceulRrXH311dO+ffv85z//KbWNGDEiO++8c61g8fP69++fJ554YqHBYpKMHTs2m266aa3VkxtttFE6duyYp556qtR20UUX5YgjjqizsnO+Vq1a5YknnsiAAQNqta+22mqpqalZ6Eq30aNHZ9SoUTnttNMW+tpJsssuu2TEiBGLfSrukthoo42SJB988MGX9i0vL88GG2xQ6/fyTXXqqafm1FNPre8yAIBvKOEiAFCvvve97+Whhx6qdU27J598Mp988kn23HPPOv133nnnPPDAAzn++OPz97//fYFB49I0ZcqUTJ06tXRK9MSJE/Pee+8tNFhM5l2vcf5pzQvTuHHjNGrUqE57+/bta13Hb1EB5fyx2rdvXyukTOadGrzGGmtk9dVXX+B2Q4cOzc4775zNNttska+/zjrrLLDOr8Pbb7+dJFlzzTUXq/9bb71V+r0AAFA/hIsAQL3aZ599Mnfu3PzlL38ptY0aNSo77rhj2rVrV6f/2Wefnb333jv33XdfDjzwwGy99dbp379/rr/++qV+fbj3338/p5xySlq0aJEf//jHSZIPP/wwyeIHYAvzrW99K//+978zd+7cUtvs2bPz9ttv57PPPlui177xxhvz1FNP5cQTT1zg86+88kqefPLJHHHEEUs0ztJSWVmZp556Ktddd1322GOPLz2206ZNy+9+97u89tprOeigg5ZRlQAALIhrLgIA9apdu3bZaaedcscdd+QnP/lJKioq8uijj9a5Cch8bdq0ye9///t88MEHefzxx/P3v/89zz//fJ599tlcccUVueqqq9K9e/evXMeYMWNqbVdVVZXZs2dnq622yvXXX19aITd/RWJ1dXWt7UePHl3nmoi9e/fOb37zmwWOd+CBB+b+++/Peeedl+OOOy6VlZU599xzU15ensaNi39Fu/7663PBBRfkqKOOSu/evRfY54YbbkjXrl2z5ZZbFh5nSUyePLnWsZ49e3YaN26cPn36LPD03m233bbW4xkzZqRz58658MILF7i69Yv95zv11FML/W0sjz6/H/NX7z744IOlts+H9QAAXyfhIgBQ777//e/n2GOPzZtvvpnnnnsuTZo0WeD1Fj9vrbXWSr9+/dKvX79UV1fnr3/9a0477bScddZZueuuu75yDRtuuGEuvfTS0uNHHnkkF110UU488cR85zvfqTVukrz33nu1tt9oo41y5513lh6feOKJizxle6uttsqFF16Ys88+OzfffHOaN2+eQw45JNtuu+2XnlK9IDU1Nbnoooty3XXX5YQTTsjhhx++wH6VlZV55JFH0r9//688xtKy8sor59Zbby09nn8znQXd4TlJbrvttjRp0iTJvNPSf/azn+VHP/pR9ttvvy/t/3nt27fPxIkTl3wHlgOf/1u7+OKLk6TWStXVVlttWZcEAHxDCRcBgHrXs2fPtGnTJvfdd1+efvrp9OrVKy1atFhg308++SQrrbRSrbby8vLstdde+cc//pEbb7wxNTU1i7xJyYI0bdq01vUN+/fvn/vvvz+nn356Ro0aVQq+VllllWywwQb561//WusmKl/cvnnz5l865n777Zd99tknkydPTocOHdK0adPsvffe6dOnz1eqPZkXMI0cOTK//e1vF3n35+effz6ffPJJ6W7U9aFRo0Zfei3Jz1tnnXVKd39ed911c/DBB+eyyy7LHnvskc6dOy+y/xetKOHi549fq1at6rQBACwrrrkIANS7Zs2aZc8998x9992Xf/3rXws9nfevf/1rtt566zz77LMLfP7999/Pqquu+pWDxQUpLy/PWWedlfHjx+fKK6+s9dwRRxyRf/7zn/nzn/+8wG2nT59eujbjwrz22mv585//nKZNm2bNNddM06ZNM3bs2IwfPz69evX6SrWOGjUq1113XS6++OJFBotJ8txzz6VFixbZZJNNvtIYy5NBgwalXbt2OeOMM1JTU1Pf5QAAfKMJFwGA5UKfPn3y1ltvpUOHDtl+++0X2GfXXXdN9+7d84tf/CLXX399XnnllXzwwQf55z//mV/96ld55JFHMmjQoKVWU9euXdOvX78MHz48b7zxRqm9d+/eOeSQQ3L66afnN7/5Tf71r39l4sSJ+fe//53rr78+++67b6ZNm5Yf/vCHpW2GDBmSAw44oPR44sSJ+eUvf5lLL7007733Xp577rkcd9xx+a//+q906dIlybzrPk6aNCmTJk3KtGnTksxbuTm/LZl3/cELLrgg++yzT7bccsvSc/P/N2vWrFr79NZbb2XttddeaAC71157ZcSIEaXHFRUVpdeafx3Khb32stKqVav88pe/zPPPP5/bbrutXmoAAGAep0UDAF/ZaaedVjrt980338wpp5ySJPnd736XlVdeOUkyYcKEr3Tq7dZbb52OHTumZ8+eadSo0QL7NG3aNCNGjMgf/vCH3H333fmf//mfTJ8+PSuttFI222yzXHPNNdlpp52WaN++6Nhjj82DDz6Y0047LTfffHPpeoi//OUvs/POO+emm27KwIED88knn6R169b59re/nZ/+9Kfp169f2rRpU3qdSZMm5Z133ik93mWXXXLOOefk2muvzTXXXJMOHTrkRz/6UX7+85+X+kycODHf/e53a9VzwgknlH5+9dVX8/LLL6eioiL33ntv7r333jr1n3/++bVCzmnTpqV169YL3d/x48dn8uTJpceDBw/O888/X3r8n//8J4888sgCX/vzjjzyyNLPS/su3kmy5557Zuedd85FF12UXXfd9StdY/CGG24o3fxk8uTJOemkk5Ik9957b15++eUkyWeffVbat2effba0P1VVVaVrcL722mu19nN+DZMnT67V/vm/54UdlyWZUxdccMFC9/Wrvu7Cjg0AwMKU1TiXBAAAAAAowGnRAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAgAbq4Ycfzt57752ZM2cuVv877rgjO+yww9dcFcvK3//+9+yyyy6ZMmVKfZcCAHyDNa7vAgAAGoIHHnggDz74YOnxsccemw8//DA333xzqe3ggw9OmzZtcvnll5favve976V79+45++yzS23bb7999t9//xx33HGlto033jhHHHFEzj777FJYtPrqq+fUU09dYD2TJk3KaaedlquvvjotWrRIkjz++OO5+uqr89prr2XGjBlZc801s//+++fwww9PWVnZ0jkQX3D77benZ8+ead++/Zf2ra6uzqWXXpp77703n3zySTbbbLOcddZZWWeddRbYf9y4cbngggvy8ssvp2XLltlzzz1z0kknpWnTpl861rBhw/Lkk0/mT3/601fep6/bhAkT8utf/zovvfRSWrZsmX322ScnnHBCyssX/O/+N998c66//vp89NFH6dSpUwYPHpzdd989W2+9dXr16pUzzjij1t8cAMCyJFwEAFgM7777bi655JIkyf/+7/9m6tSpef/99/PLX/4yq666at5///289NJLWWWVVXLIIYdk8803T5IMHz48G264YXbffffsu+++pbbk/wLFz7etvvrqOeOMM2q1Lci1116bbt26ZbPNNkuS/Otf/8rgwYNz7rnnZvfdd0/Tpk3zz3/+M7/4xS9SU1OTI488cqkfk6qqqlxwwQXp3r172rdvn9GjR6dbt24LDTJvuumm3HPPPbn66quz+uqr55JLLsnRRx+du+66q842n332WQ477LD86Ec/yvDhw/Pee+/l8MMPT7t27TJw4MClvi9Ly+TJkzNjxoyFBqZJMnjw4HTt2jUPP/xwJk+enCOPPDKrrLJKDj300Dp9H3zwwQwZMiRXXXVVNttss9x555059thjc//992edddbJEUccke9+97sZO3Zsunbt+nXuGgDAAjktGgCggZk7d25uu+22/Nd//Vep7fnnn8/aa6+d3r17p0WLFmnUqFG22mqrDB06NFtvvXWt7R966KF897vfTbdu3XLyySensrIyybyVhZdffnl69eqVzTbbLD/4wQ/y7LPPlrbr2bNn/ud//iff/e53c+aZZ2abbbbJp59+mj59+uSyyy7LFVdckV69emX48OH5+OOP69R96623pn///unSpUtat26d4447Lm+++WZeeumlOn0nT56cnXbaKYMHD07Tpk3TpUuX7LnnnnnhhReWyjGcNm1aTj755Oy4447p3r17jjjiiLz//vtJkvfffz8bbrhhnn766ey3337ZfPPN07dv39LzSXLFFVdk6623znbbbZfrr78+hx56aIYNG5aKior07ds3P/vZz/LXv/41c+fOrTXumDFjMm7cuJx44olp06ZNOnfunP79++fWW29dYJ2zZs3K8ccfny233DJNmjTJ/vvvn1atWuVf//pXkmS11VbLbrvtlltuuWWpHBcAgK9KuAgA0MCMGTMmn332WbbZZptS27e+9a2MHz8+t912W+bMmVNq33LLLbPFFluUHn/22Wd58cUXc8899+TWW2/Nfffdl8ceeyzJvJWFt912Wy677LK88MIL6d27dwYOHJjJkyeXtv/LX/6S6667LmeddVbuuuuuJMldd92VQYMG5corr8ywYcPy/vvvZ999980xxxyTZ555JjU1NZk1a1beeOONbLLJJqXXat26ddZdd92MGTOmzj526tQp559/fho3/r8TbSZOnJjVV199KRzB5PTTT8+kSZNy991358knn0zz5s1z7LHH1uozcuTIXHXVVfnb3/6WGTNm5JprrkkyL5y98sor8z//8z955JFH8uabb2bs2LFJki5duuSxxx7Lj370o9x4443Zddddc8kll5SCybFjx6Zjx45p27ZtaZyuXbtm/PjxmT59ep06+/TpkwMOOKD0+JNPPslnn31W6zj06NEjzz333FI5LgAAX5VwEQCggXnjjTey+uqrZ+WVVy617b777hkwYEB+/etfp0ePHjn00EMzfPjwTJgwoda2s2fPzuDBg9OyZctssskm+fa3v53x48cnmXf9xAMOOCAbbrhhmjZtmgEDBqRFixb529/+Vtp+p512yrrrrrvQU5833njj/OY3v8mjjz6aHXbYIRdffHH69euXadOmpaamplaoliRt27bN1KlTv3SfH3nkkTz22GMZMGDAYh6lhauoqMhDDz2UY489Nu3bt0/r1q1zzDHHZMyYMXnvvfdK/fr161c6zjvuuGPefPPNJPOubbnjjjtmq622SsuWLXPyySdn1qxZpe2aNm2affbZJyNHjswf/vCHzJ49O/vvv39uvvnmVFRUZKWVVqpzDJJ86XGoqanJ6aefnu985zu1guX1118/7777bq0aAACWFeEiAEADM3Xq1DohXVlZWU466aQ8/fTTOffcc9O5c+fccsst2WOPPXLnnXeW+rVr1y6tWrUqPW7evHlppeP777+fLl261HrdTp061QooO3bsuFg1VldXZ86cOZkzZ06aNGlSaq+pqVns/Zzvr3/9a0488cT89re/zfrrr/+Vt/+iDz74IDU1NbX2tVOnTklSa1/XXnvt0s8tWrTI7Nmzk8y7mc7nj8P805sXpLKyMnPmzEl1dXUaNWqUpNgxqKyszIknnpg33ngjl156aa3n2rVrl+TLw0kAgK+DG7oAADRAC1s52LZt2+yzzz7ZZ599UlNTk1/96le58MILs99++y1yuyS1Tqde2FjzA7KFGT16dG655ZY88sgj2WmnnXLWWWdlq622yuzZs1NeXp6Kiopa/SsqKtKhQ4eFvt6tt96aiy++OMOGDcuOO+64yLEX18L2M6m9rws7VtXV1bVO105S607Pc+bMyf33359bbrkl//nPf/LjH/84d999d1ZfffX86U9/WuAxKCsrW+gdt2fNmpWBAwdm5syZuemmm0ph4hfrLBJaAgAsKSsXAQAamHbt2tUJqK655ppapy8n80KnHXfcMbNmzVqs4KlTp0556623So/nzp2bd955Z5F3Pv68ww8/PKeccko22GCDPPjgg7n44ouz1VZbJUmaNWuW9ddfv3RtwmTe9QPffffd0h2vv+iBBx7IJZdckpEjRy61YDFJaX8+v6/zf56/gnFROnTokA8++KD0ePr06aVTy994443suuuuuf/++3PkkUfmkUceydFHH126RuKmm26aiRMnZsqUKaXtx4wZk/XWW6/WitL5ampqctxxx6Vx48a5/vrr6wSLSUqvtbBwEgDg6yRcBABoYNZbb7189NFHmTZtWqltxowZOe200/L4449n1qxZqa6uzquvvprhw4enZ8+ei1yxOF+fPn3yxz/+MW+++WbmzJmTK6+8MlVVVenZs+cC+zdv3jxJ8vbbb2f69Ok5/vjjc//996d///61rgc5X79+/TJy5Mi8+eabmT59ei6++OJsvPHG6datW5JkyJAhueCCC5Ikn376ac4666xcdNFF2XjjjRc4/l577VXo7tEdOnTIjjvumEsvvTQVFRWZNm1afv/736dHjx5Zc801v3T7bbfdNk888URGjx6dWbNm5be//W3pWLRv3z533HFHrrzyyuy66661VjQmySabbJJu3bplyJAhmT59et58882MGDEi/fr1W+B+3XPPPaVToZs1a7bAel5//fV06tSpVAMAwLLktGgAgMVQWVmZ4447LkkyefLkHH/88UmSM888M82aNcvMmTPTu3fvJMmll15aCtfmB2N/+MMf8vDDDydJaRXbvffem1deeSVJStclfOaZZ0ptn376aY444og6tXTr1i0tW7bM888/n169eiVJBg8enLZt2+aSSy7Je++9lzlz5mSNNdbI3nvvnYEDBy7WPg4YMCBTp07N4Ycfnk8++SQbb7xxRo4cWecGJPOtssoq2XPPPfOLX/wiffv2zVNPPZV33323Tr+OHTvmoYceSt++fTNp0qQcdNBB+eyzz9KjR49cdtllpX6TJk0qXdfwkUceydSpUxdY+/y7S48fP36RNzEZPXp0Kbic74c//GF+/etf58ILL8yvf/3r7L333ikvL892222X888//8sPUpLvf//7efnll3PwwQenbdu2OeaYY/LKK6+krKws48ePz0EHHbTA7QYOHJhBgwZl6NChOeOMM7LDDjukdevW6du3b607Qo8fPz4zZsxIkvz5z3/OhAkTat3AJZkXBJ9zzjlJkueffz7bbrvtYtUOALC0ldW4OAsAQINzwQUX5K233srw4cPru5R6c+mll2a33XZb6GnVX6c5c+akadOmpce77bZbBg4cmP3333+Z1jFp0qT07Nkzt9xyS7p27bpMxwYASJwWDQDQIP3sZz/LSy+9VFrF903097//PRtttFG9jLv11ltn9OjRqaqqyh133JFJkyZlu+22W+a1XH311dl5550FiwBAvbFyEQCggXr44YczZMiQjBo1yvX2lrHrr78+I0eOzJQpU7LOOuvkF7/4RXbfffdlWsMLL7yQE044IaNGjXIzFwCg3ggXAQAAAIBCnBYNAAAAABSyQt4teu7cuZk2bVqaNWuW8nL5KQAAAAB8FdXV1Zk9e3batm2bxo0XHiGukOHitGnT8vbbb9d3GQAAAADQoHXu3DkdOnRY6PMrZLjYrFmzJPN2vunoR+s832iznZd1ScutuX8YUqet8UEnLN62N/6u7rYHHr/ENX0dqp6+u05box2+Xw+VwPKt6l+P1WlrtPluX++Yj91ed8zdfvy1jtnQVD10c63HjXr1W7ztHrixTlujvQ780u3m/s+v67Q1/vmZizXm3Jsvrbttv1986XZV/1zA53X3nos35h9/X3fMA4798jH/cn3dMfftv1hjFv1cqXrijrrb7fzDxRvzf++ru22Pfb50u7l3Dq/T1ni/IxZvzGfvrTvmdt/78u3+/Wzd7TZZvDspVz23gP3c9sv3s+h2SVL19wfrbrv1nl++3TP31N1u+96LNWZR9fE+3dB88e9vsf/2XlnA3+3Gy/4O4Ivri3+3i/M3myRVY5+u09ao6w5LpaaFjlnwvaS+fPG9erHfpx+7rU5bo932Xyo1LU+qXvt7nbZGG2xdD5XA129JvqNWvfxU3W033fHLtxv3fN3tNtpmscZcqlq0ycyZM/P222+XcraFWSHDxfmnQrdo0SLNqmbVeb5RC3dTnG/uJx/XaWu8mMdnSbZd1qrmTK/T5u8A6qqqh/fMqtmfLvMxG5qqWZ/Uery4x6dq5rQ6bYuz7dypH9ZpW+zPhk8nF9q2qmpmnbbF3c+in0dVMyoKj1n0c6VqdvHPo6rKGYW2nfvZ1Dpti/v7rJrzWaExq6pnF9ouSaoqC45ZcLt52xY7tkWPz5Koj/fphuaLf3+L/XdQPadO2/J8bL/4d7v4+1l8fhZVH3NlSXzxvXqxj+2sb8Z3mqqayjptK+J+QrJk31GLvt9W1Swnn0ctW5Z+/LJLDrogIQAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAACikcX0X8HWorq5OksycOTNVjZrXeb7RzFnLuqTl1tyVVqnT1ngxj8+SbLusVTVtXafN3wHUVR/vmVXN2izzMRuaquYr1Xq8uMenqkXbOm2Ls+3cdqvXaVvsz4Y2HQptW9WoRZ22xd3Pop9HVS1XLjxm0c+VqmbFP4+qmrQstO3cVu3qtC3u77OqaatCY1aVNyu0XZJUNSk4ZsHt5m1b7NgWPT5LwnfbL/fFv7/F/jsob1qnbXk+tl/8u138/Sw+P4uqj7myJL74Xr3Yx7b5N+M7TVVZkzptK+J+QrJk31GLvt9WlS0vn0dNMnPmzCT/l7MtTFlNTU3NsihpWZo8eXLefvvt+i4DAAAAABq0zp07p0OHuosI5lshw8W5c+dm2rRpadasWcrLnfkNAAAAAF9FdXV1Zs+enbZt26Zx44Wf/LxChosAAAAAwNfPsj4AAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhwkUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAwGI588wz87Of/SzV1dX1Xcpyb/To0enRo0def/31+i4FAOBrVVZTU1NT30UAAHzTDRs2LK+99lrKysqSJOuvv34GDx6cAw88MO3bt0+STJgwIZdeemkmTJiQoUOHpkOHDqXthw4dmrPOOitTpkxJksycOTN77713fvjDH2bvvffO+uuvnyR54403ct999+WOO+7Ifffdl5YtWyZJOnTokDPPPHOh9d1555256KKLcu+99+a1115bpuPPnDkzF154YZ544olUVFRkvfXWy6BBg7LzzjsvtN7p06fnt7/9bR588MHMnj07G220UX75y19ms802W+g2I0eOzJ/+9KdMmDAha665Zvbff/8ceuihC+3/eX//+99z8MEHZ+DAgRk8eHCS5Lrrrsstt9ySu+66Ky1atFis1wEAaGga13cBAADMc8kll6Rx43lfz4YNG5Yk2XLLLXPcccclSe64445S36OPPjrbb799rb5t27bNWWedlSR5//338/zzzydJ9thjj9JrzO+bJGeccUbWXXfdOu1fNGPGjFx88cU5/PDD065du2U+/m9+85uMHj06V199ddZee+2MGjUqAwcOzF133ZUuXboscJvjjz8+lZWVufPOO9OmTZtcffXVueSSS3LttdemvLzuyTt33nlnLrnkkgwbNizbbLNNRo8enSOPPDKtW7fO/vvvv9DakuSzzz7LqaeemlatWtVqP/DAA/OHP/whN9xwQ4466qhFvgYAQEPltGgAABbprrvuyqeffpqf/OQny3zsadOm5Z577snRRx+dLl26pFmzZunbt2/WW2+9/PGPf1zgNqNHj85TTz2Vc889N2uuuWZat26d4447LiNGjCgFi1dccUV69uxZ2mbkyJHp06dPdtxxxzRt2jRbbbVVfvSjH2XkyJFfWuP555+fjTfeOBtvvHGt9qZNm+anP/1pRowYkaqqqiU4CgAAyy/hIgAAi/TEE09k6623rpdTe8eOHZvKyspsvvnmtdo322yz/Otf/1rgNs8++2zWXHPN/O1vf8vuu++ebbbZJocffnjefvvtUp+BAwfm0UcfTZLMmTMn48aNW+AYr7/+ej777LOF1vf444/n4YcfLq3Y/KIdd9wxFRUVGTNmzJftKgBAgyRcBABgkcaNG5dNNtmkXsaefw3Htm3b1mpv165d6bkvmjhxYj7++OOMHTs2d9xxR+6+++7MnTs3RxxxRObMmVOnf0VFRaqqqhY4Rk1NzULHqaioyGmnnZZf/epXWWWVVRbYZ4MNNkijRo3yyiuvfOm+AgA0RMJFAAAWacqUKaVrLTYENTU1mTNnTk477bSstNJKWWONNfLLX/4y77zzTl544YWlNs6vf/3rbLHFFtlnn30W2qe8vDxt27ZdaEAJANDQuaELAADLrfl3pK6oqKh1w5SpU6cudLXgaqutliZNmpTuRJ0knTp1SpJ89NFHdfqvvPLKady4cSoqKmq1T506NWVlZbXuij3fX/7ylzz33HP5y1/+8pX3CQBgRWLlIgAAi9S+fftMnTq1XsbedNNN07Rp07z00ku12v/xj3+ke/fuC9xmww03zOzZs/Paa6+V2t55550kydprr12nf9OmTdO1a9c613B88cUXs9FGG9UKKee75ZZb8tlnn2XvvfdOjx490qNHj/zjH//INddckx/84AelftXV1Zk2bVqDWvkJAPBVCBcBAFikDTfcsN6uGdimTZv86Ec/ymWXXZa33norM2fOzLXXXpt33303P/3pT5MkH374Yfbaa6/SKc+77LJL1ltvvfzmN7/JRx99lClTpuSCCy7IxhtvnC222GKB4/Tv3z933XVXnn766cyZMydPP/10Ro0alUMPPbTU55BDDsmIESOSJJdeemn++te/5q677ir9b9NNN03fvn0zfPjw0javvfZaqqqq6txJGgBgReG0aACA5cRxxx2XsrKyJMn666+fZN7quWOOOSZJMmHChGyzzTZJkssvvzy33HJLre2nTZtW6jtz5szsvffeSZK//vWvGT9+fJLkjTfeyODBg5MkZ599dmlV3oJO/Z1vl112yYUXXphZs2alefPmy3z8X/7yl/ntb3+bAw88MJ9++mk23njjXHvttVl33XWTJJWVlRk/fnxmzJiRJGnSpEmuueaanHPOOdlrr71SU1OTnXbaKRdeeGHKy+f92/oVV1yR22+/vXTH6H322SeffPJJzjrrrEycODFrrrlmTjnllPTp06dUx3vvvZfJkycnmbea84uaNm2a1q1bZ9VVVy21PfXUU1l55ZWz2WabLXT/AAAasrKampqa+i4CAIDl14wZM9KrV68ceeSROfjgg+u7nAajsrIye+65Z/bff//8/Oc/r+9yAAC+Fk6LBgBgkVq2bJnjjz8+w4cPr3PTExbupptuSnl5eQ455JD6LgUA4Gtj5SIAAIvlV7/6VSZOnJjhw4eXTt9mwUaPHp3DDz88f/jDH7LBBhvUdzkAAF8b4SIAAAAAUEi9nxb95JNPZvvtt89xxx23yH7V1dW55JJL8t3vfjdbb711fvazn+W9995bRlUCAAAAAF9Ur+Hi1VdfnXPOOad0p79Fuemmm3LPPfdk+PDheeyxx9K5c+ccffTRsfASAAAAAOpH4/ocvFmzZrn99ttz7rnnZvbs2Yvse+utt6Z///7p0qVLkuS4445Ljx498tJLL2XzzTev1Xfu3LmZNm1amjVrlvLyel+cCQAAAAANSnV1dWbPnp22bdumceOFR4j1Gi4efPDBi9Vv1qxZeeONN7LJJpuU2lq3bp111103Y8aMqRMuTps2LW+//fZSrBQAAAAAvnk6d+6cDh06LPT5eg0XF9e0adNSU1OTtm3b1mpv27Ztpk6dWqd/s2bNkszb+ebNmydJampqMn369LRu3drdDaEBMoehYTOHoWEzh6HhM4+hYauPOTxr1qy8/fbbpZxtYRpEuDjf4l5fcf6p0C1atEjLli1L286dOzetWrXyRgoNkDkMDZs5DA2bOQwNn3kMDVt9zOH543zZJQcbxAUJV1555ZSXl6eioqJWe0VFxSKXZQIAAAAAX58GES42a9Ys66+/fsaOHVtq++STT/Luu+9ms802q8fKAAAAAOCba7kNFz/88MPstddeee+995Ik/fr1y8iRI/Pmm29m+vTpufjii7PxxhunW7du9VwpAAAAAHwz1es1F+cHg3Pnzk2SPPzww0mSMWPGpLKyMuPHj8+cOXOSJH379s2kSZNy0EEH5bPPPkuPHj1y2WWX1U/hAAAAAED9hotjxoxZ6HNrr712Xn311dLjsrKyHHPMMTnmmGOWRWkAAAAAwJdYbk+LBgAAAACWb8JFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUEjj+i6gIZkzZ04efvjhPPbYY5k0aVKqqqrquyQagKZNm2a99dbLXnvtle7du9d3OQAAAABLjXBxMc2ZMycnnHBCnnzyyay//vpZa621Ul5u4Sdfbs6cOXnkkUfypz/9Kaeeemp+8pOf1HdJAAAAAEuFcHEx3XPPPXnqqadyxhln5Dvf+U59l0MDU1NTk2uvvTa//e1v893vfjerrLJKfZcEAAAAsMQsvVtMjz76aDbddFPBIoWUlZXlJz/5SWpqavL444/XdzkAAAAAS4VwcTFNnDgxnTp1qu8yaMDatGmTDh065D//+U99lwIAAACwVAgXF1N1dbVrLLLEGjVq5EZAAAAAwArDNReXwJ///Oe89dZbSZK11147/fr1y69//eu0bNky1dXV6d27d6ZPn146DbampiYnn3xyrrvuukyePDlJ0rVr1+yzzz6LHKempiY333xznnzyyVRXV6eysjJdunTJUUcdlQ4dOuShhx7K/fffn9/97ndLvE8PPPBAevXqlUaNGi2y34wZM3LppZfmjTfeSHl5eXbbbbcccMABC+x733335d57701lZWVWWmmlDBw4MF26dFnk6y/NfVpS7777boYNG5apU6emadOmOeigg7LddtvV6VdZWZlrrrkm//rXvzJ37tx8+9vfzjHHHJM2bdrUQ9UAAAAAXz/h4hJo2rRpTjrppCTzbviSJFtssUV69+6dDz/8MO+8806mTZuWQYMGpUWLFqU+q666agYMGFBru0W599578/zzz+eiiy5K69atM3fu3Fx55ZUZMmRIzjvvvKW2P1VVVbnmmmvSs2fPTJs2LXfccUf23nvvdOzYsU7fG264IU2bNs3VV1+dWbNm5YQTTsi3vvWtOqHbyy+/nD/+8Y8ZOnRo2rdvn1GjRuWSSy7JZZddttTqXlL/+Mc/8vbbb6dXr14LDAIvuOCC9OnTJ3vuuWfefffdnHTSSdlwww3Tvn37Wv1uu+22TJgwIZdddlnKy8tz/vnn549//GOOPPLIZbUrAAAAAMuUcHE58cwzz+TJJ5/MKaecUue59957L506dUrr1q2TJI0bN85hhx2WysrKWv1GjhyZZ555JnPmzMmgQYOyxRZbpKamJrfeemsee+yxJMlqq62WgQMHZs0118xNN92Ujz76KBMnTsxGG22Uf//735k5c2YGDRqU448/PmussUbOPffctG/fPvvss0969OhRWtH4+OOP5+yzz055eXlatmyZnj175m9/+1udcLFdu3Y59dRTS0Hclltumeuvvz41NTUpKysrdKw+++yzXHXVVXnllVdSXl6erl275ogjjkjz5s1z6qmnZptttskLL7yQCRMmpGPHjjnjjDPSokWLvPTSS/n973+fxo0bZ5tttslbb72Vnj17Zuutt86YMWMyePDgbLbZZtlnn32y0UYbJUnefvvtTJo0Kb169UqSdOrUKRtuuGGeeeaZfO9736tV1+abb56dd945TZo0SZJ07949Tz/9dKF9BAAAAGgIXERwObH99tsvMFhMkm222SZ/+9vfMnTo0Dz//PP57LPP0rx581qr7N5+++107949V155Zb73ve/lxhtvTDIvtHzkkUdy0UUX5aqrrspGG22U3//+96Xt/vd//zcnnnhiBgwYUFqFedlll2WjjTbK9773vVxxxRXZf//987e//S2HHXZYbrnllnzyySf59NNPs8Yaa5ReZ6211sqECRPq1N6xY8dsuummpcfPPfdcNtxww8LBYjIvRJ01a1auvPLKXH755fnoo49y++23l55/+umnc8YZZ+Taa6/NlClT8vTTT6eqqipDhgzJIYcckquvvjqdOnXKK6+8kiRZeeWVS+2bb755rr766hxzzDF5+umnM2HChKy++uq1rre55pprLnBfN9lkk6y99tq19nXjjTcuvJ8AAAAAyzvhYgOw1VZb5bzzzsuMGTPy+9//Pn379s3pp59eut5jMm9FYrdu3ZIk3/72t/Pxxx8nmRce7rbbbllppZWSJPvuu29efvnlzJo1K0nSpUuXrLbaaosc/zvf+U6OP/749OzZM7fffntmz56dZN5p4fM1bdq09JoL87//+78ZNWpUBg4c+BWPQN3X6d27dxo1apTGjRtnzz33zIsvvlh6ftttt02LFi3SuHHjdOrUKZMmTcqECRNSUVGRnXbaKUnSq1evtGjRotbrNmnSJD179sx///d/Z5VVVslDDz2U2bNnl1YiztesWbMv3dfrr78+U6dOzY9//OMl2lcAAACA5ZnTohuITTfdtLQC8O23386tt96aM888M9dff32SpGXLlqW+5eXlpTsST506NRtssEHpufmrHadNm5YkpdBxYSZMmJC//OUveeqpp7L11lvnwgsvTPPmzZMks2fPTrNmzZIkc+bMKbUvyIMPPpibbropv/nNb9K5c+evsOd1VVRUpG3btqXHbdq0SUVFRelxq1atSj83atQo1dXVmT59elq1alU6rbu8vDwdOnSo9bqjR4/OvffemzfeeCO777579txzz4wbN64Ups43e/bsOsHkfFVVVbn88sszfvz4nHfeeQvtBwAAALAiEC42AC+++GI22GCDUjDYuXPnDBw4MH379q0Vqi1Iu3bt8umnn5Yez/955ZVXXuR2n332Wc4///xMmTIle++9d6688spaAWbbtm0zYcKEUjj5/vvvp1OnTgt8rYceeii33357Lrzwwqy55ppftrtfauWVV84nn3xSevzpp5+mXbt2i9ymZcuWmTVrVqqrq1NeXp6amppMmTIlSfL666/nkksuSbt27bLvvvvmlFNOKYWQ66yzTj788MNUVVWV2iZMmJAddthhgeMMHTo006ZNy/nnn7/IsBUAAABgReC06AbgrrvuyvDhwzNz5swkSXV1dR555JGsvfbadVbffVGPHj3y6KOPZvr06UmSu+++O1tssUVpxeHnzQ/P5vft27dvrrjiivTu3btWsJgkPXv2zKhRo1JdXZ1p06blwQcfTM+ePeu85sSJEzNixIicffbZSyVYnL9Pf/nLX1JdXZ3Kysrcf//92XbbbRe5TceOHdO8efM899xzSZJHHnmkdGpz8+bNc9ppp+Xcc8/N9ttvXzoOybwbuHTs2DH33ntvkmTcuHF5/fXXs/3229cZ47HHHss777yT0047TbAIAAAAfCNYubgEPvzww1x00UVJ/u9U3EcffbR0Ku0ee+yRZN5qtvLy8kyZMiW9e/fO66+/XtpurbXWSrLou0Wfcsopuf7663PMMcekrKwsc+fOzYYbbpizzjrrS2vcYYcdMmHChJx44ompqanJWmutlV/84hcL7Nu+fft069YtRx55ZI488shce+21dfo0b948I0aMyIEHHphhw4bliCOOSHl5efbZZ59stdVWSZJ77rknH374YQ477LDcf//9mT17ds4888xar/OrX/0qzZo1y2mnnZbLLrusznUNk2T8+PE58sgja7Wde+65OeSQQ3LllVfm5z//eZJ5d2Xeb7/9FnkcmjRpksGDB+eaa67JjTfemO233z6dO3dOWVlZ3nrrrVx55ZV1ttliiy1y0kkn5ZRTTsmwYcNy7733plmzZjn55JNLp2UPGTIkW2+9dXbeeefcfffd+eijjzJo0KDSa7Ru3TpDhgxZZG0AAAAADVVZTU1NTX0XsbTNmDEjr7zySjbeeOPSiruamppMmzYtbdu2LXSn4v322y/dunXLIYccsrTL/UY755xzctpppy3R3aO/ipqamtJYhx9+eA477LD06NFjmYydJIMGDco+++yTY445ZpmNuaJY0jkM1C9zGBo2cxgaPvMYGrb6mMMLytcWxGnR1Ju5c+dmp512WmaT4qSTTsr999+fZN7pzZMnT8566623TMYGAAAAWBE5LZp607hx4+yyyy7LbLwjjzwyw4YNy6hRo9KoUaMce+yxX3rNSgAAAAAWTrjIN8Z6662XSy+9tL7LAAAAAFhhCBeXwJ///Oe89dZbSZK11147/fr1y69//eu0bNky1dXV6d27d6ZPn57HH388ybzz408++eRcd911mTx5cpKka9eu2WeffRY5Tk1NTW6++eY8+eSTpTskd+nSJUcddVQ6dOiQhx56KPfff39+97vfLfE+PfDAA+nVq1etOybP9/TTT+fGG29MZWVl2rdvn8GDB2edddap06+qqirXXHNNnnnmmTRq1Cg77LBDBgwY8KWnPx966KE5+uijSzeGWZbefffdDBs2LFOnTk3Tpk1z0EEHZbvttlvkNsOHD8+zzz6bESNGLKMqAQAAAJYvwsUl0LRp05x00klJ5t0hOZl3h+HevXvnww8/zDvvvJNp06Zl0KBBadGiRanPqquumgEDBtTablHuvffePP/887nooovSunXrzJ07N1deeWWGDBmS8847b6ntz/xQsGfPnnXCxY8//jiXXnppLrrooqy77rp58MEH89vf/jbDhg2r8zp//vOf88477+Saa67J3Llzc9555+Xtt9/Ot771raVW69J2wQUXpE+fPtlzzz3z7rvv5qSTTsqGG26Y9u3bL7D/uHHj8vzzzy/jKgEAAACWL8LF5cQzzzyTJ598Mqecckqd595777106tQprVu3TjLvWoWHHXZYKisra/UbOXJknnnmmcyZMyeDBg3KFltskZqamtx666157LHHkiSrrbZaBg4cmDXXXDM33XRTPvroo0ycODEbbbRR/v3vf2fmzJkZNGhQTjjhhEyePLlU0zPPPJNNNtkk6667bpJk9913z/Dhw/Puu++mU6dOter461//msGDB6dJkyZp0qRJzj777CU+Pv/7v/+bP/zhD6msrEyzZs1y6KGHpnv37hk9enSuvPLK7Lbbbnnssccyffr0DBgwILvuumsqKytz6aWX5p///GdWW2217Lbbbhk1alRGjBiRjz/+OKeddlouu+yyTJgwIZMmTUqvXr2SJJ06dcqGG26YZ555Jt/73vfq1FJZWZlhw4bl0EMPzTXXXLPE+wYAAADQUAkXlxPbb799tt9++wU+t8022+Q3v/lNGjdunG233TZdu3ZNq1at0rx581Kft99+O4ceemgOPvjg3HHHHbnxxhuzxRZb5JlnnskjjzySIUOGZKWVVspNN92U3//+97nwwguTzAvthg4dmtVWWy0ffvhhBgwYkMsuuyxNmzYt1ZUkEyZMyJprrlkar1GjRll99dXz/vvv1woXZ86cmYkTJ+a9997Lddddl9mzZ2f33XfPj3/848LH5uOPP85vf/vbXHjhhVlvvfUyevTonHPOObnuuuuSJB988EFWWWWVXHHFFXnssccycuTI7LrrrvnrX/+at99+O9ddd13mzp2b//7v/y695iqrrJKrrrqqtG+rr756ysv/7+bpa665ZiZMmLDAem655ZZ0797dnaYBAACAb7zyL+9Cfdtqq61y3nnnZcaMGfn973+fvn375vTTTy9d7zGZtyKxW7duSZJvf/vb+fjjj5PMCw932223rLTSSkmSfffdNy+//HJmzZqVJOnSpUtWW221L61h9uzZadKkSa22Zs2alV5nvs8++yzJvNWWv/vd73LmmWfmzjvvzLPPPltw75N//vOfWX/99Uth3mabbZb27dvnlVdeSZKUl5dnt912SzJv3ydNmpQkGTNmTLbffvs0a9YsrVq1yu67775E+5Yk48ePz1NPPZUDDzyw8P4AAAAArCiEiw3EpptumlNPPTV//OMfM2zYsLRp0yZnnnlmqqqqkiQtW7Ys9S0vLy+1T506tRQsJkmbNm2SJNOmTUuSWs8tSvPmzTN79uxabbNnz06LFi1qtc0/dXuPPfZIo0aNsuaaa2bHHXfMP/7xj6+yu7VMnTo1bdu2rdW20korpaKiIklq1VBeXp7q6uokyfTp02vt36qrrrrA12/WrNli7VtVVVWGDh2an//857VWjQIAAAB8UwkXG4AXX3wxn376aelx586dM3DgwEyZMqUUsC1Mu3btam07/+eVV175K9Ww9tpr54MPPig9njt3bj788MM611ts3rx5Vl555cycObPUVl5evsC7Ty+uL+5DknzyySdp167dIrdr2bJlZsyYUXo8/w7dX7TOOuvkww8/LAWyybxTpb94J+z33nsvH3zwQS699NIceuihOemkk/Lxxx/n0EMPrVMfAAAAwDeBcLEBuOuuuzJ8+PBSYFddXZ1HHnkka6+9djp06LDIbXv06JFHH30006dPT5Lcfffd2WKLLdKsWbM6fecHgPP7ft7222+fV199Na+//nrpdTp16pSOHTvW6durV6+MGjUqVVVVmTZtWp555plstdVWX22nP6d79+557bXXSqeB//Of/8y0adOyySabLHK7jTfeOM8++2wqKyszY8aM0k1tvmj+ftx7771J5t0J+vXXX69zDczOnTvn1ltvzYgRIzJixIhcdNFFWWWVVTJixIjSilAAAACAbxI3dFkCH374YS666KIkSatWrZIkjz76aMaNG5fZs2dnjz32SJIMHTo05eXlmTJlSnr37p3XX3+9tN1aa62VZNF3iz7llFNy/fXX55hjjklZWVnmzp2bDTfcMGedddaX1rjDDjtkwoQJOfHEE1NTU5O11lorv/jFLxbYt3379unWrVuOPPLIHH/88ampqSnV1L59+5x44on5/e9/nzlz5mSVVVbJySefXNr2yCOPzK9//eusscYa6du3b4YNG5YBAwakWbNm2XfffUvh4vXXX5+2bdvmBz/4wQJrGDZsWK1Tjrfbbrv0798/J598ci655JLMmTMnLVu2zOmnn17rVPAF2WuvvTJ27NgcfvjhWWONNbLTTjvl7rvvTpJad4tu0qRJTjnllAwbNiz33ntvmjVrlpNPPrl0KvaQIUOy9dZbZ+edd/7S4w0AAADwTVJWU1NTU99FLG0zZszIK6+8ko033rgUQNXU1GTatGlp27ZtysrKvvJr7rfffunWrVsOOeSQpV3uN8qrr76al156Kf/1X/+1TMarqakp/b7/9re/5Y477sjQoUOXydgLMmjQoOyzzz455phj6q2GhmpJ5zBQv8xhaNjMYWj4zGNo2OpjDi8oX1sQp0WzTM2aNSs9e/ZcJmO9+OKLOeqoozJjxoxUVVXl8ccf/9JTqQEAAABYfE6LXkxlZWWluxBT3He+851lNtYWW2yRHj16ZNCgQSkvL8+3v/3tHHDAActs/AWprq5OeblMHwAAAFgxCBcX0yqrrJKJEyfWdxl8BWVlZRkwYEAGDBhQ36UkSWbOnJkpU6Z86U14AAAAABoKS6gW0y677JLRo0dn/Pjx9V0KDdRf/vKXVFVVZZdddqnvUgAAAACWCisXF1OfPn3ywAMP5LTTTsuWW26Zjh07plGjRvVdFg3AnDlz8u9//zuvvvpqBgwYULpDOAAAAEBDJ1xcTG3atMkVV1yR22+/PY899lieeOIJ12BksTRt2jTrrbdeDj300Oy55571XQ4AAADAUiNc/ApWWmml5eoafgAAAABQn1xzEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAACqnXcHHChAk54ogj0qNHj+y222656KKLUl1dXadfdXV1hg4dmp49e6Z79+7p3bt37rvvvnqoGAAAAACYr3F9Dj548OB07do1Dz/8cCZPnpwjjzwyq6yySg499NBa/W6++ebcdtttueGGG7LuuuvmiSeeyKBBg/Ltb387G220UT1VDwAAAADfbPW2cnHMmDEZN25cTjzxxLRp0yadO3dO//79c+utt9bpO3bs2Gy55Zb59re/nUaNGmW33XbLyiuvnFdffbUeKgcAAAAAknpcuTh27Nh07Ngxbdu2LbV17do148ePz/Tp09O6detS+6677pqzzjorr7zySrp06ZInn3wyM2fOzDbbbLPIMWpqalJTU1Pr5/mPgYbFHIaGzRyGhs0chobPPIaGrT7m8OKOVW/hYkVFRVZaaaVabfODxqlTp9YKF/fYY4+88sor2W+//ZIkLVq0yIUXXpg111xzkWNMnz49lZWVSeYdkBkzZiRJysrKltZuAMuIOQwNmzkMDZs5DA2feQwNW33M4dmzZy9Wv3q95uLiJqB33nln7rzzztx2223ZcMMN8+yzz+aEE07Immuumc0222yh27Vu3TotW7asNVbbtm29kUIDZA5Dw2YOQ8NmDkPDZx5Dw1Yfc3h+mPll6i1cbN++fSoqKmq1VVRUpKysLO3bt6/VfuONN+YnP/lJKUjcdddds+222+buu+9eZLhYVlZW64DPf+yNFBomcxgaNnMYGjZzGBo+8xgatmU9hxd3nHq7ocumm26aiRMnZsqUKaW2MWPGZL311kurVq1q9a2urk5VVVWttjlz5iyTOgEAAACABau3cHGTTTZJt27dMmTIkEyfPj1vvvlmRowYkX79+iVJ9tprr7zwwgtJkp49e+b222/PuHHjMnfu3Dz11FN59tln893vfre+ygcAAACAb7x6vebi0KFDc8YZZ2SHHXZI69at07dv3xxwwAFJkvHjx5fO7T7yyCMzd+7cHH300ZkyZUo6duyYc845J9ttt119lg8AAAAA32j1Gi6uscYaufrqqxf43Kuvvlr6uUmTJjn22GNz7LHHLqPKAAAAAIAvU2+nRQMAAAAADZtwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAopF7DxQkTJuSII45Ijx49sttuu+Wiiy5KdXX1Avu++eabOeigg/Kd73wnu+yyS66//vplWywAAAAAUEu9houDBw/O6quvnocffjgjRozIww8/nBtuuKFOv1mzZuWwww7LLrvskueeey7Dhg3L7bffnjfffLMeqgYAAAAAknoMF8eMGZNx48blxBNPTJs2bdK5c+f0798/t956a52+999/f1q3bp3DDjssLVq0yGabbZZ77703Xbp0qYfKAQAAAIAkaVxfA48dOzYdO3ZM27ZtS21du3bN+PHjM3369LRu3brU/uKLL2aDDTbIf//3f+ehhx7KKquskoEDB+b73//+IseoqalJTU1NrZ/nPwYaFnMYGjZzGBo2cxgaPvMYGrb6mMOLO1a9hYsVFRVZaaWVarXNDxqnTp1aK1z8z3/+kxdeeCFnn312fvWrX+WBBx7IKaeckvXWWy+bbLLJQseYPn16Kisrk8w7IDNmzEiSlJWVLe3dAb5m5jA0bOYwNGzmMDR85jE0bPUxh2fPnr1Y/eotXEwWPwGtqalJ165d07t37yTJD37wg9xyyy154IEHFhkutm7dOi1btqw1Vtu2bb2RQgNkDkPDZg5Dw2YOQ8NnHkPDVh9zeH6Y+WXqLVxs3759KioqarVVVFSkrKws7du3r9W+6qqr1unbsWPHTJo0aZFjlJWV1Trg8x97I4WGyRyGhs0chobNHIaGzzyGhm1Zz+HFHafebuiy6aabZuLEiZkyZUqpbcyYMVlvvfXSqlWrWn27dOmS1157rdZKxwkTJqRjx47LrF4AAAAAoLZ6Cxc32WSTdOvWLUOGDMn06dPz5ptvZsSIEenXr1+SZK+99soLL7yQJPn+97+fqVOn5sorr8ysWbNy7733ZuzYsV96QxcAAAAA4OtTb+FikgwdOjQfffRRdthhhxx88MHZb7/9csABByRJxo8fXzq3e/XVV89VV12VBx54IFtvvXWGDRuWyy+/PJ06darP8gEAAADgG61eb+iyxhpr5Oqrr17gc6+++mqtx9tss03uuuuuZVEWAAAAALAY6nXlIgAAAADQcAkXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAUIlwEAAAAAAoRLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAUIlwEAAAAAAoRLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAUIlwEAAAAAAoRLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAUIlwEAAAAAAppvLgd//znP2fKlCkLfb6mpiZTp07NKaecslQKAwAAAACWb4sdLj7zzDMZMmTIIvscf/zxS1wQAAAAANAwLNXTosvKypbmywEAAAAAy7HFXrlYU1OT++67b5F9JkyYsMQFAQAAAAANw2KHi6effnpmzZq1yD7f+c53lrggAAAAAKBhWOxw8aWXXsrrr7+esrKy1NTULLBPWVlZDj/88KVWHAAAAACw/FrscPHee+/90hu6nHDCCUtcEAAAAADQMCzVG7oAAAAAAN8cX+mGLqNHj17k81OnTl0qRQEAAAAAy7/FDhePPvroTJkyZZF9jjrqqCUuCAAAAABoGBY7XJw+fXomTZr0ddYCAAAAADQgix0ujhw5MieccMJC7xSdJOecc0722WefpVIYAAAAALB8+0rXXFxrrbUW2adFixZLXBAAAAAA0DAsdriYJJMnT17oczU1NZk9e/YSFwQAAAAANAyLHS7269cvd9xxxyL7bL755ktaDwAAAADQQCx2uHjLLbfkJz/5ySL7XHHFFTn88MOXuCgAAAAAYPn3la65uM022yyyzy233LLEBQEAAAAADUP50nyxsrKypflyAAAAAMBybLFXLu65554ZPnz4QgPEmpqadOnSZakVBgAAAAAs375SuAgArHju6N270HY/vOeepVwJAADQ0CzV06IBAAAAgG8O4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAACmlc3wWwbN3Ru3eh7X54zz1LuRIAAAAAGjrhIoulaCiZCCYBAAAAVlROiwYAAAAAChEuAgAAAACFCBcBAAAAgEJccxEAaDDcmAwAAJYvVi4CAAAAAIVYuQgAFPJVVxFWVlamSZMmSawkBACAFYWViwAAAABAIfUaLk6YMCFHHHFEevTokd122y0XXXRRqqurF7nNhx9+mO7du2fYsGHLqEoAAAAAYEHq9bTowYMHp2vXrnn44YczefLkHHnkkVlllVVy6KGHLnSbc845J40aNVqGVQIAAAAAC1JvKxfHjBmTcePG5cQTT0ybNm3SuXPn9O/fP7feeutCt3n88cfzxhtvZNddd112hQIAAAAAC1RvKxfHjh2bjh07pm3btqW2rl27Zvz48Zk+fXpat25dq/+sWbPym9/8Jueee27uvPPOxRqjpqYmNTU1tX6e/5hlxzFnaTCHoYGbP3drapKysmU+l713wJLxOQwNn3kMDVt9zOHFHavewsWKioqstNJKtdrmB41Tp06tEy5efvnl2XzzzbPtttsudrg4ffr0VFZWJpl3QGbMmJEkKSsrW8LqG675x2NZmjZt2jIfkxWPOQxfn2X12VBVVVX6uehnQ9FafRbBkvE5DA2feQwNW33M4dmzZy9Wv3q95uLiJqBvvPFGbrvtttxzzz1f6fVbt26dli1b1hqrbdu23+g30iZNmizzMT+/OhWKMofh67NMPhv+/xxu0rhxUlZW+LOhaK0+i2DJ+ByGhs88hoatPubw/DDzy9RbuNi+fftUVFTUaquoqEhZWVnat29faqupqclZZ52VwYMHZ9VVV/1KY5SVldU64PMfeyNdthxvlhZzGBqw+fP2////sp7H3jdgyfkchobPPIaGbVnP4cUdp97CxU033TQTJ07MlClTSmHimDFjst5666VVq1alfh988EH+/ve/5/XXX8/QoUOTzEtOy8vL8+ijj2bUqFH1Uj8AAAAAfNPVW7i4ySabpFu3bhkyZEj++7//Ox9++GFGjBiRAQMGJEn22muvnHPOOenevXsef/zxWtuef/75WWONNXLYYYfVR+kAAAAAQOr5motDhw7NGWeckR122CGtW7dO3759c8ABByRJxo8fnxkzZqRRo0ZZY401am3XokWLtG7d+iufJg0AAAAALD31Gi6uscYaufrqqxf43KuvvrrQ7S644IKvqyQAAAAAYDGV13cBAAAAAEDDJFwEAAAAAAoRLgIAAAAAhdTrNRcBAFh67ujdu/C2P7znnqVYCQAA3xRWLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIcBEAAAAAKES4CAAAAAAUIlwEAAAAAAoRLgIAAAAAhTSu7wIAAL5ud/TuXXjbH95zz1KsBAAAVixWLgIAAAAAhQgXAQAAAIBChIsAAAAAQCHCRQAAAACgEDd0AYAVwJLcsAQAAKAoKxcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIgbugDAcsSNWQAAgIbEykUAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKcbdoAAAA+Aru6N270HY/vOeepVwJQP2zchEAAAAAKES4CAAAAAAU4rRoAJaZoqcQJU4jAgAAWB5ZuQgAAAAAFGLlIgDA18BKXYDl25K8TwPwf6xcBAAAAAAKES4CAAAAAIUIFwEAAACAQoSLAAAAAEAhbugCALAILvgPAAALJ1xkhVT0PwTdnRMAAABg8TktGgAAAAAoxMpFAAD4EktyerwzIwCAFZlwEQAAgKXC5YkAvnmEiwAACAQAAChEuAgAQL2ojztxC0MBAJYu4SIAwHKmPkI3AAAowt2iAQAAAIBChIsAAAAAQCHCRQAAAACgEOEiAAAAAFCIG7oAAPCN4WY5AABLl3ARAABgObUkgfgP77lnKVYCAAvmtGgAAAAAoBDhIgAAAABQiHARAAAAAChEuAgAAAAAFCJcBAAAAAAKcbdoAGCZW5K7nwIAwNJW9PvpD++5ZylX0vAIFwG+oZYk3PEBCgAAQCJcBACA5ZJ/BAIAGgLXXAQAAAAAChEuAgAAAACFrNCnRT967LGZM3Fi6XFlZWWaNGmywL5OHQEAAACAr8bKRQAAAACgEOEiAAAAAFCIcBEAAAAAKGSFvuYiAAAAAHxd7ujdu/C2K8r9P6xcBAAAAAAKsXIRAABgBVR0Nc2KspIGgGVDuAgAQGFLcioQLCnhGQDUP6dFAwAAAACFWLkIAAB8o1jxCABLj5WLAAAAAEAhwkUAAAAAoBDhIgAAAABQiGsuAgAAAMAy9lWvAVxZWZkmTZosd9cAtnIRAAAAACjEykUAAACgDndWBxaHlYsAAAAAQCHCRQAAAACgEOEiAAAAAFBIvYaLEyZMyBFHHJEePXpkt912y0UXXZTq6uoF9r355puz5557pnv37unTp08efvjhZVwtAAAAAPB59RouDh48OKuvvnoefvjhjBgxIg8//HBuuOGGOv0efPDBDBkyJOedd16ef/75HHjggTn22GPz3nvv1UPVAAAAAEBSj+HimDFjMm7cuJx44olp06ZNOnfunP79++fWW2+t03fWrFk5/vjjs+WWW6ZJkybZf//906pVq/zrX/9a9oUDAAAAAEmSxvU18NixY9OxY8e0bdu21Na1a9eMHz8+06dPT+vWrUvtffr0qbXtJ598ks8++yyrr7764g9YU/N//19WtoCna77aDrDYGtKxbUi1ftPU1NSU/kf9q4/fg999A/cln8OwIlsR3jN9Ds/zTdn/FeFvdnlVn/u5LOfxN+X3CcvU575PL6s5trjj1Fu4WFFRkZVWWqlW2/ygcerUqbXCxc+rqanJ6aefnu985zvZZpttFjnG3LlzU1lZWXpcVVW10L7Tpk1b3NIbtM8fj2WlPo5t0f38pvwdNEQ1NTWZMWNGkqRMMLFULMn7QdG5Uh9jNjT18T69rCzqcxhWZCvCe+by+jm8rN8zG9L32iWxJPvZkL6HN7Rju6SKzOOG9PuEJdUQvofP/z69rObY7NmzF6tfvYWLyVf/14zKysqceuqpeeONNzJy5Mgv7d+4ceNUN2kyf7AkSZPGjRe4YuLzKyhXZE3mH49lqD6ObdH9/Kb8HTRE898v2rZtu1z9R01DtiTvB0XnSn2M2dDUx/v0MvEln8OwIlsR3jOX18/hZf2e2ZC+1y6JJdnPhvQ9vKEd2yVVZB43pN8nLKnl/nv4575PL6s5Nv8fJL5MvYWL7du3T0VFRa22ioqKlJWVpX379nX6z5o1KwMHDszMmTNz0003pV27dl9twPlvngt5E12eviStaBrSsW1ItX4TlZWVlf5H/aqP34HfewP3JZ/DsCJbUd4zfQ5/cz6LVpS/2eVRfe/nsprH9b2fsEL63PfpZTXHFneceruhy6abbpqJEydmypQppbYxY8ZkvfXWS6tWrWr1rampyXHHHZfGjRvn+uuv/+rBIgAAAACw1NVbuLjJJpukW7duGTJkSKZPn54333wzI0aMSL9+/ZIke+21V1544YUkyT333JM33ngjl156aZo1a1ZfJQMAAAAAn1Ov11wcOnRozjjjjOywww5p3bp1+vbtmwMOOCBJMn78+NK53X/+858zYcKEOjdw6dOnT84555xlXjcAAAAAUM/h4hprrJGrr756gc+9+uqrpZ9vuOGGZVUSAAAAALCY6u20aAAAAACgYRMuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgEKEiwAAAABAIcJFAAAAAKAQ4SIAAAAAUIhwEQAAAAAoRLgIAAAAABQiXAQAAAAAChEuAgAAAACFCBcBAAAAgELqNVycMGFCjjjiiPTo0SO77bZbLrroolRXVy+w78iRI7Pnnntmiy22SL9+/fLyyy8v42oBAAAAgM+r13Bx8ODBWX311fPwww9nxIgRefjhh3PDDTfU6ffoo49m2LBh+e1vf5tnnnkmu+22W4466qjMmDGjHqoGAAAAAJKkcX0NPGbMmIwbNy4jRoxImzZt0qZNm/Tv3z833HBDDj300Fp9b7311vzwhz/Md77znSTJYYcdlpEjR+axxx7LvvvuW+e1569+bLzKKv/XWFOT8qqqNG7UKCkrq7PNZ599thT3bvnVdM01l/mY9XFsi+7nN+XvoCGqqanJrFmz0rhx45QtYA7z1S3J+0HRuVIfYzY09fE+vUx8yecwrMhWhPfM5fVzeFm/Zzak77VLYkn2syF9D29ox3ZJFZnHDen3CUtquf8e/rnv08tqjs2aNStJFnqW8XxlNTU1NcuioC+65ZZbcu211+ahhx4qtY0ePTr7779/XnzxxbRu3brUvuOOO+akk05Knz59Sm2HH354unTpklNPPbXOa0+ePDlvv/3211o/AAAAAKzoOnfunA4dOiz0+XpbuVhRUZGVVlqpVlvbtm2TJFOnTq0VLlZUVJSe+3zfqVOnLvC127Ztm86dO6dZs2YpL3fPGgAAAAD4KqqrqzN79uw6mdwX1Vu4mMxblv119G3cuPEiE1UAAAAAYNE+v/hvYeptWV/79u1TUVFRq62ioiJlZWVp3759rfZ27dotsO8X+wEAAAAAy069hYubbrppJk6cmClTppTaxowZk/XWWy+tWrWq03fs2LGlx1VVVfn3v/9dusELAAAAALDs1Vu4uMkmm6Rbt24ZMmRIpk+fnjfffDMjRoxIv379kiR77bVXXnjhhSRJv379cuedd+Zf//pXZs6cmf/5n/9J06ZNs+uuu9ZX+QAAAADwjVev11wcOnRozjjjjOywww5p3bp1+vbtmwMOOCBJMn78+MyYMSNJsvPOO+f444/Psccem8mTJ6dbt24ZPnx4mjdvXp/lAwAAAMA3WlnNV7lTCgAAAADA/1evKxe/ThMmTMhzzz2XNm3aZI011shmm21W3yUBAF9QU1OTsrKy+i4DKMgchoanuro61dXVadz4/+IAcxkajuVxDq+QKxdfffXVHH744dlkk03y4Ycf5tNPP82PfvSj/PznP6/v0oACpkyZko8++iht2rRJu3bt0rJly/ouCSho/PjxeeCBB/Lpp59mq622yo477pimTZvWd1nAYjKHoWF74403MnLkyLz//vvZbrvtsuWWW2aLLbZIUv/hBPDlltc5vMKFi9OnT8+AAQOy55575mc/+1kmTpyY5557LmeccUb69u2b008/vb5LBL6CcePG5Re/+EXat2+f9957LzvuuGP69OmT7bbbrr5LA76i119/PYcccki233771NTU5MEHH8yPfvSj7LfffunevXt9lwd8CXMYGrY333wzBxxwQH7yk5+kefPmeeKJJ9KiRYv07NkzBx10UBIBIyzPluc5vMKdFl1eXp5mzZpl6623TpKsueaa+cEPfpBVVlklgwcPTtOmTXPyySfXc5XA4pg0aVIGDRqUn/70pzn00EPzwAMP5PHHH8+ZZ56Zk046Kb169arvEoHFNGvWrFx22WUZMGBADjvssCTJT37yk1x00UWpqKjI7Nmzs+2229ZzlcDCmMPQsNXU1OTuu+/O/vvvn+OPPz5Jsu++++bWW2/NqFGjMnv27Bx22GGCRVhOLe9zuLxeRv0aVVVV5d13382LL75YaqupqclOO+2Uiy++OH/84x8zatSoeqwQWFwff/xxOnXqlEMPPTRJstdee+Wwww7LrrvumrPPPjuPPvpoPVcILK7mzZunoqIi5eXzvnpUV1dnm222yRlnnJGPP/44t99+e9555516rhJYGHMYGraysrK88847+eCDD0pt6667bg455JDsvPPOefjhh3PPPffUY4XAoizvc3iFCxfbtGmTgQMH5pprrsnDDz+cZN4voaamJrvuumsOOeSQPPXUU5k9e3ZWsDPCYYUxYcKEjBs3LmVlZXnuuefyj3/8o/Rcly5dctBBB6VXr14ZPnx4Xn311XqsFFhcs2fPTqdOnfLee+9l5syZKS8vT01NTTbbbLOccMIJefHFF3P//fcnic9nWI7MmDEjc+fOTWVlZTp16pT333/fHIYGavfdd8+kSZPyz3/+s9S2+uqr58c//nE6d+6cJ598MlVVVfVYIfBFH3/8cf7zn/8kSXr16pWPP/54uZzDK1y4mMxbGrrHHnvksssuy9/+9rdSe+PGjbPuuuvmgw8+SKNGjSz5huXQK6+8kv333z8ffPBBNtpoo3zve9/LTTfdVGs1xDrrrJM+ffqkRYsWeeONN+qxWmBRpk2blgkTJuSTTz5Js2bN0rt379x2223585//nOT//vFviy22yAknnJAbbrghH330kc9nWE68/PLL+elPf5qPP/44TZo0yXe/+93cdtttueOOO5KYw7C8++ijj/Lss89m7NixmTJlSrbddttUVVXljjvuyFtvvVXqt/baa6dv376577778u9//7seKwY+79///nf69u2biRMnJkk22GCDVFVVZdSoUcvdHF4hw8XWrVvn5z//eTbddNNceOGF+ctf/lL6kvPJJ5+kdevWqaysrOcqgS8aN25cDj744PTv3z89e/ZMkuy5556ZNGlSbr311rz33nulvptttlk6dOhQWqEMLF/GjRuXAw88MCeccEL22muvXHLJJfnWt76V888/P+eee25uu+22Whec3mSTTbLeeuulefPm9Vw5kMybwwMGDEivXr2yxhprJEl23XXXnHjiieYwNADjxo3LAQcckMsvvzyDBw/OL37xi7z99ts566yz8vTTT+eGG27IuHHjSv0333zz9OjRI02aNKnHqoH5xo0bl0MOOSQ//vGPSzdNW3/99XPUUUctl3N4hbuhy3yrrbZajj/++Nxwww055ZRTcscdd6Rly5Z59tlnM3LkyLRo0aK+SwQ+Z/6b55FHHpnDDjsslZWVeeSRR9KiRYs0b94877zzTm644Yb069cvXbp0SZLSf+wAy5cPP/wwRx11VA488MDsv//+ueeee/LYY4/lhBNOyPHHH59zzz03v/zlL/Pxxx9n9913z/rrr58nn3wyFRUVqa6uru/y4Rtv/j/2HX744Tn88MNTU1OT999/P02bNs0hhxySZs2a5YwzzsikSZPSq1cvcxiWM1OmTMlxxx2XAw88MP37988LL7yQRx55JP3798/ll1+eyy+/PCeeeGI++eST7Lbbbvne976XG264Ia+99lratWtX3+XDN978/zY+/PDDc8QRR6S6ujrPP/98mjZtmu7du+f3v/99Tj311OVqDpfVfAMuijJ69Og8++yzadmyZXbYYYd8+9vfru+SgM+prKzMPvvskxYtWuTuu+9OVVVV+vTpk7KysjRu3Dj/+c9/UllZme233z7vvvtudtlll1RWVubmm2/OLbfckg033LC+dwH4nGeeeSYjR47MlVdeWWp77rnncvvtt+e9997Lueeem3feeScXX3xxkmTVVVfNq6++mmuvvTabbrppfZUNZF4osffee2f33XfPueeem8rKyhx00EGZO3du3n333Xz3u9/NgAED8sEHH+SCCy5IYg7D8ub999/PqaeemmHDhpWChmnTpuWPf/xjLr300lx33XVZe+21M3z48Dz66KNZZ511MmnSpAwdOtQchno2ffr07LHHHtl8881zxRVXZM6cOfmv//qv1NTUZMqUKWnatGl+85vfpFOnTrnqqqvyyCOPpFOnTvU+h78R4SKw/Bs9enQOOeSQ0n/AVFRU5LzzzsukSZMy7v+1d7cxVZd/HMc/4AkMRBBRSMFKlCMqpA+o8IERdsNKp1NIW5NiRW7NzKyoOa2FrnJaimg2oqWAiKLiRIXKu9hYscyKuea9eYC4EQSBxLg5/B8Yv794H5K/g75fjw7XdfjxPWxfdvic67p+hw9r3rx5Cg8PV2RkpHbt2iVfX1+98MILGjFihNmlA7hMYWGhZs+efUX4f/DgQaWlpUmSli1bprKyMlVUVKihoUHBwcEaPHiwWSUD+IfNZlNqaqqqqqoUHx+vzMxMtbW1ad68efrxxx9VWFiompoaJSUlqampSWVlZfQw4GDKy8v11FNPKTk5WREREcZ4U1OTUlNTtWnTJqWmpspqtaqyslKNjY3q16+fvL29zSsagCE7O1uJiYlasmSJDh06pOrqai1evFhHjx7V/v379cUXX+irr77SI4884jA9TLgIwGEUFxcrNjZW/v7+WrlyZadVxunp6crJyVFWVpaxopED4wHH0nH+WnV1tRYsWCCr1arY2Fj179/feM7evXu1fPlyvf/++woLCzOxWgDXcuLECW3evFkFBQUaMmSI1qxZY8z99NNP+uyzzxQTE6OpU6eaWCWAS9lsNuXn58vJyUkPPfSQfv31V+3du1cLFizotJKpvLxcS5culdVq1axZs0ysGMClbDab8vLy1KtXL4WGhqq+vl6zZ89WWFiYVq1aJU9PT0lSQ0ODli5dqqamJi1atMhhzjq+I2/oAqBnCg0NNVY6dYQRHZ9/+Pn5ycnJSa2trbrnnnsIFgEHcubMGVVUVBh96ePjo0cffVR79uxRfn6+6urqjOdGRkZq4MCB2rlzp0nVArhcRw93CAwM1LRp0zR+/Hh5eXmpubnZuBliWFiY+vfvr4KCArPKBXCZI0eOaMaMGSouLjYCxZKSEvn7++vLL7/UkSNHJEl2u1333XeffH199fPPP5tcNYAOl/bwnj179N5776mlpUWrVq2St7e3LBaL7Ha72tvb5eHhoUGDBqmystJhgkXpDr6hC4CeacSIEVqyZIksFouam5vl4uIi6eLZMW5ubhwUDziYyspKPf/88xo5cqTeffddBQQESJJeeuklnTlzRmlpabpw4YImTpwoX19fSVJAQADbJwEHca0eHjZsmOLi4uTm5iYXFxddutnJz89Pffr0MatkAJdobGzUhx9+qPj4eMXFxamkpESbN29WXV2dIiMjtWXLFi1fvlyvvfaaQkNDJUl9+vTRwIED1draKouFSAAw09V6OCsrS99//70++eQTjRs3Tm5ubmpra5Oz88X1gW1tbfL29u70/7LZ+EsCwOFYLBbV19crJSVFtbW1slgsysvL09dff80/M4CDqa+vl91u14ULF5ScnKzXX3/dCCfeeecdubq6Kj8/XwcPHlR4eLjq6+u1fft2bdq0yeTKAUjX7+GBAwdKkqqrq/XLL7/IxcVFp06d0tatW5WVlWVm2QD+0atXL7W3tyskJETSxQ/wHnzwQa1cuVILFy6Um5ubcnNzFR8fr6ioKNntdu3YsUMbNmwgWAQcwNV6OCgoSJ9++qkaGxvVp08f1dbWasuWLZIuhpHp6elav369wwSLEtuiATio3r17KyQkRDU1Nerbt68yMzM1atQos8sCcJlDhw4pNDRUkyZNUkVFhZKTk1VSUmLMz5kzR7NmzdLQoUO1Y8cOHT16VBkZGRo2bJiJVQPocL0e7litaLPZtGvXLn388cfat2+fMjIyFBQUZGbZAP7R1NSk+vr6TkeQhISEyGKx6O+//1ZERIQSEhI0f/58tbS0yMPDQxs3buSmiICDuFoPjx49utOuvaamJjU3NysvL09//vmn1q9f73A9zA1dAABAl5WWlurAgQOaMmWKsrOzlZubKz8/v06rnzqcP39eLi4urJQAHMjN9nBjY6Ox/YpdBIBjOXz4sDw8PIwjR06cOKGEhARlZGTo3nvvlSSdPn1a999/v5llAriGa/VwZmamXF1dJV38MDAoKEjOzs4O+V6alYsAAKDL/P39FRUVJUmKiYnR5MmTjdVPNptNkrRlyxadOXNGbm5uDvlmCLib3UwP5+Tk6Pz58/L29iZYBBzQiBEjNHjwYGO1cVlZmc6dO6devXpJktauXaunn35aVVVVZpYJ4Bqu1cMdN0tct26doqOjVV9f77DvpR2zKgAA0GP07t1bdrtdzs7OmjZtmux2u3Jzc7Vu3Tr17dtXa9as0fbt2zVgwACzSwVwFTfbwx1nMAJwTB1BhHSxr11cXJSRkaHPP/9c2dnZ9DDg4K7Vw6tXr1Z2drZ8fHxMrO762BYNAAC6RXt7u/GmaO/evVqwYIFaWlqUlpam4OBgk6sDcCP0MHBnOHHihJKSkhQcHKyUlBSlp6dr9OjRZpcF4Cb1xB5m5SIAAOgWTk5ORjhRXl6utrY2rV+/nhs/AD0EPQzcGTw9PfXtt99qz549ys7O1siRI80uCcC/0BN7mJWLAACgW1VUVCgiIkLZ2dkKCQkxuxwA/xI9DPR8OTk5Cg0NVWBgoNmlAOiCntbDhIsAAKDbNTY2cuMHoAejh4Ge7dJjDgD0PD2thwkXAQAAAAAAAHSJs9kFAAAAAAAAAOiZCBcBAAAAAAAAdAnhIgAAAAAAAIAuIVwEAAAAAAAA0CWEiwAAAAAAAAC6hHARAAAAAAAAQJcQLgIAAMBUH330kRISEiRJVqtVBQUF//oaZ8+e1fjx43XgwIHuLg8AAADXYTG7AAAAADie/Px8ffPNN8bXc+fOVWVlpTZs2GCMxcbGysPDQ6tXrzbGJk6cqLFjx2rRokXG2Lhx4xQTE3PVn1NQUKC8vDzt2rXrlur19vbWwoUL9fbbb2vnzp1yd3e/pesBAADg5hAuAgAA4Ao2m03Lly+XJBUVFam2tlalpaWaP3++BgwYoNLSUv3222/y8fHRiy++qDFjxkiSUlJSZLVa9cQTT+jZZ581xq5lxYoVmjlzpjw8PG655ieffFLJycnatGmT4uLibvl6AAAAuDG2RQMAAMAUxcXF+v333xUdHd1pvKSkRNOnT9eYMWMUHR2t48ePS7oYco4aNUr79u3ThAkTFBoaqtmzZ+uvv/4yvnf69OnKysq6ra8DAADgbka4CAAAAFP88MMPslqt8vb27jSemZmpxYsXq7CwUEOGDNEbb7xhzLW2tmrbtm3aunWrvvvuO508eVJJSUnG/MMPP6w//vhDFRUVt+11AAAA3M0IFwEAAGCKY8eOKSgo6IrxyZMna/jw4XJ3d9err76q48ePq6yszJh/+eWX5enpKV9fX82YMUP79+835gIDA+Xs7KyjR4/ejpcAAABw1yNcBAAAgCnq6urk6el5xXhgYKDxOCAgQJJUWVlpjA0dOtR4PGjQIFVVVRlfOzs7y9PTU2fPnv0vSgYAAMBlCBcBAABgGicnpyvGnJ3//xa1vb1dkuTq6mqMtbW1XfcaV7smAAAA/huEiwAAADCFl5eX6urqrhg/deqU8bikpESS5Ovra4zZbDbjcVlZWac5u92uc+fOqV+/fv9BxQAAALgc4SIAAABMMXz4cB07duyK8W3btun06dO6cOGCUlNTNXbsWPn4+Bjza9euVUNDgyoqKrRx40Y9/vjjxtzJkyfV1tYmq9V6W14DAADA3c5idgEAAABwPC0tLXrzzTclSTU1NZo3b54k6YMPPpCrq6uampo0adIkSVJSUpK8vLwkScHBwZKk9PR07d69W1LnVYeXCg8P14oVK1RbW9tppeHMmTP11ltv6fjx47JarVqyZEmn75swYYKmTJmiqqoqPfbYY5ozZ44xV1RUpAceeEB+fn7d8FsAAADAjTi1dxxkAwAAANxmU6dO1TPPPKNXXnnlhs8tKipSbGysiouLO53BeKkpU6Zo8uTJiouL6+5SAQAAcBVsiwYAAIBp5s6dq7S0NDU2Nt7ytXbv3q26ujo999xz3VAZAAAAbgbhIgAAAEwzfvx4RUVFKTEx8ZauU1tbq8TERC1btkzu7u7dVB0AAABuhG3RAAAAAAAAALqElYsAAAAAAAAAuoRwEQAAAAAAAECXEC4CAAAAAAAA6BLCRQAAAAAAAABdQrgIAAAAAAAAoEsIFwEAAAAAAAB0CeEiAAAAAAAAgC4hXAQAAAAAAADQJf8DGPrTj+/tEowAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📊 序列预测结果统计:\n", + " 预测位点总数: 85\n", + " 高概率位点 (>0.8): 0\n", + " 中概率位点 (0.4-0.8): 6\n", + " 最高预测概率: 0.475\n" + ] + } + ], + "source": [ + "# 选择一个序列进行演示\n", + "from Bio import SeqIO\n", + "\n", + "# 读取第一个mRNA序列作为演示\n", + "mrna_sequences = list(SeqIO.parse(mrna_file, \"fasta\"))\n", + "demo_seq = mrna_sequences[0] # 选择第一个序列\n", + "\n", + "print(f\"🧬 选择演示序列: {demo_seq.id}\")\n", + "print(f\"序列长度: {len(demo_seq.seq)} bp\")\n", + "print(f\"序列前100bp: {str(demo_seq.seq)[:100]}...\")\n", + "\n", + "# 使用内置的plot_prf_prediction函数进行预测和可视化\n", + "print(f\"\\n🎯 使用plot_prf_prediction进行序列预测和可视化...\")\n", + "\n", + "sequence_results, fig = plot_prf_prediction(\n", + " sequence=str(demo_seq.seq),\n", + " window_size=3,\n", + " short_threshold=0.2,\n", + " long_threshold=0.2,\n", + " ensemble_weight=0.6,\n", + " title=f\"序列 {demo_seq.id} 的PRF预测结果(条形图+热图)\",\n", + " figsize=(16, 8),\n", + " dpi=150\n", + ")\n", + "\n", + "plt.show()\n", + "\n", + "print(f\"\\n📊 序列预测结果统计:\")\n", + "print(f\" 预测位点总数: {len(sequence_results)}\")\n", + "print(f\" 高概率位点 (>0.8): {(sequence_results['Ensemble_Probability'] > 0.8).sum()}\")\n", + "print(f\" 中概率位点 (0.4-0.8): {((sequence_results['Ensemble_Probability'] >= 0.4) & (sequence_results['Ensemble_Probability'] <= 0.8)).sum()}\")\n", + "print(f\" 最高预测概率: {sequence_results['Ensemble_Probability'].max():.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🔝 Top 5 预测位点:\n", + " 1. 位置 96: \n", + " - Short概率: 0.288\n", + " - Long概率: 0.755\n", + " - 集成概率: 0.475\n", + " - 密码子: TAA\n", + " 2. 位置 12: \n", + " - Short概率: 0.606\n", + " - Long概率: 0.177\n", + " - 集成概率: 0.434\n", + " - 密码子: TTG\n", + " 3. 位置 15: \n", + " - Short概率: 0.493\n", + " - Long概率: 0.329\n", + " - 集成概率: 0.428\n", + " - 密码子: GAA\n", + " 4. 位置 18: \n", + " - Short概率: 0.369\n", + " - Long概率: 0.510\n", + " - 集成概率: 0.426\n", + " - 密码子: GTC\n", + " 5. 位置 105: \n", + " - Short概率: 0.248\n", + " - Long概率: 0.671\n", + " - 集成概率: 0.418\n", + " - 密码子: ACT\n", + "\n", + "📊 可视化分析完成!\n", + "图表包含热图和条形图,展示了整个序列的PRF预测概率分布。\n" + ] + } + ], + "source": [ + "# 打印Top预测位点的概率\n", + "if sequence_results['Ensemble_Probability'].max() > 0.3:\n", + " top_predictions = sequence_results.nlargest(5, 'Ensemble_Probability')\n", + " print(f\"\\n🔝 Top 5 预测位点:\")\n", + " for i, (_, row) in enumerate(top_predictions.iterrows(), 1):\n", + " print(f\" {i}. 位置 {row['Position']}: \")\n", + " print(f\" - Short概率: {row['Short_Probability']:.3f}\")\n", + " print(f\" - Long概率: {row['Long_Probability']:.3f}\")\n", + " print(f\" - 集成概率: {row['Ensemble_Probability']:.3f}\")\n", + " print(f\" - 密码子: {row['Codon']}\")\n", + "else:\n", + " print(\"\\n💡 该序列没有检测到高概率的PRF位点\")\n", + "\n", + "print(\"\\n📊 可视化分析完成!\")\n", + "print(\"图表包含热图和条形图,展示了整个序列的PRF预测概率分布。\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📝 分析总结\n", + "\n", + "### 🎯 主要发现\n", + "1. **数据质量**: 测试数据集包含真实的BLASTX比对结果和验证区域\n", + "2. **FScanR效果**: 从BLASTX结果中识别出潜在PRF位点\n", + "3. **模型性能**: Short和Long模型在不同场景下各有优势\n", + "4. **预测结果**: 集成模型提供了更稳定的预测性能\n", + "5. **可视化**: 内置绘图函数生成清晰的热图和条形图\n", + "\n", + "### 🔧 最佳实践\n", + "- **数据预处理**: 确保BLASTX结果格式正确\n", + "- **参数设置**: 使用默认的集成权重(0.4:0.6)获得平衡性能\n", + "- **结果解读**: 在使用FScanpy对整条序列进行预测时,不应该使用0.5作为阈值,而应该比较不同位置的概率高低\n", + "- **可视化**: 使用plot_prf_prediction函数生成标准化图表\n", + "\n", + "### 📚 使用建议\n", + "1. **阈值选择**: 根据应用场景调整概率阈值\n", + "2. **结果验证**: 结合生物学知识验证预测结果\n", + "3. **性能优化**: 对于大规模数据使用合理的滑动窗口大小\n", + "4. **可视化参数**: 调整figsize和dpi获得最佳显示效果" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/README.md b/README.md index 6472371..34ef7d2 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,149 @@ # FScanpy ## A Machine Learning-Based Framework for Programmed Ribosomal Frameshifting Prediction -FScanpy is a comprehensive Python package designed for the prediction of [Programmed Ribosomal Frameshifting (PRF)](https://en.wikipedia.org/wiki/Ribosomal_frameshift) sites in nucleotide sequences. By integrating advanced machine learning approaches (Gradient Boosting and BiLSTM-CNN) with the established [FScanR](https://github.com/seanchen607/FScanR.git) framework, FScanpy provides robust and accurate PRF site predictions. The package requires input sequences to be in the positive (5' to 3') orientation. +FScanpy is a comprehensive Python package designed for the prediction of [Programmed Ribosomal Frameshifting (PRF)](https://en.wikipedia.org/wiki/Ribosomal_frameshift) sites in nucleotide sequences. By integrating advanced machine learning approaches (HistGradientBoosting and BiLSTM-CNN) with the established [FScanR](https://github.com/seanchen607/FScanR.git) framework, FScanpy provides robust and accurate PRF site predictions. The package requires input sequences to be in the positive (5' to 3') orientation. ![FScanpy Architecture](/tutorial/image/structure.jpeg) For detailed documentation and usage examples, please refer to our [tutorial](tutorial/tutorial.md). +## 🚀 What's New in v0.3.0 + +### Model Naming Optimization +- **Short Model** (`short.pkl`): HistGradientBoosting model for rapid screening +- **Long Model** (`long.pkl`): BiLSTM-CNN model for detailed analysis +- **Unified Interface**: Consistent parameter naming and clearer output fields + +### Performance Improvements +- **Faster Prediction**: Optimized model type detection and reduced redundant operations +- **Better Error Handling**: More informative error messages and robust exception handling +- **Code Quality**: Reduced code duplication and improved maintainability + +### 🎨 New Visualization Features +- **Sequence Plotting**: Built-in function for visualizing PRF prediction results +- **Dual Threshold Filtering**: Separate filtering for Short and Long models +- **Interactive Graphics**: Heatmap and bar chart visualization +- **Export Options**: Support for PNG and PDF output formats + +### ⚖️ Ensemble Weighting System +- **Flexible Ensemble**: Control the contribution of Short and Long models +- **Weight Validation**: Automatic parameter validation and error handling +- **Clear Naming**: `ensemble_weight` parameter for intuitive usage +- **Visual Feedback**: Weight ratios displayed in plots and results + +### 🔧 API Improvements +- **Method Renaming**: More intuitive method names + - `predict_sequence()`: Replaces `predict_full()` for sequence prediction + - `predict_regions()`: Replaces `predict_region()` for batch prediction +- **Field Standardization**: Consistent output field naming + - `Ensemble_Probability`: Main prediction result (replaces `Voting_Probability`) + - `Short_Sequence` / `Long_Sequence`: Clear sequence field names +- **Backward Compatibility**: Deprecated methods still work with warnings + ## Core Features -- **Sequence Feature Extraction**: Support for extracting features from nucleic acid sequences, including base composition, k - mer features, and positional features. +- **Sequence Feature Extraction**: Support for extracting features from nucleic acid sequences, including base composition, k-mer features, and positional features. - **Frameshift Hotspot Region Prediction**: Predict potential PRF sites in nucleotide sequences using machine learning models. - **Feature Extraction**: Extract relevant features from sequences to assist in prediction. -- **Cross - Species Support**: Built - in databases for viruses, marine phages, Euplotes, etc., enabling PRF prediction across various species. +- **Cross-Species Support**: Built-in databases for viruses, marine phages, Euplotes, etc., enabling PRF prediction across various species. +- **Visualization Tools**: Built-in plotting functions for result visualization and analysis. +- **Ensemble Modeling**: Customizable ensemble weights for different prediction strategies. ## Main Advantages - **High Accuracy**: Integrates multiple machine learning models to provide accurate PRF site predictions. - **Efficiency**: Utilizes a sliding window approach and feature extraction techniques to rapidly scan sequences. - **Versatility**: Supports PRF prediction across various species and can be combined with the [FScanR](https://github.com/seanchen607/FScanR.git) framework for enhanced accuracy. -- **User - Friendly**: Comes with detailed documentation and usage examples, making it easy for researchers to use. +- **User-Friendly**: Comes with detailed documentation and usage examples, making it easy for researchers to use. - **Flexible**: Provides different resolutions to suit different using situations. +## Quick Start + +### Basic Prediction +```python +from FScanpy import predict_prf + +# Single sequence prediction with default ensemble weights (0.4:0.6) +sequence = "ATGCGTACGT..." +results = predict_prf(sequence=sequence) +print(results[['Position', 'Short_Probability', 'Long_Probability', 'Ensemble_Probability']].head()) +``` + +### Custom Ensemble Weighting +```python +# Adjust model weights for different prediction strategies +results_long_dominant = predict_prf(sequence=sequence, ensemble_weight=0.3) # 3:7 ratio (Long dominant) +results_equal_weight = predict_prf(sequence=sequence, ensemble_weight=0.5) # 5:5 ratio (Equal weight) +results_short_dominant = predict_prf(sequence=sequence, ensemble_weight=0.7) # 7:3 ratio (Short dominant) + +# Compare ensemble probabilities +print("Long dominant:", results_long_dominant['Ensemble_Probability'].mean()) +print("Equal weight:", results_equal_weight['Ensemble_Probability'].mean()) +print("Short dominant:", results_short_dominant['Ensemble_Probability'].mean()) +``` + +### Visualization with Custom Weights +```python +from FScanpy import plot_prf_prediction +import matplotlib.pyplot as plt + +# Generate prediction plot with custom ensemble weighting +sequence = "ATGCGTACGT..." +results, fig = plot_prf_prediction( + sequence=sequence, + short_threshold=0.65, # HistGB threshold + long_threshold=0.8, # BiLSTM-CNN threshold + ensemble_weight=0.3, # Custom weight: 30% Short, 70% Long + title="Long-Dominant Ensemble PRF Prediction (3:7)", + save_path="prediction_result.png" +) + +plt.show() +``` + +### Advanced Usage with New API +```python +from FScanpy import PRFPredictor +import matplotlib.pyplot as plt + +# Create predictor instance +predictor = PRFPredictor() + +# Use new sequence prediction method +results = predictor.predict_sequence( + sequence=sequence, + ensemble_weight=0.4 +) + +# Compare different ensemble configurations +weights = [0.2, 0.4, 0.6, 0.8] +weight_names = ["Long 80%", "Balanced", "Short 60%", "Short 80%"] + +fig, axes = plt.subplots(2, 2, figsize=(15, 10)) +axes = axes.flatten() + +for i, (weight, name) in enumerate(zip(weights, weight_names)): + results = predictor.predict_sequence(sequence=sequence, ensemble_weight=weight) + ax = axes[i] + ax.bar(results['Position'], results['Ensemble_Probability'], alpha=0.7) + ax.set_title(f'{name} (Weight: {weight:.1f}:{1-weight:.1f})') + ax.set_ylabel('Probability') + +plt.tight_layout() +plt.show() +``` + +### Batch Region Prediction +```python +# Predict multiple 399bp sequences +import pandas as pd + +data = pd.DataFrame({ + 'Long_Sequence': ['ATGCGT...' * 60, 'GCTATAG...' * 57] # 399bp sequences +}) + +results = predict_prf(data=data, ensemble_weight=0.4) +print(results[['Ensemble_Probability', 'Ensemble_Weights']].head()) +``` + ## Installation Requirements - Python ≥ 3.7 - Dependencies are automatically handled during installation @@ -36,6 +160,94 @@ cd FScanpy-package pip install -e . ``` +## 🔄 Migration from Previous Versions + +### API Changes Summary +```python +# OLD API (deprecated but still works) +results = predict_prf(sequence="ATGC...", short_weight=0.4) +results = predictor.predict_full(sequence, short_weight=0.4) +results = predictor.predict_region(sequences, short_weight=0.4) + +# NEW API (recommended) +results = predict_prf(sequence="ATGC...", ensemble_weight=0.4) +results = predictor.predict_sequence(sequence, ensemble_weight=0.4) +results = predictor.predict_regions(sequences, ensemble_weight=0.4) + +# Output field changes +# OLD: 'Voting_Probability', 'Weight_Info', '33bp', '399bp' +# NEW: 'Ensemble_Probability', 'Ensemble_Weights', 'Short_Sequence', 'Long_Sequence' + +# Visualization with ensemble weights +results, fig = plot_prf_prediction( + sequence="ATGC...", + short_threshold=0.65, + long_threshold=0.8, + ensemble_weight=0.3 # 30% Short, 70% Long +) +``` + +### Backward Compatibility +- All old methods still work but will show deprecation warnings +- Old field names are automatically added for compatibility +- Gradual migration is supported + +## Ensemble Weight Configuration Guide + +### Recommended Weights for Different Scenarios: + +| Scenario | ensemble_weight | Description | Use Case | +|----------|----------------|-------------|----------| +| **High Sensitivity** | 0.2-0.3 | Long model dominant | Detecting subtle PRF sites | +| **Balanced Detection** | 0.4-0.5 | Balanced ensemble (recommended) | General purpose prediction | +| **Fast Screening** | 0.6-0.7 | Short model dominant | Rapid initial screening | +| **Equal Contribution** | 0.5 | Equal weight to both models | Comparative analysis | + +### Weight Selection Guidelines: +- **Low ensemble_weight (0.2-0.3)**: + - Emphasizes Long model (BiLSTM-CNN) + - Better for detecting complex patterns + - Higher sensitivity, may have more false positives + +- **High ensemble_weight (0.6-0.8)**: + - Emphasizes Short model (HistGB) + - Faster computation + - Good for initial screening + - Higher specificity, may miss subtle sites + +- **Balanced (0.4-0.5)**: + - Recommended for most applications + - Good balance of sensitivity and specificity + - Suitable for comprehensive analysis + +## Output Field Reference + +### Main Prediction Fields +- **`Short_Probability`**: HistGradientBoosting model prediction (0-1) +- **`Long_Probability`**: BiLSTM-CNN model prediction (0-1) +- **`Ensemble_Probability`**: Final ensemble prediction (primary result) +- **`Ensemble_Weights`**: Weight configuration information + +### Sequence Fields +- **`Short_Sequence`**: 33bp sequence used by Short model +- **`Long_Sequence`**: 399bp sequence used by Long model +- **`Position`**: Position in the original sequence +- **`Codon`**: 3bp codon at the position + +### Metadata Fields +- **`Sequence_ID`**: Identifier for multi-sequence predictions +- Additional fields from input DataFrame (for region predictions) + +## Examples + +See `example_plot_prediction.py` for comprehensive examples of: +- Basic prediction plotting +- Custom threshold configuration +- Ensemble weight parameter usage and comparison +- New API method demonstrations +- Saving plots to files +- Advanced visualization options + ## Authors diff --git a/example_plot_prediction.py b/example_plot_prediction.py new file mode 100644 index 0000000..40f8e0c --- /dev/null +++ b/example_plot_prediction.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +""" +FScanpy 序列预测绘图示例 + +展示如何使用新的 plot_prf_prediction 函数绘制序列的移码概率预测结果 +包含集成权重参数的使用示例 +""" + +import matplotlib.pyplot as plt +import os +from FScanpy import plot_prf_prediction, PRFPredictor + +def example_basic_plotting(): + """基础绘图示例""" + print("=" * 50) + print("基础绘图示例") + print("=" * 50) + + # 示例序列(可以替换为您的实际序列) + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + ) + + try: + # 使用默认参数绘图 (0.4:0.6 集成权重比例) + results, fig = plot_prf_prediction( + sequence=example_sequence, + title="示例序列的移码概率预测 (默认集成权重 4:6)" + ) + + print(f"预测完成!共处理 {len(results)} 个位置") + print(f"满足阈值条件的位点数: {len(results[results['Ensemble_Probability'] > 0])}") + print(f"使用集成权重比例: Short模型 0.4, Long模型 0.6") + + # 显示图片 + plt.show() + + return results, fig + + except Exception as e: + print(f"绘图过程中出错: {str(e)}") + return None, None + +def example_custom_ensemble_weights(): + """自定义集成权重示例""" + print("=" * 50) + print("自定义集成权重绘图示例") + print("=" * 50) + + # 示例序列 + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + ) + + # 测试不同的集成权重比例 + weight_configs = [ + (0.2, "Long模型主导 (2:8)"), + (0.5, "等权重组合 (5:5)"), + (0.7, "Short模型主导 (7:3)") + ] + + for ensemble_weight, description in weight_configs: + print(f"\n测试集成权重配置: {description}") + try: + results, fig = plot_prf_prediction( + sequence=example_sequence, + ensemble_weight=ensemble_weight, + title=f"移码概率预测 - {description}", + figsize=(14, 7) + ) + + print(f"预测完成!共处理 {len(results)} 个位置") + print(f"满足阈值条件的位点数: {len(results[results['Ensemble_Probability'] > 0])}") + + # 显示统计信息 + print("预测统计信息:") + print(f" Short模型平均概率: {results['Short_Probability'].mean():.3f}") + print(f" Long模型平均概率: {results['Long_Probability'].mean():.3f}") + print(f" 集成平均概率: {results['Ensemble_Probability'].mean():.3f}") + print(f" 集成权重比例: Short:{ensemble_weight:.1f}, Long:{1-ensemble_weight:.1f}") + + plt.show() + + except Exception as e: + print(f"集成权重 {ensemble_weight} 绘图时出错: {str(e)}") + +def example_ensemble_comparison(): + """集成权重对比示例""" + print("=" * 50) + print("集成权重对比绘图示例") + print("=" * 50) + + # 示例序列 + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + ) + + try: + # 创建预测器实例 + predictor = PRFPredictor() + + # 测试三种不同集成权重 + weights = [0.3, 0.4, 0.6] + weight_names = ["Long主导 (3:7)", "默认权重 (4:6)", "Short主导 (6:4)"] + + # 创建对比图 + fig, axes = plt.subplots(3, 1, figsize=(15, 12)) + fig.suptitle('不同集成权重配置的预测结果对比', fontsize=16) + + all_results = [] + + for i, (weight, name) in enumerate(zip(weights, weight_names)): + # 获取预测结果 + results = predictor.predict_sequence( + sequence=example_sequence, + ensemble_weight=weight + ) + all_results.append(results) + + # 绘制条形图 + ax = axes[i] + ax.bar(results['Position'], results['Ensemble_Probability'], + alpha=0.7, color=f'C{i}', width=2) + ax.set_title(f'{name} - 平均概率: {results["Ensemble_Probability"].mean():.3f}') + ax.set_ylabel('概率') + ax.grid(True, alpha=0.3) + ax.set_ylim(0, 1) + + if i == len(weights) - 1: + ax.set_xlabel('序列位置') + + plt.tight_layout() + plt.show() + + # 打印对比统计 + print("\n集成权重对比统计:") + for i, (weight, name, results) in enumerate(zip(weights, weight_names, all_results)): + print(f"{name}:") + print(f" 平均集成概率: {results['Ensemble_Probability'].mean():.3f}") + print(f" 最大集成概率: {results['Ensemble_Probability'].max():.3f}") + print(f" 非零预测数量: {(results['Ensemble_Probability'] > 0).sum()}") + + return all_results, fig + + except Exception as e: + print(f"集成权重对比时出错: {str(e)}") + return None, None + +def example_save_plot(): + """保存图片示例""" + print("=" * 50) + print("保存图片示例") + print("=" * 50) + + # 创建保存目录 + save_dir = "prediction_plots" + os.makedirs(save_dir, exist_ok=True) + + # 示例序列 + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + "CTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + ) + + try: + # 保存不同集成权重配置的图片 + weight_configs = [ + (0.3, "long_dominant"), + (0.5, "equal_weight"), + (0.7, "short_dominant") + ] + + for ensemble_weight, file_suffix in weight_configs: + save_path = os.path.join(save_dir, f"prediction_{file_suffix}.png") + results, fig = plot_prf_prediction( + sequence=example_sequence, + short_threshold=0.6, + long_threshold=0.75, + ensemble_weight=ensemble_weight, + title=f"移码概率预测 (集成权重 {ensemble_weight:.1f}:{1-ensemble_weight:.1f})", + save_path=save_path, + dpi=300 + ) + + print(f"图片已保存至: {save_path}") + + # 不显示图片,直接关闭 + plt.close(fig) + + print("所有集成权重配置的图片都已保存完成") + return True + + except Exception as e: + print(f"保存图片过程中出错: {str(e)}") + return False + +def example_direct_predictor_usage(): + """直接使用PRFPredictor类的示例""" + print("=" * 50) + print("直接使用PRFPredictor类绘图示例") + print("=" * 50) + + try: + # 直接创建预测器实例 + predictor = PRFPredictor() + + # 示例序列 + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + "AGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCTAGCTAGCTAGCTAG" + ) + + # 使用类方法绘图,展示自定义集成权重 + results, fig = predictor.plot_sequence_prediction( + sequence=example_sequence, + short_threshold=0.65, + long_threshold=0.8, + ensemble_weight=0.3, # 自定义集成权重 + title="使用PRFPredictor类的绘图结果 (集成权重 3:7)" + ) + + print(f"预测完成!共处理 {len(results)} 个位置") + print(f"使用集成权重比例: Short:{0.3:.1f}, Long:{0.7:.1f}") + + # 显示详细结果 + print("\n前10个预测结果:") + columns_to_show = ['Position', 'Short_Probability', 'Long_Probability', 'Ensemble_Probability'] + print(results[columns_to_show].head(10)) + + # 显示集成权重信息 + if 'Ensemble_Weights' in results.columns: + print(f"\n集成权重配置: {results['Ensemble_Weights'].iloc[0]}") + + plt.show() + + return results, fig + + except Exception as e: + print(f"使用PRFPredictor类时出错: {str(e)}") + return None, None + +def example_new_api_usage(): + """新API使用示例""" + print("=" * 50) + print("新API方法使用示例") + print("=" * 50) + + try: + # 直接创建预测器实例 + predictor = PRFPredictor() + + # 示例序列 + example_sequence = ( + "ATGCGTACGTTAGCGATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC" + "GATCGATCGTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCGATCGATCGTAGCT" + ) + + print("1. 使用新的 predict_sequence() 方法:") + results = predictor.predict_sequence( + sequence=example_sequence, + ensemble_weight=0.3 + ) + + print(f" 序列预测完成: {len(results)} 个位置") + print(f" 主要输出字段: {[col for col in results.columns if 'Probability' in col]}") + + print("\n2. 使用新的 predict_regions() 方法:") + # 模拟一些399bp区域序列 + region_sequences = [example_sequence + "A" * (399 - len(example_sequence))] + region_results = predictor.predict_regions( + sequences=region_sequences, + ensemble_weight=0.4 + ) + + print(f" 区域预测完成: {len(region_results)} 个序列") + print(f" 主要输出字段: {[col for col in region_results.columns if 'Probability' in col or 'Sequence' in col]}") + + # 显示统计 + print("\n3. 结果统计:") + print(f" 序列预测平均集成概率: {results['Ensemble_Probability'].mean():.3f}") + print(f" 区域预测平均集成概率: {region_results['Ensemble_Probability'].mean():.3f}") + + return results, region_results + + except Exception as e: + print(f"新API使用时出错: {str(e)}") + return None, None + +def main(): + """主函数""" + print("FScanpy 序列预测绘图功能演示") + print("=" * 60) + print("新功能:规范化的集成权重参数 (ensemble_weight)") + print("权重范围:0.0 到 1.0 (对应 Short模型的权重,Long模型权重 = 1 - ensemble_weight)") + print("新命名:Ensemble_Probability 替代 Voting_Probability") + print("=" * 60) + + examples = [ + ("1. 基础绘图示例", example_basic_plotting), + ("2. 自定义集成权重示例", example_custom_ensemble_weights), + ("3. 集成权重对比示例", example_ensemble_comparison), + ("4. 保存图片示例", example_save_plot), + ("5. 直接使用PRFPredictor类示例", example_direct_predictor_usage), + ("6. 新API方法使用示例", example_new_api_usage) + ] + + for name, func in examples: + print(f"\n{name}") + try: + result = func() + if result is not None and result != False: + print("✓ 示例执行成功") + else: + print("✗ 示例执行失败") + except Exception as e: + print(f"✗ 示例执行出错: {str(e)}") + + print("-" * 50) + + print("\n演示完成!") + print("\n📊 新功能总结:") + print("1. plot_prf_prediction(): 便捷的绘图函数") + print("2. PRFPredictor.plot_sequence_prediction(): 类方法绘图") + print("3. PRFPredictor.predict_sequence(): 序列滑动窗口预测(替代predict_full)") + print("4. PRFPredictor.predict_regions(): 区域批量预测(替代predict_region)") + print("5. 支持自定义阈值、标题、保存路径等参数") + print("6. 新增 ensemble_weight 参数,可调节两个模型的集成权重比例") + print("\n⚖️ 集成权重示例:") + print(" - ensemble_weight=0.2: Short模型20%, Long模型80% (Long主导)") + print(" - ensemble_weight=0.4: Short模型40%, Long模型60% (默认平衡)") + print(" - ensemble_weight=0.5: Short模型50%, Long模型50% (等权重)") + print(" - ensemble_weight=0.7: Short模型70%, Long模型30% (Short主导)") + print("\n📂 输出字段:") + print(" - Short_Probability: Short模型(HistGB)预测概率") + print(" - Long_Probability: Long模型(BiLSTM-CNN)预测概率") + print(" - Ensemble_Probability: 集成预测概率(主要结果)") + print(" - Ensemble_Weights: 权重配置信息") + print(" - Short_Sequence: 33bp序列") + print(" - Long_Sequence: 399bp序列") + print("7. 自动保存PNG和PDF两种格式") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tutorial/tutorial.md b/tutorial/tutorial.md index 4575607..c1dac55 100644 --- a/tutorial/tutorial.md +++ b/tutorial/tutorial.md @@ -7,7 +7,7 @@ FScanpy is a Python package designed to predict Programmed Ribosomal Frameshifti FScanpy is a Python package dedicated to predicting Programmed Ribosomal Frameshifting (PRF) sites in DNA sequences. It integrates machine learning models (Gradient Boosting and BiLSTM-CNN) along with the FScanR package to furnish precise PRF predictions. Users are capable of employing three types of data as input: the entire cDNA/mRNA sequence that requires prediction, the nucleotide sequence in the vicinity of the suspected frameshift site, and the peptide library blastx results of the species or related species. It anticipates the input sequence to be in the + strand and can be integrated with FScanR to augment the accuracy. ![Machine learning models](/image/ML.png) -For the prediction of the entire sequence, FScanpy adopts a sliding window approach to scan the entire sequence and predict the PRF sites. For regional prediction, it is based on the 33-bp and 399-bp sequences in the 0 reading frame around the suspected frameshift site. Initially, the Gradient Boosting model will predict the potential PRF sites within the scanning window. If the predicted probability exceeds the threshold, the BiLSTM-CNN model will predict the PRF sites in the 399bp sequence.Then,VotingClassifier will combine the two models to make the final prediction. +For the prediction of the entire sequence, FScanpy adopts a sliding window approach to scan the entire sequence and predict the PRF sites. For regional prediction, it is based on the 33-bp and 399-bp sequences in the 0 reading frame around the suspected frameshift site. Initially, the Short model (HistGradientBoosting) will predict the potential PRF sites within the scanning window. If the predicted probability exceeds the threshold, the Long model (BiLSTM-CNN) will predict the PRF sites in the 399bp sequence. Then, ensemble weighting combines the two models to make the final prediction. For PRF detection from BLASTX output, [FScanR](https://github.com/seanchen607/FScanR.git) identifies potential PRF sites from BLASTX alignment results, acquires the two hits of the same query sequence, and then utilizes frameDist_cutoff, mismatch_cutoff, and evalue_cutoff to filter the hits. Finally, FScanpy is utilized to predict the probability of PRF sites. @@ -17,8 +17,8 @@ For PRF detection from BLASTX output, [FScanR](https://github.com/seanchen607/FS ### Key features of FScanpy include: - Integration of two predictive models: - - [Gradient Boosting](https://tensorflow.google.cn/tutorials/estimator/boosted_trees?hl=en): Analyzes local sequence features centered around potential frameshift sites (10 codons). - - [BiLSTM-CNN](https://paperswithcode.com/method/cnn-bilstm): Analyzes broader sequence features (100 codons). + - Short Model (HistGradientBoosting): Analyzes local sequence features centered around potential frameshift sites (33bp). + - Long Model (BiLSTM-CNN): Analyzes broader sequence features (399bp). - Supports PRF prediction across various species. - Can be combined with [FScanR](https://github.com/seanchen607/FScanR.git) for enhanced accuracy. @@ -29,7 +29,7 @@ For PRF detection from BLASTX output, [FScanR](https://github.com/seanchen607/FS pip install FScanpy ``` -### 2. Clone from [GitHub](https://github.com/.../FScanpy.git) +### 2. Clone from GitHub ```bash git clone https://github.com/.../FScanpy.git cd your_project_directory @@ -39,7 +39,6 @@ pip install -e . ## Methods and Usage ### 1. Load model and test data -Test data can be found in `FScanpy/data/test_data`,you can use the `list_test_data()` method to list all the test data and the `get_test_data_path()` method to get the path of the test data: ```python from FScanpy import PRFPredictor from FScanpy.data import get_test_data_path, list_test_data @@ -51,56 +50,38 @@ region_example = get_test_data_path('region_example.xlsx') ``` ### 2. Predict PRF Sites in a Full Sequence -Use the `predict_full()` method to scan the entire sequence,you can use the `window_size` parameter to adjust the scanning window size(default is 3) and the `gb_threshold` parameter to adjust the Gradient Boosting model fitting threshold(default is 0.1) for faster or more accurate prediction: +Use the `predict_sequence()` method to scan the entire sequence: ```python - ''' - Args: - sequence: mRNA sequence - window_size: scanning window size (default is 3) - gb_threshold: Gradient Boosting model threshold (default is 0.1) - Returns: - results: DataFrame containing prediction probabilities - ''' -results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT', - window_size=3, # Scanning window size - gb_threshold=0.1, # Gradient Boosting model threshold - plot=True) # Whether to plot the prediction results -fig.savefig('predict_full.png') +results = predictor.predict_sequence( + sequence='ATGCGTACGTATGCGTACGTATGCGTACGT', + window_size=3, # Scanning window size + short_threshold=0.1, # Short model threshold + ensemble_weight=0.4 # Ensemble weight (Short:Long = 0.4:0.6) +) + +# With visualization +results, fig = predictor.plot_sequence_prediction( + sequence='ATGCGTACGTATGCGTACGTATGCGTACGT', + ensemble_weight=0.4 +) ``` ### 3. Predict PRF in Specific Regions -Use the `predict_region()` method to predict PRF in known regions of interest: +Use the `predict_regions()` method to predict PRF in known regions of interest: ```python - ''' - Args: - seq: 399bp sequence - gb_threshold: GB model probability threshold (default is 0.1) - Returns: - DataFrame: 包含所有序列预测概率的DataFrame - ''' import pandas as pd region_example = pd.read_excel(get_test_data_path('region_example.xlsx')) -results = predictor.predict_region(seq=region_example['399bp']) +results = predictor.predict_regions( + sequences=region_example['399bp'], + ensemble_weight=0.4 +) ``` ### 4. Identify PRF Sites from BLASTX Output BLASTX Output should contain the following columns: `qseqid`, `sseqid`, `pident`, `length`, `mismatch`, `gapopen`, `qstart`, `qend`, `sstart`, `send`, `evalue`, `bitscore`, `qframe`, `sframe`. -FScanR result contains `DNA_seqid`, `FS_start`, `FS_end`, `FS_type`,`Pep_seqid`, `Pep_FS_start`, `Pep_FS_end`, `Strand` columns. Use the FScanR function to identify potential PRF sites from BLASTX alignment results: ```python - """ - identify PRF sites from BLASTX output - - Args: - blastx_output: BLASTX output DataFrame - mismatch_cutoff: mismatch threshold - evalue_cutoff: E-value threshold - frameDist_cutoff: frame distance threshold - - Returns: - pd.DataFrame: DataFrame containing PRF site information - """ from FScanpy.utils import fscanr blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx')) fscanr_result = fscanr(blastx_output, @@ -109,56 +90,43 @@ fscanr_result = fscanr(blastx_output, frameDist_cutoff=10) # Frame distance threshold ``` -### 5. Extract PRF Sites from BLASTX Output or your Sequence Data and evaluate it by FScanpy -Use the `extract_prf_regions()` method to extract PRF site sequences from mRNA sequences,it based on the `FS_start` column of the FScanR output contact with the `DNA_seqid` column of the input mRNA sequence file to extract the 33bp and 399bp sequences around the PRF sites in 0 reading frame: +### 5. Extract PRF Sites and Evaluate +Use the `extract_prf_regions()` method to extract PRF site sequences from mRNA sequences: ```python - """ - extract PRF site sequences from mRNA sequences - - Args: - mrna_file: mRNA sequence file path (FASTA format) - prf_data: FScanR output PRF site data or your suspected PRF site data which at least contains `DNA_seqid` `FS_start` `strand` columns - - Returns: - pd.DataFrame: DataFrame containing 33bp and 399bp sequences - """ from FScanpy.utils import extract_prf_regions -prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'), - prf_data=fscanr_result) -prf_results = predictor.predict_region (prf_regions['399bp']) +prf_regions = extract_prf_regions( + mrna_file=get_test_data_path('mrna_example.fasta'), + prf_data=fscanr_result +) +prf_results = predictor.predict_regions(prf_regions['399bp']) ``` -## Total Test +## Complete Workflow Example ```python -from FScanpy import PRFPredictor +from FScanpy import PRFPredictor, predict_prf, plot_prf_prediction from FScanpy.data import get_test_data_path, list_test_data -predictor = PRFPredictor() # load model -list_test_data() # list all the test data -blastx_file = get_test_data_path('blastx_example.xlsx') -mrna_file = get_test_data_path('mrna_example.fasta') -region_example = get_test_data_path('region_example.xlsx') - -results = predictor.predict_full(sequence='ATGCGTACGTATGCGTACGTATGCGTACGT', - window_size=3, # Scanning window size - gb_threshold=0.1, # Gradient Boosting model threshold - plot=True) - +from FScanpy.utils import fscanr, extract_prf_regions import pandas as pd -region_example = pd.read_excel(get_test_data_path('region_example.xlsx')) -results = predictor.predict_region(seq=region_example['399bp']) -from FScanpy.utils import fscanr +# Initialize predictor +predictor = PRFPredictor() + +# Method 1: Sequence prediction +sequence = 'ATGCGTACGTATGCGTACGTATGCGTACGT' +results = predict_prf(sequence=sequence, ensemble_weight=0.4) + +# Method 2: Region prediction +region_data = pd.read_excel(get_test_data_path('region_example.xlsx')) +results = predict_prf(data=region_data, ensemble_weight=0.4) + +# Method 3: BLASTX pipeline blastx_output = pd.read_excel(get_test_data_path('blastx_example.xlsx')) -fscanr_result = fscanr(blastx_output, - mismatch_cutoff=10, # Allowed mismatches - evalue_cutoff=1e-5, # E-value threshold - frameDist_cutoff=10) - -from FScanpy.utils import extract_prf_regions -prf_regions = extract_prf_regions(mrna_file=get_test_data_path('mrna_example.fasta'), - prf_data=fscanr_result) -prf_results = predictor.predict_region (prf_regions['399bp']) +fscanr_result = fscanr(blastx_output, mismatch_cutoff=10, evalue_cutoff=1e-5, frameDist_cutoff=10) +prf_regions = extract_prf_regions(get_test_data_path('mrna_example.fasta'), fscanr_result) +prf_results = predictor.predict_regions(prf_regions['399bp']) +# Visualization +results, fig = plot_prf_prediction(sequence, ensemble_weight=0.4, save_path='prediction.png') ``` ## Citation