From aa520b6238764a7eec0c4d7b9419b56fa63eff11 Mon Sep 17 00:00:00 2001 From: yyh Date: Sun, 17 Aug 2025 15:30:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20train=5Fmodels?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train_models/__init__.cpython-310.pyc | Bin 0 -> 150 bytes train_models/bilstm_cnn.cpython-310.pyc | Bin 0 -> 10774 bytes train_models/bilstm_cnn.py | 507 ++++++++++++++++++++++++ train_models/hist_gb.cpython-310.pyc | Bin 0 -> 7598 bytes train_models/hist_gb.py | 344 ++++++++++++++++ 5 files changed, 851 insertions(+) create mode 100644 train_models/__init__.cpython-310.pyc create mode 100644 train_models/bilstm_cnn.cpython-310.pyc create mode 100644 train_models/bilstm_cnn.py create mode 100644 train_models/hist_gb.cpython-310.pyc create mode 100644 train_models/hist_gb.py diff --git a/train_models/__init__.cpython-310.pyc b/train_models/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e58686f74f62f2d270ff40797f252522af207b0a GIT binary patch literal 150 zcmd1j<>g`k0`13hGC=fW5P=LBfgA@QE@lA|DGb33nv8xc8Hzx{2;!Heer{fgeok&d zQmTG>X=-tafuX)zaB^Z^L1lb$er|4N36M%j)z1YIImP<%@tJu*p?JN5%3B;ZK*7?S OR6CH7#Y{kgg#iFK7$Rf< literal 0 HcmV?d00001 diff --git a/train_models/bilstm_cnn.cpython-310.pyc b/train_models/bilstm_cnn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62edabb45fc90ed2affa6c80325f50f35b24d8ef GIT binary patch literal 10774 zcmai4OK=>=d7jtK&dxrtSS+3d$>m!siJ)kgB`2mWlLREeCPA8jZMrShWU)P1F1RoD z3`hbslc)^ZvXda8o!E{YCRw<0;VGA#Q@-Yus$5bvxg<`dTuf4xT#{5|NzC{6%q~EP zwr6X4yZ?T7|NTF{o^C#GD|qhz>!Vv!Ur>~Pp~U{jK;rXwy*X7;xWW{!a;>E@RsL&C zm;VM!$$yij@vpb6c7|ozHnZDVmTl))uAOK3c7YXCs&BML+C^4ukFrsjPPIzyF*YXS zW~-Z#jrh?x{-(lh zo_(P3EU1_C&hVproKJxA z0-xklpj0zjM&lJe&1XP;ksso-puXh3%n#qz#P{9V{@GmxV|ZyBA=j4WTRpb*b^B8D z%JRz9vzM2aDp$MQZB^E~qO$DX*>F2`w{p2v^StI-vtIL?-HyFn>ooo5mdh)5UEzTu zOZPTbTSp>0~Kno(YK>rQQ>?s)aCaHB%4Uf&S4`eu^0*3Kufk)Ckt zO{$y_a!A)&twF(7x8bz9o>$c(vsK%~5F)cpgZ8RwRPehFhUqrC0)uIRlJC{H<0Vt^ zsz#J)bwS}ZzZR9e+FjT2MXlLsb{bBzv(}A{d2Y+C`{?4HQ}1@xnw-|a@oMc}%k`Xk zrxTUjyS3H^dU0gGL_wv%UF`tTK)FcnL$i$1JlSx|znb>P8Ez@h~1uY}^FhAgq z8s{#L@_yH^fnQ^miFEMQlc#tdn~_1iMmjo-ticJ0%%q#lu!l%nJ6|nCDQ~0aipX-n znoZwvqN1Dw@oKWE(P-0mg*=x-lD$fElB^k}*1+w4RFEV&YB_+oh}-MdZ$(9!hpu_E zPUqW;O7}MgjXAfPp5GOl(L~L^-RyV=>1+zeWo_>W?RCv2pOYikeC9bB$#igx49b11 zMj6McliYEf$IAPkIoIy^=UVOFs(Y@nf$6<+{@feO^;)O5i9_0MV{z&b0q0^laBj8P z^87ZP%emgBs9=+(@Om#JuvJqtRr*;U7?!45ADJo8=%<5*R2t!_;nIBRo*ji?XK zGFnu|sad?2@p>!Bd8lsvaiI03657>5UB-2;A#Mb5%Ak~_r-m_;VvqwGPd_rl6t_at z$C+B!*7a>QOb6+}oK$#bN(syvCA55_p9<2uYM9yjhHnPewmPr8^IB*Jc3_s3FzcrS zn`b6bx~v50Z7sy?r(+eSa{7ch>3KO);MVq~Mp$@9XlzODHx(xm|EFKO$C z{#aZt$ZTsJmFN6&k}jb1_#h7DII^Sr69>|h&!**CjeJ$<95xhxYV#jdB`m7S)~G+t zi|8+2&%v6a+}B_pgvH-3c2e>GJ@ps797(0FS8i!3=Hq1F(tA5ZCHMBII=LifaKIXP3_anRat(^ z6t^Vg!(ZRi(IWGJ1jx4NCg>JcX-#dCe2r_)c2$76UES-3P(yvk4+DsNvRFSNX3$mbCD%Ma?nhj5k zQQ>n$&c_=vP=WSp&9C2bJeZ`c(2;90(7nU7EaH01mfP18-BmAAn=B)z?=<;+md4a& zBpWv;%M8oA+=G1~^=fQQMJl%9wz~<(K@yx!t?fqkp5}yw`;5jEXKDO0m_sbe!M>H# zbI|dpLGd0SFw`7=nojD+RyF+0_YFIz7FAo*kYlMObrw06T99u+o6wJgUX(xk{X)iu zdMkZoy{kR7(>8Ku@aEJhReQIHR*{FFtK)QURVM7^!qgKBU8P?ze& zVS|U_#9dPN5nDkH;M8uYqKKHH7M?5SnZc@`0nNhS!Fr#dT%%hDK#dD5xXQS3ia?*~& z;iov#Xb@keaeNIyluxXYx(i3d**CVAs0eMSC?OzIz$070qMg)omC`jyuY>f=L+kc= zydF(;zc0WwW^f(0zrjqrX}qwbdFm%B%YbfklVy3D<+#Q2cndtkM!3z2h>!9tEAbqx z=6qzvj>y)LtJ0m5j!Xmo79IaeuO_e#up7N*WeFqyB*yTBMpM~EW=Jh7+vLL!RPnl&3{t)tYsX8VYk-tvC+Z4QmfMu5!-gK5PUcG*0VVRXLUt4-( zaeiUx@`AJS){TYbx2|28kMavwFD=Z^FD||5%r9PzY;r-JFE1^wEJs;*qRtzOS5_8o zEJsC3d~xB%(!v#Id66wFN4fbM*REgt@``ig;>tplzp`-g#u7CnllG;HE0^DrC7HQ! z{o3WXmZQQVnq9nj#YxiT<%KJ6I4d_Uf{uEFQdckDbQaO)#g)ZtOUuzD+P(5MXL;q? z_3KpZI_NZqC^vA89f z=i_|A&r)HL!J6ZUk%h7hDt2Aln!FAt+x5H(lu4uIRt9eG(qmQp2CBnV(LsyscI`e5 zz2V>5H+m_SijaHM#L#8_Gt~2lU1v4nLkSTa%|B z!07BNyUIKUAHpCEG&u=HoP-IiajSue#MCLJ9Gf^A&kQD4wYMe~I%MA;oL9_?G&oU1 z@Ps50Jbd+Hq^?BjnF@R=-t&y#X_!l5pP!`WAN(R8w-U(Csom;ZBB? zAZ}!tmb>P|&1|-rEy&uC@q^3@!=Tw7F85*WySG!n)Z6lPfy+aT0w?t^Cn5t4Os zExrel*1#O`rtku?i23%NPrR*#X*gYKU`YcWj!l^HH8?{Qvjd~A2evfMQ;1m{$C6T9 z53}3|th=ytq{C)$gL1@cD1%s-4NR2JN_WEc&3-yC;7a9WEQMHJ#!SQjtr$vJ2uHxF zDJa-)1?sxxeG8*3;vEgtWTdmdW~9a8NJsaN^o}EYDoRI=#sN1d9FcvN;5KNo&z}Zx zAibZWzcSu&a$xCHzw_(*ED!q}f2PkL$v(^IsU&0Lh>Zm~IGKfDBrt7suAVT*JOSNjNZmSV*?|;2M0A~`zbg7zQ7GWl$d(V>s zO`Z<%2FgcTkF!*#>vwA60@5s16QZ^mrQCMU->gmyfGCL_#vqc6<_B1zL>1%Xn!ry3 z2~5@_!HkX^oEH^haA;^Ps}9pKqWBDClcO%DW5(8Xl!hlkEktRU{S* zO!xYV!K=6{X5SoQnatkYi(-Tx#f^PzU!Kpk=_j(;=0!k;(=s0$h1!2QUpv znP>x`M_^JMCp;hE9K!8C;}-(Jo!o{RUP74>ltDU7cZ>kW8%}i^EfzTpHJ;ni0KTY@ z97=zbMu8Cks-ZMW5PNC*5VsT@$4hswWk7b&j-i8kk7=pID z;Sb`um>f&(m8!;y!u2-Ba4pUWhd#J(;;kGuI0S){G}#u0XiPdeP{O(fL13!Jhniu0V5Cc`_Egi1j|>2?Bp}G! zMTrlzl<|w439ps>QLUsG;j!Y?3uvK89OV8|fUgA_soYQ}beq#^{0iZJb4+k~HH*6sD|dJf_xzg~!q+8raz zaT7{;#D{+iBQ{?qleXZGu9x~_F^$wFlvjbLC3;u{!j}#TG7iiwC`v?8mH$BF2q^=4 zrpo_uh>tW>`)Xjw{D}^*;BbNtI;p~-9E`8gXvXnQ1QUeAHZ&O9){ce4o?V~BJB4?; zKeG#@_>mq0Ev0xk1@nOZr;{GO2XTfp(ARW0!wneVs71Ih&rT~zDmO?`KIS&Htu|Di zf0POjk)Xsl|HeNAv^fOfJ@i#RB#Gysp62T zbbf*MPJOJs)8QG})(f(|VsIMb-?&Yv>!(oXpZ(E(HJA!cH-OoOQ%JoSOb2IVYC4#a zBUZ&VNuLQ`ko1{^%ZQV(vLE9l(0(N+O(mQ(op91r!b#H!Ck=Z2b};iRdFiF+^3ow* z0^a>{aFP^WaELeqseL?>(7_`KJ>iywK1OuR4C|Co$_Yiz3vEgc!utK}86E=f$QC6p z$(oXnQ0{-8;iN+x=LkJZj*{hQY{?kHYETW10$v%gO?l8}7p$d2yBisp1o;)%fv`==s~%ae2;0fOR)vYeM5l(+hqP z-Q9WOODHKGAc!*XEaio;C#ys0oua(c6da>~yk0?GuORTInu;>RtHU^t$YPuVa)}bX z2tZR_+GP_olM+A^$R!m&e5L|)J*)(L830(GZqmhZWG6R|!=9r28WePEaE(|!8(SbG z%lGWr7$;sL8iB*&8U+Ne?P)&o10-2~p(a|J%YHX@$fb(zQn^bM5aJ>2cwDf;S?IP| z79Mtc!;j4leXZ$>7L~b8!GYBxyH-NgQU!}eN)B;Kk)!0dC?IoHd=o*GnTH?p1|e&r zN5s!k(4nA!fSGrJHP>%5^OpPMjcfHBmssTVsl@j{H#|K7ZE?Av5w$t4*`SBNt)gxB#rM82B(1dM|_!ruTXG;n#ss> zLQE)rOhm%i#CIqMquS_5uLT$haV9I`JiH_h_4*#e*+8$Bj&q6xq!px%-= z1+WVENCpfZX)wuhCYe6}XXFg{R_2FR8fb=Uz(gYN3Y1@1R*o(OGztsw#b9WuMZF}i z4a!d^3=J0p##75cpL$L$KP@0uKz*Ca>P47V`}-#!4gCW-p>}ATasP$v6fP6wB?5ZF zRe}r)xLSxug0k=CbPcASyuA3pPK!RKU&lLq=qL{jwf{hZK0bkxS3>x0p$^Xs{v8lO zTztdJBXyn%O+sJ7OlXHT*MNg$@#gU6!$N?obr>VB1@HpB*HBmYQ+*TWz+U}qn4_BU z4<-W5I>Ggw%-ec65*EWz--5X}N{R*_ z!5FH7bTLV%IMC0X6x7`a(WDZwwkEEU4jHghWo@G)eF^a=SPHrsdCsY41~EQ9=OF31 z0>7uw^qfSL62f22#xhBYBPj=^43Z*>q>~^RSMG$!Q>=)96)!DH%>C-Mc|hy)i#Mtz z@gC)Vn*w>N(vbgzaj=2>r^GLpSMoSMXZZ2q&F&kp_zUXez#%f)SQuR4E8YvpErN}} z 1: + batch_probs = batch_probs.flatten() + test_probs[start_idx:end_idx] = batch_probs + + test_preds = (test_probs > 0.5).astype(int) + + test_metrics = { + 'loss': log_loss(self.model.y_test, np.clip(test_probs, 1e-15, 1-1e-15)), + 'auc': roc_auc_score(self.model.y_test, test_probs) if len(np.unique(test_probs)) > 1 else 0.5, + 'accuracy': accuracy_score(self.model.y_test, test_preds), + 'recall': recall_score(self.model.y_test, test_preds, zero_division=0), + 'precision': precision_score(self.model.y_test, test_preds, zero_division=0), + 'f1': f1_score(self.model.y_test, test_preds, zero_division=0) + } + + except Exception as e: + test_metrics = { + 'loss': float('inf'), 'auc': 0.0, 'accuracy': 0.0, + 'recall': 0.0, 'precision': 0.0, 'f1': 0.0 + } + + # Record metrics + for key in self.training_metrics: + if key.startswith('train_'): + metric_name = key[6:] + self.training_metrics[key].append(train_metrics.get(metric_name, 0.0)) + elif key.startswith('test_'): + metric_name = key[5:] + self.training_metrics[key].append(test_metrics.get(metric_name, 0.0)) + + # Update best model based on test loss + if test_metrics['loss'] < self.best_test_loss: + self.best_test_loss = test_metrics['loss'] + self.best_epoch = epoch + self.best_model = tf.keras.models.clone_model(self.model) + self.best_model.set_weights(self.model.get_weights()) + self.best_predictions = test_probs.copy() + + # Evaluate external validation sets if available + if hasattr(self.model, 'X_xu') and self.model.X_xu is not None: + xu_metrics = evaluate_model_cnn(self.model, self.model.X_xu, self.model.y_xu) + for key in self.xu_metrics_history: + self.xu_metrics_history[key].append(xu_metrics.get(key, 0.0)) + + if hasattr(self.model, 'X_atkins') and self.model.X_atkins is not None: + atkins_metrics = evaluate_model_cnn(self.model, self.model.X_atkins, self.model.y_atkins) + for key in self.atkins_metrics_history: + self.atkins_metrics_history[key].append(atkins_metrics.get(key, 0.0)) + + except Exception as e: + pass + + def on_train_end(self, logs=None): + if self.best_model is not None: + self.model.set_weights(self.best_model.get_weights()) + +class Config: + """Model configuration parameters""" + NEG_SAMPLES = 20000 + CONFIDENCE_THRESHOLD = 0.5 + EMBEDDING_DIM = 64 + LSTM_UNITS = 64 + CNN_FILTERS = 64 + CNN_KERNEL_SIZES = [3, 5, 7] + DROPOUT_RATE = 0.5 + LEARNING_RATE = 1e-4 + BATCH_SIZE = 1024 + EPOCHS = 5 + INITIAL_EPOCHS = 5 + SELF_TRAINING_EPOCHS = 1 + MAX_ITERATIONS = 20 + EARLY_STOPPING_PATIENCE = 5 + Sequence_len = 399 + +def process_sequence(seq, max_length=399): + """Process single sequence""" + return seq[:max_length] if len(seq) > max_length else seq + +def encode_sequence(seq, max_length=399): + """Encode single sequence""" + mapping = {'A': 1, 'T': 2, 'C': 3, 'G': 4} + encoded = [mapping.get(base, 0) for base in seq.upper()] + if len(encoded) < max_length: + encoded.extend([0] * (max_length - len(encoded))) + return encoded[:max_length] + +def trim_sequence(seq, target_length): + """Trim sequence from both ends to reach target length""" + if len(seq) <= target_length: + return seq + + excess = len(seq) - target_length + left_trim = excess // 2 + right_trim = excess - left_trim + + return seq[left_trim:len(seq)-right_trim] + +def prepare_data(train_data, test_data=None, low_conf_data=None, max_length=399): + """Prepare training and test data""" + # Process training data + train_sequences = [] + train_labels = [] + sample_weights = [] + + for _, row in train_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + train_sequences.append(encoded_seq) + train_labels.append(row['label']) + + weight = 1.0 + if 'sample_weight' in row and pd.notna(row['sample_weight']): + weight = row['sample_weight'] + sample_weights.append(weight) + + X_train = np.array(train_sequences) + y_train = np.array(train_labels) + sample_weights = np.array(sample_weights) + + # Process test data + X_test = y_test = None + if test_data is not None and not test_data.empty: + test_sequences = [] + test_labels = [] + + for _, row in test_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + test_sequences.append(encoded_seq) + test_labels.append(row['label']) + + X_test = np.array(test_sequences) + y_test = np.array(test_labels) + + # Process low confidence data + X_low_conf = y_low_conf = None + if low_conf_data is not None and not low_conf_data.empty: + low_conf_sequences = [] + low_conf_labels = [] + + for _, row in low_conf_data.iterrows(): + seq = process_sequence(row['full_seq'], max_length) + encoded_seq = encode_sequence(seq, max_length) + low_conf_sequences.append(encoded_seq) + low_conf_labels.append(row['label']) + + X_low_conf = np.array(low_conf_sequences) + y_low_conf = np.array(low_conf_labels) + + return X_train, y_train, X_test, y_test, sample_weights, X_low_conf, y_low_conf + +def create_bilstm_cnn_model(input_shape): + """Create BiLSTM-CNN model""" + input_layer = layers.Input(shape=input_shape) + + # Embedding layer + embedding = layers.Embedding( + input_dim=5, + output_dim=Config.EMBEDDING_DIM, + input_length=input_shape[0] + )(input_layer) + + # BiLSTM layers + lstm_out = layers.Bidirectional( + layers.LSTM(Config.LSTM_UNITS, return_sequences=True, dropout=Config.DROPOUT_RATE) + )(embedding) + + # CNN branches + cnn_outputs = [] + for kernel_size in Config.CNN_KERNEL_SIZES: + cnn = layers.Conv1D( + filters=Config.CNN_FILTERS, + kernel_size=kernel_size, + activation='relu', + padding='same' + )(lstm_out) + cnn = layers.GlobalMaxPooling1D()(cnn) + cnn_outputs.append(cnn) + + # Concatenate CNN outputs + if len(cnn_outputs) > 1: + concat = layers.Concatenate()(cnn_outputs) + else: + concat = cnn_outputs[0] + + # Dense layers + dense = layers.Dense(128, activation='relu')(concat) + dense = layers.Dropout(Config.DROPOUT_RATE)(dense) + dense = layers.Dense(64, activation='relu')(dense) + dense = layers.Dropout(Config.DROPOUT_RATE)(dense) + + # Output layer + output = layers.Dense(1, activation='sigmoid')(dense) + + model = models.Model(inputs=input_layer, outputs=output) + + # Compile model + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=Config.LEARNING_RATE), + loss='binary_crossentropy', + metrics=['accuracy', 'auc', 'recall'] + ) + + return model + +def train_bilstm_cnn_model(X_train, y_train, X_test, y_test, sample_weights=None, + X_xu=None, y_xu=None, X_atkins=None, y_atkins=None): + """Train BiLSTM-CNN model with self-training""" + + # Create model + input_shape = (X_train.shape[1],) + model = create_bilstm_cnn_model(input_shape) + + # Store validation data in model for callback access + model.X_test = X_test + model.y_test = y_test + model.X_xu = X_xu + model.y_xu = y_xu + model.X_atkins = X_atkins + model.y_atkins = y_atkins + + # Initial training + metrics_callback = MetricsCallback() + + early_stopping = tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=Config.EARLY_STOPPING_PATIENCE, + restore_best_weights=True, + verbose=0 + ) + + # Split training data for validation + val_split = 0.2 + n_val = int(len(X_train) * val_split) + indices = np.random.permutation(len(X_train)) + train_indices = indices[n_val:] + val_indices = indices[:n_val] + + X_train_split = X_train[train_indices] + y_train_split = y_train[train_indices] + X_val_split = X_train[val_indices] + y_val_split = y_train[val_indices] + + if sample_weights is not None: + sample_weights_split = sample_weights[train_indices] + else: + sample_weights_split = None + + # Initial training + model.fit( + X_train_split, y_train_split, + validation_data=(X_val_split, y_val_split), + epochs=Config.INITIAL_EPOCHS, + batch_size=Config.BATCH_SIZE, + sample_weight=sample_weights_split, + callbacks=[metrics_callback, early_stopping], + verbose=0 + ) + + # Store initial training info + initial_info = { + 'best_test_loss': metrics_callback.best_test_loss, + 'best_epoch': metrics_callback.best_epoch, + 'training_metrics': metrics_callback.training_metrics.copy() + } + + # Self-training iterations + current_X_train = X_train.copy() + current_y_train = y_train.copy() + current_weights = sample_weights.copy() if sample_weights is not None else None + + iteration_metrics = { + 'iteration': [0], + 'train_loss': [metrics_callback.training_metrics['train_loss'][-1]], + 'test_loss': [metrics_callback.training_metrics['test_loss'][-1]], + 'samples_added': [0], + 'total_samples': [len(current_X_train)] + } + + if X_xu is not None: + xu_metrics = evaluate_model_cnn(model, X_xu, y_xu) + iteration_metrics['xu_loss'] = [xu_metrics['loss']] + + if X_atkins is not None: + atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins) + iteration_metrics['atkins_loss'] = [atkins_metrics['loss']] + + best_model = tf.keras.models.clone_model(model) + best_model.set_weights(model.get_weights()) + best_loss = metrics_callback.best_test_loss + best_iteration = 0 + + # Load low confidence data for self-training + _, _, low_conf_data, _, _ = load_data() + + if low_conf_data is not None and not low_conf_data.empty: + X_unlabeled, _, _, _, _, _, _ = prepare_data( + low_conf_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + for iteration in range(1, Config.MAX_ITERATIONS + 1): + # Select low confidence samples + selected_samples = select_low_confidence_samples_cnn( + model, X_unlabeled, low_conf_data + ) + + if selected_samples.empty: + break + + # Prepare selected samples + X_selected, y_selected, _, _, weights_selected, _, _ = prepare_data( + selected_samples, pd.DataFrame(), max_length=Config.Sequence_len + ) + + if len(X_selected) == 0: + break + + # Add to training set + current_X_train = np.vstack([current_X_train, X_selected]) + current_y_train = np.hstack([current_y_train, y_selected]) + + if current_weights is not None: + current_weights = np.hstack([current_weights, weights_selected]) + + # Retrain model + metrics_callback = MetricsCallback() + + # Split updated training data + n_val = int(len(current_X_train) * val_split) + indices = np.random.permutation(len(current_X_train)) + train_indices = indices[n_val:] + val_indices = indices[:n_val] + + X_train_split = current_X_train[train_indices] + y_train_split = current_y_train[train_indices] + X_val_split = current_X_train[val_indices] + y_val_split = current_y_train[val_indices] + + if current_weights is not None: + sample_weights_split = current_weights[train_indices] + else: + sample_weights_split = None + + model.fit( + X_train_split, y_train_split, + validation_data=(X_val_split, y_val_split), + epochs=Config.SELF_TRAINING_EPOCHS, + batch_size=Config.BATCH_SIZE, + sample_weight=sample_weights_split, + callbacks=[metrics_callback, early_stopping], + verbose=0 + ) + + # Record iteration metrics + iteration_metrics['iteration'].append(iteration) + iteration_metrics['train_loss'].append(metrics_callback.training_metrics['train_loss'][-1]) + iteration_metrics['test_loss'].append(metrics_callback.training_metrics['test_loss'][-1]) + iteration_metrics['samples_added'].append(len(X_selected)) + iteration_metrics['total_samples'].append(len(current_X_train)) + + if X_xu is not None: + xu_metrics = evaluate_model_cnn(model, X_xu, y_xu) + iteration_metrics['xu_loss'].append(xu_metrics['loss']) + + if X_atkins is not None: + atkins_metrics = evaluate_model_cnn(model, X_atkins, y_atkins) + iteration_metrics['atkins_loss'].append(atkins_metrics['loss']) + + # Update best model + current_loss = metrics_callback.training_metrics['test_loss'][-1] + if current_loss < best_loss: + best_model = tf.keras.models.clone_model(model) + best_model.set_weights(model.get_weights()) + best_loss = current_loss + best_iteration = iteration + + # Final evaluation + final_metrics = evaluate_model_cnn(best_model, X_test, y_test) + + training_info = { + 'initial_info': initial_info, + 'iteration_metrics': iteration_metrics, + 'best_iteration': best_iteration, + 'final_metrics': final_metrics + } + + return best_model, model, training_info + +def main(): + """Main training function""" + # Load data + train_data, test_data, low_conf_data, xu_data, atkins_data = load_data() + + # Prepare data + X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data( + train_data, test_data, max_length=Config.Sequence_len + ) + + # Prepare validation data + X_xu = y_xu = X_atkins = y_atkins = None + + if xu_data is not None and not xu_data.empty: + X_xu, y_xu, _, _, _, _, _ = prepare_data( + xu_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + if atkins_data is not None and not atkins_data.empty: + X_atkins, y_atkins, _, _, _, _, _ = prepare_data( + atkins_data, pd.DataFrame(), max_length=Config.Sequence_len + ) + + # Train model + best_model, final_model, training_info = train_bilstm_cnn_model( + X_train, y_train, X_test, y_test, sample_weights, + X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins + ) + + # Save results + save_training_info(best_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "best") + save_training_info(final_model, training_info, BaseConfig.BILSTM_MODEL_DIR, "final", is_final_model=True) + + return best_model, final_model, training_info + +if __name__ == "__main__": + BaseConfig.create_directories() + main() diff --git a/train_models/hist_gb.cpython-310.pyc b/train_models/hist_gb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5b77e96ffd0a0282364d72801724c357b19f0bc GIT binary patch literal 7598 zcmb7JTa4V+dFDCfkQ~lsN2}FcmdAk|du&6p-MCFtMbb)I$qJ;EA}N+k%cZ$Y>!3gyJ@mks?Yc)s1bL3w?zkTVLru_>wrXL-RQz+3t0|?F41QWc&T&CXK z)z#Z@OX_X9Cf<6->e{Z&Xs*#IcOBR1R@_Rr>Q=iox7Mw@bv0J%%yb)WL)FdBuI{Wm z%e3>Fu!Q}oChQIF?iPoIBPyRVcdpFKT)ZHvqK4KUaYW3B25Ni7E;0M5=I#?Oirr!k zJ^L}XN9;xIfY>MYW9*RMXMjCiY12xlABM&$l<4aKfriW1TrL0WYG3Rej7L~s|x!9^_w&(TyZs2*D`U`Ey$jbDu4c813-ebOFD#z-uFhXy$jUTucHzqP_p<8M`NgxBFL~Fl(?A{5 zE`ET<QYX&no3jOo~>NZoe9yb>? ztxZfYe&|~&XHcT!7#V#p);3wpSM^OJF0DcfZ1F9YvZs2v>g7-M>Z(^4hA6G@)JRyu z6Mb1ft9^7JEhPrtbE%n_Nom!(%jB5^tOhAQDbSOc$F;R%!W!4s-bnOB7xn>7l<)8Z z+NQmwrPid+ne^F-mDtN1xL+w~G^c3%0jTMqrpYyYYaY;gHBhafeNR+-RW_OnZ?`v-@4YD%T-hKzwf5QjrS$tGl zq^IR%d>~v~2EOpHs*Lxe%ozCb?aWy1hdo)wbk_!p&wFPtT+OU@zccLiFsI!gtY_tg z`|V&r`m*J?m0qyoMSgeC38Ku|-fdQn$6It~p4yk&kipt8lsKLOGt|>^rlXNr_lIil zS;bd-${Q@$mLYirtINtAT!9*V4bA8q0G#h-^s{-L+st9AaQ+GZpE+xeRaM_CtFk&X zpP=_i!>;o>vp?Zq$anF1n|3{ZW>A?(%s-$JYpX0RC$O7C7F809-m;Efo|a=BZv&-- zVxm|mHcA=ANy>shv{NT>Rx61Ho`VX0s533DCJsr?hMragORG6nQid7F>j`f8Pt^K)SU~;7j6t56V7p%QPUs6Sv)}gf#TKUKG@EzgSkwdCR*Oge$k>Y<6mvvX7D1 z(ldj2&E@wY2|3raVBm66ZYAeQHQC+u%;#BcIqdlzuN%NTx1-iBr9$N)T)8|uLZ`6>e$nu{O!iiJD=r-5Cm8P!sNAa{#|7TaP;Ca+`gAx-`cz+(!QToe8l z+;jOdb-w}thrR?Qwi0BfKNti(k(Ej3#r=LK%B+EeFWQET2KQr(waS?T+w66M-U`g8 zJwE=dMAs8#EX)k5Wi_>tPSB4-f&P+eW_7F+`qiLD`aStR7KJcJ$2?q!4zzA9|Fk%37dOxjqR1_Gg5?+* zH0-*LBqO2fPah$)TnMXFtnM>lkE#GwIML)8*ldo=4m#Sz6yt|xm?-L){tnVZ+umeb zkc$nT>d;O`p`A)GM;~dYBk)T}pB#W6YsTvy6_kd)F{-7wrLQ(qTufZnJ7=_kA6sv7&TN)(Gy~R@#+5$1f0WC$OE5OZZIn+`| z=m|x&Z^UeqqhBX3m#^kEc=rapef_v5N@~`LVpd7b`UYm9R?I3LCoN$^g*y~)B#8Ty z%0%MIR1+rR46Fk5yaJP@FnPd)8bv7hs!DU$oVc`!m>W3*UDY5{DKZJ@E1_1xK5Hm- zXe}Eu|2GL+wIaY`$7lb~9sgvWBuOompkGd=^EHfUr$o$@rmgU=oCX z4~@~=u@V zH^c6rFO~9Zz80a;q5SY9*WAkH#`iVY9?`Js>P;?z#@$3*TDbLe76H^G1uaGFm{-TY<4tb-9v=;Jv(T zw1X1nJfBhwuak17Nm=ERkX=gZ!gyG|rKh`qbvAaAU22qX80l`*XUPr;WM!9mk!7{l ze_GJWv#e?t0-AMuKP`K-_Yx(i#@UJH@~~GVg_M57Uo-Me+*iwx^LWYA1nBbQSpt+0 zl)nwo;z)Q_rr7^o8X;XH?-L*ykPiqDHpS6-c7iyhHRsMue1<$^nNqW)Qsp}YeuKca z!rc~yQ#A0L^|_AH+A7bJGrE$YbC~Yd^USV?zpRFxIC@$Hj4qDl4PtiMG6z#PbPjf( z`=U{*gW>C}i87}Wn{|l5bekKL7Og(1o3q=Z@Cne+|1E^q^Kh&$qC>t4fO?%?T)h@s zN8~IGk!FBJ8g}uw1lb|i93biE=cY=zgKi_bqf&%NgI*MLk*-jAG#qBpT_m5RPWVXY z%Mt;T0C`MqeHe$G=;-*;^UPW`@67*aQW`10S9PW5>071hI#eTPEWc*DNcjl?^2glS zHUd%nnI@2cCjUMRrjJ= 2: + feature_vector.extend(mfe_values[:2]) + else: + feature_vector.extend([0.0, 0.0]) + else: + feature_vector.extend([0.0, 0.0]) + + return np.array(feature_vector) + +def prepare_data(train_data, test_data, seq_length=33): + """Prepare training and test data including MFE features""" + + # Process training data + X_train = [] + y_train = [] + sample_weights = [] + + for _, row in train_data.iterrows(): + sequence = row['full_seq'] + label = row['label'] + + # Get MFE values + mfe_values = {} + if 'mfe_40bp' in row: + mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 + if 'mfe_120bp' in row: + mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 + + # Convert to features + features = sequence_to_features(sequence, seq_length, mfe_values) + X_train.append(features) + y_train.append(label) + + # Sample weight + weight = 1.0 + if 'sample_weight' in row and pd.notna(row['sample_weight']): + weight = row['sample_weight'] + sample_weights.append(weight) + + X_train = np.array(X_train) + y_train = np.array(y_train) + sample_weights = np.array(sample_weights) + + # Process test data + X_test = [] + y_test = [] + + if test_data is not None and not test_data.empty: + for _, row in test_data.iterrows(): + sequence = row['full_seq'] + label = row['label'] + + # Get MFE values + mfe_values = {} + if 'mfe_40bp' in row: + mfe_values['mfe_40bp'] = row['mfe_40bp'] if pd.notna(row['mfe_40bp']) else 0.0 + if 'mfe_120bp' in row: + mfe_values['mfe_120bp'] = row['mfe_120bp'] if pd.notna(row['mfe_120bp']) else 0.0 + + # Convert to features + features = sequence_to_features(sequence, seq_length, mfe_values) + X_test.append(features) + y_test.append(label) + + X_test = np.array(X_test) if X_test else None + y_test = np.array(y_test) if y_test else None + + return X_train, y_train, X_test, y_test, sample_weights, train_data, test_data + +def analyze_feature_importance(model, X_test, y_test, test_data): + """Analyze feature importance (simplified version)""" + try: + # Get feature names + feature_names = get_feature_names(GBConfig.SEQUENCE_LENGTH) + + # Built-in feature importance + if hasattr(model, 'feature_importances_'): + importance_scores = model.feature_importances_ + + # Create importance DataFrame + importance_df = pd.DataFrame({ + 'feature': feature_names, + 'importance': importance_scores + }).sort_values('importance', ascending=False) + + # Save results + importance_path = os.path.join(BaseConfig.GB_DIR, 'feature_importance.csv') + importance_df.to_csv(importance_path, index=False) + + return {'built_in_importance': importance_df} + + return None + + except Exception as e: + return None + +def main(): + """Main training function""" + try: + # Set sequence length + sequence_length = GBConfig.SEQUENCE_LENGTH + + # Load data + train_data, test_data, _, xu_data, atkins_data = load_data() + + # Prepare data + X_train, y_train, X_test, y_test, sample_weights, _, _ = prepare_data( + train_data, test_data, seq_length=sequence_length + ) + + # Prepare validation data + X_xu = y_xu = X_atkins = y_atkins = None + if xu_data is not None and not xu_data.empty: + try: + empty_test = pd.DataFrame(columns=xu_data.columns) + X_xu, y_xu, _, _, _, _, _ = prepare_data(xu_data, empty_test, seq_length=sequence_length) + except Exception as e: + X_xu = y_xu = None + + if atkins_data is not None and not atkins_data.empty: + try: + empty_test = pd.DataFrame(columns=atkins_data.columns) + X_atkins, y_atkins, _, _, _, _, _ = prepare_data(atkins_data, empty_test, seq_length=sequence_length) + except Exception as e: + X_atkins = y_atkins = None + + # Train model + model, _, training_info = train_hist_model( + X_train, y_train, X_test, y_test, sample_weights, + X_xu=X_xu, y_xu=y_xu, X_atkins=X_atkins, y_atkins=y_atkins + ) + + # Feature importance analysis + source_results = analyze_feature_importance(model, X_test, y_test, test_data) + + return model, training_info['final_metrics']['test'] + + except Exception as e: + return None, None + +if __name__ == "__main__": + BaseConfig.create_directories() + main()