Explorar el Código

awg commit algorithm components

anweiguo hace 4 meses
padre
commit
aa40f10fa8

+ 17 - 1
common/processing_data_common.py

@@ -13,4 +13,20 @@ def generate_unique_colors(num_colors):
     while len(generated_colors) < num_colors:
         color = f"rgb({random.randint(0, 255)}, {random.randint(0, 255)}, {random.randint(0, 255)})"
         generated_colors.add(color)
-    return list(generated_colors)
+    return list(generated_colors)
+
+def missing_features(df, features, col_time, threshold=0.2):
+    df['day'] = df[col_time].str[:10]
+    # 按日期分组,计算缺失率
+    missing_rates = df[['day']+features].groupby('day').apply(
+        lambda group: (group.isnull().sum() / group.shape[0]).mean()
+    )
+    # 筛选特征平均缺失率大于 20% 的日期
+    days_with_high_missing = missing_rates[missing_rates >= threshold].index
+    # 打印结果
+    print("特征缺失率超过50%的日期:",days_with_high_missing)
+    print()
+    print("**********删除前维度", df.shape)
+    df = df[~df['day'].isin(days_with_high_missing)]
+    print("**********删除后维度", df.shape)
+    return df.drop('day',axis=1)

+ 4 - 8
models_processing/model_train/model_training_lightgbm.py

@@ -7,17 +7,20 @@ import time
 import traceback
 import logging
 from common.database_dml import get_data_from_mongo,insert_pickle_model_into_mongo
+from common.processing_data_common import missing_features,str_to_list
 app = Flask('model_training_lightgbm——service')
 
 
 def build_model(df,args):
     np.random.seed(42)
     #lightgbm预测下
-    numerical_features,categorical_features,label,model_name,num_boost_round,model_params = str_to_list(args['numerical_features']),str_to_list(args['categorical_features']),args['label'],args['model_name'],int(args['num_boost_round']),eval(args['model_params'])
+    numerical_features,categorical_features,label,model_name,num_boost_round,model_params,col_time = str_to_list(args['numerical_features']),str_to_list(args['categorical_features']),args['label'],args['model_name'],int(args['num_boost_round']),eval(args['model_params']),args['col_time']
     features = numerical_features+categorical_features
     print("features:************",features)
     if 'is_limit' in df.columns:
         df = df[df['is_limit']==False]
+    # 清洗特征平均缺失率大于20%的天
+    df = missing_features(df, features, col_time)
     # 拆分数据为训练集和测试集
     X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42)
     # 创建LightGBM数据集
@@ -51,13 +54,6 @@ def build_model(df,args):
     return gbm
 
 
-def str_to_list(arg):
-    if arg == '':
-        return []
-    else:
-        return arg.split(',')
-
-
 @app.route('/model_training_lightgbm', methods=['POST'])
 def model_training_lightgbm():
     # 获取程序开始时间  

+ 3 - 14
models_processing/model_train/model_training_lstm.py

@@ -1,6 +1,4 @@
-import pandas as pd
 import numpy as np
-from pymongo import MongoClient
 from sklearn.model_selection import train_test_split
 from flask import Flask,request
 import time
@@ -12,23 +10,12 @@ import joblib
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import LSTM, Dense
 from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
-# import matplotlib.pyplot as plt
 import tensorflow as tf
 from common.database_dml import get_data_from_mongo,insert_h5_model_into_mongo
+from common.processing_data_common import missing_features,str_to_list
 
 app = Flask('model_training_lightgbm——service')
 
-# def draw_loss(history):
-#     #绘制训练集和验证集损失
-#     plt.figure(figsize=(20, 8))
-#     plt.plot(history.history['loss'], label='Training Loss')
-#     plt.plot(history.history['val_loss'], label='Validation Loss')
-#     plt.title('Loss Curve')
-#     plt.xlabel('Epochs')
-#     plt.ylabel('Loss')
-#     plt.legend()
-#     plt.show()
-
 def rmse(y_true, y_pred):
     return tf.math.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
 
@@ -51,6 +38,8 @@ def build_model(data, args):
     col_time, time_steps,features,target = args['col_time'], int(args['time_steps']), str_to_list(args['features']),args['target']
     if 'is_limit' in data.columns:
         data = data[data['is_limit']==False]
+    # 清洗特征平均缺失率大于20%的天
+    df = missing_features(data, features, col_time)
     train_data = data.fillna(method='ffill').fillna(method='bfill').sort_values(by=col_time)
     # X_train, X_test, y_train, y_test = process_data(df_clean, params)
     # 创建特征和目标的标准化器