2 týždňov pred · 21f39260c9
--- a/data_processing/data_operation/weight.py
+++ b/data_processing/data_operation/weight.py
@@ -0,0 +1,44 @@
 
															+import numpy as np
														
 
															+
														
 
															+
														
 
															+def balance_weights(y: np.ndarray, bins: int = 10, normalize: bool = True, **kwargs) -> np.ndarray:
														
 
															+    """
														
 
															+    平衡权重，分布数量越少权重越大
														
 
															+    """
														
 
															+    counts, bin_edges = np.histogram(y, bins=bins)
														
 
															+
														
 
															+    # digitize 不使用 right=True，这样最小值也能落在 bin 0 开始
														
 
															+    bin_indices = np.digitize(y, bin_edges[1:-1], right=False)
														
 
															+
														
 
															+    # bin_counts 用 0 到 bins-1 的索引
														
 
															+    bin_counts = {i: count for i, count in enumerate(counts)}
														
 
															+
														
 
															+    # 对于每个样本分配权重（加个兜底：出现异常时给个较大默认值）
														
 
															+    weights = np.array([1.0 / bin_counts.get(b, 1e-6) for b in bin_indices])
														
 
															+
														
 
															+    if normalize:
														
 
															+        weights /= np.mean(weights)
														
 
															+
														
 
															+    return weights
														
 
															+
														
 
															+def south_weight(target: np.ndarray, cap, **kwargs) -> np.ndarray:
														
 
															+    """
														
 
															+    应付南方点网的奇怪考核
														
 
															+    为了不把曲线压太低，这里有所收敛(添加开方处理，不让权重分布过于离散)
														
 
															+    """
														
 
															+    weight = 1 / np.sqrt(np.where(target < 0.2 * cap, 0.2 * cap, target))
														
 
															+    return weight
														
 
															+
														
 
															+def standard_weight(target: np.array, **kwargs) -> np.ndarray:
														
 
															+    """
														
 
															+    标准化权重
														
 
															+    """
														
 
															+    weight = np.sqrt(np.abs(target - np.mean(target))) / np.std(target)
														
 
															+    return weight
														
 
															+
														
 
															+# ------------------------------权重函数注册------------------------------------------------
														
 
															+WEIGHT_REGISTER = {
														
 
															+    "balance": balance_weights,
														
 
															+    "south": south_weight,
														
 
															+    "std": standard_weight
														
 
															+}
														
--- a/models_processing/model_train/model_training_ml.py
+++ b/models_processing/model_train/model_training_ml.py
@@ -11,21 +11,11 @@ from common.processing_data_common import missing_features, str_to_list
 
															 from sklearn.pipeline import Pipeline
														
 
															 from sklearn.svm import SVR
														
 
															 from sklearn.preprocessing import MinMaxScaler
														
 
															+from data_processing.data_operation.weight import WEIGHT_REGISTER
														
 
															 app = Flask('model_training_ml——service')
														
 
															-"""
														
 
															-基于model_training_lightgbm.py
														
 
															-机器学习通用训练方法，特点
														
 
															-1. 保存模型同时，保存模型特征
														
 
															-2. 支持模型训练样本权重(需要在预处理部分生成权重特征)
														
 
															-
														
 
															-参数格式如下
														
 
															-
														
 
															-"""
														
 
															-
														
 
															-
														
 
															-def train_lgb(data_split, categorical_features, model_params, num_boost_round, sample_weight=None):
														
 
															+def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
														
 
															     X_train, X_test, y_train, y_test = data_split
														
 
															     # 创建LightGBM数据集
														
 
															     lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, weight=sample_weight)
														
@@ -37,7 +27,6 @@ def train_lgb(data_split, categorical_features, model_params, num_boost_round, s
 
															         'boosting_type': 'gbdt',
														
 
															         'verbose': 1
														
 
															     }
														
 
															-    print(type(model_params))
														
 
															     params.update(model_params)
														
 
															     # 训练模型
														
 
															     print('Starting training...')
														
@@ -65,14 +54,10 @@ def train_svr(data_split, model_params, sample_weight=None):
 
															 def build_model(df, args):
														
 
															     np.random.seed(42)
														
 
															-    # lightgbm预测下
														
 
															-    numerical_features, categorical_features, label, model_name, num_boost_round, model_params, col_time = str_to_list(
														
 
															-        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], int(
														
 
															-        args['num_boost_round']), eval(args['model_params']), args['col_time']
														
 
															-    # 样本权重
														
 
															-    sample_weight = None
														
 
															-    if 'sample_weight' in args.keys():
														
 
															-        sample_weight = args['sample_weight']
														
 
															+    # 参数
														
 
															+    numerical_features, categorical_features, label, model_name, model_params, col_time = str_to_list(
														
 
															+        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], eval(
														
 
															+        args['model_params']), args['col_time']
														
 
															     features = numerical_features + categorical_features
														
 
															     print("features:************", features)
														
@@ -84,16 +69,26 @@ def build_model(df, args):
 
															     # 拆分数据为训练集和测试集
														
 
															     X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
														
 
															                                                         shuffle=False)
														
 
															-
														
 
															     model_type = args['model_type']
														
 
															+    sample_weight = None
														
 
															+    # 样本权重
														
 
															+    if 'sample_weight' in args.keys():
														
 
															+        if args['sample_weight'] in WEIGHT_REGISTER.keys():
														
 
															+            sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
														
 
															+        elif args['sample_weight'] in df.columns.tolist():
														
 
															+            sample_weight = df[args['sample_weight']].values.reshape(-1)
														
 
															+        else:
														
 
															+            sample_weight = None
														
 
															+            print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
														
 
															     # 区分常规机器学习模型和lgb，这里只实例化svr，后续可扩展
														
 
															     if model_type == "lightgbm":
														
 
															+        num_boost_round = int(args['num_boost_round'])
														
 
															         model, y_pred = train_lgb([X_train, X_test, y_train, y_test], categorical_features, model_params,
														
 
															                                   num_boost_round, sample_weight=sample_weight)
														
 
															-    elif model_type == "SVR":
														
 
															+    elif model_type == "svr":
														
 
															         model, y_pred = train_svr([X_train, X_test, y_train, y_test], model_params, sample_weight=sample_weight)
														
 
															     else:
														
 
															-        raise ValueError(f"Invalid model_type, must be one of [lightgbm, SVR]")
														
 
															+        raise ValueError(f"Invalid model_type, must be one of [lightgbm, svr]")
														
 
															     # 评估
														
 
															     mse = mean_squared_error(y_test, y_pred)