hzh 2 týždňov pred
rodič
commit
21f39260c9

+ 44 - 0
data_processing/data_operation/weight.py

@@ -0,0 +1,44 @@
+import numpy as np
+
+
+def balance_weights(y: np.ndarray, bins: int = 10, normalize: bool = True, **kwargs) -> np.ndarray:
+    """
+    平衡权重,分布数量越少权重越大
+    """
+    counts, bin_edges = np.histogram(y, bins=bins)
+
+    # digitize 不使用 right=True,这样最小值也能落在 bin 0 开始
+    bin_indices = np.digitize(y, bin_edges[1:-1], right=False)
+
+    # bin_counts 用 0 到 bins-1 的索引
+    bin_counts = {i: count for i, count in enumerate(counts)}
+
+    # 对于每个样本分配权重(加个兜底:出现异常时给个较大默认值)
+    weights = np.array([1.0 / bin_counts.get(b, 1e-6) for b in bin_indices])
+
+    if normalize:
+        weights /= np.mean(weights)
+
+    return weights
+
+def south_weight(target: np.ndarray, cap, **kwargs) -> np.ndarray:
+    """
+    应付南方点网的奇怪考核
+    为了不把曲线压太低,这里有所收敛(添加开方处理,不让权重分布过于离散)
+    """
+    weight = 1 / np.sqrt(np.where(target < 0.2 * cap, 0.2 * cap, target))
+    return weight
+
+def standard_weight(target: np.array, **kwargs) -> np.ndarray:
+    """
+    标准化权重
+    """
+    weight = np.sqrt(np.abs(target - np.mean(target))) / np.std(target)
+    return weight
+
+# ------------------------------权重函数注册------------------------------------------------
+WEIGHT_REGISTER = {
+    "balance": balance_weights,
+    "south": south_weight,
+    "std": standard_weight
+}

+ 19 - 24
models_processing/model_train/model_training_ml.py

@@ -11,21 +11,11 @@ from common.processing_data_common import missing_features, str_to_list
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVR
 from sklearn.preprocessing import MinMaxScaler
+from data_processing.data_operation.weight import WEIGHT_REGISTER
 
 app = Flask('model_training_ml——service')
 
-"""
-基于model_training_lightgbm.py
-机器学习通用训练方法,特点
-1. 保存模型同时,保存模型特征
-2. 支持模型训练样本权重(需要在预处理部分生成权重特征)
-
-参数格式如下
-
-"""
-
-
-def train_lgb(data_split, categorical_features, model_params, num_boost_round, sample_weight=None):
+def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
     X_train, X_test, y_train, y_test = data_split
     # 创建LightGBM数据集
     lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, weight=sample_weight)
@@ -37,7 +27,6 @@ def train_lgb(data_split, categorical_features, model_params, num_boost_round, s
         'boosting_type': 'gbdt',
         'verbose': 1
     }
-    print(type(model_params))
     params.update(model_params)
     # 训练模型
     print('Starting training...')
@@ -65,14 +54,10 @@ def train_svr(data_split, model_params, sample_weight=None):
 
 def build_model(df, args):
     np.random.seed(42)
-    # lightgbm预测下
-    numerical_features, categorical_features, label, model_name, num_boost_round, model_params, col_time = str_to_list(
-        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], int(
-        args['num_boost_round']), eval(args['model_params']), args['col_time']
-    # 样本权重
-    sample_weight = None
-    if 'sample_weight' in args.keys():
-        sample_weight = args['sample_weight']
+    # 参数
+    numerical_features, categorical_features, label, model_name, model_params, col_time = str_to_list(
+        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], eval(
+        args['model_params']), args['col_time']
 
     features = numerical_features + categorical_features
     print("features:************", features)
@@ -84,16 +69,26 @@ def build_model(df, args):
     # 拆分数据为训练集和测试集
     X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
                                                         shuffle=False)
-
     model_type = args['model_type']
+    sample_weight = None
+    # 样本权重
+    if 'sample_weight' in args.keys():
+        if args['sample_weight'] in WEIGHT_REGISTER.keys():
+            sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
+        elif args['sample_weight'] in df.columns.tolist():
+            sample_weight = df[args['sample_weight']].values.reshape(-1)
+        else:
+            sample_weight = None
+            print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
     # 区分常规机器学习模型和lgb,这里只实例化svr,后续可扩展
     if model_type == "lightgbm":
+        num_boost_round = int(args['num_boost_round'])
         model, y_pred = train_lgb([X_train, X_test, y_train, y_test], categorical_features, model_params,
                                   num_boost_round, sample_weight=sample_weight)
-    elif model_type == "SVR":
+    elif model_type == "svr":
         model, y_pred = train_svr([X_train, X_test, y_train, y_test], model_params, sample_weight=sample_weight)
     else:
-        raise ValueError(f"Invalid model_type, must be one of [lightgbm, SVR]")
+        raise ValueError(f"Invalid model_type, must be one of [lightgbm, svr]")
 
     # 评估
     mse = mean_squared_error(y_test, y_pred)