2 týždňov pred · 21f39260c9
--- a/data_processing/data_operation/weight.py
+++ b/data_processing/data_operation/weight.py
@@ -0,0 +1,44 @@
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+def balance_weights(y: np.ndarray, bins: int = 10, normalize: bool = True, **kwargs) -> np.ndarray:
			
 
				+    """
			
 
				+    平衡权重，分布数量越少权重越大
			
 
				+    """
			
 
				+    counts, bin_edges = np.histogram(y, bins=bins)
			
 
				+
			
 
				+    # digitize 不使用 right=True，这样最小值也能落在 bin 0 开始
			
 
				+    bin_indices = np.digitize(y, bin_edges[1:-1], right=False)
			
 
				+
			
 
				+    # bin_counts 用 0 到 bins-1 的索引
			
 
				+    bin_counts = {i: count for i, count in enumerate(counts)}
			
 
				+
			
 
				+    # 对于每个样本分配权重（加个兜底：出现异常时给个较大默认值）
			
 
				+    weights = np.array([1.0 / bin_counts.get(b, 1e-6) for b in bin_indices])
			
 
				+
			
 
				+    if normalize:
			
 
				+        weights /= np.mean(weights)
			
 
				+
			
 
				+    return weights
			
 
				+
			
 
				+def south_weight(target: np.ndarray, cap, **kwargs) -> np.ndarray:
			
 
				+    """
			
 
				+    应付南方点网的奇怪考核
			
 
				+    为了不把曲线压太低，这里有所收敛(添加开方处理，不让权重分布过于离散)
			
 
				+    """
			
 
				+    weight = 1 / np.sqrt(np.where(target < 0.2 * cap, 0.2 * cap, target))
			
 
				+    return weight
			
 
				+
			
 
				+def standard_weight(target: np.array, **kwargs) -> np.ndarray:
			
 
				+    """
			
 
				+    标准化权重
			
 
				+    """
			
 
				+    weight = np.sqrt(np.abs(target - np.mean(target))) / np.std(target)
			
 
				+    return weight
			
 
				+
			
 
				+# ------------------------------权重函数注册------------------------------------------------
			
 
				+WEIGHT_REGISTER = {
			
 
				+    "balance": balance_weights,
			
 
				+    "south": south_weight,
			
 
				+    "std": standard_weight
			
 
				+}
			
--- a/models_processing/model_train/model_training_ml.py
+++ b/models_processing/model_train/model_training_ml.py
@@ -11,21 +11,11 @@ from common.processing_data_common import missing_features, str_to_list
 
				 from sklearn.pipeline import Pipeline
			
 
				 from sklearn.svm import SVR
			
 
				 from sklearn.preprocessing import MinMaxScaler
			
 
				+from data_processing.data_operation.weight import WEIGHT_REGISTER
			
 
				 
			
 
				 app = Flask('model_training_ml——service')
			
 
				 
			
 
				-"""
			
 
				-基于model_training_lightgbm.py
			
 
				-机器学习通用训练方法，特点
			
 
				-1. 保存模型同时，保存模型特征
			
 
				-2. 支持模型训练样本权重(需要在预处理部分生成权重特征)
			
 
				-
			
 
				-参数格式如下
			
 
				-
			
 
				-"""
			
 
				-
			
 
				-
			
 
				-def train_lgb(data_split, categorical_features, model_params, num_boost_round, sample_weight=None):
			
 
				+def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
			
 
				     X_train, X_test, y_train, y_test = data_split
			
 
				     # 创建LightGBM数据集
			
 
				     lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, weight=sample_weight)
			
@@ -37,7 +27,6 @@ def train_lgb(data_split, categorical_features, model_params, num_boost_round, s
 
				         'boosting_type': 'gbdt',
			
 
				         'verbose': 1
			
 
				     }
			
 
				-    print(type(model_params))
			
 
				     params.update(model_params)
			
 
				     # 训练模型
			
 
				     print('Starting training...')
			
@@ -65,14 +54,10 @@ def train_svr(data_split, model_params, sample_weight=None):
 
				 
			
 
				 def build_model(df, args):
			
 
				     np.random.seed(42)
			
 
				-    # lightgbm预测下
			
 
				-    numerical_features, categorical_features, label, model_name, num_boost_round, model_params, col_time = str_to_list(
			
 
				-        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], int(
			
 
				-        args['num_boost_round']), eval(args['model_params']), args['col_time']
			
 
				-    # 样本权重
			
 
				-    sample_weight = None
			
 
				-    if 'sample_weight' in args.keys():
			
 
				-        sample_weight = args['sample_weight']
			
 
				+    # 参数
			
 
				+    numerical_features, categorical_features, label, model_name, model_params, col_time = str_to_list(
			
 
				+        args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], eval(
			
 
				+        args['model_params']), args['col_time']
			
 
				 
			
 
				     features = numerical_features + categorical_features
			
 
				     print("features:************", features)
			
@@ -84,16 +69,26 @@ def build_model(df, args):
 
				     # 拆分数据为训练集和测试集
			
 
				     X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
			
 
				                                                         shuffle=False)
			
 
				-
			
 
				     model_type = args['model_type']
			
 
				+    sample_weight = None
			
 
				+    # 样本权重
			
 
				+    if 'sample_weight' in args.keys():
			
 
				+        if args['sample_weight'] in WEIGHT_REGISTER.keys():
			
 
				+            sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
			
 
				+        elif args['sample_weight'] in df.columns.tolist():
			
 
				+            sample_weight = df[args['sample_weight']].values.reshape(-1)
			
 
				+        else:
			
 
				+            sample_weight = None
			
 
				+            print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
			
 
				     # 区分常规机器学习模型和lgb，这里只实例化svr，后续可扩展
			
 
				     if model_type == "lightgbm":
			
 
				+        num_boost_round = int(args['num_boost_round'])
			
 
				         model, y_pred = train_lgb([X_train, X_test, y_train, y_test], categorical_features, model_params,
			
 
				                                   num_boost_round, sample_weight=sample_weight)
			
 
				-    elif model_type == "SVR":
			
 
				+    elif model_type == "svr":
			
 
				         model, y_pred = train_svr([X_train, X_test, y_train, y_test], model_params, sample_weight=sample_weight)
			
 
				     else:
			
 
				-        raise ValueError(f"Invalid model_type, must be one of [lightgbm, SVR]")
			
 
				+        raise ValueError(f"Invalid model_type, must be one of [lightgbm, svr]")
			
 
				 
			
 
				     # 评估
			
 
				     mse = mean_squared_error(y_test, y_pred)