|
@@ -11,21 +11,11 @@ from common.processing_data_common import missing_features, str_to_list
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.svm import SVR
|
|
from sklearn.svm import SVR
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
+from data_processing.data_operation.weight import WEIGHT_REGISTER
|
|
|
|
|
|
app = Flask('model_training_ml——service')
|
|
app = Flask('model_training_ml——service')
|
|
|
|
|
|
-"""
|
|
|
|
-基于model_training_lightgbm.py
|
|
|
|
-机器学习通用训练方法,特点
|
|
|
|
-1. 保存模型同时,保存模型特征
|
|
|
|
-2. 支持模型训练样本权重(需要在预处理部分生成权重特征)
|
|
|
|
-
|
|
|
|
-参数格式如下
|
|
|
|
-
|
|
|
|
-"""
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def train_lgb(data_split, categorical_features, model_params, num_boost_round, sample_weight=None):
|
|
|
|
|
|
+def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
|
|
X_train, X_test, y_train, y_test = data_split
|
|
X_train, X_test, y_train, y_test = data_split
|
|
# 创建LightGBM数据集
|
|
# 创建LightGBM数据集
|
|
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, weight=sample_weight)
|
|
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features, weight=sample_weight)
|
|
@@ -37,7 +27,6 @@ def train_lgb(data_split, categorical_features, model_params, num_boost_round, s
|
|
'boosting_type': 'gbdt',
|
|
'boosting_type': 'gbdt',
|
|
'verbose': 1
|
|
'verbose': 1
|
|
}
|
|
}
|
|
- print(type(model_params))
|
|
|
|
params.update(model_params)
|
|
params.update(model_params)
|
|
# 训练模型
|
|
# 训练模型
|
|
print('Starting training...')
|
|
print('Starting training...')
|
|
@@ -65,14 +54,10 @@ def train_svr(data_split, model_params, sample_weight=None):
|
|
|
|
|
|
def build_model(df, args):
|
|
def build_model(df, args):
|
|
np.random.seed(42)
|
|
np.random.seed(42)
|
|
- # lightgbm预测下
|
|
|
|
- numerical_features, categorical_features, label, model_name, num_boost_round, model_params, col_time = str_to_list(
|
|
|
|
- args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], int(
|
|
|
|
- args['num_boost_round']), eval(args['model_params']), args['col_time']
|
|
|
|
- # 样本权重
|
|
|
|
- sample_weight = None
|
|
|
|
- if 'sample_weight' in args.keys():
|
|
|
|
- sample_weight = args['sample_weight']
|
|
|
|
|
|
+ # 参数
|
|
|
|
+ numerical_features, categorical_features, label, model_name, model_params, col_time = str_to_list(
|
|
|
|
+ args['numerical_features']), str_to_list(args['categorical_features']), args['label'], args['model_name'], eval(
|
|
|
|
+ args['model_params']), args['col_time']
|
|
|
|
|
|
features = numerical_features + categorical_features
|
|
features = numerical_features + categorical_features
|
|
print("features:************", features)
|
|
print("features:************", features)
|
|
@@ -84,16 +69,26 @@ def build_model(df, args):
|
|
# 拆分数据为训练集和测试集
|
|
# 拆分数据为训练集和测试集
|
|
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
|
|
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
|
|
shuffle=False)
|
|
shuffle=False)
|
|
-
|
|
|
|
model_type = args['model_type']
|
|
model_type = args['model_type']
|
|
|
|
+ sample_weight = None
|
|
|
|
+ # 样本权重
|
|
|
|
+ if 'sample_weight' in args.keys():
|
|
|
|
+ if args['sample_weight'] in WEIGHT_REGISTER.keys():
|
|
|
|
+ sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
|
|
|
|
+ elif args['sample_weight'] in df.columns.tolist():
|
|
|
|
+ sample_weight = df[args['sample_weight']].values.reshape(-1)
|
|
|
|
+ else:
|
|
|
|
+ sample_weight = None
|
|
|
|
+ print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
|
|
# 区分常规机器学习模型和lgb,这里只实例化svr,后续可扩展
|
|
# 区分常规机器学习模型和lgb,这里只实例化svr,后续可扩展
|
|
if model_type == "lightgbm":
|
|
if model_type == "lightgbm":
|
|
|
|
+ num_boost_round = int(args['num_boost_round'])
|
|
model, y_pred = train_lgb([X_train, X_test, y_train, y_test], categorical_features, model_params,
|
|
model, y_pred = train_lgb([X_train, X_test, y_train, y_test], categorical_features, model_params,
|
|
num_boost_round, sample_weight=sample_weight)
|
|
num_boost_round, sample_weight=sample_weight)
|
|
- elif model_type == "SVR":
|
|
|
|
|
|
+ elif model_type == "svr":
|
|
model, y_pred = train_svr([X_train, X_test, y_train, y_test], model_params, sample_weight=sample_weight)
|
|
model, y_pred = train_svr([X_train, X_test, y_train, y_test], model_params, sample_weight=sample_weight)
|
|
else:
|
|
else:
|
|
- raise ValueError(f"Invalid model_type, must be one of [lightgbm, SVR]")
|
|
|
|
|
|
+ raise ValueError(f"Invalid model_type, must be one of [lightgbm, svr]")
|
|
|
|
|
|
# 评估
|
|
# 评估
|
|
mse = mean_squared_error(y_test, y_pred)
|
|
mse = mean_squared_error(y_test, y_pred)
|