|
@@ -15,6 +15,18 @@ from data_processing.data_operation.weight import WEIGHT_REGISTER
|
|
|
|
|
|
app = Flask('model_training_ml——service')
|
|
app = Flask('model_training_ml——service')
|
|
|
|
|
|
|
|
+def get_sample_weight(df, label, args):
|
|
|
|
+ # 样本权重
|
|
|
|
+ if 'sample_weight' in args.keys():
|
|
|
|
+ if args['sample_weight'] in WEIGHT_REGISTER.keys():
|
|
|
|
+ sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
|
|
|
|
+ elif args['sample_weight'] in df.columns.tolist():
|
|
|
|
+ sample_weight = df[args['sample_weight']].values.reshape(-1)
|
|
|
|
+ else:
|
|
|
|
+ sample_weight = None
|
|
|
|
+ print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
|
|
|
|
+ return sample_weight
|
|
|
|
+
|
|
def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
|
|
def train_lgb(data_split, categorical_features, model_params, num_boost_round=100, sample_weight=None):
|
|
X_train, X_test, y_train, y_test = data_split
|
|
X_train, X_test, y_train, y_test = data_split
|
|
# 创建LightGBM数据集
|
|
# 创建LightGBM数据集
|
|
@@ -67,19 +79,15 @@ def build_model(df, args):
|
|
df = missing_features(df, features, col_time)
|
|
df = missing_features(df, features, col_time)
|
|
df = df[~np.isnan(df[label])]
|
|
df = df[~np.isnan(df[label])]
|
|
# 拆分数据为训练集和测试集
|
|
# 拆分数据为训练集和测试集
|
|
- X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], test_size=0.2, random_state=42,
|
|
|
|
|
|
+ df_train, df_test = train_test_split(df, test_size=0.2, random_state=42,
|
|
shuffle=False)
|
|
shuffle=False)
|
|
|
|
+ X_train, y_train = df_train[features].values, df_train[label].values
|
|
|
|
+ X_test, y_test = df_test[features].values, df_test[label].values
|
|
|
|
+
|
|
|
|
+ # 获取样本权重
|
|
|
|
+ sample_weight = get_sample_weight(df_train, label=label, args=args)
|
|
|
|
+
|
|
model_type = args['model_type']
|
|
model_type = args['model_type']
|
|
- sample_weight = None
|
|
|
|
- # 样本权重
|
|
|
|
- if 'sample_weight' in args.keys():
|
|
|
|
- if args['sample_weight'] in WEIGHT_REGISTER.keys():
|
|
|
|
- sample_weight = WEIGHT_REGISTER[args['sample_weight']](df[label].values.reshape(-1), **args)
|
|
|
|
- elif args['sample_weight'] in df.columns.tolist():
|
|
|
|
- sample_weight = df[args['sample_weight']].values.reshape(-1)
|
|
|
|
- else:
|
|
|
|
- sample_weight = None
|
|
|
|
- print('sample_weight is neither in the predefined weights nor a column of the DataFrame, not applicable')
|
|
|
|
# 区分常规机器学习模型和lgb,这里只实例化svr,后续可扩展
|
|
# 区分常规机器学习模型和lgb,这里只实例化svr,后续可扩展
|
|
if model_type == "lightgbm":
|
|
if model_type == "lightgbm":
|
|
num_boost_round = int(args['num_boost_round'])
|
|
num_boost_round = int(args['num_boost_round'])
|
|
@@ -114,6 +122,7 @@ def model_training_ml():
|
|
insert_pickle_model_into_mongo(model, args, features=features)
|
|
insert_pickle_model_into_mongo(model, args, features=features)
|
|
success = 1
|
|
success = 1
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
|
+ print(e)
|
|
my_exception = traceback.format_exc()
|
|
my_exception = traceback.format_exc()
|
|
my_exception.replace("\n", "\t")
|
|
my_exception.replace("\n", "\t")
|
|
result['msg'] = my_exception
|
|
result['msg'] = my_exception
|