|
@@ -18,17 +18,17 @@ class DataHandler(object):
|
|
|
self.logger = logger
|
|
|
self.opt = argparse.Namespace(**args)
|
|
|
|
|
|
- def get_train_data(self, dfs, col_time, target, time_series=1):
|
|
|
+ def get_train_data(self, dfs, col_time, target, time_series=1, lstm_type=1):
|
|
|
train_x, valid_x, train_y, valid_y = [], [], [], []
|
|
|
for i, df in enumerate(dfs, start=1):
|
|
|
if len(df) < self.opt.Model["time_step"]:
|
|
|
self.logger.info("特征处理-训练数据-不满足time_step")
|
|
|
- if time_series == 2:
|
|
|
- datax, datay = self.get_timestep_features_lstm2(df, col_time, target, is_train=True)
|
|
|
- elif time_series == 3:
|
|
|
- datax, datay = self.get_timestep_features_bilstm(df, col_time, target, is_train=True)
|
|
|
+ if lstm_type == 2:
|
|
|
+ datax, datay = self.get_timestep_features_lstm2(df, col_time, target, is_train=True, time_series=time_series)
|
|
|
+ elif lstm_type == 3:
|
|
|
+ datax, datay = self.get_timestep_features_bilstm(df, col_time, target, is_train=True, time_series=time_series)
|
|
|
else:
|
|
|
- datax, datay = self.get_timestep_features(df, col_time, target, is_train=True)
|
|
|
+ datax, datay = self.get_timestep_features(df, col_time, target, is_train=True, time_series=time_series)
|
|
|
if len(datax) < 10:
|
|
|
self.logger.info("特征处理-训练数据-无法进行最小分割")
|
|
|
continue
|
|
@@ -46,17 +46,23 @@ class DataHandler(object):
|
|
|
|
|
|
return train_x, valid_x, train_y, valid_y
|
|
|
|
|
|
- def get_predict_data(self, dfs, time_series=1):
|
|
|
+ def get_predict_data(self, dfs, time_series=1, lstm_type=1):
|
|
|
test_x = []
|
|
|
for i, df in enumerate(dfs, start=1):
|
|
|
- if len(df) < self.opt.Model["time_step"]:
|
|
|
+ if len(df) < self.opt.Model["time_step"]*time_series:
|
|
|
self.logger.info("特征处理-预测数据-不满足time_step")
|
|
|
continue
|
|
|
- datax = self.get_predict_features(df, time_series)
|
|
|
+ if lstm_type == 2:
|
|
|
+ datax = self.get_predict_features2(df, time_series)
|
|
|
+ elif lstm_type == 3:
|
|
|
+ datax = self.get_predict_features3(df, time_series)
|
|
|
+ else:
|
|
|
+ datax = self.get_predict_features(df, time_series)
|
|
|
test_x.append(datax)
|
|
|
test_x = np.concatenate(test_x, axis=0)
|
|
|
return test_x
|
|
|
|
|
|
+
|
|
|
def get_predict_features(self, norm_data, time_series=1):
|
|
|
"""
|
|
|
均分数据,获取预测数据集
|
|
@@ -70,59 +76,97 @@ class DataHandler(object):
|
|
|
features_x = np.array([feature_data.loc[i*time_step:i*time_step + time_step_loc, self.opt.features].reset_index(drop=True) for i in range(iters)])
|
|
|
if end > 0:
|
|
|
df = feature_data.tail(end)
|
|
|
- df_repeated = pd.concat([df] + [pd.DataFrame([df.iloc[0]]* (time_step-end))]).reset_index(drop=True)
|
|
|
+ df_repeated = pd.concat([df] + [pd.DataFrame([df.iloc[-1]]* (time_step-end))]).reset_index(drop=True)
|
|
|
features_x = np.concatenate((features_x, np.expand_dims(df_repeated, 0)), axis=0)
|
|
|
return features_x
|
|
|
|
|
|
- def get_timestep_features(self, norm_data, col_time, target, is_train):
|
|
|
+ def get_predict_features2(self, norm_data, time_series=2):
|
|
|
+ """
|
|
|
+ 时序2:以时间步长切分数据,得到切分原子段,根据所需的时序原子段得到预测数据集
|
|
|
+ """
|
|
|
+ time_step = self.opt.Model["time_step"]
|
|
|
+ feature_data = norm_data.reset_index(drop=True)
|
|
|
+ time_step_loc = time_step*time_series - 1
|
|
|
+ iters = int(len(feature_data)) // time_step
|
|
|
+ iters = iters - (time_series - 1)
|
|
|
+ end = int(len(feature_data)) % time_step
|
|
|
+ features_x = np.array([feature_data.loc[i*time_step:i*time_step + time_step_loc, self.opt.features].reset_index(drop=True) for i in range(0, iters)])
|
|
|
+ if end > 0:
|
|
|
+ df = norm_data.tail(end)
|
|
|
+ df_repeated = pd.concat([norm_data.tail((time_series-1)*time_step)] + [df] + [df.tail(1)] * (time_step - end)).reset_index(drop=True)
|
|
|
+ features_x = np.concatenate((features_x, np.expand_dims(df_repeated, 0)), axis=0)
|
|
|
+ return features_x
|
|
|
+
|
|
|
+ def get_predict_features3(self, norm_data, time_series=3):
|
|
|
+ """
|
|
|
+ 时序3:以时间步长切分数据,得到切分原子段,根据所需的时序原子段得到预测数据集
|
|
|
+ """
|
|
|
+ time_step = self.opt.Model["time_step"]
|
|
|
+ feature_data = norm_data.reset_index(drop=True)
|
|
|
+ time_step_loc = time_step*time_series - 1
|
|
|
+ features_x = np.array([x for x in [feature_data.loc[i*time_step:i*time_step + time_step_loc, self.opt.features].reset_index(drop=True) for i in range(0, len(norm_data), (time_series-2)*time_step)] if not len(x) < time_step*time_series])
|
|
|
+ end = norm_data.tail(len(feature_data) - (features_x.shape[0] * time_step) - time_step).reset_index(drop=True)
|
|
|
+ num = len(end) // ((time_series - 2) * time_step)
|
|
|
+ bu = len(end) % ((time_series - 2) * time_step)
|
|
|
+ front = norm_data.tail(time_step)
|
|
|
+ back = norm_data.tail(time_step)
|
|
|
+ df_repeated = [pd.concat([front]+[end.iloc[i*time_step:i*time_step + time_step].reset_index(drop=True)]+[back]) for i in range(0, num)]
|
|
|
+ if bu > 0:
|
|
|
+ last = pd.concat([front] + [end.tail(bu)] + [end.tail(1)] * (time_step - bu) + [back])
|
|
|
+ df_repeated = np.array(df_repeated + [last])
|
|
|
+ else:
|
|
|
+ df_repeated = np.array(df_repeated)
|
|
|
+ features_x = np.concatenate((features_x, df_repeated), axis=0)
|
|
|
+ return features_x
|
|
|
+
|
|
|
+ def get_timestep_features(self, norm_data, col_time, target, is_train, time_series=1):
|
|
|
"""
|
|
|
步长分割数据,获取时序训练集
|
|
|
"""
|
|
|
time_step = self.opt.Model["time_step"]
|
|
|
feature_data = norm_data.reset_index(drop=True)
|
|
|
- time_step_loc = time_step - 1
|
|
|
+ time_step_loc = time_step*time_series - 1
|
|
|
train_num = int(len(feature_data))
|
|
|
label_features = [col_time, target] if is_train is True else [col_time, target]
|
|
|
nwp_cs = self.opt.features
|
|
|
- nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
- labels = [feature_data.loc[i:i + time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step + 1)]
|
|
|
+ nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
+ labels = [feature_data.loc[i:i + time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)]
|
|
|
features_x, features_y = [], []
|
|
|
for i, row in enumerate(zip(nwp, labels)):
|
|
|
features_x.append(row[0])
|
|
|
features_y.append(row[1])
|
|
|
return features_x, features_y
|
|
|
|
|
|
- def get_timestep_features_lstm2(self, norm_data, col_time, target, is_train):
|
|
|
+ def get_timestep_features_lstm2(self, norm_data, col_time, target, is_train, time_series=2):
|
|
|
"""
|
|
|
- 步长分割数据,获取时序训练集
|
|
|
+ 步长分割数据,获取最后一个时间步长作为训练集
|
|
|
"""
|
|
|
time_step = self.opt.Model["time_step"]
|
|
|
feature_data = norm_data.reset_index(drop=True)
|
|
|
- time_step_loc = time_step*2 - 1
|
|
|
+ time_step_loc = time_step*time_series - 1
|
|
|
train_num = int(len(feature_data))
|
|
|
label_features = [col_time, target] if is_train is True else [col_time, target]
|
|
|
nwp_cs = self.opt.features
|
|
|
- nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step*2 + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
- labels = [feature_data.loc[i+time_step:i+time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step*2 + 1)]
|
|
|
+ nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
+ labels = [feature_data.loc[i+time_step_loc-time_step+1: i+time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)]
|
|
|
features_x, features_y = [], []
|
|
|
for i, row in enumerate(zip(nwp, labels)):
|
|
|
features_x.append(row[0])
|
|
|
features_y.append(row[1])
|
|
|
return features_x, features_y
|
|
|
|
|
|
- def get_timestep_features_bilstm(self, norm_data, col_time, target, is_train):
|
|
|
+ def get_timestep_features_bilstm(self, norm_data, col_time, target, is_train, time_series=3):
|
|
|
"""
|
|
|
- 步长分割数据,获取时序训练集
|
|
|
+ 步长分割数据,获取中间的时间步长作为训练集
|
|
|
"""
|
|
|
time_step = self.opt.Model["time_step"]
|
|
|
feature_data = norm_data.reset_index(drop=True)
|
|
|
- time_step_loc = time_step*3 - 1
|
|
|
- time_step_m = time_step*2 - 1
|
|
|
+ time_step_loc = time_step*time_series - 1
|
|
|
train_num = int(len(feature_data))
|
|
|
label_features = [col_time, target] if is_train is True else [col_time, target]
|
|
|
nwp_cs = self.opt.features
|
|
|
- nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step*3 + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
- labels = [feature_data.loc[i+time_step:i+time_step_m, label_features].reset_index(drop=True) for i in range(train_num - time_step*3 + 1)]
|
|
|
+ nwp = [feature_data.loc[i:i + time_step_loc, nwp_cs].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)] # 数据库字段 'C_T': 'C_WS170'
|
|
|
+ labels = [feature_data.loc[i+time_step: i+time_step_loc-time_step, label_features].reset_index(drop=True) for i in range(train_num - time_step*time_series + 1)]
|
|
|
features_x, features_y = [], []
|
|
|
for i, row in enumerate(zip(nwp, labels)):
|
|
|
features_x.append(row[0])
|
|
@@ -205,7 +249,7 @@ class DataHandler(object):
|
|
|
vy.append(data[1])
|
|
|
return tx, vx, ty, vy
|
|
|
|
|
|
- def train_data_handler(self, data, bp_data=False, time_series=1):
|
|
|
+ def train_data_handler(self, data, bp_data=False, time_series=1, lstm_type=1):
|
|
|
"""
|
|
|
训练数据预处理:
|
|
|
清洗+补值+归一化
|
|
@@ -257,10 +301,10 @@ class DataHandler(object):
|
|
|
train_x, valid_x, train_y, valid_y = self.train_valid_split(train_data[self.opt.features].values, train_data[target].values, valid_rate=self.opt.Model["valid_data_rate"], shuffle=self.opt.Model['shuffle_train_data'])
|
|
|
train_x, valid_x, train_y, valid_y = np.array(train_x), np.array(valid_x), np.array(train_y), np.array(valid_y)
|
|
|
else:
|
|
|
- train_x, valid_x, train_y, valid_y = self.get_train_data(train_datas, col_time, target, time_series)
|
|
|
+ train_x, valid_x, train_y, valid_y = self.get_train_data(train_datas, col_time, target, time_series, lstm_type)
|
|
|
return train_x, train_y, valid_x, valid_y, scaled_train_bytes, scaled_target_bytes, scaled_cap
|
|
|
|
|
|
- def pre_data_handler(self, data, feature_scaler, bp_data=False, time_series=1):
|
|
|
+ def pre_data_handler(self, data, feature_scaler, bp_data=False, time_series=1, lstm_type=1):
|
|
|
"""
|
|
|
预测数据简单处理
|
|
|
Args:
|
|
@@ -286,5 +330,5 @@ class DataHandler(object):
|
|
|
if bp_data:
|
|
|
pre_x = np.array(pre_data)
|
|
|
else:
|
|
|
- pre_x = self.get_predict_data([pre_data], time_series)
|
|
|
+ pre_x = self.get_predict_data([pre_data], time_series, lstm_type)
|
|
|
return pre_x, data
|