|
@@ -0,0 +1,139 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# time: 2023/4/12 17:42
|
|
|
+# file: data_features.py
|
|
|
+# author: David
|
|
|
+# company: shenyang JY
|
|
|
+import pandas as pd
|
|
|
+from sklearn.model_selection import train_test_split
|
|
|
+import numpy as np
|
|
|
+from data_utils import *
|
|
|
+
|
|
|
+
|
|
|
+class data_features(object):
|
|
|
+ def __init__(self, opt, mean, std):
|
|
|
+ self.opt = opt
|
|
|
+ self.time_step = self.opt.Model["time_step"]
|
|
|
+ self.mean = mean
|
|
|
+ self.std = std
|
|
|
+ self.columns = list()
|
|
|
+ self.columns_lstm = list()
|
|
|
+ self.columns_cnn = list()
|
|
|
+
|
|
|
+ def get_train_data(self, dfs):
|
|
|
+ train_x, valid_x, train_y, valid_y = [], [], [], []
|
|
|
+ self.opt.feature_columns = dfs[0].columns.tolist()
|
|
|
+ self.opt.feature_columns.insert(0, 'C_TIME')
|
|
|
+ self.opt.label_in_feature_index = (lambda x, y: [x.index(i) for i in y])(self.opt.feature_columns,
|
|
|
+ self.opt.label_columns) # 因为feature不一定从0开始
|
|
|
+ self.opt.input_size = len(self.opt.feature_columns)
|
|
|
+ for df in dfs:
|
|
|
+ datax, datay = self.get_data_features(df)
|
|
|
+ trainx_ = [[np.array(x[0]), np.array(x[1])] for x in datax]
|
|
|
+ # trainx = np.array(datax)
|
|
|
+ trainy = [y['C_REAL_VALUE'].values for y in datay]
|
|
|
+ trainy = np.expand_dims(np.array(trainy), axis=-1) # 在最后一维加一维度
|
|
|
+ tx, vx, ty, vy = train_test_split(trainx_, trainy, test_size=self.opt.valid_data_rate,
|
|
|
+ random_state=self.opt.Model["random_seed"],
|
|
|
+ shuffle=self.opt.shuffle_train_data) # 划分训练和验证集
|
|
|
+ # 分裂 tx 和 vx
|
|
|
+ train_x.extend(tx)
|
|
|
+ valid_x.extend(vx)
|
|
|
+ train_y.append(ty)
|
|
|
+ valid_y.append(vy)
|
|
|
+ # train_x = np.concatenate(train_x, axis=0)
|
|
|
+ # valid_x = np.concatenate(valid_x, axis=0)
|
|
|
+ train_y = np.concatenate(train_y, axis=0)
|
|
|
+ valid_y = np.concatenate(valid_y, axis=0)
|
|
|
+
|
|
|
+ train_x = self.norm_features(train_x)
|
|
|
+ valid_x = self.norm_features(valid_x)
|
|
|
+ train_y = self.norm_label(train_y)
|
|
|
+ valid_y = self.norm_label(valid_y)
|
|
|
+
|
|
|
+ cnn_x, cnn_x1 = [], []
|
|
|
+ lstm_x, lstm_x1 = [], []
|
|
|
+ for i in range(0, len(train_x)):
|
|
|
+ cnn_x.append(train_x[i][0])
|
|
|
+ lstm_x.append(train_x[i][1])
|
|
|
+ train_x = [np.array(cnn_x), np.array(lstm_x)]
|
|
|
+ for i in range(0, len(valid_x)):
|
|
|
+ cnn_x1.append(valid_x[i][0])
|
|
|
+ lstm_x1.append(valid_x[i][1])
|
|
|
+ valid_x = [np.array(cnn_x1), np.array(lstm_x1)]
|
|
|
+ return train_x, valid_x, train_y, valid_y
|
|
|
+
|
|
|
+ def get_test_data(self, dfs):
|
|
|
+ test_x, test_y, data_y = [], [], []
|
|
|
+ self.opt.feature_columns = dfs[0].columns.tolist()
|
|
|
+ self.opt.feature_columns.insert(0, 'C_TIME')
|
|
|
+ self.opt.label_in_feature_index = (lambda x, y: [x.index(i) for i in y])(self.opt.feature_columns,
|
|
|
+ self.opt.label_columns) # 因为feature不一定从0开始
|
|
|
+ self.opt.input_size = len(self.opt.feature_columns)
|
|
|
+ for df in dfs:
|
|
|
+ datax, datay = self.get_data_features(df)
|
|
|
+ trainx_ = [[np.array(x[0]), np.array(x[1])] for x in datax]
|
|
|
+ # trainx = np.array(datax)
|
|
|
+ trainy = [y['C_REAL_VALUE'].values for y in datay]
|
|
|
+ trainy = np.expand_dims(np.array(trainy), axis=-1) # 在最后一维加一维度
|
|
|
+ test_x.extend(trainx_)
|
|
|
+ test_y.append(trainy)
|
|
|
+ data_y.append(datay)
|
|
|
+
|
|
|
+ test_y = np.concatenate(test_y, axis=0)
|
|
|
+
|
|
|
+ test_x = self.norm_features(test_x)
|
|
|
+ test_y = self.norm_label(test_y)
|
|
|
+
|
|
|
+ cnn_x, lstm_x = [], []
|
|
|
+
|
|
|
+ for i in range(0, len(test_x)):
|
|
|
+ cnn_x.append(test_x[i][0])
|
|
|
+ lstm_x.append(test_x[i][1])
|
|
|
+ test_x = [np.array(cnn_x), np.array(lstm_x)]
|
|
|
+
|
|
|
+ return test_x, test_y, data_y
|
|
|
+
|
|
|
+ def get_data_features(self, df): # 这段代码基于pandas方法的优化
|
|
|
+ norm_data = df.reset_index()
|
|
|
+ feature_data = norm_data[:-self.opt.predict_points]
|
|
|
+ label_data = norm_data[self.opt.predict_points:].reset_index(drop=True)
|
|
|
+ time_step = self.opt.Model["time_step"]
|
|
|
+ time_step_loc = time_step - 1
|
|
|
+ train_num = int(len(feature_data))
|
|
|
+ time_rp = [feature_data.loc[i:i + time_step_loc, 'C_TIME':'C_WD_INST50'] for i in range(train_num - time_step)]
|
|
|
+ nwp = [label_data.loc[i:i + time_step_loc, 'C_T':] for i in range(train_num - time_step)]
|
|
|
+ features_x, features_x1, features_y = [], [], []
|
|
|
+ for row in zip(time_rp, nwp):
|
|
|
+ row0 = row[0] # row0是时间+rp+环境
|
|
|
+ row1 = row[1] # row1是nwp
|
|
|
+ row0.set_index('C_TIME', inplace=True, drop=False)
|
|
|
+ row0["C_TIME"] = row0["C_TIME"].apply(datetime_to_timestr)
|
|
|
+ row0_ = row0.loc[:, ['C_TIME', 'C_REAL_VALUE']]
|
|
|
+ row0_.reset_index(drop=True, inplace=True)
|
|
|
+ row1.reset_index(drop=True, inplace=True)
|
|
|
+ rowx = pd.concat([row0_, row1], axis=1) # rowx是时间+rp+nwp
|
|
|
+ features_x.append([row0, rowx])
|
|
|
+ self.columns = row0.columns.tolist()
|
|
|
+ self.columns_cnn = row0.columns.tolist()
|
|
|
+ self.columns_lstm = rowx.columns.tolist()
|
|
|
+
|
|
|
+ features_y = [label_data.loc[i:i + time_step_loc, ['C_TIME', 'C_REAL_VALUE']] for i in range(train_num - time_step)]
|
|
|
+
|
|
|
+ return features_x, features_y
|
|
|
+
|
|
|
+ def norm_features(self, data: np.ndarray):
|
|
|
+ for i, d in enumerate(data):
|
|
|
+ mean = np.array([self.mean[col] for col in self.columns_cnn])
|
|
|
+ std = np.array([self.std[col] for col in self.columns_cnn])
|
|
|
+ d[0] = (d[0] - mean) / std # 归一化
|
|
|
+ mean = np.array([self.mean[col] for col in self.columns_lstm])
|
|
|
+ std = np.array([self.std[col] for col in self.columns_lstm])
|
|
|
+ d[1] = (d[1] - mean) / std # 归一化
|
|
|
+ data[i] = d
|
|
|
+ self.opt.input_size_lstm = len(self.columns_lstm)
|
|
|
+ self.opt.input_size_cnn = len(self.columns_cnn)
|
|
|
+ return data
|
|
|
+
|
|
|
+ def norm_label(self, label_data: np.ndarray):
|
|
|
+ return (label_data - self.mean['C_REAL_VALUE']) / self.std['C_REAL_VALUE']
|