|
@@ -0,0 +1,142 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# time: 2023/3/20 9:23
|
|
|
+# file: run_case_history.py
|
|
|
+# author: David
|
|
|
+# company: shenyang JY
|
|
|
+
|
|
|
+class Data:
|
|
|
+ def __init__(self, config):
|
|
|
+ self.config = config
|
|
|
+ self.data, self.data_column_name = self.read_data()
|
|
|
+
|
|
|
+ self.data_num = self.data.shape[0]
|
|
|
+ self.train_num = int(self.data_num * self.config.train_data_rate)
|
|
|
+
|
|
|
+ self.mean = np.mean(self.data, axis=0) # 数据的均值和方差
|
|
|
+ self.std = np.std(self.data, axis=0)
|
|
|
+ self.norm_data = (self.data - self.mean)/self.std # 归一化,去量纲
|
|
|
+
|
|
|
+ self.start_num_in_test = 0 # 测试集中前几天的数据会被删掉,因为它不够一个time_step
|
|
|
+
|
|
|
+ def read_data(self): # 读取初始数据
|
|
|
+ if self.config.debug_mode:
|
|
|
+ init_data = pd.read_csv(self.config.train_data_path, nrows=self.config.debug_num,
|
|
|
+ usecols=self.config.feature_columns)
|
|
|
+ else:
|
|
|
+ init_data = pd.read_csv(self.config.train_data_path, usecols=self.config.feature_columns)
|
|
|
+ init_data = self.filter_data(init_data)
|
|
|
+ return init_data.values, init_data.columns.tolist() # .columns.tolist() 是获取列名
|
|
|
+
|
|
|
+ def filter_data(self, init_data):
|
|
|
+ return init_data[init_data.apply(np.sum, axis=1)!=0]
|
|
|
+
|
|
|
+ def get_train_and_valid_data(self):
|
|
|
+ feature_data = self.norm_data[:self.train_num]
|
|
|
+ label_data = self.norm_data[: self.train_num,
|
|
|
+ self.config.label_in_feature_index] # 将延后几天的数据作为label
|
|
|
+
|
|
|
+ if not self.config.do_continue_train:
|
|
|
+ # 在非连续训练模式下,每time_step行数据会作为一个样本,两个样本错开一行,比如:1-20行,2-21行。。。。
|
|
|
+ train_x, train_y = [], []
|
|
|
+ for i in range(self.train_num-self.config.time_step*2):
|
|
|
+ p1 = feature_data[:, 0][i:i+self.config.start_predict_point]
|
|
|
+ p2 = feature_data[:, 1][i+self.config.start_predict_point:i+self.config.start_predict_point*2]
|
|
|
+ p = [list(t) for t in zip(p1, p2)] # 实际功率, 预测功率 是一组特征值
|
|
|
+ l = label_data[i+self.config.start_predict_point:i+self.config.start_predict_point*2]
|
|
|
+ train_x.append(p)
|
|
|
+ train_y.append(l)
|
|
|
+ # train_x = [feature_data[i:i+self.config.time_step] for i in range(self.train_num-self.config.time_step)]
|
|
|
+ # train_y = [label_data[i+self.config.start_predict_point:i+self.config.time_step] for i in range(self.train_num-self.config.time_step)]
|
|
|
+ # 这里选取后16个点 作为 预测及
|
|
|
+ else:
|
|
|
+ # 在连续训练模式下,每time_step行数据会作为一个样本,两个样本错开time_step行,
|
|
|
+ # 比如:1-20行,21-40行。。。到数据末尾,然后又是 2-21行,22-41行。。。到数据末尾,……
|
|
|
+ # 这样才可以把上一个样本的final_state作为下一个样本的init_state,而且不能shuffle
|
|
|
+ # 目前本项目中仅能在pytorch的RNN系列模型中用
|
|
|
+ train_x = [feature_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
|
|
|
+ for start_index in range(self.config.time_step)
|
|
|
+ for i in range((self.train_num - start_index) // self.config.time_step)]
|
|
|
+ train_y = [label_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
|
|
|
+ for start_index in range(self.config.time_step)
|
|
|
+ for i in range((self.train_num - start_index) // self.config.time_step)]
|
|
|
+
|
|
|
+ train_x, train_y = np.array(train_x), np.array(train_y)
|
|
|
+
|
|
|
+ train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=self.config.valid_data_rate,
|
|
|
+ random_state=self.config.random_seed,
|
|
|
+ shuffle=self.config.shuffle_train_data) # 划分训练和验证集,并打乱
|
|
|
+ return train_x, valid_x, train_y, valid_y
|
|
|
+
|
|
|
+
|
|
|
+class Config:
|
|
|
+ # 数据参数
|
|
|
+ # feature_columns = list(range(2, 9)) # 要作为feature的列,按原数据从0开始计算,也可以用list 如 [2,4,6,8] 设置
|
|
|
+ feature_columns = list(range(1, 3))
|
|
|
+ # label_columns = [4, 5] # 要预测的列,按原数据从0开始计算, 如同时预测第四,五列 最低价和最高价
|
|
|
+ label_columns = [1]
|
|
|
+ # label_in_feature_index = [feature_columns.index(i) for i in label_columns] # 这样写不行
|
|
|
+ label_in_feature_index = (lambda x,y: [x.index(i) for i in y])(feature_columns, label_columns) # 因为feature不一定从0开始
|
|
|
+
|
|
|
+ predict_day = 1 # 预测未来几天
|
|
|
+ predict_points = 16
|
|
|
+ # 网络参数
|
|
|
+ input_size = len(feature_columns)
|
|
|
+ output_size = len(label_columns)
|
|
|
+
|
|
|
+ hidden_size = 128 # LSTM的隐藏层大小,也是输出大小
|
|
|
+ lstm_layers = 2 # LSTM的堆叠层数
|
|
|
+ dropout_rate = 0.2 # dropout概率
|
|
|
+ time_step = 16 # 这个参数很重要,是设置用前多少个点的数据来预测,也是LSTM的time step数,请保证训练数据量大于它
|
|
|
+ start_predict_point = 16
|
|
|
+
|
|
|
+ # 训练参数
|
|
|
+ do_train = True
|
|
|
+ do_predict = True
|
|
|
+ add_train = False # 是否载入已有模型参数进行增量训练
|
|
|
+ shuffle_train_data = False # 是否对训练数据做shuffle
|
|
|
+ use_cuda = False # 是否使用GPU训练
|
|
|
+
|
|
|
+ train_data_rate = 0.95 # 训练数据占总体数据比例,测试数据就是 1-train_data_rate
|
|
|
+ valid_data_rate = 0.15 # 验证数据占训练数据比例,验证集在训练过程使用,为了做模型和参数选择
|
|
|
+
|
|
|
+ batch_size = 64
|
|
|
+ learning_rate = 0.001
|
|
|
+ epoch = 20 # 整个训练集被训练多少遍,不考虑早停的前提下
|
|
|
+ patience = 5 # 训练多少epoch,验证集没提升就停掉
|
|
|
+ random_seed = 42 # 随机种子,保证可复现
|
|
|
+
|
|
|
+ do_continue_train = False # 每次训练把上一次的final_state作为下一次的init_state,仅用于RNN类型模型,目前仅支持pytorch
|
|
|
+ continue_flag = "" # 但实际效果不佳,可能原因:仅能以 batch_size = 1 训练
|
|
|
+ if do_continue_train:
|
|
|
+ shuffle_train_data = False
|
|
|
+ batch_size = 1
|
|
|
+ continue_flag = "continue_"
|
|
|
+
|
|
|
+ # 训练模式
|
|
|
+ debug_mode = False # 调试模式下,是为了跑通代码,追求快
|
|
|
+ debug_num = 500 # 仅用debug_num条数据来调试
|
|
|
+
|
|
|
+ # 框架参数
|
|
|
+ used_frame = frame # 选择的深度学习框架,不同的框架模型保存后缀不一样
|
|
|
+ model_postfix = {"pytorch": ".pth", "keras": ".h5", "tensorflow": ".ckpt"}
|
|
|
+ model_name = "model_" + continue_flag + used_frame + model_postfix[used_frame]
|
|
|
+
|
|
|
+ # 路径参数
|
|
|
+ train_data_path = "./data/J00285.csv"
|
|
|
+ model_save_path = "./checkpoint/" + used_frame + "/"
|
|
|
+ figure_save_path = "./figure/"
|
|
|
+ log_save_path = "./log/"
|
|
|
+ do_log_print_to_screen = True
|
|
|
+ do_log_save_to_file = True # 是否将config和训练过程记录到log
|
|
|
+ do_figure_save = False
|
|
|
+ do_train_visualized = False # 训练loss可视化,pytorch用visdom,tf用tensorboardX,实际上可以通用, keras没有
|
|
|
+ if not os.path.exists(model_save_path):
|
|
|
+ os.makedirs(model_save_path) # makedirs 递归创建目录
|
|
|
+ if not os.path.exists(figure_save_path):
|
|
|
+ os.mkdir(figure_save_path)
|
|
|
+ if do_train and (do_log_save_to_file or do_train_visualized):
|
|
|
+ cur_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
|
|
|
+ log_save_path = log_save_path + cur_time + '_' + used_frame + "/"
|
|
|
+ os.makedirs(log_save_path)
|
|
|
+
|