liudawei пре 2 година
комит
97d2fbf4fc

+ 10 - 0
ipfcst-forecast-LSTM-v1.0/.gitignore

@@ -0,0 +1,10 @@
+*/__pycache__
+/__pycache__
+/.idea
+/checkpoint
+*.log
+*.swp
+/log
+/data
+/figure
+

+ 18 - 0
ipfcst-forecast-LSTM-v1.0/Readme.md

@@ -0,0 +1,18 @@
+## 超短期功率预测系统训练端
+
+这个项目将LSTM长短期时序模型用于超短期电力功率预测任务,实现特性如下: 
+
+- 程序简洁、模块化
+- 支持可扩展的Keras框架(LSTM,可修改网络层)
+- 参数、模型和框架支持高度可定制和修改
+- 支持增量训练(在预训练模型上进行微调)
+- 支持同时预测多个指标(目前预测实际功率)
+- 支持预测任意时间节点数(目前设置16个点)
+- 支持训练可视化和记录日志
+
+
+
+| 训练case | 表头  |
+|--------| ----  |
+| 1      | 单元格 |
+| 2      | 单元格 |

+ 94 - 0
ipfcst-forecast-LSTM-v1.0/config.py

@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# time: 2023/3/17 14:46
+# file: config.py
+# author: David
+# company: shenyang JY
+
+import yaml
+import argparse
+
+
+class myargparse(argparse.ArgumentParser):
+    def __init__(self, discription, add_help):
+        super(myargparse, self).__init__(description=discription, add_help=add_help)
+        # default_config_parser = parser = argparse.ArgumentParser(
+        #     description='Training Config', add_help=False)
+        self.add_argument(
+            '-c',
+            '--config_yaml',
+            default=
+            'config.yml',
+            type=str,
+            metavar='FILE',
+            help='YAML config file specifying default arguments')
+
+        feature_columns = list(range(1, 16))
+        # feature_columns = list(range(1, 28))
+        label_columns = [2]
+
+        label_in_feature_index = (lambda x, y: [x.index(i) for i in y])(feature_columns, label_columns)  # 因为feature不一定从0开始
+
+        # 在控制台可以指定的参数, yml中没有
+        self.add_argument('--feature_columns', type=list, default=feature_columns, help='要作为特征的列')
+
+        self.add_argument('--label_columns', type=list, default=label_columns, help='要预测的列')
+
+        self.add_argument('--label_in_feature_index', type=list, default=label_in_feature_index, help='标签在特征列的索引')
+
+        self.add_argument('--input_size', type=int, default=len(feature_columns), help='输入维度')
+
+        self.add_argument('--output_size', type=int, default=len(label_columns), help='输出维度')
+
+        self.add_argument("--train_data_path", type=str, default=None,help='数据集地址')  # train_data_path yml中有
+
+        # model_name 和 model_save_path 这两个参数根据yml中的参数拼接而成
+
+        self.add_argument('--model_name', type=str, default=None, help='模型名称')
+
+        self.add_argument('--model_save_path', type=str, default=None, help='模型保存地址')
+
+
+    def _init_dir(self, opt):
+        import os, time
+        # 在这里给opt赋值
+        opt.model_name = "model_" + opt.continue_flag + opt.used_frame + opt.model_postfix[opt.used_frame]
+        opt.model_save_path = './checkpoint/' + opt.model_name + "/"
+        if not os.path.exists(opt.model_save_path):
+            os.makedirs(opt.model_save_path)    # makedirs 递归创建目录
+        if not os.path.exists(opt.figure_save_path):
+            os.mkdir(opt.figure_save_path)
+        if opt.do_train and (opt.do_log_save_to_file or opt.do_train_visualized):
+            cur_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
+            log_save_path = opt.log_save_path + cur_time + '_' + opt.used_frame + "/"
+            os.makedirs(log_save_path)
+
+
+# YAML should override the argparser's content
+    def _parse_args_and_yaml(self):
+        given_configs, remaining = self.parse_known_args()
+        if given_configs.config_yaml:
+            with open(given_configs.config_yaml, 'r', encoding='utf-8') as f:
+                cfg = yaml.safe_load(f)
+                self.set_defaults(**cfg)
+
+        # The main arg parser parses the rest of the args, the usual
+        # defaults will have been overridden if config file specified.
+        opt = self.parse_args(remaining)
+        self._init_dir(opt)
+        # Cache the args as a text string to save them in the output dir later
+        opt_text = yaml.safe_dump(opt.__dict__, default_flow_style=False)
+        return opt, opt_text
+
+
+    def parse_args_and_yaml(self):
+        return self._parse_args_and_yaml()[0]
+
+
+if __name__ == "__main__":
+    # opt = _parse_args_and_yaml()
+    pass
+
+
+
+

+ 77 - 0
ipfcst-forecast-LSTM-v1.0/config.yml

@@ -0,0 +1,77 @@
+Model:
+  batch_size: 64
+  dropout_rate: 0.2
+  epoch: 20
+  hidden_size: 128
+  learning_rate: 0.001
+  lstm_layers: 2
+  patience: 5
+  random_seed: 42
+  time_step: 16
+add_train: false
+continue_flag: ''
+data_format:
+  dq: dq.xls
+  envir: "\u73AF\u5883\u6570\u636E.xls"
+  nwp: nwp.xls
+  rp: rp.xls
+debug_model: false
+debug_num: 500
+do_continue_train: false
+do_figure_save: false
+do_log_print_to_screen: true
+do_log_save_to_file: true
+do_predict: true
+do_train: true
+do_train_visualized: True
+excel_data_path: ./data/J00307/
+figure_save_path: ./figure/
+is_continuous_predict: False
+log_save_path: ./log/
+mean:
+  C_AIRT: 10.305992230762874
+  C_CELLT: 10.664897925448384
+  C_DIFFUSER: 143.2639061079428
+  C_DIFFUSERDA: 6.571077155136789
+  C_DIRECTR: 68.21328208942887
+  C_DIRECTRDA: 3.163283039920654
+  C_FORECAST: 3.1419734966774113
+  C_GLOBALR: 173.2587817174973
+  C_GLOBALRDA: 7.756491280271097
+  C_HOURDA: 1.998222150590958
+  C_P: 947.7830440532276
+  C_RH: 55.59672286965865
+  C_VALUE: 3.404744648318043
+  C_WD: 212.88300686007108
+  C_WS: 1.802446483180428
+model_postfix:
+  keras: .h5
+  pytorch: .pth
+  tensorflow: .ckpt
+predict_points: 16
+shuffle_train_data: false
+std:
+  C_AIRT: 12.127220611319888
+  C_CELLT: 12.654848145970181
+  C_DIFFUSER: 230.93680419867772
+  C_DIFFUSERDA: 6.4933162833681415
+  C_DIRECTR: 166.61348332191056
+  C_DIRECTRDA: 4.991297839913351
+  C_FORECAST: 4.447082956749344
+  C_GLOBALR: 258.87947949591955
+  C_GLOBALRDA: 7.9174382136573955
+  C_HOURDA: 2.9110230573747247
+  C_P: 25.75152505719027
+  C_RH: 22.445059526990818
+  C_VALUE: 5.013868885103326
+  C_WD: 112.90029001408325
+  C_WS: 1.6575249140627502
+train_data_path: ./data/
+train_data_rate: 0.9
+use_cuda: false
+used_frame: keras
+valid_data_rate: 0.15
+
+is_photovoltaic: True
+cap: 20
+envir_columns: 16

+ 182 - 0
ipfcst-forecast-LSTM-v1.0/dataset.py

@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# time: 2023/3/17 10:10
+# file: main.py
+# author: David
+# company: shenyang JY
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import yaml
+
+
+class DataSet(object):
+    def __init__(self, opt):
+        self.std = None
+        self.mean = None
+        self.opt = opt
+        self.time_step = self.opt.Model["time_step"]
+        excel_data_path = opt.excel_data_path
+        data_format = opt.data_format
+        dq_path = excel_data_path + data_format["dq"]
+        rp_path = excel_data_path + data_format["rp"]
+        envir_path = excel_data_path + data_format["envir"]
+        nwp_path = excel_data_path + data_format["nwp"]
+
+        dq_columns = [1, 2]
+        rp_columns = [0, 2]
+        envir_columns = [0, *[x for x in range(3, 16)]]
+        nwp_columns = [x for x in range(1, 27)]
+
+        dq = self.read_data(dq_path, dq_columns)
+        rp = self.read_data(rp_path, rp_columns)
+        # nwp = self.read_data(nwp_path, nwp_columns)
+        # rp_average(rp)    # 计算平均功率
+        envir = self.read_data(envir_path, envir_columns)
+
+        self.tables, self.tables_column_name = self.tables_integra(dq, rp, envir)
+        # 如果是光
+        if opt.is_photovoltaic:
+            # self.tables = self.filter_data()
+            pass
+        self.data_num = self.tables.shape[0]
+        self.train_num = int(self.data_num * opt.train_data_rate)
+
+        # 都是在ndarray量纲下进行计算
+        self.norm_data = (self.tables[:, 1:] - self.mean) / self.std  # 归一化,去量纲
+        # self.norm_data.insert(0, 'C_TIME', self.tables['C_TIME'])
+        # self.set_yml({'mean': self.mean.to_dict(), 'std': self.std.to_dict()})
+        self.start_num_in_test = 0
+
+    def set_yml(self, yml_dict):
+        with open(self.opt.config_yaml, 'r', encoding='utf-8') as f:
+            cfg = yaml.safe_load(f)
+        for k, v in yml_dict.items():
+            cfg[k] = v
+        with open(self.opt.config_yaml, 'w') as f:
+            yaml.safe_dump(cfg, f, default_flow_style=False)
+
+    def read_data(self, path, cols):
+        init_data = pd.read_excel(path, usecols=cols)
+        return init_data
+
+    def filter_data(self):
+        check_table = self.tables[:, 2]  # 实际功率不能为0,为0代表没发电
+        preserve_index = list(np.nonzero(check_table)[0])
+        indexs = list(range(len(self.tables)))
+        del_index = list(set(indexs) - set(preserve_index))
+        self.tables = np.delete(self.tables, del_index, axis=0)
+        return self.tables
+
+    def norm(self, tables):
+        """
+        归一化操作,获取后存储于config.yml
+        :param tables:
+        :return:
+        """
+        mean = np.mean(tables.iloc[:, 1:], axis=0)  # 数据的均值
+        std = np.std(tables.iloc[:, 1:], axis=0)  # 标准差
+        if hasattr(self.opt, 'mean') is False or hasattr(self.opt, 'std') is False:
+            self.set_yml({'mean': mean.to_dict(), 'std': std.to_dict()})
+        self.mean, self.std = mean.values, std.values
+
+    def tables_integra(self, dq, rp, envir):
+        """
+        联合表
+        :param dq: 短期预测功率
+        :param rp: 实际功率
+        :param envir: 环境
+        :return: 联合表, 列集(不包含第一列时间)
+        """
+        # 1. 先将 dq rp envir 根据时间联立
+        union_tables = pd.merge(dq, rp, on='C_TIME')
+        union_tables = union_tables.merge(envir, on='C_TIME')
+        self.norm(union_tables)
+        return union_tables.values, union_tables.columns.tolist()[1:]
+
+    
+
+    def get_train_and_valid_data(self, case):
+        feature_data = self.norm_data[:self.train_num]
+        # label_data = self.norm_data[: self.train_num,
+        #                             self.opt.label_in_feature_index]    # 将延后几天的数据作为label
+        label_data = self.norm_data[self.opt.predict_points: self.opt.predict_points + self.train_num, self.opt.label_in_feature_index]
+        time_step = self.opt.Model["time_step"]
+        train_x, train_y = [], []
+        if not self.opt.do_continue_train:
+            # 在非连续训练模式下,每time_step行数据会作为一个样本,两个样本错开一行,比如:1-20行,2-21行。。。。
+            if case == 1: # 相当于实际功率+气象
+                train_x = [feature_data[i:i + time_step] for i in range(self.train_num - time_step)]
+                train_y = [label_data[i:i + time_step] for i in range(self.train_num - time_step)]
+            elif case == 2: # 相当于短期+实际功率+气象
+                train_rp = [feature_data[i:i + time_step, 1:]for i in range(self.train_num - time_step*2)]
+                train_qd = [feature_data[i + time_step: i + 2*time_step, 0][:, np.newaxis] for i in range(self.train_num - time_step*2)]
+                train_x = [list(np.append(t[0], t[1], axis=1)) for t in zip(train_rp, train_qd)]
+                train_y = [label_data[i:i + time_step] for i in range(self.train_num - time_step*2)]
+        else:
+            # 在连续训练模式下
+            pass
+        train_x, train_y = np.array(train_x), np.array(train_y)
+
+        train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=self.opt.valid_data_rate,
+                                                              random_state=self.opt.Model["random_seed"],
+                                                              shuffle=self.opt.shuffle_train_data)   # 划分训练和验证集,并打乱
+        return train_x, valid_x, train_y, valid_y
+
+    def get_test_data(self, return_label_data=False):
+        feature_data = self.norm_data[self.train_num:]
+
+        sample_interval = min(feature_data.shape[0], self.time_step*2)     # 防止time_step大于测试集数量
+        assert sample_interval == self.time_step*2
+
+
+        test_x, test_y, dq_y = [], [], []
+        if self.opt.is_continuous_predict:
+            test_num = len(feature_data)
+            test_x = [feature_data[
+                       i : i + self.time_step]
+                      for i in range(test_num - sample_interval)]
+            test_y = [feature_data[
+                       i + self.time_step: i + sample_interval, self.opt.label_in_feature_index]
+                      for i in range(test_num - sample_interval)]
+        else:
+            # 在测试数据中,每time_step行数据会作为一个样本,两个样本错开time_step行
+            # 比如:1-20行,21-40行。。。到数据末尾。
+            # 这个地方要重新获取测试集 刘大为
+            self.start_num_in_test = feature_data.shape[0] % sample_interval  # 这些天的数据不够一个sample_interval
+
+            time_step_size = feature_data.shape[0] // sample_interval
+            test_x = [feature_data[
+                      self.start_num_in_test + i * sample_interval: self.start_num_in_test + i * sample_interval + self.time_step]
+                      for i in range(time_step_size)]
+            test_y = [feature_data[
+                      self.start_num_in_test + i * sample_interval + self.time_step: self.start_num_in_test + (
+                                  i + 1) * sample_interval, self.opt.label_in_feature_index]
+                      for i in range(time_step_size)]
+            dq_y = [feature_data[
+                      self.start_num_in_test + i * sample_interval + self.time_step: self.start_num_in_test + (
+                                  i + 1) * sample_interval, 0][:, np.newaxis]
+                      for i in range(time_step_size)]
+            # test_x = [list(np.append(t[0], t[1], axis=1)) for t in zip(test_x, dq_y)]
+
+        print("test_x的长度为:", len(test_x))
+        pass
+        # 把test_x重新转换成timestamp时间步长
+        # for i, x in enumerate(test_x):
+        #     p1 = x[0:16, 0]
+        #     p2 = x[16:32, 1]
+        #     p = [list(t) for t in zip(p1, p2)]
+        #     test_x[i] = np.array(p)
+        if return_label_data:       # 实际应用中的测试集是没有label数据的
+            return np.array(test_x), np.array(test_y), np.array(dq_y)
+        return np.array(test_x)
+
+
+if __name__ == "__main__":
+    ds = DataSet()
+    # dq = ds.read_data(dq_path, dq_columns)[0]
+    # rp = ds.read_data(rp_path, rp_columns)[0]
+    # # rp_average(rp)    # 计算平均功率
+    # envir = ds.read_data(envir_path, envir_columns)[0]
+    # tables = ds.tables_integra(dq, rp, envir)
+    # ds.tables_norm_result(tables)

+ 73 - 0
ipfcst-forecast-LSTM-v1.0/figure.py

@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# time: 2023/3/20 15:19
+# file: figure.py
+# author: David
+# company: shenyang JY
+import sys
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+class Figure(object):
+    def __init__(self, opt, logger, ds):
+        self.opt = opt
+        self.ds = ds
+        self.logger = logger
+
+    def draw(self, label_data, dq_data, predict_norm_data):
+        # label_data = origin_data.data[origin_data.train_num + origin_data.start_num_in_test : ,
+        #                                         config.label_in_feature_index]
+        dq_data = dq_data.reshape((-1, self.opt.output_size))
+        label_data = label_data.reshape((-1, self.opt.output_size))
+        # label_data 要进行反归一化
+        label_data = label_data * self.ds.std[self.opt.label_in_feature_index] + \
+                       self.ds.mean[self.opt.label_in_feature_index]
+        predict_data = predict_norm_data * self.ds.std[self.opt.label_in_feature_index] + \
+                       self.ds.mean[self.opt.label_in_feature_index]   # 通过保存的均值和方差还原数据
+        dq_data = dq_data * self.ds.std[0] + self.ds.mean[0]
+        # predict_data = predict_norm_data
+        assert label_data.shape[0] == predict_data.shape[0], "The element number in origin and predicted data is different"
+
+        label_name = [self.ds.tables_column_name[i] for i in self.opt.label_in_feature_index]
+        label_column_num = len(self.opt.label_columns)
+
+        # label 和 predict 是错开config.predict_day天的数据的
+        # 下面是两种norm后的loss的计算方式,结果是一样的,可以简单手推一下
+        # label_norm_data = origin_data.norm_data[origin_data.train_num + origin_data.start_num_in_test:,
+        #              config.label_in_feature_index]
+        # loss_norm = np.mean((label_norm_data[config.predict_day:] - predict_norm_data[:-config.predict_day]) ** 2, axis=0)
+        # logger.info("The mean squared error of stock {} is ".format(label_name) + str(loss_norm))
+
+        loss = np.sum((label_data - predict_data) ** 2)/len(label_data)  # mse
+        # loss = np.mean((label_data - predict_data) ** 2, axis=0)
+        loss_sqrt = np.sqrt(loss)   # rmse
+        loss_norm = 1 - loss_sqrt / self.opt.cap
+        # loss_norm = loss/(ds.std[opt.label_in_feature_index] ** 2)
+        self.logger.info("The mean squared error of power {} is ".format(label_name) + str(loss_norm))
+
+        # loss1 = np.sum((label_data - dq_data) ** 2) / len(label_data)  # mse
+        # loss_sqrt1 = np.sqrt(loss1)  # rmse
+        # loss_norm1 = 1 - loss_sqrt1 / self.opt.cap
+        # self.logger.info("The mean squared error1 of power {} is ".format(label_name) + str(loss_norm1))
+        if self.opt.is_continuous_predict:
+            label_X = range(int((self.ds.data_num - self.ds.train_num - 32)))
+        else:
+            label_X = range(int((self.ds.data_num - self.ds.train_num - self.ds.start_num_in_test)/2))
+        print("label_x = ", label_X)
+        predict_X = [x for x in label_X]
+
+        if not sys.platform.startswith('linux'):    # 无桌面的Linux下无法输出,如果是有桌面的Linux,如Ubuntu,可去掉这一行
+            for i in range(label_column_num):
+                plt.figure(i+1)                     # 预测数据绘制
+                plt.plot(label_X, label_data[:, i], label='label', color='b')
+                plt.plot(predict_X, predict_data[:, i], label='predict', color='g')
+                plt.plot(predict_X, dq_data[:, i], label='dq', color='y')
+                # plt.title("Predict actual {} power with {}".format(label_name[i], self.opt.used_frame))
+                self.logger.info("The predicted power {} for the last {} point(s) is: ".format(label_name[i], self.opt.predict_points) +
+                      str(np.squeeze(predict_data[-self.opt.predict_points:, i])))
+
+                if self.opt.do_figure_save:
+                    plt.savefig(self.opt.figure_save_path+"{}predict_{}_with_{}.png".format(self.opt.continue_flag, label_name[i], opt.used_frame))
+
+            plt.show()

+ 43 - 0
ipfcst-forecast-LSTM-v1.0/logger.py

@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# time: 2023/3/20 15:19
+# file: logger.py
+# author: David
+# company: shenyang JY
+
+import logging, sys
+from logging.handlers import RotatingFileHandler
+
+
+def load_logger(config):
+    logger = logging.getLogger()
+    logger.setLevel(level=logging.DEBUG)
+
+    # StreamHandler
+    if config.do_log_print_to_screen:
+        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler.setLevel(level=logging.INFO)
+        formatter = logging.Formatter(datefmt='%Y/%m/%d %H:%M:%S',
+                                      fmt='[ %(asctime)s ] %(message)s')
+        stream_handler.setFormatter(formatter)
+        logger.addHandler(stream_handler)
+
+    # FileHandler
+    if config.do_log_save_to_file:
+        file_handler = RotatingFileHandler(config.log_save_path + "out.log", maxBytes=1024000, backupCount=5)
+        file_handler.setLevel(level=logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+        # 把config信息也记录到log 文件中
+        config_dict = {}
+        for key in dir(config):
+            if not key.startswith("_"):
+                config_dict[key] = getattr(config, key)
+        config_str = str(config_dict)
+        config_list = config_str[1:-1].split(", '")
+        config_save_str = "\nConfig:\n" + "\n'".join(config_list)
+        logger.info(config_save_str)
+
+    return logger

+ 0 - 0
ipfcst-forecast-LSTM-v1.0/model/__init__.py


+ 48 - 0
ipfcst-forecast-LSTM-v1.0/model/model_keras.py

@@ -0,0 +1,48 @@
+# -*- coding: UTF-8 -*-
+from keras.layers import Input, Dense, LSTM
+from keras.models import Model
+from keras.callbacks import ModelCheckpoint, EarlyStopping
+
+
+def get_keras_model(opt):
+    input1 = Input(shape=(opt.Model['time_step'], opt.input_size))
+    lstm = input1
+    for i in range(opt.Model['lstm_layers']):
+        lstm = LSTM(units=opt.Model['hidden_size'],dropout=opt.Model['dropout_rate'],return_sequences=True)(lstm)
+    output = Dense(opt.output_size)(lstm)
+    model = Model(input1, output)
+    model.compile(loss='mse', optimizer='adam')     # metrics=["mae"]
+    return model
+
+
+def gpu_train_init():
+    import tensorflow as tf
+    from keras.backend.tensorflow_backend import set_session
+    sess_config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
+    sess_config.gpu_options.per_process_gpu_memory_fraction = 0.7  # 最多使用70%GPU内存
+    sess_config.gpu_options.allow_growth=True   # 初始化时不全部占满GPU显存, 按需分配
+    sess = tf.Session(config=sess_config)
+    set_session(sess)
+
+
+def train(opt, train_and_valid_data):
+    if opt.use_cuda: gpu_train_init()
+    train_X, train_Y, valid_X, valid_Y = train_and_valid_data
+    model = get_keras_model(opt)
+    model.summary()
+    if opt.add_train:
+        model.load_weights(opt.model_save_path + opt.model_name)
+
+    check_point = ModelCheckpoint(filepath=opt.model_save_path + opt.model_name, monitor='val_loss',
+                                    save_best_only=True, mode='auto')
+    early_stop = EarlyStopping(monitor='val_loss', patience=opt.Model['patience'], mode='auto')
+    model.fit(train_X, train_Y, batch_size=opt.Model['batch_size'], epochs=opt.Model['epoch'], verbose=2,
+              validation_data=(valid_X, valid_Y), callbacks=[check_point, early_stop])
+
+
+def predict(config, test_X):
+    model = get_keras_model(config)
+    model.load_weights(config.model_save_path + config.model_name)
+    result = model.predict(test_X, batch_size=1)
+    result = result.reshape((-1, config.output_size))
+    return result

+ 8 - 0
ipfcst-forecast-LSTM-v1.0/requirements.txt

@@ -0,0 +1,8 @@
+sklearn
+pandas
+argparse
+keras
+tensorflow==1.15
+matplotlib>=3.0.2
+numpy>=1.14.6
+scipy>=1.1.0

+ 56 - 0
ipfcst-forecast-LSTM-v1.0/run_case1.py

@@ -0,0 +1,56 @@
+# -*- coding: UTF-8 -*-
+
+import numpy as np
+import os
+import sys
+import time
+from figure import Figure
+from dataset import DataSet
+from logger import load_logger
+from config import myargparse
+
+frame = "keras"
+
+if frame == "keras":
+    from model.model_keras import train, predict
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+else:
+    raise Exception("Wrong frame seletion")
+
+
+def main():
+    parse = myargparse(discription="training config", add_help=False)
+    opt = parse.parse_args_and_yaml()
+    logger = load_logger(opt)
+    try:
+        np.random.seed(opt.Model["random_seed"])
+        # 在这里获取数据集
+        ds = DataSet(opt=opt)
+        if opt.do_train:
+            train_X, valid_X, train_Y, valid_Y = ds.get_train_and_valid_data(case=2)
+            train(opt, [train_X, train_Y, valid_X, valid_Y])
+        if opt.do_predict:
+            test_X, test_Y, dq_Y = ds.get_test_data(return_label_data=True)
+            result = predict(opt, test_X)       # 这里输出的是未还原的归一化预测数据
+            fig = Figure(opt, logger, ds)
+            fig.draw(test_Y, dq_Y, result)
+    except Exception:
+        logger.error("Run Error", exc_info=True)
+
+
+if __name__=="__main__":
+    import argparse
+    # argparse方便于命令行下输入参数,可以根据需要增加更多
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("-t", "--do_train", default=False, type=bool, help="whether to train")
+    # parser.add_argument("-p", "--do_predict", default=True, type=bool, help="whether to train")
+    # parser.add_argument("-b", "--batch_size", default=64, type=int, help="batch size")
+    # parser.add_argument("-e", "--epoch", default=20, type=int, help="epochs num")
+    # args = parser.parse_args()
+
+    # con = Config()
+    # for key in dir(args):               # dir(args) 函数获得args所有的属性
+    #     if not key.startswith("_"):     # 去掉 args 自带属性,比如__name__等
+    #         setattr(con, key, getattr(args, key))   # 将属性值赋给Config
+    main()
+

+ 142 - 0
ipfcst-forecast-LSTM-v1.0/run_case_history.py

@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# time: 2023/3/20 9:23
+# file: run_case_history.py
+# author: David
+# company: shenyang JY
+
+class Data:
+    def __init__(self, config):
+        self.config = config
+        self.data, self.data_column_name = self.read_data()
+
+        self.data_num = self.data.shape[0]
+        self.train_num = int(self.data_num * self.config.train_data_rate)
+
+        self.mean = np.mean(self.data, axis=0)              # 数据的均值和方差
+        self.std = np.std(self.data, axis=0)
+        self.norm_data = (self.data - self.mean)/self.std   # 归一化,去量纲
+
+        self.start_num_in_test = 0      # 测试集中前几天的数据会被删掉,因为它不够一个time_step
+
+    def read_data(self):                # 读取初始数据
+        if self.config.debug_mode:
+            init_data = pd.read_csv(self.config.train_data_path, nrows=self.config.debug_num,
+                                    usecols=self.config.feature_columns)
+        else:
+            init_data = pd.read_csv(self.config.train_data_path, usecols=self.config.feature_columns)
+        init_data = self.filter_data(init_data)
+        return init_data.values, init_data.columns.tolist()     # .columns.tolist() 是获取列名
+
+    def filter_data(self, init_data):
+        return init_data[init_data.apply(np.sum, axis=1)!=0]
+
+    def get_train_and_valid_data(self):
+        feature_data = self.norm_data[:self.train_num]
+        label_data = self.norm_data[: self.train_num,
+                                    self.config.label_in_feature_index]    # 将延后几天的数据作为label
+
+        if not self.config.do_continue_train:
+            # 在非连续训练模式下,每time_step行数据会作为一个样本,两个样本错开一行,比如:1-20行,2-21行。。。。
+            train_x, train_y = [], []
+            for i in range(self.train_num-self.config.time_step*2):
+                p1 = feature_data[:, 0][i:i+self.config.start_predict_point]
+                p2 = feature_data[:, 1][i+self.config.start_predict_point:i+self.config.start_predict_point*2]
+                p = [list(t) for t in zip(p1, p2)]  # 实际功率, 预测功率 是一组特征值
+                l = label_data[i+self.config.start_predict_point:i+self.config.start_predict_point*2]
+                train_x.append(p)
+                train_y.append(l)
+            # train_x = [feature_data[i:i+self.config.time_step] for i in range(self.train_num-self.config.time_step)]
+            # train_y = [label_data[i+self.config.start_predict_point:i+self.config.time_step] for i in range(self.train_num-self.config.time_step)]
+            # 这里选取后16个点 作为 预测及
+        else:
+            # 在连续训练模式下,每time_step行数据会作为一个样本,两个样本错开time_step行,
+            # 比如:1-20行,21-40行。。。到数据末尾,然后又是 2-21行,22-41行。。。到数据末尾,……
+            # 这样才可以把上一个样本的final_state作为下一个样本的init_state,而且不能shuffle
+            # 目前本项目中仅能在pytorch的RNN系列模型中用
+            train_x = [feature_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
+                       for start_index in range(self.config.time_step)
+                       for i in range((self.train_num - start_index) // self.config.time_step)]
+            train_y = [label_data[start_index + i*self.config.time_step : start_index + (i+1)*self.config.time_step]
+                       for start_index in range(self.config.time_step)
+                       for i in range((self.train_num - start_index) // self.config.time_step)]
+
+        train_x, train_y = np.array(train_x), np.array(train_y)
+
+        train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=self.config.valid_data_rate,
+                                                              random_state=self.config.random_seed,
+                                                              shuffle=self.config.shuffle_train_data)   # 划分训练和验证集,并打乱
+        return train_x, valid_x, train_y, valid_y
+
+
+class Config:
+    # 数据参数
+    # feature_columns = list(range(2, 9))     # 要作为feature的列,按原数据从0开始计算,也可以用list 如 [2,4,6,8] 设置
+    feature_columns = list(range(1, 3))
+    # label_columns = [4, 5]                  # 要预测的列,按原数据从0开始计算, 如同时预测第四,五列 最低价和最高价
+    label_columns = [1]
+    # label_in_feature_index = [feature_columns.index(i) for i in label_columns]  # 这样写不行
+    label_in_feature_index = (lambda x,y: [x.index(i) for i in y])(feature_columns, label_columns)  # 因为feature不一定从0开始
+
+    predict_day = 1             # 预测未来几天
+    predict_points = 16
+    # 网络参数
+    input_size = len(feature_columns)
+    output_size = len(label_columns)
+
+    hidden_size = 128           # LSTM的隐藏层大小,也是输出大小
+    lstm_layers = 2             # LSTM的堆叠层数
+    dropout_rate = 0.2          # dropout概率
+    time_step = 16             # 这个参数很重要,是设置用前多少个点的数据来预测,也是LSTM的time step数,请保证训练数据量大于它
+    start_predict_point = 16
+
+    # 训练参数
+    do_train = True
+    do_predict = True
+    add_train = False           # 是否载入已有模型参数进行增量训练
+    shuffle_train_data = False   # 是否对训练数据做shuffle
+    use_cuda = False            # 是否使用GPU训练
+
+    train_data_rate = 0.95      # 训练数据占总体数据比例,测试数据就是 1-train_data_rate
+    valid_data_rate = 0.15      # 验证数据占训练数据比例,验证集在训练过程使用,为了做模型和参数选择
+
+    batch_size = 64
+    learning_rate = 0.001
+    epoch = 20                  # 整个训练集被训练多少遍,不考虑早停的前提下
+    patience = 5                # 训练多少epoch,验证集没提升就停掉
+    random_seed = 42            # 随机种子,保证可复现
+
+    do_continue_train = False    # 每次训练把上一次的final_state作为下一次的init_state,仅用于RNN类型模型,目前仅支持pytorch
+    continue_flag = ""           # 但实际效果不佳,可能原因:仅能以 batch_size = 1 训练
+    if do_continue_train:
+        shuffle_train_data = False
+        batch_size = 1
+        continue_flag = "continue_"
+
+    # 训练模式
+    debug_mode = False  # 调试模式下,是为了跑通代码,追求快
+    debug_num = 500  # 仅用debug_num条数据来调试
+
+    # 框架参数
+    used_frame = frame  # 选择的深度学习框架,不同的框架模型保存后缀不一样
+    model_postfix = {"pytorch": ".pth", "keras": ".h5", "tensorflow": ".ckpt"}
+    model_name = "model_" + continue_flag + used_frame + model_postfix[used_frame]
+
+    # 路径参数
+    train_data_path = "./data/J00285.csv"
+    model_save_path = "./checkpoint/" + used_frame + "/"
+    figure_save_path = "./figure/"
+    log_save_path = "./log/"
+    do_log_print_to_screen = True
+    do_log_save_to_file = True                  # 是否将config和训练过程记录到log
+    do_figure_save = False
+    do_train_visualized = False          # 训练loss可视化,pytorch用visdom,tf用tensorboardX,实际上可以通用, keras没有
+    if not os.path.exists(model_save_path):
+        os.makedirs(model_save_path)    # makedirs 递归创建目录
+    if not os.path.exists(figure_save_path):
+        os.mkdir(figure_save_path)
+    if do_train and (do_log_save_to_file or do_train_visualized):
+        cur_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
+        log_save_path = log_save_path + cur_time + '_' + used_frame + "/"
+        os.makedirs(log_save_path)
+