dataset.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/3/17 10:10
  4. # file: main.py
  5. # author: David
  6. # company: shenyang JY
  7. import pandas as pd
  8. import numpy as np
  9. from sklearn.model_selection import train_test_split
  10. import yaml
  11. class DataSet(object):
  12. def __init__(self, opt):
  13. self.std = None
  14. self.mean = None
  15. self.opt = opt
  16. self.time_step = self.opt.Model["time_step"]
  17. excel_data_path = opt.excel_data_path
  18. data_format = opt.data_format
  19. dq_path = excel_data_path + data_format["dq"]
  20. rp_path = excel_data_path + data_format["rp"]
  21. envir_path = excel_data_path + data_format["envir"]
  22. nwp_path = excel_data_path + data_format["nwp"]
  23. dq_columns = [1, 2]
  24. rp_columns = [0, 2]
  25. envir_columns = [0, *[x for x in range(3, 16)]]
  26. nwp_columns = [x for x in range(1, 27)]
  27. dq = self.read_data(dq_path, dq_columns)
  28. rp = self.read_data(rp_path, rp_columns)
  29. # nwp = self.read_data(nwp_path, nwp_columns)
  30. # rp_average(rp) # 计算平均功率
  31. envir = self.read_data(envir_path, envir_columns)
  32. self.tables, self.tables_column_name = self.tables_integra(dq, rp, envir)
  33. # 如果是光
  34. if opt.is_photovoltaic:
  35. # self.tables = self.filter_data()
  36. pass
  37. self.data_num = self.tables.shape[0]
  38. self.train_num = int(self.data_num * opt.train_data_rate)
  39. # 都是在ndarray量纲下进行计算
  40. self.norm_data = (self.tables[:, 1:] - self.mean) / self.std # 归一化,去量纲
  41. # self.norm_data.insert(0, 'C_TIME', self.tables['C_TIME'])
  42. # self.set_yml({'mean': self.mean.to_dict(), 'std': self.std.to_dict()})
  43. self.start_num_in_test = 0
  44. def set_yml(self, yml_dict):
  45. with open(self.opt.config_yaml, 'r', encoding='utf-8') as f:
  46. cfg = yaml.safe_load(f)
  47. for k, v in yml_dict.items():
  48. cfg[k] = v
  49. with open(self.opt.config_yaml, 'w') as f:
  50. yaml.safe_dump(cfg, f, default_flow_style=False)
  51. def read_data(self, path, cols):
  52. init_data = pd.read_excel(path, usecols=cols)
  53. return init_data
  54. def filter_data(self):
  55. check_table = self.tables[:, 2] # 实际功率不能为0,为0代表没发电
  56. preserve_index = list(np.nonzero(check_table)[0])
  57. indexs = list(range(len(self.tables)))
  58. del_index = list(set(indexs) - set(preserve_index))
  59. self.tables = np.delete(self.tables, del_index, axis=0)
  60. return self.tables
  61. def norm(self, tables):
  62. """
  63. 归一化操作,获取后存储于config.yml
  64. :param tables:
  65. :return:
  66. """
  67. mean = np.mean(tables.iloc[:, 1:], axis=0) # 数据的均值
  68. std = np.std(tables.iloc[:, 1:], axis=0) # 标准差
  69. if hasattr(self.opt, 'mean') is False or hasattr(self.opt, 'std') is False:
  70. self.set_yml({'mean': mean.to_dict(), 'std': std.to_dict()})
  71. self.mean, self.std = mean.values, std.values
  72. def tables_integra(self, dq, rp, envir):
  73. """
  74. 联合表
  75. :param dq: 短期预测功率
  76. :param rp: 实际功率
  77. :param envir: 环境
  78. :return: 联合表, 列集(不包含第一列时间)
  79. """
  80. # 1. 先将 dq rp envir 根据时间联立
  81. union_tables = pd.merge(dq, rp, on='C_TIME')
  82. union_tables = union_tables.merge(envir, on='C_TIME')
  83. self.norm(union_tables)
  84. return union_tables.values, union_tables.columns.tolist()[1:]
  85. def get_train_and_valid_data(self, case):
  86. feature_data = self.norm_data[:self.train_num]
  87. # label_data = self.norm_data[: self.train_num,
  88. # self.opt.label_in_feature_index] # 将延后几天的数据作为label
  89. label_data = self.norm_data[self.opt.predict_points: self.opt.predict_points + self.train_num, self.opt.label_in_feature_index]
  90. time_step = self.opt.Model["time_step"]
  91. train_x, train_y = [], []
  92. if not self.opt.do_continue_train:
  93. # 在非连续训练模式下,每time_step行数据会作为一个样本,两个样本错开一行,比如:1-20行,2-21行。。。。
  94. if case == 1: # 相当于实际功率+气象
  95. train_x = [feature_data[i:i + time_step] for i in range(self.train_num - time_step)]
  96. train_y = [label_data[i:i + time_step] for i in range(self.train_num - time_step)]
  97. elif case == 2: # 相当于短期+实际功率+气象
  98. train_rp = [feature_data[i:i + time_step, 1:]for i in range(self.train_num - time_step*2)]
  99. train_qd = [feature_data[i + time_step: i + 2*time_step, 0][:, np.newaxis] for i in range(self.train_num - time_step*2)]
  100. train_x = [list(np.append(t[0], t[1], axis=1)) for t in zip(train_rp, train_qd)]
  101. train_y = [label_data[i:i + time_step] for i in range(self.train_num - time_step*2)]
  102. else:
  103. # 在连续训练模式下
  104. pass
  105. train_x, train_y = np.array(train_x), np.array(train_y)
  106. train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=self.opt.valid_data_rate,
  107. random_state=self.opt.Model["random_seed"],
  108. shuffle=self.opt.shuffle_train_data) # 划分训练和验证集,并打乱
  109. return train_x, valid_x, train_y, valid_y
  110. def get_test_data(self, return_label_data=False):
  111. feature_data = self.norm_data[self.train_num:]
  112. sample_interval = min(feature_data.shape[0], self.time_step*2) # 防止time_step大于测试集数量
  113. assert sample_interval == self.time_step*2
  114. test_x, test_y, dq_y = [], [], []
  115. if self.opt.is_continuous_predict:
  116. test_num = len(feature_data)
  117. test_x = [feature_data[
  118. i : i + self.time_step]
  119. for i in range(test_num - sample_interval)]
  120. test_y = [feature_data[
  121. i + self.time_step: i + sample_interval, self.opt.label_in_feature_index]
  122. for i in range(test_num - sample_interval)]
  123. else:
  124. # 在测试数据中,每time_step行数据会作为一个样本,两个样本错开time_step行
  125. # 比如:1-20行,21-40行。。。到数据末尾。
  126. # 这个地方要重新获取测试集 刘大为
  127. self.start_num_in_test = feature_data.shape[0] % sample_interval # 这些天的数据不够一个sample_interval
  128. time_step_size = feature_data.shape[0] // sample_interval
  129. test_x = [feature_data[
  130. self.start_num_in_test + i * sample_interval: self.start_num_in_test + i * sample_interval + self.time_step]
  131. for i in range(time_step_size)]
  132. test_y = [feature_data[
  133. self.start_num_in_test + i * sample_interval + self.time_step: self.start_num_in_test + (
  134. i + 1) * sample_interval, self.opt.label_in_feature_index]
  135. for i in range(time_step_size)]
  136. dq_y = [feature_data[
  137. self.start_num_in_test + i * sample_interval + self.time_step: self.start_num_in_test + (
  138. i + 1) * sample_interval, 0][:, np.newaxis]
  139. for i in range(time_step_size)]
  140. # test_x = [list(np.append(t[0], t[1], axis=1)) for t in zip(test_x, dq_y)]
  141. print("test_x的长度为:", len(test_x))
  142. pass
  143. # 把test_x重新转换成timestamp时间步长
  144. # for i, x in enumerate(test_x):
  145. # p1 = x[0:16, 0]
  146. # p2 = x[16:32, 1]
  147. # p = [list(t) for t in zip(p1, p2)]
  148. # test_x[i] = np.array(p)
  149. if return_label_data: # 实际应用中的测试集是没有label数据的
  150. return np.array(test_x), np.array(test_y), np.array(dq_y)
  151. return np.array(test_x)
  152. if __name__ == "__main__":
  153. ds = DataSet()
  154. # dq = ds.read_data(dq_path, dq_columns)[0]
  155. # rp = ds.read_data(rp_path, rp_columns)[0]
  156. # # rp_average(rp) # 计算平均功率
  157. # envir = ds.read_data(envir_path, envir_columns)[0]
  158. # tables = ds.tables_integra(dq, rp, envir)
  159. # ds.tables_norm_result(tables)