data_features.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/4/12 17:42
  4. # file: data_features.py
  5. # author: David
  6. # company: shenyang JY
  7. import pandas as pd
  8. from sklearn.model_selection import train_test_split
  9. import numpy as np
  10. from data_utils import *
  11. class data_features(object):
  12. def __init__(self, opt, mean, std):
  13. self.opt = opt
  14. self.time_step = self.opt.Model["time_step"]
  15. self.mean = mean
  16. self.std = std
  17. self.columns = list()
  18. def get_train_data(self, dfs):
  19. train_x, valid_x, train_y, valid_y = [], [], [], []
  20. self.opt.feature_columns = dfs[0].columns.tolist()
  21. self.opt.feature_columns.insert(0, 'C_TIME')
  22. self.opt.label_in_feature_index = (lambda x, y: [x.index(i) for i in y])(self.opt.feature_columns,
  23. self.opt.label_columns) # 因为feature不一定从0开始
  24. self.opt.input_size = len(self.opt.feature_columns)
  25. for df in dfs:
  26. datax, datay = self.get_data_features(df)
  27. trainx = np.array(datax)
  28. trainy = [y['C_REAL_VALUE'].values for y in datay]
  29. trainy = np.expand_dims(np.array(trainy), axis=-1) # 在最后一维加一维度
  30. tx, vx, ty, vy = train_test_split(trainx, trainy, test_size=self.opt.valid_data_rate,
  31. random_state=self.opt.Model["random_seed"],
  32. shuffle=self.opt.shuffle_train_data) # 划分训练和验证集
  33. train_x.append(tx)
  34. valid_x.append(vx)
  35. train_y.append(ty)
  36. valid_y.append(vy)
  37. train_x = np.concatenate(train_x, axis=0)
  38. valid_x = np.concatenate(valid_x, axis=0)
  39. train_y = np.concatenate(train_y, axis=0)
  40. valid_y = np.concatenate(valid_y, axis=0)
  41. train_x = self.norm_features(train_x)
  42. valid_x = self.norm_features(valid_x)
  43. train_y = self.norm_label(train_y)
  44. valid_y = self.norm_label(valid_y)
  45. return train_x, valid_x, train_y, valid_y
  46. def get_test_data(self, dfs):
  47. test_x, test_y, data_y = [], [], []
  48. self.opt.feature_columns = dfs[0].columns.tolist()
  49. self.opt.feature_columns.insert(0, 'C_TIME')
  50. self.opt.label_in_feature_index = (lambda x, y: [x.index(i) for i in y])(self.opt.feature_columns,
  51. self.opt.label_columns) # 因为feature不一定从0开始
  52. self.opt.input_size = len(self.opt.feature_columns)
  53. for df in dfs:
  54. datax, datay = self.get_data_features(df)
  55. trainx = np.array(datax)
  56. trainy = [y['C_REAL_VALUE'].values for y in datay]
  57. trainy = np.expand_dims(np.array(trainy), axis=-1) # 在最后一维加一维度
  58. test_x.append(trainx)
  59. test_y.append(trainy)
  60. data_y.append(datay)
  61. test_x = np.concatenate(test_x, axis=0)
  62. test_y = np.concatenate(test_y, axis=0)
  63. test_x = self.norm_features(test_x)
  64. test_y = self.norm_label(test_y)
  65. return test_x, test_y, data_y
  66. def get_data_features(self, df): # 这段代码基于pandas方法的优化
  67. norm_data = df.reset_index()
  68. feature_data = norm_data[:-self.opt.predict_points]
  69. label_data = norm_data[self.opt.predict_points:].reset_index(drop=True)
  70. time_step = self.opt.Model["time_step"]
  71. time_step_loc = time_step - 1
  72. train_num = int(len(feature_data))
  73. time_rp = [feature_data.loc[i:i + time_step_loc, 'C_TIME':'C_WD_INST120'] for i in range(train_num - time_step)]
  74. nwp = [label_data.loc[i:i + time_step_loc, 'C_T':] for i in range(train_num - time_step)]
  75. features_x, features_y = [], []
  76. for row in zip(time_rp, nwp):
  77. row0 = row[0]
  78. row1 = row[1]
  79. row0 = pd.concat([row0, row1], axis=1)
  80. row0.set_index('C_TIME', inplace=True, drop=False)
  81. row0["C_TIME"] = row0["C_TIME"].apply(datetime_to_timestr)
  82. features_x.append(row0)
  83. self.columns = row0.columns.tolist()
  84. features_y = [label_data.loc[i:i + time_step_loc, ['C_TIME', 'C_REAL_VALUE']] for i in range(train_num - time_step)]
  85. return features_x, features_y
  86. def norm_features(self, data: np.ndarray):
  87. mean = np.array([self.mean[col] for col in self.columns])
  88. std = np.array([self.std[col] for col in self.columns])
  89. data = (data - mean) / std # 归一化
  90. return data
  91. def norm_label(self, label_data: np.ndarray):
  92. return (label_data - self.mean['C_REAL_VALUE']) / self.std['C_REAL_VALUE']