data_features.py 3.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/4/12 17:42
  4. # file: data_features.py
  5. # author: David
  6. # company: shenyang JY
  7. import pandas as pd
  8. from sklearn.model_selection import train_test_split
  9. import numpy as np
  10. from data_utils import *
  11. class data_features(object):
  12. def __init__(self, opt, mean, std):
  13. self.opt = opt
  14. self.time_step = self.opt.Model["time_step"]
  15. self.mean = mean
  16. self.std = std
  17. self.columns_lstm = list()
  18. self.labels = list()
  19. def get_train_data(self, dfs):
  20. train_x, valid_x, train_y, valid_y = [], [], [], []
  21. for i, df in enumerate(dfs):
  22. if len(df) <= self.opt.Model["time_step"]:
  23. continue
  24. trainx_, trainy = self.get_data_features(df)
  25. trainx_ = [np.array(x) for x in trainx_]
  26. trainy_ = [y.iloc[:, 1:].values for y in trainy]
  27. tx, vx, ty, vy = train_test_split(trainx_, trainy_, test_size=self.opt.valid_data_rate,
  28. random_state=self.opt.Model["random_seed"],
  29. shuffle=self.opt.shuffle_train_data) # 划分训练和验证集
  30. # 分裂 tx 和 vx
  31. train_x.extend(tx)
  32. valid_x.extend(vx)
  33. train_y.extend(ty)
  34. valid_y.extend(vy)
  35. # train_y = np.concatenate(train_y, axis=0)
  36. # valid_y = np.concatenate(valid_y, axis=0)
  37. train_x = self.norm_features(train_x)
  38. valid_x = self.norm_features(valid_x)
  39. train_y = self.norm_label(train_y)
  40. valid_y = self.norm_label(valid_y)
  41. print("训练的数据集有{}个点".format(len(train_x)))
  42. return np.array(train_x), np.array(valid_x), np.array(train_y), np.array(valid_y)
  43. def get_test_data(self, dfs):
  44. test_x, test_y, data_y = [], [], []
  45. for df in dfs:
  46. if len(df) <= self.opt.Model["time_step"]:
  47. continue
  48. testx_, testy = self.get_data_features(df)
  49. testx_ = [np.array(x) for x in testx_]
  50. testy_ = [y.iloc[:, 1:].values for y in testy]
  51. test_x.extend(testx_)
  52. test_y.extend(testy_)
  53. data_y.extend(testy)
  54. test_y = np.concatenate(test_y, axis=0)
  55. test_x = self.norm_features(test_x)
  56. test_y = self.norm_label(test_y)
  57. print("测试的数据集有{}个点".format(len(test_x)))
  58. return np.array(test_x), test_y, data_y
  59. def get_data_features(self, feature_data): # 这段代码基于pandas方法的优化
  60. time_step = self.opt.Model["time_step"]
  61. time_step_loc = time_step - 1
  62. train_num = int(len(feature_data))
  63. features_x = [feature_data.loc[i:i + time_step_loc, 'C_T':'C_WS170'] for i in range(train_num - time_step)]
  64. features_y = [feature_data.loc[i:i + time_step_loc, ['C_TIME', 'C_ACTIVE_POWER1', 'C_ACTIVE_POWER2', 'SUM', 'C_REAL_VALUE']] for i in range(train_num - time_step)]
  65. self.columns_lstm = features_x[0].columns.tolist()
  66. self.labels = features_y[0].columns.tolist()
  67. self.labels.remove('C_TIME')
  68. self.opt.input_size_lstm = len(self.columns_lstm)
  69. # self.opt.input_size_lstm = len(self.columns_lstm)
  70. return features_x, features_y
  71. def norm_features(self, data: np.ndarray):
  72. for i, d in enumerate(data):
  73. mean = np.array([self.mean[col] for col in self.columns_lstm])
  74. std = np.array([self.std[col] for col in self.columns_lstm])
  75. d = (d - mean) / std # 归一化
  76. data[i] = d
  77. return data
  78. def norm_label(self, label_data: np.ndarray):
  79. for i, d in enumerate(label_data):
  80. mean = np.array([self.mean[col] for col in self.labels])
  81. std = np.array([self.std[col] for col in self.labels])
  82. d = (d - mean) / std # 归一化
  83. label_data[i] = d
  84. return label_data