data_features.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/4/12 17:42
  4. # file: data_features.py
  5. # author: David
  6. # company: shenyang JY
  7. import pandas as pd
  8. from sklearn.model_selection import train_test_split
  9. import numpy as np
  10. class data_features(object):
  11. def __init__(self, opt):
  12. self.opt = opt
  13. self.time_step = self.opt.Model["time_step"]
  14. self.columns = list()
  15. def get_train_data(self, dfs):
  16. train_x, valid_x, train_y, valid_y = [], [], [], []
  17. for i, df in enumerate(dfs, start=1):
  18. datax, datay = self.get_data_features(df, is_train=True)
  19. tx, vx, ty, vy = train_test_split(datax, datay, test_size=self.opt.valid_data_rate, random_state=self.opt.Model["random_seed"], shuffle=self.opt.shuffle_train_data) # 划分训练和验证集
  20. train_x.extend(tx)
  21. valid_x.extend(vx)
  22. train_y.extend(ty)
  23. valid_y.extend(vy)
  24. train_y = np.concatenate([[y.iloc[:, 1:].values for y in train_y]], axis=0)
  25. valid_y = np.concatenate([[y.iloc[:, 1:].values for y in valid_y]], axis=0)
  26. train_x = np.array([x[0].values for x in train_x])
  27. valid_x = np.array([x[0].values for x in valid_x])
  28. return train_x, valid_x, train_y, valid_y
  29. def get_test_data(self, dfs):
  30. test_x, test_y, data_y = [], [], []
  31. for i, df in enumerate(dfs, start=1):
  32. datax, datay = self.get_data_features(df, is_train=False)
  33. test_x.extend(datax)
  34. test_y.extend(datay)
  35. data_y.extend(datay)
  36. test_x = np.array([x[0].values for x in test_x])
  37. test_y = np.concatenate([[y.iloc[:, 1:].values for y in test_y]], axis=0)
  38. return test_x, test_y, data_y
  39. def get_data_features(self, norm_data, is_train): # 这段代码基于pandas方法的优化
  40. time_step = self.opt.Model["time_step"]
  41. feature_data = norm_data.reset_index(drop=True)
  42. time_step_loc = time_step - 1
  43. train_num = int(len(feature_data))
  44. label_features = ['C_TIME', 'col1_power', 'col2_power', 'sum_power', 'C_VALUE'] if is_train is True else ['C_TIME', 'col1_power', 'col2_power', 'sum_power', 'C_VALUE']
  45. nwp = [feature_data.loc[i:i + time_step_loc, 'C_RADIATION':'C_TPR'].reset_index(drop=True) for i in range(train_num - time_step)] # 数据库字段 'C_T': 'C_WS170'
  46. labels = [feature_data.loc[i:i + time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step)]
  47. features_x, features_y = [], []
  48. print("匹配环境前,{}组".format(len(nwp)), end=" -> ")
  49. for i, row in enumerate(zip(nwp, labels)):
  50. time_end = row[1]['C_TIME'][0]
  51. time_start = time_end - pd.DateOffset(1)
  52. # row1 = envir[(envir.C_TIME < time_end) & (envir.C_TIME > time_start)][-16:]
  53. # if len(row1) < 16:
  54. # print("环境数据不足16个点:", len(row1))
  55. # continue
  56. # row1 = row1.reset_index(drop=True).drop(['C_TIME'], axis=1)
  57. # features_x.append([row1.iloc[:,:-4], row1.iloc[:,-4:]])
  58. features_x.append([row[0]])
  59. features_y.append(row[1])
  60. print("匹配环境后,{}组".format(len(features_x)))
  61. return features_x, features_y