#!/usr/bin/env python # -*- coding: utf-8 -*- # time: 2023/4/12 17:42 # file: data_features.py # author: David # company: shenyang JY import pandas as pd from sklearn.model_selection import train_test_split import numpy as np class data_features(object): def __init__(self, opt): self.opt = opt self.time_step = self.opt.Model["time_step"] self.columns = list() def get_train_data(self, dfs): train_x, valid_x, train_y, valid_y = [], [], [], [] for i, df in enumerate(dfs, start=1): datax, datay = self.get_data_features(df, is_train=True) tx, vx, ty, vy = train_test_split(datax, datay, test_size=self.opt.valid_data_rate, random_state=self.opt.Model["random_seed"], shuffle=self.opt.shuffle_train_data) # 划分训练和验证集 train_x.extend(tx) valid_x.extend(vx) train_y.extend(ty) valid_y.extend(vy) train_y = np.concatenate([[y.iloc[:, 1:].values for y in train_y]], axis=0) valid_y = np.concatenate([[y.iloc[:, 1:].values for y in valid_y]], axis=0) train_x = np.array([x[0].values for x in train_x]) valid_x = np.array([x[0].values for x in valid_x]) return train_x, valid_x, train_y, valid_y def get_test_data(self, dfs): test_x, test_y, data_y = [], [], [] for i, df in enumerate(dfs, start=1): datax, datay = self.get_data_features(df, is_train=False) test_x.extend(datax) test_y.extend(datay) data_y.extend(datay) test_x = np.array([x[0].values for x in test_x]) test_y = np.concatenate([[y.iloc[:, 1:].values for y in test_y]], axis=0) return test_x, test_y, data_y def get_data_features(self, norm_data, is_train): # 这段代码基于pandas方法的优化 time_step = self.opt.Model["time_step"] feature_data = norm_data.reset_index(drop=True) time_step_loc = time_step - 1 train_num = int(len(feature_data)) label_features = ['C_TIME', 'col1_power', 'col2_power', 'sum_power', 'C_VALUE'] if is_train is True else ['C_TIME', 'col1_power', 'col2_power', 'sum_power', 'C_VALUE'] nwp = [feature_data.loc[i:i + time_step_loc, 'C_RADIATION':'C_TPR'].reset_index(drop=True) for i in range(train_num - time_step)] # 数据库字段 'C_T': 'C_WS170' labels = [feature_data.loc[i:i + time_step_loc, label_features].reset_index(drop=True) for i in range(train_num - time_step)] features_x, features_y = [], [] print("匹配环境前,{}组".format(len(nwp)), end=" -> ") for i, row in enumerate(zip(nwp, labels)): time_end = row[1]['C_TIME'][0] time_start = time_end - pd.DateOffset(1) # row1 = envir[(envir.C_TIME < time_end) & (envir.C_TIME > time_start)][-16:] # if len(row1) < 16: # print("环境数据不足16个点:", len(row1)) # continue # row1 = row1.reset_index(drop=True).drop(['C_TIME'], axis=1) # features_x.append([row1.iloc[:,:-4], row1.iloc[:,-4:]]) features_x.append([row[0]]) features_y.append(row[1]) print("匹配环境后,{}组".format(len(features_x))) return features_x, features_y