# -*- coding: utf-8 -*- # J00226超短期功率预测,数据清洗:删除和填充 # %% 调库,定义函数 import pandas as pd import numpy as np import matplotlib.pyplot as plt import lightgbm as lgb from os.path import isfile from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from pickle import dump, load from datetime import datetime from time import mktime, strptime, strftime import time root_path = "./" # 计算相邻两条数据时间差不是15分钟的数量 def missing(df): # dt = pd.Timedelta(hours=1) dt = pd.Timedelta(minutes=15) cnt = 0 if type(df["C_TIME"][0]) == str: df['C_TIME'] = pd.to_datetime(df['C_TIME']) for i in range(1, len(df)): if df["C_TIME"][i] - df["C_TIME"][i-1] != dt: print(df.iloc[i - 1, 0], end=" ~ ") print(df.iloc[i, 0]) # print(df["C_TIME"][i] - df["C_TIME"][i-1]) cnt += 1 print("数据总数:", len(df), ",缺失段数:", cnt) def show_curve(series): plt.figure(dpi=100, figsize=(18, 6)) # plt.plot(df["预测功率"], color="red", label="预测功率") # plt.plot(df["实际功率"], color="blue", label="实际功率") # plt.plot(df["短期预测功率"], color="green", label="短期预测功率") plt.plot(series) # plt.legend(loc="upper right") plt.show() def data_process(): df_nwp = pd.read_excel(root_path + "nwp.xls", usecols="B:AA") df_dq = pd.read_excel(root_path + "dq.xls", usecols="B,C") df_rp = pd.read_excel(root_path + "rp.xls", usecols="A,C") # missing(df_nwp) # 34848 1 # missing(df_dq) # 34368 6 # missing(df_rp) # 33602 13 # plt.figure(dpi=100, figsize=(18, 6)) # plt.plot(df_dq["C_FORECAST"][0:2000], color="red", label="forecast") # plt.plot(df_rp["C_VALUE"][0:2000], color="blue", label="value") # plt.legend(loc="upper right") # plt.show() # show_curve(df_dq["C_FORECAST"][0:2000]) # show_curve(df_rp["C_VALUE"][0:2000]) # 使用concat合并三个表,并将索引reset df_nwp_dq_rp = pd.concat([df_nwp.set_index("C_TIME"), df_dq.set_index("C_TIME"), df_rp.set_index("C_TIME")], axis=1, join="inner").reset_index() # missing(df_env_nwp_rp) # 24557 4341 missing(df_nwp_dq_rp) # 32738 20 df_nwp_dq_rp.to_csv(root_path + "nwp_dq_rp.csv", index=False) # 数据填充和删除 def data_fill(df): # 将缺省数据较多的那一天数据直接删除 # del_day = "2021-12-15" idx1 = df[df.C_TIME == "2021-12-15 14:45:00"].index.tolist()[0] idx2 = df[df.C_TIME == "2021-12-16 00:00:00"].index.tolist()[0] df.drop(list(range(idx1, idx2)), inplace=True) idx1 = df[df.C_TIME == "2021-10-14 00:00:00"].index.tolist()[0] idx2 = df[df.C_TIME == "2021-10-16 00:00:00"].index.tolist()[0] df.drop(list(range(idx1, idx2)), inplace=True) idx1 = df[df.C_TIME == "2021-11-13 00:00:00"].index.tolist()[0] idx2 = df[df.C_TIME == "2021-11-14 00:00:00"].index.tolist()[0] df.drop(list(range(idx1, idx2)), inplace=True) # df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False) df = pd.read_csv(root_path + "nwp_dq_rp_1.csv") missing(df) # 数据填充 cols = df.columns if type(df["C_TIME"][0]) == str: df['C_TIME'] = pd.to_datetime(df['C_TIME']) dt15 = pd.Timedelta(minutes=15) dt3 = pd.Timedelta(hours=3) for i in range(1, len(df)): tdt = df["C_TIME"][i] - df["C_TIME"][i-1] if tdt > dt15 and tdt <= dt3: num = int(tdt / dt15) last_time = df["C_TIME"][i-1] mp = {col: df[col][i] - df[col][i-1] for col in cols} for j in range(1, num): df_temp = {cols[k]: (df[cols[k]][i-1] + mp[cols[k]] / num * j) for k in range(1, len(cols))} df_temp["C_TIME"] = last_time + dt15 last_time = df_temp["C_TIME"] df_temp = pd.DataFrame(df_temp, index=[len(df) + 1]) df = df.append(df_temp, ignore_index=True) df.sort_values(by="C_TIME", inplace=True, ascending=True) df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False) # %% if __name__ == "__main__": df_nwp_dq_rp = pd.read_csv( root_path + "nwp_dq_rp.csv", usecols=list(range(0, 27))) df = df_nwp_dq_rp # data_fill(df_nwp_dq_rp)