123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- # -*- coding: utf-8 -*-
- # J00226超短期功率预测,数据清洗:删除和填充
- # %% 调库,定义函数
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import lightgbm as lgb
- from os.path import isfile
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import mean_squared_error
- from pickle import dump, load
- from datetime import datetime
- from time import mktime, strptime, strftime
- import time
- root_path = "./"
- # 计算相邻两条数据时间差不是15分钟的数量
- def missing(df):
- # dt = pd.Timedelta(hours=1)
- dt = pd.Timedelta(minutes=15)
- cnt = 0
- if type(df["C_TIME"][0]) == str:
- df['C_TIME'] = pd.to_datetime(df['C_TIME'])
- for i in range(1, len(df)):
- if df["C_TIME"][i] - df["C_TIME"][i-1] != dt:
- print(df.iloc[i - 1, 0], end=" ~ ")
- print(df.iloc[i, 0])
- # print(df["C_TIME"][i] - df["C_TIME"][i-1])
- cnt += 1
- print("数据总数:", len(df), ",缺失段数:", cnt)
- def show_curve(series):
- plt.figure(dpi=100, figsize=(18, 6))
- # plt.plot(df["预测功率"], color="red", label="预测功率")
- # plt.plot(df["实际功率"], color="blue", label="实际功率")
- # plt.plot(df["短期预测功率"], color="green", label="短期预测功率")
- plt.plot(series)
- # plt.legend(loc="upper right")
- plt.show()
- def data_process():
- df_nwp = pd.read_excel(root_path + "nwp.xls", usecols="B:AA")
- df_dq = pd.read_excel(root_path + "dq.xls", usecols="B,C")
- df_rp = pd.read_excel(root_path + "rp.xls", usecols="A,C")
- # missing(df_nwp) # 34848 1
- # missing(df_dq) # 34368 6
- # missing(df_rp) # 33602 13
- # plt.figure(dpi=100, figsize=(18, 6))
- # plt.plot(df_dq["C_FORECAST"][0:2000], color="red", label="forecast")
- # plt.plot(df_rp["C_VALUE"][0:2000], color="blue", label="value")
- # plt.legend(loc="upper right")
- # plt.show()
- # show_curve(df_dq["C_FORECAST"][0:2000])
- # show_curve(df_rp["C_VALUE"][0:2000])
- # 使用concat合并三个表,并将索引reset
- df_nwp_dq_rp = pd.concat([df_nwp.set_index("C_TIME"), df_dq.set_index("C_TIME"),
- df_rp.set_index("C_TIME")], axis=1, join="inner").reset_index()
- # missing(df_env_nwp_rp) # 24557 4341
- missing(df_nwp_dq_rp) # 32738 20
- df_nwp_dq_rp.to_csv(root_path + "nwp_dq_rp.csv", index=False)
- # 数据填充和删除
- def data_fill(df):
- # 将缺省数据较多的那一天数据直接删除
- # del_day = "2021-12-15"
- idx1 = df[df.C_TIME == "2021-12-15 14:45:00"].index.tolist()[0]
- idx2 = df[df.C_TIME == "2021-12-16 00:00:00"].index.tolist()[0]
- df.drop(list(range(idx1, idx2)), inplace=True)
- idx1 = df[df.C_TIME == "2021-10-14 00:00:00"].index.tolist()[0]
- idx2 = df[df.C_TIME == "2021-10-16 00:00:00"].index.tolist()[0]
- df.drop(list(range(idx1, idx2)), inplace=True)
- idx1 = df[df.C_TIME == "2021-11-13 00:00:00"].index.tolist()[0]
- idx2 = df[df.C_TIME == "2021-11-14 00:00:00"].index.tolist()[0]
- df.drop(list(range(idx1, idx2)), inplace=True)
- #
- df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)
- df = pd.read_csv(root_path + "nwp_dq_rp_1.csv")
- missing(df)
- # 数据填充
- cols = df.columns
- if type(df["C_TIME"][0]) == str:
- df['C_TIME'] = pd.to_datetime(df['C_TIME'])
- dt15 = pd.Timedelta(minutes=15)
- dt3 = pd.Timedelta(hours=3)
- for i in range(1, len(df)):
- tdt = df["C_TIME"][i] - df["C_TIME"][i-1]
- if tdt > dt15 and tdt <= dt3:
- num = int(tdt / dt15)
- last_time = df["C_TIME"][i-1]
- mp = {col: df[col][i] - df[col][i-1] for col in cols}
- for j in range(1, num):
- df_temp = {cols[k]: (df[cols[k]][i-1] + mp[cols[k]] / num * j)
- for k in range(1, len(cols))}
- df_temp["C_TIME"] = last_time + dt15
- last_time = df_temp["C_TIME"]
- df_temp = pd.DataFrame(df_temp, index=[len(df) + 1])
- df = df.append(df_temp, ignore_index=True)
- df.sort_values(by="C_TIME", inplace=True, ascending=True)
- df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)
- # %%
- if __name__ == "__main__":
- df_nwp_dq_rp = pd.read_csv(
- root_path + "nwp_dq_rp.csv", usecols=list(range(0, 27)))
- df = df_nwp_dq_rp
- # data_fill(df_nwp_dq_rp)
|