data_process1.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # -*- coding: utf-8 -*-
  2. # J00226超短期功率预测,数据清洗:删除和填充
  3. # %% 调库,定义函数
  4. import pandas as pd
  5. import numpy as np
  6. import matplotlib.pyplot as plt
  7. import lightgbm as lgb
  8. from os.path import isfile
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.metrics import mean_squared_error
  11. from pickle import dump, load
  12. from datetime import datetime
  13. from time import mktime, strptime, strftime
  14. import time
  15. root_path = "./"
  16. # 计算相邻两条数据时间差不是15分钟的数量
  17. def missing(df):
  18. # dt = pd.Timedelta(hours=1)
  19. dt = pd.Timedelta(minutes=15)
  20. cnt = 0
  21. if type(df["C_TIME"][0]) == str:
  22. df['C_TIME'] = pd.to_datetime(df['C_TIME'])
  23. for i in range(1, len(df)):
  24. if df["C_TIME"][i] - df["C_TIME"][i-1] != dt:
  25. print(df.iloc[i - 1, 0], end=" ~ ")
  26. print(df.iloc[i, 0])
  27. # print(df["C_TIME"][i] - df["C_TIME"][i-1])
  28. cnt += 1
  29. print("数据总数:", len(df), ",缺失段数:", cnt)
  30. def show_curve(series):
  31. plt.figure(dpi=100, figsize=(18, 6))
  32. # plt.plot(df["预测功率"], color="red", label="预测功率")
  33. # plt.plot(df["实际功率"], color="blue", label="实际功率")
  34. # plt.plot(df["短期预测功率"], color="green", label="短期预测功率")
  35. plt.plot(series)
  36. # plt.legend(loc="upper right")
  37. plt.show()
  38. def data_process():
  39. df_nwp = pd.read_excel(root_path + "nwp.xls", usecols="B:AA")
  40. df_dq = pd.read_excel(root_path + "dq.xls", usecols="B,C")
  41. df_rp = pd.read_excel(root_path + "rp.xls", usecols="A,C")
  42. # missing(df_nwp) # 34848 1
  43. # missing(df_dq) # 34368 6
  44. # missing(df_rp) # 33602 13
  45. # plt.figure(dpi=100, figsize=(18, 6))
  46. # plt.plot(df_dq["C_FORECAST"][0:2000], color="red", label="forecast")
  47. # plt.plot(df_rp["C_VALUE"][0:2000], color="blue", label="value")
  48. # plt.legend(loc="upper right")
  49. # plt.show()
  50. # show_curve(df_dq["C_FORECAST"][0:2000])
  51. # show_curve(df_rp["C_VALUE"][0:2000])
  52. # 使用concat合并三个表,并将索引reset
  53. df_nwp_dq_rp = pd.concat([df_nwp.set_index("C_TIME"), df_dq.set_index("C_TIME"),
  54. df_rp.set_index("C_TIME")], axis=1, join="inner").reset_index()
  55. # missing(df_env_nwp_rp) # 24557 4341
  56. missing(df_nwp_dq_rp) # 32738 20
  57. df_nwp_dq_rp.to_csv(root_path + "nwp_dq_rp.csv", index=False)
  58. # 数据填充和删除
  59. def data_fill(df):
  60. # 将缺省数据较多的那一天数据直接删除
  61. # del_day = "2021-12-15"
  62. idx1 = df[df.C_TIME == "2021-12-15 14:45:00"].index.tolist()[0]
  63. idx2 = df[df.C_TIME == "2021-12-16 00:00:00"].index.tolist()[0]
  64. df.drop(list(range(idx1, idx2)), inplace=True)
  65. idx1 = df[df.C_TIME == "2021-10-14 00:00:00"].index.tolist()[0]
  66. idx2 = df[df.C_TIME == "2021-10-16 00:00:00"].index.tolist()[0]
  67. df.drop(list(range(idx1, idx2)), inplace=True)
  68. idx1 = df[df.C_TIME == "2021-11-13 00:00:00"].index.tolist()[0]
  69. idx2 = df[df.C_TIME == "2021-11-14 00:00:00"].index.tolist()[0]
  70. df.drop(list(range(idx1, idx2)), inplace=True)
  71. #
  72. df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)
  73. df = pd.read_csv(root_path + "nwp_dq_rp_1.csv")
  74. missing(df)
  75. # 数据填充
  76. cols = df.columns
  77. if type(df["C_TIME"][0]) == str:
  78. df['C_TIME'] = pd.to_datetime(df['C_TIME'])
  79. dt15 = pd.Timedelta(minutes=15)
  80. dt3 = pd.Timedelta(hours=3)
  81. for i in range(1, len(df)):
  82. tdt = df["C_TIME"][i] - df["C_TIME"][i-1]
  83. if tdt > dt15 and tdt <= dt3:
  84. num = int(tdt / dt15)
  85. last_time = df["C_TIME"][i-1]
  86. mp = {col: df[col][i] - df[col][i-1] for col in cols}
  87. for j in range(1, num):
  88. df_temp = {cols[k]: (df[cols[k]][i-1] + mp[cols[k]] / num * j)
  89. for k in range(1, len(cols))}
  90. df_temp["C_TIME"] = last_time + dt15
  91. last_time = df_temp["C_TIME"]
  92. df_temp = pd.DataFrame(df_temp, index=[len(df) + 1])
  93. df = df.append(df_temp, ignore_index=True)
  94. df.sort_values(by="C_TIME", inplace=True, ascending=True)
  95. df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)
  96. # %%
  97. if __name__ == "__main__":
  98. df_nwp_dq_rp = pd.read_csv(
  99. root_path + "nwp_dq_rp.csv", usecols=list(range(0, 27)))
  100. df = df_nwp_dq_rp
  101. # data_fill(df_nwp_dq_rp)