liudawei
/
dalian-university


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							# -*- coding: utf-8 -*-

# J00226超短期功率预测，数据清洗：删除和填充

# %% 调库，定义函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from os.path import isfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pickle import dump, load
from datetime import datetime
from time import mktime, strptime, strftime
import time


root_path = "./"


# 计算相邻两条数据时间差不是15分钟的数量
def missing(df):
    # dt = pd.Timedelta(hours=1)
    dt = pd.Timedelta(minutes=15)
    cnt = 0

    if type(df["C_TIME"][0]) == str:
        df['C_TIME'] = pd.to_datetime(df['C_TIME'])

    for i in range(1, len(df)):
        if df["C_TIME"][i] - df["C_TIME"][i-1] != dt:
            print(df.iloc[i - 1, 0], end=" ~ ")
            print(df.iloc[i, 0])
            # print(df["C_TIME"][i] - df["C_TIME"][i-1])

            cnt += 1

    print("数据总数：", len(df), "，缺失段数：", cnt)


def show_curve(series):
    plt.figure(dpi=100, figsize=(18, 6))
    # plt.plot(df["预测功率"], color="red", label="预测功率")
    # plt.plot(df["实际功率"], color="blue", label="实际功率")
    # plt.plot(df["短期预测功率"], color="green", label="短期预测功率")
    plt.plot(series)

    # plt.legend(loc="upper right")

    plt.show()


def data_process():
    df_nwp = pd.read_excel(root_path + "nwp.xls", usecols="B:AA")
    df_dq = pd.read_excel(root_path + "dq.xls", usecols="B,C")
    df_rp = pd.read_excel(root_path + "rp.xls", usecols="A,C")

    # missing(df_nwp)   # 34848 1
    # missing(df_dq)    # 34368 6
    # missing(df_rp)    # 33602 13

    # plt.figure(dpi=100, figsize=(18, 6))
    # plt.plot(df_dq["C_FORECAST"][0:2000], color="red", label="forecast")
    # plt.plot(df_rp["C_VALUE"][0:2000], color="blue", label="value")

    # plt.legend(loc="upper right")
    # plt.show()

    # show_curve(df_dq["C_FORECAST"][0:2000])
    # show_curve(df_rp["C_VALUE"][0:2000])

    # 使用concat合并三个表，并将索引reset
    df_nwp_dq_rp = pd.concat([df_nwp.set_index("C_TIME"), df_dq.set_index("C_TIME"),
                              df_rp.set_index("C_TIME")], axis=1, join="inner").reset_index()

    # missing(df_env_nwp_rp)  # 24557 4341
    missing(df_nwp_dq_rp)   # 32738 20

    df_nwp_dq_rp.to_csv(root_path + "nwp_dq_rp.csv", index=False)


# 数据填充和删除
def data_fill(df):
    # 将缺省数据较多的那一天数据直接删除
    # del_day = "2021-12-15"
    idx1 = df[df.C_TIME == "2021-12-15 14:45:00"].index.tolist()[0]
    idx2 = df[df.C_TIME == "2021-12-16 00:00:00"].index.tolist()[0]
    df.drop(list(range(idx1, idx2)), inplace=True)

    idx1 = df[df.C_TIME == "2021-10-14 00:00:00"].index.tolist()[0]
    idx2 = df[df.C_TIME == "2021-10-16 00:00:00"].index.tolist()[0]
    df.drop(list(range(idx1, idx2)), inplace=True)

    idx1 = df[df.C_TIME == "2021-11-13 00:00:00"].index.tolist()[0]
    idx2 = df[df.C_TIME == "2021-11-14 00:00:00"].index.tolist()[0]
    df.drop(list(range(idx1, idx2)), inplace=True)

    #
    df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)
    df = pd.read_csv(root_path + "nwp_dq_rp_1.csv")

    missing(df)

    # 数据填充
    cols = df.columns

    if type(df["C_TIME"][0]) == str:
        df['C_TIME'] = pd.to_datetime(df['C_TIME'])

    dt15 = pd.Timedelta(minutes=15)
    dt3 = pd.Timedelta(hours=3)

    for i in range(1, len(df)):
        tdt = df["C_TIME"][i] - df["C_TIME"][i-1]

        if tdt > dt15 and tdt <= dt3:
            num = int(tdt / dt15)
            last_time = df["C_TIME"][i-1]
            mp = {col: df[col][i] - df[col][i-1] for col in cols}

            for j in range(1, num):
                df_temp = {cols[k]: (df[cols[k]][i-1] + mp[cols[k]] / num * j)
                           for k in range(1, len(cols))}
                df_temp["C_TIME"] = last_time + dt15
                last_time = df_temp["C_TIME"]

                df_temp = pd.DataFrame(df_temp, index=[len(df) + 1])
                df = df.append(df_temp, ignore_index=True)

    df.sort_values(by="C_TIME", inplace=True, ascending=True)
    df.to_csv(root_path + "nwp_dq_rp_1.csv", index=False)


# %%
if __name__ == "__main__":
    df_nwp_dq_rp = pd.read_csv(
        root_path + "nwp_dq_rp.csv", usecols=list(range(0, 27)))
    df = df_nwp_dq_rp
    # data_fill(df_nwp_dq_rp)