splitdata.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import pandas as pd
  2. import utils.savedata
  3. def a():
  4. # 读取大文件
  5. # large_file = "power_15min.csv"
  6. # large_df = utils.savedata.readData(large_file)
  7. #
  8. # # 读取每个小文件并将其存储在一个字典中
  9. # small_files = []
  10. # for i in range(6):
  11. # small_files.append("/Dataset_training/NWP/NWP_{}.csv".format(i))
  12. # small_dfs = {}
  13. #
  14. # for file in small_files:
  15. # small_dfs[file] = utils.savedata.readData(file)
  16. #
  17. # # 根据每个小文件的时间范围拆分大文件
  18. # i = 0
  19. # for file, df in small_dfs.items():
  20. # min_time = df["C_TIME"].min()
  21. # max_time = df["C_TIME"].max()
  22. # splitted_df = large_df[(large_df["C_TIME"] >= min_time) & (large_df["C_TIME"] <= max_time)]
  23. # utils.savedata.saveData("/Dataset_training/power/power_{}.csv".format(i), splitted_df)
  24. # i = i + 1
  25. filenames = ["Dataset_training/NWP/NWP_0.csv","Dataset_training/power/power_0.csv"]
  26. dataframes = []
  27. for name in filenames:
  28. dataframes.append(utils.savedata.readData(name))
  29. # 查找最大起始时间和最小结束时间
  30. max_start_time = max(df['C_TIME'].min() for df in dataframes)
  31. min_end_time = min(df['C_TIME'].max() for df in dataframes)
  32. print(max_start_time)
  33. print(min_end_time)
  34. # 重新调整每个 DataFrame 的时间范围,只保留在 [max_start_time, min_end_time] 区间内的数据
  35. for i, df in enumerate(dataframes):
  36. df['C_TIME'] = pd.to_datetime(df['C_TIME']) # 确保时间列是 datetime 类型
  37. df_filtered = df[(df['C_TIME'] >= max_start_time) & (df['C_TIME'] <= min_end_time)]
  38. # 将结果保存到新文件,文件名为原文件名加上 "_filtered" 后缀
  39. utils.savedata.saveData(filenames[i],df_filtered)
  40. def split_test():
  41. # 读取 NWP_5.csv 和 power_5.csv 文件
  42. nwp_df = utils.savedata.readData("Dataset_training/NWP/NWP_0.csv")
  43. power_df = utils.savedata.readData("Dataset_training/power/power_0.csv")
  44. small_files = []
  45. for i in range(6):
  46. small_files.append("/Dataset_training/NWP/NWP_{}.csv".format(i))
  47. for i in range(6):
  48. small_files.append("/Dataset_training/power/power_{}.csv".format(i))
  49. dataframes = []
  50. for name in small_files:
  51. dataframes.append(utils.savedata.readData(name))
  52. small_dfs = {}
  53. l = 0
  54. for df in dataframes:
  55. l = l + len(df)
  56. test_size = int(l* 0.1)
  57. nwp_test = nwp_df.iloc[-test_size:]
  58. power_test = power_df.iloc[-test_size:]
  59. nwp_train = nwp_df[~nwp_df["C_TIME"].isin(nwp_test["C_TIME"])]
  60. power_train = power_df[~power_df["C_TIME"].isin(power_test["C_TIME"])]
  61. utils.savedata.saveData("/Dataset_test/NWP/NWP_test.csv", nwp_test)
  62. utils.savedata.saveData("/Dataset_test/power/power_test.csv", power_test)
  63. utils.savedata.saveData("/Dataset_training/NWP/NWP_5.csv", nwp_train)
  64. utils.savedata.saveData("/Dataset_training/power/power_5.csv", power_train)
  65. if __name__ == '__main__':
  66. a()
  67. #split_test()