dataSplit.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. import glob
  2. import os
  3. from Arg import Arg
  4. import pandas as pd
  5. arg = Arg()
  6. def data_split_by_month(getpath,savepath,name):
  7. # 读取你的大数据集
  8. df = pd.read_csv(getpath)
  9. # 15分钟一个点,看看数据集是否存在超过半个月的数据集,如果不存在就不导出
  10. if len(df) < 15*24*60/15:
  11. print("当前数据集个数不足半个月,不予导出!!")
  12. return
  13. # 确保你的日期列是 datetime 类型
  14. df['C_TIME'] = pd.to_datetime(df['C_TIME'])
  15. # 设置日期列为索引
  16. df = df.set_index('C_TIME')
  17. # 按照年份和月份进行分组
  18. grouped = df.groupby([df.index.year, df.index.month])
  19. # 遍历分组后的数据,将每个月的数据保存为一个新的 csv 文件
  20. for (year, month), group in grouped:
  21. group.to_csv(savepath + f'{name}_{year}_{month}.csv')
  22. return grouped
  23. def split_for_month():
  24. path = "../data/Dataset_training/NWP"
  25. csv_files = glob.glob(os.path.join(path, '*.csv'))
  26. data_len = len(csv_files)
  27. for i in range(data_len-1):
  28. getpath = f'../data/Dataset_training/NWP/NWP_{i}.csv'
  29. savepath = '../data/training/NWP/'
  30. if not os.path.exists(savepath): # 如果路径不存在
  31. os.makedirs(savepath)
  32. data_split_by_month(getpath, savepath,"NWP")
  33. getpath = f'../data/Dataset_training/power/power_{i}.csv'
  34. savepath = '../data/training/power/'
  35. if not os.path.exists(savepath): # 如果路径不存在
  36. os.makedirs(savepath)
  37. data_split_by_month(getpath, savepath,"power")
  38. if __name__ == '__main__':
  39. split_for_month()