1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import glob
- import os
- from Arg import Arg
- import pandas as pd
- arg = Arg()
- def data_split_by_month(getpath,savepath,name):
- # 读取你的大数据集
- df = pd.read_csv(getpath)
- # 15分钟一个点,看看数据集是否存在超过半个月的数据集,如果不存在就不导出
- if len(df) < 15*24*60/15:
- print("当前数据集个数不足半个月,不予导出!!")
- return
- # 确保你的日期列是 datetime 类型
- df['C_TIME'] = pd.to_datetime(df['C_TIME'])
- # 设置日期列为索引
- df = df.set_index('C_TIME')
- # 按照年份和月份进行分组
- grouped = df.groupby([df.index.year, df.index.month])
- # 遍历分组后的数据,将每个月的数据保存为一个新的 csv 文件
- for (year, month), group in grouped:
- group.to_csv(savepath + f'{name}_{year}_{month}.csv')
- return grouped
- def split_for_month():
- path = "../data/Dataset_training/NWP"
- csv_files = glob.glob(os.path.join(path, '*.csv'))
- data_len = len(csv_files)
- for i in range(data_len-1):
- getpath = f'../data/Dataset_training/NWP/NWP_{i}.csv'
- savepath = '../data/training/NWP/'
- if not os.path.exists(savepath): # 如果路径不存在
- os.makedirs(savepath)
- data_split_by_month(getpath, savepath,"NWP")
- getpath = f'../data/Dataset_training/power/power_{i}.csv'
- savepath = '../data/training/power/'
- if not os.path.exists(savepath): # 如果路径不存在
- os.makedirs(savepath)
- data_split_by_month(getpath, savepath,"power")
- if __name__ == '__main__':
- split_for_month()
|