inputData.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. import pymysql
  2. import pandas as pd
  3. import numpy as np
  4. from sqlalchemy import create_engine
  5. import matplotlib.pyplot as plt
  6. import pytz
  7. plt.rcParams['font.sans-serif'] = ['SimHei']
  8. import utils.savedata
  9. from utils import Arg
  10. arg = Arg.Arg()
  11. def clear_data():
  12. """
  13. 删除所有csv
  14. :return:
  15. """
  16. # 设置文件夹路径
  17. import glob
  18. import os
  19. folder_path = arg.dataloc
  20. # 使用 glob 获取所有的 .csv 文件路径
  21. csv_files = glob.glob(os.path.join(folder_path, '**/*.csv'), recursive=True)
  22. # 遍历所有 .csv 文件并删除
  23. for file_path in csv_files:
  24. os.remove(file_path)
  25. print("清除所有scv文件")
  26. def create_database(database):
  27. """
  28. 创建数据库连接
  29. :param database: 数据库地址
  30. :return:
  31. """
  32. engine = create_engine(database)
  33. return engine
  34. def exec_sql(sql,engine):
  35. """
  36. 从数据库获取数据
  37. :param sql: sql语句
  38. :param engine: 数据库对象
  39. :return:
  40. """
  41. df = pd.read_sql_query(sql, engine)
  42. return df
  43. def get_process_NWP(database):
  44. """
  45. 从数据库中获取NWP数据,并进行简单处理
  46. :param database:
  47. :return:
  48. """
  49. # NPW数据
  50. engine = create_database(database)
  51. sql_NWP = "select C_PRE_TIME,C_T,C_RH,C_PRESSURE,C_WD10,C_WD30,C_WD50,C_WD70,C_WD80,C_WD90,C_WD100,C_WD170,C_WS10,C_WS30,C_WS50,C_WS70,C_WS80,C_WS90,C_WS100,C_WS170 from t_nwp"
  52. NWP = exec_sql(sql_NWP, engine)
  53. #删除后三位
  54. NWP['C_PRE_TIME'] = NWP['C_PRE_TIME'].astype(str)
  55. NWP['C_PRE_TIME'] = NWP['C_PRE_TIME'].str[:-3]
  56. # 将 'timestamp' 列转换为日期时间格式
  57. NWP['C_PRE_TIME'] = NWP['C_PRE_TIME'].astype(float)
  58. NWP['C_PRE_TIME'] = pd.to_datetime(NWP['C_PRE_TIME'], unit='s')
  59. # 将日期时间转换为本地时区
  60. NWP['C_PRE_TIME'] = NWP['C_PRE_TIME'].dt.tz_localize(pytz.utc).dt.tz_convert('Asia/Shanghai')
  61. # 格式化日期时间为年月日时分秒
  62. NWP['C_PRE_TIME'] = NWP['C_PRE_TIME'].dt.strftime('%Y-%m-%d %H:%M:%S')
  63. NWP = NWP.rename(columns={'C_PRE_TIME': 'C_TIME'})
  64. utils.savedata.saveData("NWP.csv",NWP)
  65. return NWP
  66. def get_process_turbine(database):
  67. """
  68. 从数据库中获取风头数据,并进行简单处理
  69. :param database:
  70. :return:
  71. """
  72. # 获取NWP数据
  73. NWP = utils.savedata.readData("NWP.csv")
  74. NWP_date = NWP.iloc[:,0]
  75. print(NWP_date)
  76. # 机头数据
  77. engine = create_database(database)
  78. for i in arg.turbineloc:
  79. print("导出风机{}的数据".format(i))
  80. sql_turbine = "select C_TIME,C_WS,C_WD,C_ACTIVE_POWER from t_wind_turbine_status_data WHERE C_EQUIPMENT_NO=" + str(i) #+ " and C_WS>0 and C_ACTIVE_POWER>0"
  81. turbine = exec_sql(sql_turbine, engine)
  82. #直接导出所有数据
  83. utils.savedata.saveData("turbine-all/turbine-{}.csv".format(i), turbine)
  84. #每15分钟导出一个数据
  85. filtered_df = turbine[turbine['C_TIME'].isin(NWP_date)]
  86. utils.savedata.saveData("turbine-15/turbine-{}.csv".format(i), filtered_df)
  87. def get_process_tower(database):
  88. """
  89. 获取测风塔数据
  90. :param database:
  91. :return:
  92. """
  93. engine = create_database(database)
  94. print("现有测风塔:{}".format(arg.towerloc))
  95. for i in arg.towerloc:
  96. print("测风塔{}导出数据".format(i))
  97. # 删除没用的列
  98. drop_colmns = ["C_DATA1","C_DATA2","C_DATA3","C_DATA4","C_DATA5","C_DATA6","C_DATA7","C_DATA8","C_DATA9","C_DATA10","C_IS_GENERATED","C_ABNORMAL_CODE"]
  99. get_colmns = []
  100. # 查询表的所有列名
  101. result_set = exec_sql("SHOW COLUMNS FROM t_wind_tower_status_data", engine)
  102. for name in result_set.iloc[:,0]:
  103. if name not in drop_colmns:
  104. get_colmns.append(name)
  105. all_columns_str = ", ".join([f'{col}' for col in get_colmns])
  106. tower_sql = "select " + all_columns_str + " from t_wind_tower_status_data where C_EQUIPMENT_NO="+str(i)
  107. tower = exec_sql(tower_sql, engine)
  108. utils.savedata.saveData("tower/tower-{}.csv".format(i), tower)
  109. def get_process_power(database):
  110. """
  111. 获取整体功率数据
  112. :param database:
  113. :return:
  114. """
  115. engine = create_database(database)
  116. sql_power = "select C_TIME,C_REAL_VALUE from t_power_station_status_data"
  117. power = exec_sql(sql_power, engine)
  118. utils.savedata.saveData("power.csv", power)
  119. def get_turbine_info(database):
  120. """
  121. 获取风机信息
  122. :param database:
  123. :return:
  124. """
  125. engine = create_engine(database)
  126. sql_turbine = "select C_ID, C_LATITUDE as '纬度', C_LONGITUDE as '经度', C_HUB_HEIGHT as '轮毂高度' from t_wind_turbine_info"
  127. turbine_info = exec_sql(sql_turbine, engine)
  128. utils.savedata.saveData("风机信息.csv", turbine_info)
  129. def indep_process():
  130. """
  131. 进一步数据处理:时间统一处理等
  132. :return:
  133. """
  134. # 测风塔数据处理
  135. for i in arg.towerloc:
  136. tower = utils.savedata.readData("/tower/tower-{}.csv".format(i))
  137. # 判断每一列是否全是 -99
  138. all_minus_99 = (tower == -99).all()
  139. # 获取全是 -99 的列的列名
  140. cols_to_drop = all_minus_99[all_minus_99 == True].index.tolist()
  141. # 使用 drop() 方法删除列
  142. tower = tower.drop(cols_to_drop, axis=1)
  143. # MBD: 将一部分是-99的列删除,把-99替换为nan
  144. tower_nan = tower.replace(-99, np.nan, inplace=False)
  145. # nan 超过80% 删除
  146. tower = tower.dropna(axis=1, thresh=len(tower_nan) * 0.8)
  147. utils.savedata.saveData("/tower/tower-{}-process.csv".format(i), tower)
  148. # 测风塔时间统一
  149. tower1 = utils.savedata.readData("/tower/tower-{}-process.csv".format(1))
  150. # tower2 = utils.savedata.readData("/tower/tower-{}-process.csv".format(2))
  151. # tower1 = tower1[tower1['C_TIME'].isin(tower2['C_TIME'])]
  152. # tower2 = tower2[tower2['C_TIME'].isin(tower1['C_TIME'])]
  153. utils.savedata.saveData("/tower/tower-{}-process.csv".format(1), tower1)
  154. # utils.savedata.saveData("/tower/tower-{}-process.csv".format(2), tower2)
  155. # 所有表时间统一
  156. filenames = ["/NWP.csv","/power.csv", '/tower/tower-1-process.csv']
  157. dataframes = []
  158. for i in arg.turbineloc:
  159. filenames.append("/turbine-15/turbine-{}.csv".format(i))
  160. for name in filenames:
  161. dataframes.append(utils.savedata.readData(name))
  162. # 查找最大起始时间和最小结束时间
  163. max_start_time = max(df['C_TIME'].min() for df in dataframes)
  164. min_end_time = min(df['C_TIME'].max() for df in dataframes)
  165. print(max_start_time)
  166. print(min_end_time)
  167. # 重新调整每个 DataFrame 的时间范围,只保留在 [max_start_time, min_end_time] 区间内的数据
  168. for i, df in enumerate(dataframes):
  169. df['C_TIME'] = pd.to_datetime(df['C_TIME']) # 确保时间列是 datetime 类型
  170. df_filtered = df[(df['C_TIME'] >= max_start_time) & (df['C_TIME'] <= min_end_time)]
  171. # 将结果保存到新文件,文件名为原文件名加上 "_filtered" 后缀
  172. utils.savedata.saveData(filenames[i],df_filtered)
  173. def NWP_indep_process():
  174. """
  175. 将NWP数据按照缺失值数量划分为N个不同数据集
  176. :return:
  177. """
  178. # NWP数据进一步处理
  179. NWP = utils.savedata.readData("NWP.csv")
  180. df = pd.to_datetime(NWP['C_TIME'])
  181. time_diff = df.diff()
  182. time_diff_threshold = pd.Timedelta(minutes=15)
  183. missing_values = df[time_diff > time_diff_threshold]
  184. print("NWP数据缺失的数量为:{}".format(len(missing_values)))
  185. print(missing_values)
  186. # 文件保存
  187. utils.savedata.saveVar("NWP_miss.pickle", missing_values)
  188. split_indices = []
  189. for i in range(len(missing_values)):
  190. if i == 0:
  191. split_indices.append((0, missing_values.index[i]))
  192. else:
  193. split_indices.append((missing_values.index[i - 1], missing_values.index[i]))
  194. split_indices.append((missing_values.index[-1], len(df))) # MBD: 分割少了一个点
  195. split_datasets = [NWP.iloc[start:end,:] for start, end in split_indices]
  196. for i, split_df in enumerate(split_datasets):
  197. utils.savedata.saveData("Dataset_training/NWP/NWP_{}.csv".format(i),split_df)
  198. return split_datasets
  199. # def power_indep_process():
  200. # NWP = utils.savedata.readData("power.csv")
  201. def Data_split():
  202. """
  203. 这个函数没用上,可以不看
  204. :return:
  205. """
  206. NWP = utils.savedata.readData("power_15min.csv")
  207. df = pd.to_datetime(NWP['C_TIME'])
  208. time_diff = df.diff()
  209. time_diff_threshold = pd.Timedelta(minutes=15)
  210. missing_values = df[time_diff > time_diff_threshold]
  211. print("NWP数据缺失的数量为:{}".format(len(missing_values)))
  212. print(missing_values)
  213. NWP_miss = utils.savedata.readVar("NWP_miss.pickle")
  214. for t in missing_values.index:
  215. a = t-1
  216. b = t
  217. time1 = NWP['C_TIME'][a]
  218. time2 = NWP['C_TIME'][b]
  219. df = pd.to_datetime([time1, time2])
  220. # 计算时间差
  221. time_diff = (df[1] - df[0]) / pd.Timedelta(minutes=15)
  222. print(time_diff)
  223. time1 = "2022-10-27 14:00:00"
  224. time2 = "2023-04-16 12:00:00"
  225. df = pd.to_datetime([time1, time2])
  226. # 计算时间差
  227. time_diff = (df[1] - df[0]) / pd.Timedelta(minutes=15)
  228. print(time_diff)
  229. def time_all_in():
  230. """
  231. 这个函数暂时没用上,这个函数的目的是给机头数据进行填充,找到时间缺失的位置,填充为-99
  232. :return:
  233. """
  234. filenames = []
  235. dataframes = []
  236. for i in arg.turbineloc:
  237. filenames.append("/turbine-15/turbine-{}.csv".format(i))
  238. for name in filenames:
  239. dataframes.append(utils.savedata.readData(name))
  240. for df in dataframes:
  241. df['C_TIME'] = pd.to_datetime(df['C_TIME'])
  242. # 创建一个完整的时间序列索引,包括所有可能的时间点
  243. start_time = df['C_TIME'].min()
  244. end_time = df['C_TIME'].max()
  245. full_time_range = pd.date_range(start_time, end_time, freq='15min')
  246. # 使用完整的时间序列索引创建一个空的 DataFrame
  247. full_df = pd.DataFrame(index=full_time_range)
  248. full_df.index.name = 'C_TIME'
  249. # 将原始数据与空的 DataFrame 合并
  250. merged_df = full_df.merge(df, how='left', left_on='time', right_on='time')
  251. # 使用 -99 填充缺失值,除了时间列
  252. merged_df.fillna(-99, inplace=True)
  253. merged_df.reset_index(inplace=True)
  254. def data_process(database):
  255. """
  256. 数据导出+初步处理的总操控代码
  257. :param database:
  258. :return:
  259. """
  260. clear_data()
  261. get_process_NWP(database)
  262. get_process_turbine(database)
  263. get_turbine_info(database)
  264. get_process_tower(database)
  265. get_process_power(database)
  266. indep_process()
  267. NWP_indep_process()
  268. # Data_split()
  269. if __name__ == '__main__':
  270. import os
  271. import glob
  272. # 设置文件夹路径
  273. folder_path = '../data'
  274. # 使用 glob 获取所有的 .csv 文件
  275. csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
  276. # 遍历所有 .csv 文件并删除
  277. for file_path in csv_files:
  278. os.remove(file_path)
  279. # database = "mysql+pymysql://root:!QAZ2root@192.168.1.205:3306/ipfcst-sishui-a"
  280. # engine = create_database(database)
  281. #
  282. # # NPW数据
  283. # sql_NWP = "select C_SC_DATE,C_SC_TIME,C_T,C_RH,C_PRESSURE,C_WD10,C_WD30,C_WD50,C_WD70,C_WD80,C_WD90,C_WD100,C_WD170,C_WS10,C_WS30,C_WS50,C_WS70,C_WS80,C_WS90,C_WS100,C_WS170 from t_nwp"
  284. # NWP = exec_sql(sql_NWP,engine)
  285. #
  286. # # 分风机功率
  287. # sql_wind = "select C_WS,C_ACTIVE_POWER from t_wind_turbine_status_data_15 WHERE C_EQUIPMENT_NO=2 and C_WS>0 and C_ACTIVE_POWER>0 "
  288. # df_wind = exec_sql(sql_wind,engine)
  289. # print(df_wind)
  290. # 总功率数据读取
  291. # sql_power = "select * from t_power_station_status_data"
  292. # df_power = exec_sql(sql_power, engine)
  293. filenames = []
  294. dataframes = []
  295. for i in arg.turbineloc:
  296. filenames.append("../data/turbine-15/turbine-{}.csv".format(i))
  297. for name in filenames:
  298. dataframes.append(pd.read_csv(name).iloc[:7000,:])
  299. # for df in enumerate(dataframes):
  300. # df =
  301. mean_of_first_columns = pd.concat([df['C_WS'] for df in dataframes], axis=1).mean(axis=1)
  302. mean_of_second_columns = (pd.concat([df['C_ACTIVE_POWER'] for df in dataframes], axis=1).sum(axis=1)/1000).astype(int)
  303. print(len(mean_of_first_columns))
  304. plt.scatter(mean_of_first_columns, mean_of_second_columns)
  305. plt.show()