data_analysis.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. # !usr/bin/env python
  2. # -*- coding:utf-8 _*-
  3. """
  4. @Author:Lijiaxing
  5. @File:data_analysis.py
  6. @Time:2023/4/24 15:16
  7. """
  8. import os.path
  9. import matplotlib.colors as mcolors
  10. colors = list(mcolors.XKCD_COLORS.keys())
  11. import pandas as pd
  12. # from mpl_toolkits.basemap import Basemap
  13. from scipy.signal import savgol_filter
  14. import numpy as np
  15. import matplotlib.pyplot as plt
  16. from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
  17. from sklearn.metrics import silhouette_samples, silhouette_score
  18. def paint_others(y):
  19. """ 绘制其他数据 """
  20. plt.plot([j for j in range(y)], y)
  21. # 添加标题和标签
  22. plt.xlabel('x')
  23. plt.ylabel('y')
  24. # 显示图形
  25. plt.show()
  26. def compute_cos_similarity(a, b):
  27. """
  28. 计算两个向量的余弦相似度
  29. :param a: 向量a
  30. :param b: 向量b
  31. :return: 余弦相似度值
  32. """
  33. return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
  34. def compute_pearsonr(a):
  35. """
  36. 计算数据皮尔逊相关系数并返回相似度矩阵
  37. :param a: 数据格式为n*m的矩阵,n为数据个数,m为数据维度
  38. :return: 返回相似度矩阵,数据格式为n*n的矩阵
  39. """
  40. return np.corrcoef(a)
  41. def compute_distance(a, b):
  42. """
  43. 计算两个向量的欧式距离
  44. :param a:
  45. :param b:
  46. :return: 返回两个向量的欧式距离
  47. """
  48. return np.linalg.norm(a - b)
  49. def hierarchical_clustering(data, threshold, similarity_func):
  50. """
  51. 层次聚类,使用工具包scipy.cluster.hierarchy中的linkage和fcluster函数进行层次聚类
  52. :param data: 二维数据,格式为n*m的矩阵,n为数据个数,m为数据维度
  53. :param threshold: 阈值,当两个数据的距离小于阈值时,将两个数据归为一类,阈值为根据相似度矩阵层次聚类后的类别距离阈值,可根据需求进行调整,可大于1
  54. :param similarity_func: 相似度计算函数,用于计算两个数据的相似度,可以进行替换,若替换为计算距离的函数需对内部进行修改
  55. :return: 返回聚类结果,格式为n*1的矩阵,n为数据个数,每个数据的值为该数据所属的类别
  56. """
  57. # 计算数据的相似度矩阵
  58. similarity_matrix = similarity_func(data)
  59. # 计算数据的距离矩阵
  60. distance_matrix = 1 - similarity_matrix
  61. # 进行层次聚类返回聚类结果
  62. Z = linkage(distance_matrix, method='ward')
  63. # 根据相似度阈值获取聚类结果
  64. clusters = fcluster(Z, t=threshold, criterion='distance')
  65. print(clusters)
  66. # 画出层次聚类树形结构
  67. # fig = plt.figure(figsize=(5, 3))
  68. # dn = dendrogram(Z)
  69. # plt.show()
  70. # clusters[42] = 1
  71. silhouette = silhouette_samples(np.abs(distance_matrix), clusters, metric='euclidean')
  72. silhouette1 = silhouette_score(np.abs(distance_matrix), clusters, metric='euclidean')
  73. print(f"平均轮廓系数为:{silhouette1}, 单个样本的轮廓系数:{silhouette}")
  74. return clusters
  75. class DataAnalysis:
  76. """
  77. 数据分析类
  78. """
  79. def __init__(self, data_length, data_start, data_end):
  80. """
  81. 初始化
  82. :param data_length: 分析数据段长度
  83. :param data_start: 分析数据段开始位置
  84. :param data_end: 分析数据段结束位置
  85. """
  86. # 原始风机功率数据傅里叶变换滤波后的数据
  87. self.ori_turbine_power = None
  88. self.ori_turbine_fft = None
  89. # 原始风机功率数据片段
  90. self.ori_turbine_pic = None
  91. # 聚类结果
  92. self.cluster = None
  93. # 风机功率差分平滑后的结果
  94. self.smooth_turbine_diff = None
  95. # 风机功率差分变化情况
  96. self.diff_change = None
  97. # 风机功率差分
  98. self.turbine_diff = None
  99. # 全部风机数据
  100. self.turbine = None
  101. # 风机的标号顺序
  102. self.turbine_id = [x for x in range(76, 151, 1)]
  103. # b1b4 = [142, 143, 144, 145]
  104. # self.turbine_id = [id for id in self.turbine_id if id not in b1b4]
  105. # 风机功率数据15分钟级别
  106. self.power_15min = None
  107. # 风机经纬度信息
  108. self.info = None
  109. # 使用数据长度
  110. self.data_length = data_length
  111. # 使用数据开始位置
  112. self.data_start = data_start
  113. # 使用数据结束位置
  114. self.data_end = data_end
  115. # 导入数据
  116. self.turbine_path = '../../cluster/260/turbine-{}.csv'
  117. self.load_data(normalize=False)
  118. # 计算风机功率差分
  119. self.compute_turbine_diff()
  120. def load_data(self, normalize=False):
  121. """
  122. 加载数据
  123. :return:
  124. """
  125. self.turbine, turbines = {}, []
  126. for i in self.turbine_id:
  127. self.turbine[i] = pd.read_csv(self.turbine_path.format(i)).reset_index(drop=True)
  128. if normalize is True:
  129. self.normalize()
  130. def normalize(self):
  131. turbines = [self.turbine[i].values[:, 1:].astype(np.float32) for i in self.turbine_id]
  132. turbines = np.vstack(turbines)
  133. mean, std = np.mean(turbines, axis=0), np.std(turbines, axis=0)
  134. for i in self.turbine_id:
  135. c_time = self.turbine[i]['C_TIME']
  136. self.turbine[i] = (self.turbine[i].iloc[:, 1:] - mean) / std
  137. self.turbine[i].insert(loc=0, column='C_TIME', value=c_time)
  138. return self.turbine
  139. def compute_turbine_diff(self):
  140. """
  141. 计算风机功率差分
  142. :return:
  143. """
  144. turbine_diff = []
  145. ori_turbine_pic = []
  146. ori_turbine_power = []
  147. for turbine_i in self.turbine_id:
  148. ori = np.array(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end + 1])
  149. diff_array = np.diff(ori)
  150. smoothness_value = np.std(diff_array)
  151. print("turbine-{}的平滑度是:{}".format(turbine_i, round(smoothness_value, 2)))
  152. turbine_diff.append(diff_array)
  153. ori_turbine_pic.append(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end])
  154. ori_turbine_power.append(self.turbine[turbine_i]['C_ACTIVE_POWER'].values[self.data_start:self.data_end])
  155. self.ori_turbine_power = ori_turbine_power
  156. self.ori_turbine_pic = ori_turbine_pic
  157. self.turbine_diff = turbine_diff
  158. diff_change = []
  159. for diff_i in turbine_diff:
  160. single_diff_change = []
  161. for diff_i_i in diff_i:
  162. if diff_i_i > 0:
  163. single_diff_change.append(1)
  164. elif diff_i_i < 0:
  165. single_diff_change.append(-1)
  166. else:
  167. single_diff_change.append(0)
  168. diff_change.append(single_diff_change)
  169. self.diff_change = diff_change
  170. self.ori_turbine_fft = [self.turbine_fft(i + 1) for i in range(len(self.ori_turbine_pic))]
  171. # 平滑
  172. self.turbine_smooth(window_size=21)
  173. def paint_map(self):
  174. """
  175. 绘制经纬度地图
  176. :return:
  177. """
  178. lats = self.info['纬度'].values
  179. lons = self.info['经度'].values
  180. map = Basemap()
  181. # 绘制海岸线和国家边界
  182. map.drawcoastlines()
  183. map.drawcountries()
  184. # 绘制经纬度坐标
  185. map.drawmeridians(range(0, 360, 30))
  186. map.drawparallels(range(-90, 90, 30))
  187. # 绘制点
  188. x, y = map(lons, lats)
  189. map.plot(x, y, 'bo', markersize=10)
  190. # 显示图表
  191. plt.show()
  192. def paint_power15min(self):
  193. """
  194. 绘制15分钟功率曲线
  195. :return:
  196. """
  197. plt.plot(self.power_15min['C_REAL_VALUE'])
  198. # 设置图表标题和轴标签
  199. plt.title('Data Time Change Curve')
  200. plt.xlabel('Date')
  201. plt.ylabel('Value')
  202. # 显示图表
  203. plt.show()
  204. def paint_lats_lons(self):
  205. """
  206. 绘制经纬度图
  207. :return:
  208. """
  209. x = self.info['纬度'].values
  210. y = self.info['经度'].values
  211. # 绘制散点图
  212. fig, ax = plt.subplots()
  213. plt.scatter(x, y)
  214. for i, txt in enumerate(self.info['id'].values):
  215. ax.annotate(txt, (x[i], y[i]))
  216. # 设置图表标题和轴标签
  217. plt.xlabel('lats')
  218. plt.ylabel('lons')
  219. # 显示图表
  220. plt.show()
  221. def similarity_score(self, turbine_diff, threshold=0.5):
  222. """
  223. 使用余弦相似度计算相似度分数并返回相似度大于阈值的index矩阵
  224. :param turbine_diff: 需要计算相似的矩阵,数据格式n*m,n为数据条数,m为数据维数
  225. :param threshold: 相似度阈值
  226. :return: 返回相似计算后的矩阵
  227. """
  228. similarity = {i: [] for i in range(49)}
  229. similarity_index = {i: [] for i in range(49)}
  230. for turbine_i in range(49):
  231. for turbine_j in range(49):
  232. cos_similarity = compute_cos_similarity(turbine_diff[turbine_i], turbine_diff[turbine_j])
  233. similarity[turbine_i].append(cos_similarity)
  234. if cos_similarity > threshold:
  235. similarity_index[turbine_i].append(turbine_j)
  236. return similarity_index
  237. def mapping_turbines(self):
  238. turbine_clus = {}
  239. import pickle
  240. for a, b in zip(self.turbine_id, self.cluster):
  241. print("风机编号:{},类别:{}".format(a, b))
  242. turbine_clus.setdefault(b, []).append(a)
  243. path = os.path.join(os.path.dirname(self.turbine_path), 'turbine_cls.pickle')
  244. with open(path, 'wb') as file:
  245. pickle.dump(turbine_clus, file)
  246. def paint_turbine(self, paint_default=True):
  247. """
  248. 绘制风机地理位置图
  249. :param paint_default:默认True,绘制聚类后每个类别的数据折线图
  250. :return: None
  251. """
  252. # y = self.info['纬度'].values
  253. # x = self.info['经度'].values
  254. #
  255. # fig, ax = plt.subplots(figsize=(15, 15))
  256. #
  257. # plt.scatter(x, y, c=self.cluster)
  258. # for i, txt in enumerate(self.info['C_ID'].values):
  259. # ax.annotate(txt, (x[i], y[i]))
  260. # 设置图表标题和轴标签
  261. # plt.xlabel('lons')
  262. # plt.ylabel('lats')
  263. # plt.legend()
  264. #
  265. # # 显示图表
  266. # plt.savefig('analysis_img/turbine_cluster.png')
  267. # plt.show()
  268. plt.figure(figsize=(60, 40))
  269. cmap = plt.get_cmap('viridis')
  270. linestyle= ['solid', 'dashed', 'dotted', 'dashdot']
  271. for i in range(max(self.cluster)):
  272. cluster, cluster_fft, cluster_power, = [], [], []
  273. for j, item in enumerate(self.cluster):
  274. if item == i + 1:
  275. cluster.append(self.ori_turbine_pic[j])
  276. cluster_fft.append(self.ori_turbine_fft[j])
  277. cluster_power.append(self.ori_turbine_power[j])
  278. cluster_power = np.average(cluster_power, axis=0)
  279. cluster_fft = np.average(cluster_fft, axis=0)
  280. cluster = np.average(cluster, axis=0)
  281. diff_array = np.diff(cluster)
  282. smoothness_value = np.std(diff_array)
  283. print("聚类-{}的平滑度是:{}".format(i+1, smoothness_value))
  284. color = cmap(i*200)
  285. plt.figure(1)
  286. # plt.subplot(max(self.cluster), 1, 1)
  287. # print("----", cluster, linestyle[i])
  288. # plt.plot([j for j in range(len(cluster))], cluster, color=color, label='cluster'+str(i))
  289. # plt.subplot(max(self.cluster), 1, 2)
  290. # plt.plot([j for j in range(len(cluster_fft))], cluster_fft, color=color, label='cluster'+str(i))
  291. # ws_power_dict = {}
  292. # for c, p in zip(cluster, cluster_power):
  293. # ws_power_dict.setdefault(round(c, 2), []).append(round(p, 2))
  294. #
  295. # for key, value in ws_power_dict.items():
  296. # ws_power_dict[key] = round(np.average(value), 2)
  297. # print(ws_power_dict)
  298. # plt.scatter(cluster, cluster_power, color=color, label='cluster' + str(i),
  299. # linestyle=linestyle[i], s=1, alpha=0.2)
  300. # 添加图例
  301. # plt.legend()
  302. # # 显示图形
  303. # plt.savefig('./clusters.png')
  304. # plt.show()
  305. # if paint_default:
  306. # for i in range(max(self.cluster)):
  307. # self.paint_turbine_k(i + 1) # 画出聚类中每个风机的曲线
  308. def turbine_smooth(self, window_size=50):
  309. """
  310. 使用滑动平均平滑数据。
  311. 参数:
  312. data -- 需要平滑的数据,numpy数组类型
  313. window_size -- 滑动窗口大小,整数类型
  314. 返回值:
  315. smooth_data -- 平滑后的数据,numpy数组类型
  316. """
  317. # weights = np.repeat(1.0, window_size) / window_size
  318. smooth_data = []
  319. for turbine_diff_i in self.turbine_diff:
  320. smooth_y = savgol_filter(turbine_diff_i, window_length=window_size, polyorder=3)
  321. smooth_data.append(smooth_y)
  322. # smooth_data.append(np.convolve(turbine_diff_i, weights, 'valid'))
  323. self.smooth_turbine_diff = smooth_data
  324. def paint_turbine_k(self, k):
  325. """
  326. 绘制第k聚类的风机数据折线图
  327. :param k:
  328. :return:
  329. """
  330. pic_label = []
  331. y = []
  332. plt.figure(figsize=(20, 10))
  333. cmap = plt.get_cmap('viridis')
  334. for i, item in enumerate(self.cluster):
  335. if item == k:
  336. pic_label.append('turbine-' + str(self.turbine_id[i]))
  337. y.append(self.ori_turbine_fft[i])
  338. for i in range(len(y)):
  339. color = cmap(i / 10)
  340. plt.plot([j for j in range(len(y[i]))], y[i], color=color, label=pic_label[i])
  341. # 添加标签和标题
  342. plt.xlabel('x')
  343. plt.ylabel('y')
  344. plt.title('Cluster {}'.format(k))
  345. # 添加图例
  346. plt.legend()
  347. # 显示图形
  348. plt.savefig('analysis_img/cluster/cluster_{}.png'.format(k))
  349. plt.show()
  350. def turbine_fft(self, k):
  351. """
  352. 对第k台原始风机数据进行傅里叶变换,并绘制变换前后曲线
  353. :param k: 数据读入时的风机顺序index,从1开始
  354. :return: 傅里叶变换清洗后的数据,数据格式
  355. """
  356. y = self.ori_turbine_pic
  357. t = np.linspace(0, 1, self.data_length)
  358. signal = y[k - 1]
  359. # 进行傅里叶变换
  360. freq = np.fft.fftfreq(len(signal), t[1] - t[0])
  361. spectrum = np.fft.fft(signal)
  362. spectrum_abs = np.abs(spectrum)
  363. threshold = np.percentile(spectrum_abs, 98)
  364. indices = spectrum_abs > threshold
  365. spectrum_clean = indices * spectrum
  366. # 进行傅里叶逆变换
  367. signal_clean = np.fft.ifft(spectrum_clean)
  368. # plt.figure(figsize=(20, 10))
  369. #
  370. # # 绘制时域信号
  371. # plt.subplot(4, 1, 1)
  372. # plt.plot(t, signal)
  373. # plt.title(self.turbine_id[k-1])
  374. #
  375. # # 绘制频域信号
  376. # plt.subplot(4, 1, 2)
  377. # plt.plot(freq, np.abs(spectrum))
  378. #
  379. # # 绘制过滤后的频域信号
  380. # plt.subplot(4, 1, 3)
  381. # plt.plot(freq, np.abs(spectrum_clean))
  382. #
  383. # # 绘制过滤后的时域信号
  384. # plt.subplot(4, 1, 4)
  385. # plt.plot(t, signal_clean)
  386. #
  387. # plt.savefig('analysis_img/fft/{}_turbine_fft.png'.format(self.turbine_id[k-1]))
  388. # plt.show()
  389. return signal_clean
  390. def paint_double(self, i, j):
  391. """
  392. 绘制两台风机的数据变换对比
  393. :param i: 风机数据读入时数据编号,从1开始
  394. :param j: 风机数据读入时数据编号,从1开始
  395. :return:
  396. """
  397. y = self.ori_turbine_fft
  398. x = [index for index in range(self.data_length)]
  399. data_i = y[i - 1]
  400. data_j = y[j - 1]
  401. plt.figure(figsize=(20, 10))
  402. plt.plot(x, data_i, label='turbine {}'.format(self.turbine_id[i - 1]), linestyle='solid')
  403. plt.plot(x, data_j, label='turbine {}'.format(self.turbine_id[j - 1]), linestyle='dashed')
  404. plt.title('{} and {}'.format(i, j))
  405. plt.legend()
  406. plt.savefig('analysis_img/{}_{}_turbine.png'.format(self.turbine_id[i - 1], self.turbine_id[j - 1]))
  407. plt.show()
  408. def process_ori_data(self):
  409. """
  410. 对原始数据进行处理,聚类和绘图
  411. :return:
  412. """
  413. self.turbine_clusters(self.ori_turbine_fft)
  414. self.paint_turbine()
  415. def turbine_clusters(self, data=None):
  416. """
  417. 风机数据聚类,聚类信息保存在self.cluster中
  418. :param data: 默认为空,也可以使用其他数据聚类,并体现在绘图中,
  419. 数据格式:二维数据n*m,n为数据条数,m为每条数据维数
  420. :return: None
  421. """
  422. if data is None:
  423. cluster = hierarchical_clustering(self.turbine_diff, threshold=1.4,
  424. similarity_func=compute_pearsonr) # 层次聚类
  425. else:
  426. cluster = hierarchical_clustering(data, threshold=0.6,
  427. similarity_func=compute_pearsonr)
  428. self.cluster = cluster
  429. # 在这里保存cluster变量
  430. # from cluster_analysis import cluster_power_list_file, cluster_power_list_folder
  431. #
  432. # output_path = '../data-process/data/cluster_power/'
  433. # cluster_power_list_file(self.cluster, self.turbine_id,
  434. # input_path='../data-process/data/output_filtered_csv_files/', output_path=output_path)
  435. # cluster_power_list_folder(self.cluster, self.turbine_id, input_path='../data-process/data/continuous_data/',
  436. # output_path=output_path)
  437. if __name__ == '__main__':
  438. # import pickle
  439. # with open('./turbine_dict.pickle', 'rb') as f:
  440. # turbine_dict = pickle.load(f)
  441. # number, dt = 0, []
  442. # for key, turbine in turbine_dict.items():
  443. # if number == 0:
  444. # dt = turbine['时间']
  445. # number += 1
  446. # else:
  447. # dt = pd.to_datetime(list(set(dt) & set(turbine['时间'])))
  448. # for key, turbine in turbine_dict.items():
  449. # turbine_dict[key] = turbine[turbine['时间'].isin(dt)]
  450. # with open('./turbine_dict_common.pickle', 'wb') as f:
  451. # pickle.dump(turbine_dict, f)
  452. data_analysis = DataAnalysis(data_length=8549,
  453. data_start=0,
  454. data_end=8540)
  455. data_analysis.process_ori_data()
  456. data_analysis.mapping_turbines()
  457. # data_analysis.paint_double(1, 56)