|
@@ -0,0 +1,425 @@
|
|
|
+# !usr/bin/env python
|
|
|
+# -*- coding:utf-8 _*-
|
|
|
+"""
|
|
|
+@Author:Lijiaxing
|
|
|
+
|
|
|
+@File:data_analysis.py
|
|
|
+@Time:2023/4/24 15:16
|
|
|
+
|
|
|
+"""
|
|
|
+import pandas as pd
|
|
|
+#from mpl_toolkits.basemap import Basemap
|
|
|
+from scipy.signal import savgol_filter
|
|
|
+import numpy as np
|
|
|
+import matplotlib.pyplot as plt
|
|
|
+from scipy.cluster.hierarchy import linkage, fcluster
|
|
|
+
|
|
|
+
|
|
|
+def paint_others(y):
|
|
|
+ """ 绘制其他数据 """
|
|
|
+ plt.plot([j for j in range(y)], y)
|
|
|
+ # 添加标题和标签
|
|
|
+ plt.xlabel('x')
|
|
|
+ plt.ylabel('y')
|
|
|
+
|
|
|
+ # 显示图形
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+
|
|
|
+def compute_cos_similarity(a, b):
|
|
|
+ """
|
|
|
+ 计算两个向量的余弦相似度
|
|
|
+ :param a: 向量a
|
|
|
+ :param b: 向量b
|
|
|
+ :return: 余弦相似度值
|
|
|
+ """
|
|
|
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
|
|
+
|
|
|
+
|
|
|
+def compute_pearsonr(a):
|
|
|
+ """
|
|
|
+ 计算数据皮尔逊相关系数并返回相似度矩阵
|
|
|
+ :param a: 数据格式为n*m的矩阵,n为数据个数,m为数据维度
|
|
|
+ :return: 返回相似度矩阵,数据格式为n*n的矩阵
|
|
|
+ """
|
|
|
+ return np.corrcoef(a)
|
|
|
+
|
|
|
+
|
|
|
+def compute_distance(a, b):
|
|
|
+ """
|
|
|
+ 计算两个向量的欧式距离
|
|
|
+ :param a:
|
|
|
+ :param b:
|
|
|
+ :return: 返回两个向量的欧式距离
|
|
|
+ """
|
|
|
+ return np.linalg.norm(a - b)
|
|
|
+
|
|
|
+
|
|
|
+def hierarchical_clustering(data, threshold, similarity_func):
|
|
|
+ """
|
|
|
+ 层次聚类,使用工具包scipy.cluster.hierarchy中的linkage和fcluster函数进行层次聚类
|
|
|
+ :param data: 二维数据,格式为n*m的矩阵,n为数据个数,m为数据维度
|
|
|
+ :param threshold: 阈值,当两个数据的距离小于阈值时,将两个数据归为一类,阈值为根据相似度矩阵层次聚类后的类别距离阈值,可根据需求进行调整,可大于1
|
|
|
+ :param similarity_func: 相似度计算函数,用于计算两个数据的相似度,可以进行替换,若替换为计算距离的函数需对内部进行修改
|
|
|
+ :return: 返回聚类结果,格式为n*1的矩阵,n为数据个数,每个数据的值为该数据所属的类别
|
|
|
+ """
|
|
|
+ # 计算数据的相似度矩阵
|
|
|
+ similarity_matrix = similarity_func(data)
|
|
|
+
|
|
|
+ # 计算数据的距离矩阵
|
|
|
+ distance_matrix = 1 - similarity_matrix
|
|
|
+
|
|
|
+ # 进行层次聚类返回聚类结果
|
|
|
+ Z = linkage(distance_matrix, method='ward')
|
|
|
+ # 根据相似度阈值获取聚类结果
|
|
|
+ clusters = fcluster(Z, t=threshold, criterion='distance')
|
|
|
+ return clusters
|
|
|
+
|
|
|
+
|
|
|
+class DataAnalysis:
|
|
|
+ """
|
|
|
+ 数据分析类
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, data_length, data_start, data_end):
|
|
|
+ """
|
|
|
+ 初始化
|
|
|
+ :param data_length: 分析数据段长度
|
|
|
+ :param data_start: 分析数据段开始位置
|
|
|
+ :param data_end: 分析数据段结束位置
|
|
|
+ """
|
|
|
+ # 原始风机功率数据傅里叶变换滤波后的数据
|
|
|
+ self.ori_turbine_fft = None
|
|
|
+ # 原始风机功率数据片段
|
|
|
+ self.ori_turbine_pic = None
|
|
|
+ # 聚类结果
|
|
|
+ self.cluster = None
|
|
|
+ # 风机功率差分平滑后的结果
|
|
|
+ self.smooth_turbine_diff = None
|
|
|
+ # 风机功率差分变化情况
|
|
|
+ self.diff_change = None
|
|
|
+ # 风机功率差分
|
|
|
+ self.turbine_diff = None
|
|
|
+ # 全部风机数据
|
|
|
+ self.turbine = None
|
|
|
+ # 风机的标号顺序
|
|
|
+ self.turbine_id = list(range(102, 162))
|
|
|
+ self.turbine_id.remove(144)
|
|
|
+ # 风机功率数据15分钟级别
|
|
|
+ self.power_15min = None
|
|
|
+ # 风机经纬度信息
|
|
|
+ self.info = None
|
|
|
+ # 使用数据长度
|
|
|
+ self.data_length = data_length
|
|
|
+ # 使用数据开始位置
|
|
|
+ self.data_start = data_start
|
|
|
+ # 使用数据结束位置
|
|
|
+ self.data_end = data_end
|
|
|
+ # 导入数据
|
|
|
+ self.load_data()
|
|
|
+ # 计算风机功率差分
|
|
|
+ self.compute_turbine_diff()
|
|
|
+
|
|
|
+ def load_data(self):
|
|
|
+ """
|
|
|
+ 加载数据
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ self.info = pd.read_csv('../data/风机信息.csv', encoding='utf-8')
|
|
|
+ # power_15min = pd.read_csv('../data/power_15min.csv')
|
|
|
+ # for i in range(len(power_15min)):
|
|
|
+ # if power_15min.loc[i, 'C_REAL_VALUE'] == -9999:
|
|
|
+ # # 方便在曲线中看出缺失数据位置
|
|
|
+ # power_15min.loc[i, 'C_REAL_VALUE'] = -34.56789
|
|
|
+ # self.power_15min = power_15min
|
|
|
+ turbine_path = '../data/output_filtered_csv_files/turbine-{}.csv'
|
|
|
+ self.turbine = {}
|
|
|
+ for i in self.turbine_id:
|
|
|
+ self.turbine[i] = pd.read_csv(turbine_path.format(i))[21:]
|
|
|
+
|
|
|
+ def compute_turbine_diff(self):
|
|
|
+ """
|
|
|
+ 计算风机功率差分
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ turbine_diff = []
|
|
|
+ ori_turbine_pic = []
|
|
|
+ for turbine_i in self.turbine_id:
|
|
|
+ diff_array = np.diff(
|
|
|
+ np.array(self.turbine[turbine_i]['C_ACTIVE_POWER'].values[self.data_start:self.data_end+1]))
|
|
|
+ turbine_diff.append(diff_array)
|
|
|
+ ori_turbine_pic.append(self.turbine[turbine_i]['C_ACTIVE_POWER'].values[self.data_start:self.data_end])
|
|
|
+ self.ori_turbine_pic = ori_turbine_pic
|
|
|
+ self.turbine_diff = turbine_diff
|
|
|
+
|
|
|
+ diff_change = []
|
|
|
+ for diff_i in turbine_diff:
|
|
|
+ single_diff_change = []
|
|
|
+ for diff_i_i in diff_i:
|
|
|
+ if diff_i_i > 0:
|
|
|
+ single_diff_change.append(1)
|
|
|
+ elif diff_i_i < 0:
|
|
|
+ single_diff_change.append(-1)
|
|
|
+ else:
|
|
|
+ single_diff_change.append(0)
|
|
|
+ diff_change.append(single_diff_change)
|
|
|
+ self.diff_change = diff_change
|
|
|
+ self.ori_turbine_fft = [self.turbine_fft(i + 1) for i in range(len(self.ori_turbine_pic))]
|
|
|
+
|
|
|
+ # 平滑
|
|
|
+ self.turbine_smooth(window_size=21)
|
|
|
+
|
|
|
+ def paint_map(self):
|
|
|
+ """
|
|
|
+ 绘制经纬度地图
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ lats = self.info['纬度'].values
|
|
|
+ lons = self.info['经度'].values
|
|
|
+ map = Basemap()
|
|
|
+
|
|
|
+ # 绘制海岸线和国家边界
|
|
|
+ map.drawcoastlines()
|
|
|
+ map.drawcountries()
|
|
|
+
|
|
|
+ # 绘制经纬度坐标
|
|
|
+ map.drawmeridians(range(0, 360, 30))
|
|
|
+ map.drawparallels(range(-90, 90, 30))
|
|
|
+
|
|
|
+ # 绘制点
|
|
|
+
|
|
|
+ x, y = map(lons, lats)
|
|
|
+ map.plot(x, y, 'bo', markersize=10)
|
|
|
+
|
|
|
+ # 显示图表
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+ def paint_power15min(self):
|
|
|
+ """
|
|
|
+ 绘制15分钟功率曲线
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+
|
|
|
+ plt.plot(self.power_15min['C_REAL_VALUE'])
|
|
|
+
|
|
|
+ # 设置图表标题和轴标签
|
|
|
+ plt.title('Data Time Change Curve')
|
|
|
+ plt.xlabel('Date')
|
|
|
+ plt.ylabel('Value')
|
|
|
+
|
|
|
+ # 显示图表
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+ def paint_lats_lons(self):
|
|
|
+ """
|
|
|
+ 绘制经纬度图
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ x = self.info['纬度'].values
|
|
|
+ y = self.info['经度'].values
|
|
|
+
|
|
|
+ # 绘制散点图
|
|
|
+ fig, ax = plt.subplots()
|
|
|
+ plt.scatter(x, y)
|
|
|
+
|
|
|
+ for i, txt in enumerate(self.info['id'].values):
|
|
|
+ ax.annotate(txt, (x[i], y[i]))
|
|
|
+
|
|
|
+ # 设置图表标题和轴标签
|
|
|
+ plt.xlabel('lats')
|
|
|
+ plt.ylabel('lons')
|
|
|
+
|
|
|
+ # 显示图表
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+ def similarity_score(self, turbine_diff, threshold=0.5):
|
|
|
+ """
|
|
|
+ 使用余弦相似度计算相似度分数并返回相似度大于阈值的index矩阵
|
|
|
+ :param turbine_diff: 需要计算相似的矩阵,数据格式n*m,n为数据条数,m为数据维数
|
|
|
+ :param threshold: 相似度阈值
|
|
|
+ :return: 返回相似计算后的矩阵
|
|
|
+ """
|
|
|
+ similarity = {i: [] for i in range(49)}
|
|
|
+ similarity_index = {i: [] for i in range(49)}
|
|
|
+ for turbine_i in range(49):
|
|
|
+ for turbine_j in range(49):
|
|
|
+ cos_similarity = compute_cos_similarity(turbine_diff[turbine_i], turbine_diff[turbine_j])
|
|
|
+ similarity[turbine_i].append(cos_similarity)
|
|
|
+ if cos_similarity > threshold:
|
|
|
+ similarity_index[turbine_i].append(turbine_j)
|
|
|
+ return similarity_index
|
|
|
+
|
|
|
+ def paint_turbine(self, paint_default=True):
|
|
|
+ """
|
|
|
+ 绘制风机地理位置图
|
|
|
+ :param paint_default:默认True,绘制聚类后每个类别的数据折线图
|
|
|
+ :return: None
|
|
|
+ """
|
|
|
+
|
|
|
+ # y = self.info['纬度'].values
|
|
|
+ # x = self.info['经度'].values
|
|
|
+ #
|
|
|
+ # fig, ax = plt.subplots(figsize=(15, 15))
|
|
|
+ #
|
|
|
+ # plt.scatter(x, y, c=self.cluster)
|
|
|
+ # for i, txt in enumerate(self.info['C_ID'].values):
|
|
|
+ # ax.annotate(txt, (x[i], y[i]))
|
|
|
+
|
|
|
+ # 设置图表标题和轴标签
|
|
|
+ # plt.xlabel('lons')
|
|
|
+ # plt.ylabel('lats')
|
|
|
+ # plt.legend()
|
|
|
+ #
|
|
|
+ # # 显示图表
|
|
|
+ # plt.savefig('analysis_img/turbine_cluster.png')
|
|
|
+ # plt.show()
|
|
|
+ if paint_default:
|
|
|
+ for i in range(max(self.cluster)):
|
|
|
+ self.paint_turbine_k(i + 1)
|
|
|
+
|
|
|
+ def turbine_smooth(self, window_size=50):
|
|
|
+ """
|
|
|
+ 使用滑动平均平滑数据。
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ data -- 需要平滑的数据,numpy数组类型
|
|
|
+ window_size -- 滑动窗口大小,整数类型
|
|
|
+
|
|
|
+ 返回值:
|
|
|
+ smooth_data -- 平滑后的数据,numpy数组类型
|
|
|
+ """
|
|
|
+
|
|
|
+ # weights = np.repeat(1.0, window_size) / window_size
|
|
|
+ smooth_data = []
|
|
|
+ for turbine_diff_i in self.turbine_diff:
|
|
|
+ smooth_y = savgol_filter(turbine_diff_i, window_length=window_size, polyorder=3)
|
|
|
+ smooth_data.append(smooth_y)
|
|
|
+ # smooth_data.append(np.convolve(turbine_diff_i, weights, 'valid'))
|
|
|
+ self.smooth_turbine_diff = smooth_data
|
|
|
+
|
|
|
+ def paint_turbine_k(self, k):
|
|
|
+ """
|
|
|
+ 绘制第k聚类的风机数据折线图
|
|
|
+ :param k:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ pic_label = []
|
|
|
+ y = []
|
|
|
+ plt.figure(figsize=(20, 10))
|
|
|
+ cmap = plt.get_cmap('viridis')
|
|
|
+ for i, item in enumerate(self.cluster):
|
|
|
+ if item == k:
|
|
|
+ pic_label.append('turbine-'+str(self.turbine_id[i]))
|
|
|
+ y.append(self.ori_turbine_fft[i])
|
|
|
+ for i in range(len(y)):
|
|
|
+ color = cmap(i / 10)
|
|
|
+ plt.plot([j for j in range(len(y[i]))], y[i], color=color, label=pic_label[i])
|
|
|
+ # 添加标签和标题
|
|
|
+ plt.xlabel('x')
|
|
|
+ plt.ylabel('y')
|
|
|
+ plt.title('Cluster {}'.format(k))
|
|
|
+
|
|
|
+ # 添加图例
|
|
|
+ plt.legend()
|
|
|
+
|
|
|
+ # 显示图形
|
|
|
+ plt.savefig('analysis_img/cluster/cluster_{}.png'.format(k))
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+ def turbine_fft(self, k):
|
|
|
+ """
|
|
|
+ 对第k台原始风机数据进行傅里叶变换,并绘制变换前后曲线
|
|
|
+ :param k: 数据读入时的风机顺序index,从1开始
|
|
|
+ :return: 傅里叶变换清洗后的数据,数据格式
|
|
|
+ """
|
|
|
+ y = self.ori_turbine_pic
|
|
|
+ t = np.linspace(0, 1, self.data_length)
|
|
|
+ signal = y[k - 1]
|
|
|
+
|
|
|
+ # 进行傅里叶变换
|
|
|
+ freq = np.fft.fftfreq(len(signal), t[1] - t[0])
|
|
|
+ spectrum = np.fft.fft(signal)
|
|
|
+ spectrum_abs = np.abs(spectrum)
|
|
|
+ threshold = np.percentile(spectrum_abs, 98)
|
|
|
+ indices = spectrum_abs > threshold
|
|
|
+ spectrum_clean = indices * spectrum
|
|
|
+
|
|
|
+ # 进行傅里叶逆变换
|
|
|
+ signal_clean = np.fft.ifft(spectrum_clean)
|
|
|
+ # plt.figure(figsize=(20, 10))
|
|
|
+ #
|
|
|
+ # # 绘制时域信号
|
|
|
+ # plt.subplot(4, 1, 1)
|
|
|
+ # plt.plot(t, signal)
|
|
|
+ # plt.title(k)
|
|
|
+ #
|
|
|
+ # # 绘制频域信号
|
|
|
+ # plt.subplot(4, 1, 2)
|
|
|
+ # plt.plot(freq, np.abs(spectrum))
|
|
|
+ #
|
|
|
+ # # 绘制过滤后的频域信号
|
|
|
+ # plt.subplot(4, 1, 3)
|
|
|
+ # plt.plot(freq, np.abs(spectrum_clean))
|
|
|
+ #
|
|
|
+ # # 绘制过滤后的时域信号
|
|
|
+ # plt.subplot(4, 1, 4)
|
|
|
+ # plt.plot(t, signal_clean)
|
|
|
+ #
|
|
|
+ # plt.savefig('analysis_img/fft/{}_turbine_fft.png'.format(k))
|
|
|
+ # plt.show()
|
|
|
+ return signal_clean
|
|
|
+
|
|
|
+ def paint_double(self, i, j):
|
|
|
+ """
|
|
|
+ 绘制两台风机的数据变换对比
|
|
|
+ :param i: 风机数据读入时数据编号,从1开始
|
|
|
+ :param j: 风机数据读入时数据编号,从1开始
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ y = self.ori_turbine_fft
|
|
|
+ x = [index for index in range(self.data_length)]
|
|
|
+ data_i = y[i - 1]
|
|
|
+ data_j = y[j - 1]
|
|
|
+
|
|
|
+ plt.figure(figsize=(20, 10))
|
|
|
+ plt.plot(x, data_i, label='turbine {}'.format(self.turbine_id[i-1]), linestyle='solid')
|
|
|
+ plt.plot(x, data_j, label='turbine {}'.format(self.turbine_id[j-1]), linestyle='dashed')
|
|
|
+
|
|
|
+ plt.title('{} and {}'.format(i, j))
|
|
|
+ plt.legend()
|
|
|
+ plt.savefig('analysis_img/{}_{}_turbine.png'.format(self.turbine_id[i-1], self.turbine_id[j-1]))
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+ def process_ori_data(self):
|
|
|
+ """
|
|
|
+ 对原始数据进行处理,聚类和绘图
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ self.turbine_clusters(self.ori_turbine_fft)
|
|
|
+ self.paint_turbine()
|
|
|
+
|
|
|
+ def turbine_clusters(self, data=None):
|
|
|
+ """
|
|
|
+ 风机数据聚类,聚类信息保存在self.cluster中
|
|
|
+ :param data: 默认为空,也可以使用其他数据聚类,并体现在绘图中,
|
|
|
+ 数据格式:二维数据n*m,n为数据条数,m为每条数据维数
|
|
|
+ :return: None
|
|
|
+ """
|
|
|
+ if data is None:
|
|
|
+ cluster = hierarchical_clustering(self.turbine_diff, threshold=1.4,
|
|
|
+ similarity_func=compute_pearsonr) # 层次聚类
|
|
|
+ else:
|
|
|
+ cluster = hierarchical_clustering(data, threshold=1,
|
|
|
+ similarity_func=compute_pearsonr)
|
|
|
+ self.cluster = cluster
|
|
|
+ from cluster_power import cluster_power
|
|
|
+ out_put = '../data/cluester_power/'
|
|
|
+ cluster_power(self.cluster, out_put)
|
|
|
+
|
|
|
+
|
|
|
+data_analysis = DataAnalysis(data_length=9771,
|
|
|
+ data_start=0,
|
|
|
+ data_end=9771)
|
|
|
+
|
|
|
+data_analysis.process_ori_data()
|
|
|
+data_analysis.paint_double(20, 21)
|