瀏覽代碼

轮廓系数-平滑度-聚类添加字段和对比图2

liudawei 1 年之前
父節點
當前提交
5620336f2b
共有 2 個文件被更改,包括 75 次插入39 次删除
  1. 22 24
      cluster_analysis.py
  2. 53 15
      data_analysis.py

+ 22 - 24
cluster_analysis.py

@@ -29,25 +29,15 @@ def read_cfs(cfs, input_path, output_path, is_folder=False):
                 raise ValueError
             dfj['C_ACTIVE_POWER'] += df['C_ACTIVE_POWER']
             dfj['C_WS'] += df['C_WS']
-        dfj['C_WS'] /= len(dfj)
+        dfj['C_WS'] /= len(dfs_j)
         dfj.rename(columns=({'C_ACTIVE_POWER':'C_ACTIVE_POWER'+str(j), 'C_WS': 'C_WS'+str(j)}), inplace=True)
         if is_folder:
-            dfj.to_csv(os.path.join(output_path, 'cluster_' + str(j) + '.csv'), index=False)
+            dfj[20:].to_csv(os.path.join(output_path, 'cluster_' + str(j) + '.csv'), index=False)
         else:
             dfj[20:].to_csv(os.path.join(output_path, 'cluster_' + str(j) + '.csv'), index=False)
         dfs[j] = dfj
     return dfs
 
-def normalize(input_path, turbine):
-    turbines = [pd.read_csv(f) for f in os.listdir(os.path.join(input_path))]
-    turbines = [turbine.values[:, 1:].astype(np.float32) for turbine in turbines]
-    turbines = np.vstack(turbines)
-    mean, std = np.mean(turbines, axis=0), np.std(turbines, axis=0)
-
-    c_time = turbine['C_TIME']
-    turbine = (turbine.iloc[:, 1:] - mean) / std
-    turbine.insert(loc=0, column='C_TIME', value=c_time)
-    return turbine
 
 def get_cfs(cluster, turbine_id):
     cfs = {}
@@ -59,6 +49,15 @@ def get_cfs(cluster, turbine_id):
     return cfs
 
 
+def cluster_data_indep(dfs_cluster, root_path):
+    df_power = pd.read_csv(root_path + "power.csv")
+    df_nwp = pd.read_csv(root_path + "NWP.csv",
+                         usecols=["C_TIME", "C_WS100", "C_WS170"])
+    df_all = pd.concat([df_power.set_index("C_TIME"), df_nwp.set_index("C_TIME"),
+                        dfs_cluster], axis=1, join="inner")
+    return df_all
+
+
 def cluster_power_list_file(cluster, turbine_id, input_path, output_path):
     """
     从turbine-*.csv的文件列表中进行聚类功率相加
@@ -73,6 +72,8 @@ def cluster_power_list_file(cluster, turbine_id, input_path, output_path):
     cfs = get_cfs(cluster, turbine_id)
     dfs = read_cfs(cfs, input_path, output_path)
     dfs_cluster = pd.concat([df.set_index("C_TIME") for df in dfs.values()], join='inner', axis=1)
+    dfs_cluster['SUM'] = dfs_cluster.filter(like='C_ACTIVE_POWER').sum(axis=1)
+    dfs_cluster = cluster_data_indep(dfs_cluster, '../data-process/data/')
     dfs_cluster.reset_index().to_csv(os.path.join(output_path, 'cluster_data.csv'), index=False)
 
 
@@ -96,17 +97,14 @@ def cluster_power_list_folder(cluster, turbine_id, input_path, output_path):
         dfs_cluster.reset_index().to_csv(os.path.join(output, 'cluster_data.csv'), index=False)
 
 
+if __name__ == '__main__':
+    turbine_id = list(range(102, 162))
+    cluster = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
+    cluster[42] = 1
+    output_path = '../data-process/data/cluster_power/'
 
-def paint_cluster_power(cluster_path, dfs):
-    dfs_cluster = pd.merge(dfs.values, axis=1, join='inner', on='C_TIME')
-    dfs = [pd.read_csv(os.path.join(cluster_path, file_path)).rename(columns={'C_ACTIVE_POWER':'power_'+file_path.split('/')[-1][-5], 'C_WS': 'C_WS_'+file_path.split('/')[-1][-5]}) for file_path in os.listdir(cluster_path)]
-    df_cluster = pd.DataFrame({df.columns[-1]: df.iloc[:, -1] for df in dfs})
-    df_cluster.insert(loc=0, column='C_TIME', value=dfs[0]['C_TIME'])
-    df_cluster.insert(loc=len(dfs)+1, column='SUM', value=df_cluster.iloc[:, 1:].sum(axis=1))
-    df_cluster.to_csv(os.path.join(cluster_path, 'cluster_data.csv'), index=False)
-
-
-
-
-
+    cluster_power_list_file(cluster, turbine_id,
+                            input_path='../data-process/data/output_filtered_csv_files/', output_path=output_path)
+    cluster_power_list_folder(cluster, turbine_id, input_path='../data-process/data/continuous_data/',
+                              output_path=output_path)
 

+ 53 - 15
data_analysis.py

@@ -14,8 +14,8 @@ import pandas as pd
 from scipy.signal import savgol_filter
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.cluster.hierarchy import linkage, fcluster
-
+from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
+from sklearn.metrics import silhouette_samples, silhouette_score
 
 def paint_others(y):
     """ 绘制其他数据 """
@@ -75,6 +75,14 @@ def hierarchical_clustering(data, threshold, similarity_func):
     Z = linkage(distance_matrix, method='ward')
     # 根据相似度阈值获取聚类结果
     clusters = fcluster(Z, t=threshold, criterion='distance')
+    # 画出层次聚类树形结构
+    fig = plt.figure(figsize=(5, 3))
+    dn = dendrogram(Z)
+    plt.show()
+    # clusters[42] = 1
+    silhouette = silhouette_samples(np.abs(distance_matrix), clusters, metric='euclidean')
+    silhouette1 = silhouette_score(np.abs(distance_matrix), clusters, metric='euclidean')
+    print(f"平均轮廓系数为:{silhouette1}, 单个样本的轮廓系数:{silhouette}")
     return clusters
 
 
@@ -106,8 +114,8 @@ class DataAnalysis:
         self.turbine = None
         # 风机的标号顺序
         self.turbine_id = list(range(102, 162))
-        b1b4 = [142, 143, 144, 145]
-        self.turbine_id = [id for id in self.turbine_id if id not in b1b4]
+        # b1b4 = [142, 143, 144, 145]
+        # self.turbine_id = [id for id in self.turbine_id if id not in b1b4]
         # 风机功率数据15分钟级别
         self.power_15min = None
         # 风机经纬度信息
@@ -119,7 +127,7 @@ class DataAnalysis:
         # 使用数据结束位置
         self.data_end = data_end
         # 导入数据
-        self.load_data()
+        self.load_data(normalize=True)
         # 计算风机功率差分
         self.compute_turbine_diff()
 
@@ -139,7 +147,7 @@ class DataAnalysis:
         self.turbine, turbines = {}, []
         for i in self.turbine_id:
             self.turbine[i] = pd.read_csv(turbine_path.format(i))[20:].reset_index(drop=True)
-        if normalize:
+        if normalize is True:
             self.normalize()
 
     def normalize(self):
@@ -160,10 +168,12 @@ class DataAnalysis:
         turbine_diff = []
         ori_turbine_pic = []
         for turbine_i in self.turbine_id:
-            diff_array = np.diff(
-                np.array(self.turbine[turbine_i]['C_ACTIVE_POWER'].values[self.data_start:self.data_end + 1]))
+            ori = np.array(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end + 1])
+            diff_array = np.diff(ori)
+            smoothness_value = np.std(diff_array)
+            print("turbine-{}的平滑度是:{}".format(turbine_i, round(smoothness_value, 2)))
             turbine_diff.append(diff_array)
-            ori_turbine_pic.append(self.turbine[turbine_i]['C_ACTIVE_POWER'].values[self.data_start:self.data_end])
+            ori_turbine_pic.append(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end])
         self.ori_turbine_pic = ori_turbine_pic
         self.turbine_diff = turbine_diff
 
@@ -288,9 +298,37 @@ class DataAnalysis:
         # # 显示图表
         # plt.savefig('analysis_img/turbine_cluster.png')
         # plt.show()
+
+        plt.figure(figsize=(20, 10))
+        cmap = plt.get_cmap('viridis')
+        linestyle= ['solid', 'dashed']
+        for i in range(max(self.cluster)):
+            cluster, cluster_fft = [], []
+            for j, item in enumerate(self.cluster):
+                if item == i + 1:
+                    cluster.append(self.ori_turbine_pic[j])
+                    cluster_fft.append(self.ori_turbine_fft[j])
+            cluster_fft = np.average(cluster_fft, axis=0)
+            cluster = np.average(cluster, axis=0)
+            diff_array = np.diff(cluster)
+            smoothness_value = np.std(diff_array)
+            print("聚类-{}的平滑度是:{}".format(i+1, smoothness_value))
+            color = cmap(i*200)
+            plt.subplot(2, 1, 1)
+            plt.plot([j for j in range(len(cluster))], cluster, color=color, label='cluster'+str(i), linestyle=linestyle[i])
+            plt.subplot(2, 1, 2)
+            plt.plot([j for j in range(len(cluster_fft))], cluster_fft, color=color, label='cluster'+str(i), linestyle=linestyle[i])
+
+        # 添加图例
+        plt.legend()
+        # 显示图形
+        plt.savefig('analysis_img/cluster/clusters.png')
+        plt.show()
         if paint_default:
             for i in range(max(self.cluster)):
-                self.paint_turbine_k(i + 1)
+                self.paint_turbine_k(i + 1)  # 画出聚类中每个风机的曲线
+
+
 
     def turbine_smooth(self, window_size=50):
         """
@@ -365,7 +403,7 @@ class DataAnalysis:
         # # 绘制时域信号
         # plt.subplot(4, 1, 1)
         # plt.plot(t, signal)
-        # plt.title(k)
+        # plt.title(self.turbine_id[k-1])
         #
         # # 绘制频域信号
         # plt.subplot(4, 1, 2)
@@ -379,7 +417,7 @@ class DataAnalysis:
         # plt.subplot(4, 1, 4)
         # plt.plot(t, signal_clean)
         #
-        # plt.savefig('analysis_img/fft/{}_turbine_fft.png'.format(k))
+        # plt.savefig('analysis_img/fft/{}_turbine_fft.png'.format(self.turbine_id[k-1]))
         # plt.show()
         return signal_clean
 
@@ -423,11 +461,11 @@ class DataAnalysis:
             cluster = hierarchical_clustering(self.turbine_diff, threshold=1.4,
                                               similarity_func=compute_pearsonr)  # 层次聚类
         else:
-            cluster = hierarchical_clustering(data, threshold=1.2,
+            cluster = hierarchical_clustering(data, threshold=0.8,
                                               similarity_func=compute_pearsonr)
         self.cluster = cluster
         # 在这里保存cluster变量
-        from cluster_power import cluster_power_list_file, cluster_power_list_folder
+        from cluster_analysis import cluster_power_list_file, cluster_power_list_folder
 
         output_path = '../data-process/data/cluster_power/'
 
@@ -442,4 +480,4 @@ data_analysis = DataAnalysis(data_length=9773,
                              data_end=9773)
 
 data_analysis.process_ori_data()
-data_analysis.paint_double(20, 21)
+data_analysis.paint_double(1, 56)