liudawei 1 jaar geleden
bovenliggende
commit
9664b90fa3
62 gewijzigde bestanden met toevoegingen van 0 en 724 verwijderingen
  1. 0 6
      README.md
  2. 0 110
      cluster_analysis.py
  3. 0 483
      data_analysis.py
  4. 0 78
      data_clean.py
  5. 0 21
      聚类结果说明/README.md
  6. 0 11
      聚类结果说明/cluster/README.md
  7. BIN
      聚类结果说明/cluster/cluster_1.png
  8. BIN
      聚类结果说明/cluster/cluster_2.png
  9. BIN
      聚类结果说明/cluster/cluster_3.png
  10. BIN
      聚类结果说明/cluster/cluster_4.png
  11. BIN
      聚类结果说明/fft/10_turbine_fft.png
  12. BIN
      聚类结果说明/fft/11_turbine_fft.png
  13. BIN
      聚类结果说明/fft/12_turbine_fft.png
  14. BIN
      聚类结果说明/fft/13_turbine_fft.png
  15. BIN
      聚类结果说明/fft/14_turbine_fft.png
  16. BIN
      聚类结果说明/fft/15_turbine_fft.png
  17. BIN
      聚类结果说明/fft/16_turbine_fft.png
  18. BIN
      聚类结果说明/fft/17_turbine_fft.png
  19. BIN
      聚类结果说明/fft/18_turbine_fft.png
  20. BIN
      聚类结果说明/fft/19_turbine_fft.png
  21. BIN
      聚类结果说明/fft/1_turbine_fft.png
  22. BIN
      聚类结果说明/fft/20_turbine_fft.png
  23. BIN
      聚类结果说明/fft/21_turbine_fft.png
  24. BIN
      聚类结果说明/fft/22_turbine_fft.png
  25. BIN
      聚类结果说明/fft/23_turbine_fft.png
  26. BIN
      聚类结果说明/fft/24_turbine_fft.png
  27. BIN
      聚类结果说明/fft/25_turbine_fft.png
  28. BIN
      聚类结果说明/fft/26_turbine_fft.png
  29. BIN
      聚类结果说明/fft/27_turbine_fft.png
  30. BIN
      聚类结果说明/fft/28_turbine_fft.png
  31. BIN
      聚类结果说明/fft/29_turbine_fft.png
  32. BIN
      聚类结果说明/fft/2_turbine_fft.png
  33. BIN
      聚类结果说明/fft/30_turbine_fft.png
  34. BIN
      聚类结果说明/fft/31_turbine_fft.png
  35. BIN
      聚类结果说明/fft/32_turbine_fft.png
  36. BIN
      聚类结果说明/fft/33_turbine_fft.png
  37. BIN
      聚类结果说明/fft/34_turbine_fft.png
  38. BIN
      聚类结果说明/fft/35_turbine_fft.png
  39. BIN
      聚类结果说明/fft/36_turbine_fft.png
  40. BIN
      聚类结果说明/fft/37_turbine_fft.png
  41. BIN
      聚类结果说明/fft/38_turbine_fft.png
  42. BIN
      聚类结果说明/fft/39_turbine_fft.png
  43. BIN
      聚类结果说明/fft/3_turbine_fft.png
  44. BIN
      聚类结果说明/fft/40_turbine_fft.png
  45. BIN
      聚类结果说明/fft/41_turbine_fft.png
  46. BIN
      聚类结果说明/fft/42_turbine_fft.png
  47. BIN
      聚类结果说明/fft/43_turbine_fft.png
  48. BIN
      聚类结果说明/fft/44_turbine_fft.png
  49. BIN
      聚类结果说明/fft/45_turbine_fft.png
  50. BIN
      聚类结果说明/fft/46_turbine_fft.png
  51. BIN
      聚类结果说明/fft/47_turbine_fft.png
  52. BIN
      聚类结果说明/fft/48_turbine_fft.png
  53. BIN
      聚类结果说明/fft/49_turbine_fft.png
  54. BIN
      聚类结果说明/fft/4_turbine_fft.png
  55. BIN
      聚类结果说明/fft/5_turbine_fft.png
  56. BIN
      聚类结果说明/fft/6_turbine_fft.png
  57. BIN
      聚类结果说明/fft/7_turbine_fft.png
  58. BIN
      聚类结果说明/fft/8_turbine_fft.png
  59. BIN
      聚类结果说明/fft/9_turbine_fft.png
  60. 0 15
      聚类结果说明/fft/README.md
  61. BIN
      聚类结果说明/turbine_cluster.png
  62. BIN
      聚类结果说明/风机标签与风机名称对应表.xlsx

+ 0 - 6
README.md

@@ -1,6 +0,0 @@
-## 功率预测系统
-
-该模块用于风机聚类,找到风机之间规律。
-
-
-

+ 0 - 110
cluster_analysis.py

@@ -1,110 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# time: 2023/5/11 14:43
-# file: cluster_power.py
-# author: David
-# company: shenyang JY
-
-import os
-import re
-import numpy as np
-import pandas as pd
-
-
-def read_cfs(cfs, input_path, output_path, is_folder=False):
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    dfs = {}
-    for j, ids in cfs.items():
-        if is_folder:
-            dirname = input_path.split('/')[-1]
-            x = re.findall('(?<=Continuous_Turbine_Data_).*?(?=_)',dirname)[0]
-            dfs_j = [pd.read_csv(os.path.join(input_path, f"turbine-{id}_{int(x)}.csv")) for id in ids]
-        else:
-            dfs_j = [pd.read_csv(os.path.join(input_path, f"turbine-{id}.csv")) for id in ids]
-        dfj, time_series = dfs_j[0].loc[:, ['C_TIME', 'C_WS', 'C_ACTIVE_POWER']], dfs_j[0]['C_TIME']
-        for df in dfs_j[1:]:
-            if df['C_TIME'].equals(time_series) is False:
-                print("风机之间的日期不一致!")
-                raise ValueError
-            dfj['C_ACTIVE_POWER'] += df['C_ACTIVE_POWER']
-            dfj['C_WS'] += df['C_WS']
-        dfj['C_WS'] /= len(dfs_j)
-        dfj.rename(columns=({'C_ACTIVE_POWER':'C_ACTIVE_POWER'+str(j), 'C_WS': 'C_WS'+str(j)}), inplace=True)
-        if is_folder:
-            dfj[20:].to_csv(os.path.join(output_path, 'cluster_' + str(j) + '.csv'), index=False)
-        else:
-            dfj[20:].to_csv(os.path.join(output_path, 'cluster_' + str(j) + '.csv'), index=False)
-        dfs[j] = dfj
-    return dfs
-
-
-def get_cfs(cluster, turbine_id):
-    cfs = {}
-    for j in range(1, max(cluster) + 1):
-        arr_j = np.where(cluster == j)[0]  # cluster中聚类j的索引列表
-        cfs.setdefault(j, [turbine_id[k] for k in arr_j])
-    for key, value in cfs.items():
-        print("第{}组:{}".format(key, cfs[key]))
-    return cfs
-
-
-def cluster_data_indep(dfs_cluster, root_path):
-    df_power = pd.read_csv(root_path + "power.csv")
-    df_nwp = pd.read_csv(root_path + "NWP.csv",
-                         usecols=["C_TIME", "C_WS100", "C_WS170"])
-    df_all = pd.concat([df_power.set_index("C_TIME"), df_nwp.set_index("C_TIME"),
-                        dfs_cluster], axis=1, join="inner")
-    return df_all
-
-
-def cluster_power_list_file(cluster, turbine_id, input_path, output_path):
-    """
-    从turbine-*.csv的文件列表中进行聚类功率相加
-    cluster:聚类的结果
-    turbine_id:风机ID
-    input_path:输入路径 output_filtered_csv_files 所在路径
-    output_path:输出每个聚类的功率,和所有聚类的功率cluster_data
-    """
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-
-    cfs = get_cfs(cluster, turbine_id)
-    dfs = read_cfs(cfs, input_path, output_path)
-    dfs_cluster = pd.concat([df.set_index("C_TIME") for df in dfs.values()], join='inner', axis=1)
-    dfs_cluster['SUM'] = dfs_cluster.filter(like='C_ACTIVE_POWER').sum(axis=1)
-    dfs_cluster = cluster_data_indep(dfs_cluster, '../data-process/data/')
-    dfs_cluster.reset_index().to_csv(os.path.join(output_path, 'cluster_data.csv'), index=False)
-
-
-def cluster_power_list_folder(cluster, turbine_id, input_path, output_path):
-    """
-    从嵌套turbine-*.csv的多个文件夹列表中进行聚类功率相加
-    cluster:聚类的结果
-    turbine_id:风机ID
-    input_path:输入路径 continuous_data 所在路径
-    output_path:输出每个聚类的功率,和所有聚类的功率cluster_data
-    """
-    if not os.path.exists(output_path):
-        os.makedirs(output_path)
-    continuous_list = [os.path.join(input_path, path) for path in os.listdir(input_path)]
-    cfs = get_cfs(cluster, turbine_id)
-    for con in continuous_list:
-        dirname = con.split('/')[-1]
-        output = os.path.join(output_path, dirname)
-        dfs = read_cfs(cfs, con, output, True)
-        dfs_cluster = pd.concat([df.set_index("C_TIME") for df in dfs.values()], join='inner', axis=1)
-        dfs_cluster.reset_index().to_csv(os.path.join(output, 'cluster_data.csv'), index=False)
-
-
-if __name__ == '__main__':
-    turbine_id = list(range(102, 162))
-    cluster = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
-    cluster[42] = 1
-    output_path = '../data-process/data/cluster_power/'
-
-    cluster_power_list_file(cluster, turbine_id,
-                            input_path='../data-process/data/output_filtered_csv_files/', output_path=output_path)
-    cluster_power_list_folder(cluster, turbine_id, input_path='../data-process/data/continuous_data/',
-                              output_path=output_path)
-

+ 0 - 483
data_analysis.py

@@ -1,483 +0,0 @@
-# !usr/bin/env python
-# -*- coding:utf-8 _*-
-"""
-@Author:Lijiaxing
- 
-@File:data_analysis.py
-@Time:2023/4/24 15:16
-
-"""
-import os.path
-
-import pandas as pd
-# from mpl_toolkits.basemap import Basemap
-from scipy.signal import savgol_filter
-import numpy as np
-import matplotlib.pyplot as plt
-from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
-from sklearn.metrics import silhouette_samples, silhouette_score
-
-def paint_others(y):
-    """ 绘制其他数据 """
-    plt.plot([j for j in range(y)], y)
-    # 添加标题和标签
-    plt.xlabel('x')
-    plt.ylabel('y')
-
-    # 显示图形
-    plt.show()
-
-
-def compute_cos_similarity(a, b):
-    """
-    计算两个向量的余弦相似度
-    :param a: 向量a
-    :param b: 向量b
-    :return: 余弦相似度值
-    """
-    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
-
-
-def compute_pearsonr(a):
-    """
-    计算数据皮尔逊相关系数并返回相似度矩阵
-    :param a: 数据格式为n*m的矩阵,n为数据个数,m为数据维度
-    :return: 返回相似度矩阵,数据格式为n*n的矩阵
-    """
-    return np.corrcoef(a)
-
-
-def compute_distance(a, b):
-    """
-    计算两个向量的欧式距离
-    :param a:
-    :param b:
-    :return: 返回两个向量的欧式距离
-    """
-    return np.linalg.norm(a - b)
-
-
-def hierarchical_clustering(data, threshold, similarity_func):
-    """
-    层次聚类,使用工具包scipy.cluster.hierarchy中的linkage和fcluster函数进行层次聚类
-    :param data: 二维数据,格式为n*m的矩阵,n为数据个数,m为数据维度
-    :param threshold: 阈值,当两个数据的距离小于阈值时,将两个数据归为一类,阈值为根据相似度矩阵层次聚类后的类别距离阈值,可根据需求进行调整,可大于1
-    :param similarity_func: 相似度计算函数,用于计算两个数据的相似度,可以进行替换,若替换为计算距离的函数需对内部进行修改
-    :return: 返回聚类结果,格式为n*1的矩阵,n为数据个数,每个数据的值为该数据所属的类别
-    """
-    # 计算数据的相似度矩阵
-    similarity_matrix = similarity_func(data)
-
-    # 计算数据的距离矩阵
-    distance_matrix = 1 - similarity_matrix
-
-    # 进行层次聚类返回聚类结果
-    Z = linkage(distance_matrix, method='ward')
-    # 根据相似度阈值获取聚类结果
-    clusters = fcluster(Z, t=threshold, criterion='distance')
-    # 画出层次聚类树形结构
-    fig = plt.figure(figsize=(5, 3))
-    dn = dendrogram(Z)
-    plt.show()
-    # clusters[42] = 1
-    silhouette = silhouette_samples(np.abs(distance_matrix), clusters, metric='euclidean')
-    silhouette1 = silhouette_score(np.abs(distance_matrix), clusters, metric='euclidean')
-    print(f"平均轮廓系数为:{silhouette1}, 单个样本的轮廓系数:{silhouette}")
-    return clusters
-
-
-class DataAnalysis:
-    """
-    数据分析类
-    """
-
-    def __init__(self, data_length, data_start, data_end):
-        """
-        初始化
-        :param data_length: 分析数据段长度
-        :param data_start: 分析数据段开始位置
-        :param data_end: 分析数据段结束位置
-        """
-        # 原始风机功率数据傅里叶变换滤波后的数据
-        self.ori_turbine_fft = None
-        # 原始风机功率数据片段
-        self.ori_turbine_pic = None
-        # 聚类结果
-        self.cluster = None
-        # 风机功率差分平滑后的结果
-        self.smooth_turbine_diff = None
-        # 风机功率差分变化情况
-        self.diff_change = None
-        # 风机功率差分
-        self.turbine_diff = None
-        # 全部风机数据
-        self.turbine = None
-        # 风机的标号顺序
-        self.turbine_id = list(range(102, 162))
-        # b1b4 = [142, 143, 144, 145]
-        # self.turbine_id = [id for id in self.turbine_id if id not in b1b4]
-        # 风机功率数据15分钟级别
-        self.power_15min = None
-        # 风机经纬度信息
-        self.info = None
-        # 使用数据长度
-        self.data_length = data_length
-        # 使用数据开始位置
-        self.data_start = data_start
-        # 使用数据结束位置
-        self.data_end = data_end
-        # 导入数据
-        self.load_data(normalize=True)
-        # 计算风机功率差分
-        self.compute_turbine_diff()
-
-    def load_data(self, normalize=False):
-        """
-        加载数据
-        :return:
-        """
-        self.info = pd.read_csv('../data-process/data/风机信息.csv', encoding='utf-8')
-        # power_15min = pd.read_csv('../data/power_15min.csv')
-        # for i in range(len(power_15min)):
-        #     if power_15min.loc[i, 'C_REAL_VALUE'] == -9999:
-        #         # 方便在曲线中看出缺失数据位置
-        #         power_15min.loc[i, 'C_REAL_VALUE'] = -34.56789
-        # self.power_15min = power_15min
-        turbine_path = '../data-process/data/output_filtered_csv_files/turbine-{}.csv'
-        self.turbine, turbines = {}, []
-        for i in self.turbine_id:
-            self.turbine[i] = pd.read_csv(turbine_path.format(i))[20:].reset_index(drop=True)
-        if normalize is True:
-            self.normalize()
-
-    def normalize(self):
-        turbines = [self.turbine[i].values[:, 1:].astype(np.float32) for i in self.turbine_id]
-        turbines = np.vstack(turbines)
-        mean, std = np.mean(turbines, axis=0), np.std(turbines, axis=0)
-        for i in self.turbine_id:
-            c_time = self.turbine[i]['C_TIME']
-            self.turbine[i] = (self.turbine[i].iloc[:, 1:] - mean) / std
-            self.turbine[i].insert(loc=0, column='C_TIME', value=c_time)
-        return self.turbine
-
-    def compute_turbine_diff(self):
-        """
-        计算风机功率差分
-        :return:
-        """
-        turbine_diff = []
-        ori_turbine_pic = []
-        for turbine_i in self.turbine_id:
-            ori = np.array(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end + 1])
-            diff_array = np.diff(ori)
-            smoothness_value = np.std(diff_array)
-            print("turbine-{}的平滑度是:{}".format(turbine_i, round(smoothness_value, 2)))
-            turbine_diff.append(diff_array)
-            ori_turbine_pic.append(self.turbine[turbine_i]['C_WS'].values[self.data_start:self.data_end])
-        self.ori_turbine_pic = ori_turbine_pic
-        self.turbine_diff = turbine_diff
-
-        diff_change = []
-        for diff_i in turbine_diff:
-            single_diff_change = []
-            for diff_i_i in diff_i:
-                if diff_i_i > 0:
-                    single_diff_change.append(1)
-                elif diff_i_i < 0:
-                    single_diff_change.append(-1)
-                else:
-                    single_diff_change.append(0)
-            diff_change.append(single_diff_change)
-        self.diff_change = diff_change
-        self.ori_turbine_fft = [self.turbine_fft(i + 1) for i in range(len(self.ori_turbine_pic))]
-
-        # 平滑
-        self.turbine_smooth(window_size=21)
-
-    def paint_map(self):
-        """
-        绘制经纬度地图
-        :return:
-        """
-        lats = self.info['纬度'].values
-        lons = self.info['经度'].values
-        map = Basemap()
-
-        # 绘制海岸线和国家边界
-        map.drawcoastlines()
-        map.drawcountries()
-
-        # 绘制经纬度坐标
-        map.drawmeridians(range(0, 360, 30))
-        map.drawparallels(range(-90, 90, 30))
-
-        # 绘制点
-
-        x, y = map(lons, lats)
-        map.plot(x, y, 'bo', markersize=10)
-
-        # 显示图表
-        plt.show()
-
-    def paint_power15min(self):
-        """
-        绘制15分钟功率曲线
-        :return:
-        """
-
-        plt.plot(self.power_15min['C_REAL_VALUE'])
-
-        # 设置图表标题和轴标签
-        plt.title('Data Time Change Curve')
-        plt.xlabel('Date')
-        plt.ylabel('Value')
-
-        # 显示图表
-        plt.show()
-
-    def paint_lats_lons(self):
-        """
-        绘制经纬度图
-        :return:
-        """
-        x = self.info['纬度'].values
-        y = self.info['经度'].values
-
-        # 绘制散点图
-        fig, ax = plt.subplots()
-        plt.scatter(x, y)
-
-        for i, txt in enumerate(self.info['id'].values):
-            ax.annotate(txt, (x[i], y[i]))
-
-        # 设置图表标题和轴标签
-        plt.xlabel('lats')
-        plt.ylabel('lons')
-
-        # 显示图表
-        plt.show()
-
-    def similarity_score(self, turbine_diff, threshold=0.5):
-        """
-        使用余弦相似度计算相似度分数并返回相似度大于阈值的index矩阵
-        :param turbine_diff: 需要计算相似的矩阵,数据格式n*m,n为数据条数,m为数据维数
-        :param threshold: 相似度阈值
-        :return: 返回相似计算后的矩阵
-        """
-        similarity = {i: [] for i in range(49)}
-        similarity_index = {i: [] for i in range(49)}
-        for turbine_i in range(49):
-            for turbine_j in range(49):
-                cos_similarity = compute_cos_similarity(turbine_diff[turbine_i], turbine_diff[turbine_j])
-                similarity[turbine_i].append(cos_similarity)
-                if cos_similarity > threshold:
-                    similarity_index[turbine_i].append(turbine_j)
-        return similarity_index
-
-    def paint_turbine(self, paint_default=True):
-        """
-        绘制风机地理位置图
-        :param paint_default:默认True,绘制聚类后每个类别的数据折线图
-        :return: None
-        """
-
-        # y = self.info['纬度'].values
-        # x = self.info['经度'].values
-        #
-        # fig, ax = plt.subplots(figsize=(15, 15))
-        #
-        # plt.scatter(x, y, c=self.cluster)
-        # for i, txt in enumerate(self.info['C_ID'].values):
-        #     ax.annotate(txt, (x[i], y[i]))
-
-        # 设置图表标题和轴标签
-        # plt.xlabel('lons')
-        # plt.ylabel('lats')
-        # plt.legend()
-        #
-        # # 显示图表
-        # plt.savefig('analysis_img/turbine_cluster.png')
-        # plt.show()
-
-        plt.figure(figsize=(20, 10))
-        cmap = plt.get_cmap('viridis')
-        linestyle= ['solid', 'dashed']
-        for i in range(max(self.cluster)):
-            cluster, cluster_fft = [], []
-            for j, item in enumerate(self.cluster):
-                if item == i + 1:
-                    cluster.append(self.ori_turbine_pic[j])
-                    cluster_fft.append(self.ori_turbine_fft[j])
-            cluster_fft = np.average(cluster_fft, axis=0)
-            cluster = np.average(cluster, axis=0)
-            diff_array = np.diff(cluster)
-            smoothness_value = np.std(diff_array)
-            print("聚类-{}的平滑度是:{}".format(i+1, smoothness_value))
-            color = cmap(i*200)
-            plt.subplot(2, 1, 1)
-            plt.plot([j for j in range(len(cluster))], cluster, color=color, label='cluster'+str(i), linestyle=linestyle[i])
-            plt.subplot(2, 1, 2)
-            plt.plot([j for j in range(len(cluster_fft))], cluster_fft, color=color, label='cluster'+str(i), linestyle=linestyle[i])
-
-        # 添加图例
-        plt.legend()
-        # 显示图形
-        plt.savefig('analysis_img/cluster/clusters.png')
-        plt.show()
-        if paint_default:
-            for i in range(max(self.cluster)):
-                self.paint_turbine_k(i + 1)  # 画出聚类中每个风机的曲线
-
-
-
-    def turbine_smooth(self, window_size=50):
-        """
-        使用滑动平均平滑数据。
-
-        参数:
-        data -- 需要平滑的数据,numpy数组类型
-        window_size -- 滑动窗口大小,整数类型
-
-        返回值:
-        smooth_data -- 平滑后的数据,numpy数组类型
-        """
-
-        # weights = np.repeat(1.0, window_size) / window_size
-        smooth_data = []
-        for turbine_diff_i in self.turbine_diff:
-            smooth_y = savgol_filter(turbine_diff_i, window_length=window_size, polyorder=3)
-            smooth_data.append(smooth_y)
-        #     smooth_data.append(np.convolve(turbine_diff_i, weights, 'valid'))
-        self.smooth_turbine_diff = smooth_data
-
-    def paint_turbine_k(self, k):
-        """
-        绘制第k聚类的风机数据折线图
-        :param k:
-        :return:
-        """
-        pic_label = []
-        y = []
-        plt.figure(figsize=(20, 10))
-        cmap = plt.get_cmap('viridis')
-        for i, item in enumerate(self.cluster):
-            if item == k:
-                pic_label.append('turbine-' + str(self.turbine_id[i]))
-                y.append(self.ori_turbine_fft[i])
-        for i in range(len(y)):
-            color = cmap(i / 10)
-            plt.plot([j for j in range(len(y[i]))], y[i], color=color, label=pic_label[i])
-        # 添加标签和标题
-        plt.xlabel('x')
-        plt.ylabel('y')
-        plt.title('Cluster {}'.format(k))
-
-        # 添加图例
-        plt.legend()
-        # 显示图形
-        plt.savefig('analysis_img/cluster/cluster_{}.png'.format(k))
-        plt.show()
-
-    def turbine_fft(self, k):
-        """
-        对第k台原始风机数据进行傅里叶变换,并绘制变换前后曲线
-        :param k: 数据读入时的风机顺序index,从1开始
-        :return: 傅里叶变换清洗后的数据,数据格式
-        """
-        y = self.ori_turbine_pic
-        t = np.linspace(0, 1, self.data_length)
-        signal = y[k - 1]
-
-        # 进行傅里叶变换
-        freq = np.fft.fftfreq(len(signal), t[1] - t[0])
-        spectrum = np.fft.fft(signal)
-        spectrum_abs = np.abs(spectrum)
-        threshold = np.percentile(spectrum_abs, 98)
-        indices = spectrum_abs > threshold
-        spectrum_clean = indices * spectrum
-
-        # 进行傅里叶逆变换
-        signal_clean = np.fft.ifft(spectrum_clean)
-        # plt.figure(figsize=(20, 10))
-        #
-        # # 绘制时域信号
-        # plt.subplot(4, 1, 1)
-        # plt.plot(t, signal)
-        # plt.title(self.turbine_id[k-1])
-        #
-        # # 绘制频域信号
-        # plt.subplot(4, 1, 2)
-        # plt.plot(freq, np.abs(spectrum))
-        #
-        # # 绘制过滤后的频域信号
-        # plt.subplot(4, 1, 3)
-        # plt.plot(freq, np.abs(spectrum_clean))
-        #
-        # # 绘制过滤后的时域信号
-        # plt.subplot(4, 1, 4)
-        # plt.plot(t, signal_clean)
-        #
-        # plt.savefig('analysis_img/fft/{}_turbine_fft.png'.format(self.turbine_id[k-1]))
-        # plt.show()
-        return signal_clean
-
-    def paint_double(self, i, j):
-        """
-        绘制两台风机的数据变换对比
-        :param i: 风机数据读入时数据编号,从1开始
-        :param j: 风机数据读入时数据编号,从1开始
-        :return:
-        """
-        y = self.ori_turbine_fft
-        x = [index for index in range(self.data_length)]
-        data_i = y[i - 1]
-        data_j = y[j - 1]
-
-        plt.figure(figsize=(20, 10))
-        plt.plot(x, data_i, label='turbine {}'.format(self.turbine_id[i - 1]), linestyle='solid')
-        plt.plot(x, data_j, label='turbine {}'.format(self.turbine_id[j - 1]), linestyle='dashed')
-
-        plt.title('{} and {}'.format(i, j))
-        plt.legend()
-        plt.savefig('analysis_img/{}_{}_turbine.png'.format(self.turbine_id[i - 1], self.turbine_id[j - 1]))
-        plt.show()
-
-    def process_ori_data(self):
-        """
-        对原始数据进行处理,聚类和绘图
-        :return:
-        """
-        self.turbine_clusters(self.ori_turbine_fft)
-        self.paint_turbine()
-
-    def turbine_clusters(self, data=None):
-        """
-        风机数据聚类,聚类信息保存在self.cluster中
-        :param data: 默认为空,也可以使用其他数据聚类,并体现在绘图中,
-        数据格式:二维数据n*m,n为数据条数,m为每条数据维数
-        :return: None
-        """
-        if data is None:
-            cluster = hierarchical_clustering(self.turbine_diff, threshold=1.4,
-                                              similarity_func=compute_pearsonr)  # 层次聚类
-        else:
-            cluster = hierarchical_clustering(data, threshold=0.8,
-                                              similarity_func=compute_pearsonr)
-        self.cluster = cluster
-        # 在这里保存cluster变量
-        from cluster_analysis import cluster_power_list_file, cluster_power_list_folder
-
-        output_path = '../data-process/data/cluster_power/'
-
-        cluster_power_list_file(self.cluster, self.turbine_id,
-                                input_path='../data-process/data/output_filtered_csv_files/', output_path=output_path)
-        cluster_power_list_folder(self.cluster, self.turbine_id, input_path='../data-process/data/continuous_data/',
-                                  output_path=output_path)
-
-
-data_analysis = DataAnalysis(data_length=9773,
-                             data_start=0,
-                             data_end=9773)
-
-data_analysis.process_ori_data()
-data_analysis.paint_double(1, 56)

+ 0 - 78
data_clean.py

@@ -1,78 +0,0 @@
-# !usr/bin/env python
-# -*- coding:utf-8 _*-
-"""
-@Author:Lijiaxing
- 
-@File:data_clean.py
-@Time:2023/4/26 18:06
-
-"""
-import os.path
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-
-
-def paint_data(clean_data, clean_index=1):
-    x = [index for index in range(len(clean_data))]
-    plt.figure(figsize=(20, 10))
-    plt.title('clean_{}'.format(clean_index))
-
-    # 绘制曲线
-    plt.plot(x, clean_data, color='red', label='clean_data')
-
-    plt.savefig('data_{}.png'.format(clean_index))
-    plt.show()
-
-
-class clean_file:
-    """
-        清洗数据
-    """
-
-    def __init__(self, output_path='./'):
-        """
-        :param output_path: 清洗后的数据存放路径 ,只传入路径,不包括文件名
-        """
-        self.data = []
-        output_path = os.path.join(output_path, 'clean_data')
-        if not os.path.exists(output_path):
-            os.makedirs(output_path)
-        self.output_path = output_path
-
-    def clean_data(self, file_path, clean_name, clean_value, multi_value=False, clean_index=1, paint=True):
-        """
-        数据清洗
-        将-9999或-99数据进行插值处理,并绘制处理后的数据图像保存至output_path路径下
-        :param paint: 是否绘制图像
-        :param multi_value: 若为True,则clean_value为list
-        :param clean_value: 清洗数据中异常值
-        :param clean_name: 清洗数据中异常值列名
-        :param file_path: 需要清洗的数据,csv格式
-        :param clean_index: 清洗数据输出文件名格式为 clean_${clean_index}.csv
-        :return: None
-        """
-        data = pd.read_csv(file_path)
-        if paint:
-            paint_old_data = [item for item in data[clean_name].values]
-        old_data = data[clean_name].values
-        if multi_value:
-            for clean_value_i in clean_value:
-                data[clean_name][old_data == clean_value_i] = np.nan
-        else:
-            data[clean_name][old_data == clean_value] = np.nan
-
-        data[clean_name] = data[clean_name].interpolate()
-        data.to_csv(os.path.join(self.output_path, 'clean_{}.csv'.format(clean_index)), index=False)
-        already_clean = data[clean_name].values
-        if paint:
-            paint_data(already_clean, clean_index)
-
-
-# 使用示例
-cleaner = clean_file(output_path='Dataset_training/power/')
-for i in range(6):
-    cleaner.clean_data(file_path='Dataset_training/power/power_{}.csv'.format(i), clean_name='C_REAL_VALUE',
-                       clean_value=[-9999.0, -99], multi_value=True,
-                       clean_index=i)

+ 0 - 21
聚类结果说明/README.md

@@ -1,21 +0,0 @@
-# 聚类结果
-
----
-
-## 目录结构
-
-|------ cluster		                           			            # 聚类结果目录
-|------------- cluster_1.png	       					        # 聚类数据趋势图片,index为类别标签
-|------------- cluster_2.png
-|------------- cluster_3.png
-|------------- cluster_4.png
-|------ fft                                           			        # 傅里叶变换滤波结果目录
-|------------- index_turbine_fft.png 				    # 傅里叶变换滤波前后数据对比
-|------------- index_turbine_fft.png
-|------------- index_turbine_fft.png
-|------------- ......
-|------ turbine_cluster.png               			        # 聚类体现在经纬度位置图中的表现
-|------ 风机标签与风机名称对应表.xlsx               # 使用的index与风机名称对应表
-
-
-

+ 0 - 11
聚类结果说明/cluster/README.md

@@ -1,11 +0,0 @@
-# 聚类文件说明
-
----
-
-![样例图片,对应类目1](./cluster_1.png)
-
-## Title:1对应类目编号
-
-## 图例:该类目下对应的风机编号,从1开始,1-49
-
-## 纵坐标:对应功率值

BIN
聚类结果说明/cluster/cluster_1.png


BIN
聚类结果说明/cluster/cluster_2.png


BIN
聚类结果说明/cluster/cluster_3.png


BIN
聚类结果说明/cluster/cluster_4.png


BIN
聚类结果说明/fft/10_turbine_fft.png


BIN
聚类结果说明/fft/11_turbine_fft.png


BIN
聚类结果说明/fft/12_turbine_fft.png


BIN
聚类结果说明/fft/13_turbine_fft.png


BIN
聚类结果说明/fft/14_turbine_fft.png


BIN
聚类结果说明/fft/15_turbine_fft.png


BIN
聚类结果说明/fft/16_turbine_fft.png


BIN
聚类结果说明/fft/17_turbine_fft.png


BIN
聚类结果说明/fft/18_turbine_fft.png


BIN
聚类结果说明/fft/19_turbine_fft.png


BIN
聚类结果说明/fft/1_turbine_fft.png


BIN
聚类结果说明/fft/20_turbine_fft.png


BIN
聚类结果说明/fft/21_turbine_fft.png


BIN
聚类结果说明/fft/22_turbine_fft.png


BIN
聚类结果说明/fft/23_turbine_fft.png


BIN
聚类结果说明/fft/24_turbine_fft.png


BIN
聚类结果说明/fft/25_turbine_fft.png


BIN
聚类结果说明/fft/26_turbine_fft.png


BIN
聚类结果说明/fft/27_turbine_fft.png


BIN
聚类结果说明/fft/28_turbine_fft.png


BIN
聚类结果说明/fft/29_turbine_fft.png


BIN
聚类结果说明/fft/2_turbine_fft.png


BIN
聚类结果说明/fft/30_turbine_fft.png


BIN
聚类结果说明/fft/31_turbine_fft.png


BIN
聚类结果说明/fft/32_turbine_fft.png


BIN
聚类结果说明/fft/33_turbine_fft.png


BIN
聚类结果说明/fft/34_turbine_fft.png


BIN
聚类结果说明/fft/35_turbine_fft.png


BIN
聚类结果说明/fft/36_turbine_fft.png


BIN
聚类结果说明/fft/37_turbine_fft.png


BIN
聚类结果说明/fft/38_turbine_fft.png


BIN
聚类结果说明/fft/39_turbine_fft.png


BIN
聚类结果说明/fft/3_turbine_fft.png


BIN
聚类结果说明/fft/40_turbine_fft.png


BIN
聚类结果说明/fft/41_turbine_fft.png


BIN
聚类结果说明/fft/42_turbine_fft.png


BIN
聚类结果说明/fft/43_turbine_fft.png


BIN
聚类结果说明/fft/44_turbine_fft.png


BIN
聚类结果说明/fft/45_turbine_fft.png


BIN
聚类结果说明/fft/46_turbine_fft.png


BIN
聚类结果说明/fft/47_turbine_fft.png


BIN
聚类结果说明/fft/48_turbine_fft.png


BIN
聚类结果说明/fft/49_turbine_fft.png


BIN
聚类结果说明/fft/4_turbine_fft.png


BIN
聚类结果说明/fft/5_turbine_fft.png


BIN
聚类结果说明/fft/6_turbine_fft.png


BIN
聚类结果说明/fft/7_turbine_fft.png


BIN
聚类结果说明/fft/8_turbine_fft.png


BIN
聚类结果说明/fft/9_turbine_fft.png


+ 0 - 15
聚类结果说明/fft/README.md

@@ -1,15 +0,0 @@
-# FFT 图片说明
-
----
-
-![样例图片,对应风机1](1_turbine_fft.png)
-
-## Title:1对应风机编号
-
-## 图1:原始数据曲线
-
-## 图2:使用傅里叶变换后将原始数据曲线从时域转换为频域
-
-## 图3:频域曲线过滤噪声后结果
-
-## 图4:将滤噪后的曲线转换回时域的结果

BIN
聚类结果说明/turbine_cluster.png


BIN
聚类结果说明/风机标签与风机名称对应表.xlsx