# -*- coding: utf-8 -*- import os import pandas as pd import numpy as np import random import operator import math from copy import deepcopy import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # # 将网格线置于曲线之下 # plt.rcParams['axes.axisbelow'] = False plt.style.use('fivethirtyeight') # 'ggplot' from PlotFunctions import plot_random_init_iris_sepal, plot_random_init_iris_petal, plot_cluster_iris_sepal, plot_cluster_iris_petal from sklearn.datasets import load_iris def load_iris_data(): data = load_iris() # iris数据集的特征列 features = data['data'] # iris数据集的标签 target = data['target'] # 增加维度1,用于拼接 target = target[:, np.newaxis] target_names = data['target_names'] target_dicts = dict(zip(np.unique(target), target_names)) feature_names = data['feature_names'] # 浅拷贝,防止原地修改 feature_names = data['feature_names'].copy() # deepcopy(data['feature_names']) feature_names.append('label') df_full = pd.DataFrame(data = np.concatenate([features, target], axis=1), columns=feature_names) # 保存数据集 df_full.to_csv(str(os.getcwd()) + '/iris_data.csv', index=None) columns = list(df_full.columns) features = columns[:len(columns)-1] class_labels = list(df_full[columns[-1]]) df = df_full[features] return df_full, df, class_labels, target_dicts def load_env_data(): path = '../xiangzhou/features.csv' env = pd.read_csv(path) return env # 初始化隶属度矩阵 U def init_fuzzy_matrix(n_sample, c): """ 随机初始化隶属度矩阵,注意针对一个样本,三个隶属度的相加和=1 ---- param n_sample: 样本数量 param c: 聚类数量 """ # 针对数据集中所有样本的隶属度矩阵,shape = [n_sample, c] fuzzy_matrix = [] for i in range(n_sample): # 生成 c 个随机数列表, random.random()方法随机生成[0,1)范围内的一个实数。 random_list = [random.random() for i in range(c)] sum_of_random = sum(random_list) # 归一化之后的随机数列表 # 单个样本的模糊隶属度列表 norm_random_list = [x/sum_of_random for x in random_list] # 选择随机参数列表中最大的数的索引 one_of_random_index = norm_random_list.index(max(norm_random_list)) for j in range(0, len(norm_random_list)): if(j == one_of_random_index): norm_random_list[j] = 1 else: norm_random_list[j] = 0 fuzzy_matrix.append(norm_random_list) return fuzzy_matrix # 计算FCM的聚类中心 def cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m): """ param df: 数据集的特征集,不包含标签列 param fuzzy_matrix: 隶属度矩阵 param c: 聚类簇数量 param m: 加权指数 """ # *字符称为解包运算符 # zip(*fuzzy_amtrix) 相当于将fuzzy_matrix按列展开并拼接,但并不合并! # list(zip(*fuzzy_amtrix)) 包含 列数 个元组。 fuzzy_mat_ravel = list(zip(*fuzzy_matrix)) cluster_centers = [] # 遍历聚类数量次 for j in range(c): # 取出属于某一类的所有样本的隶属度列表(隶属度矩阵的一列) fuzzy_one_dim_list = list(fuzzy_mat_ravel[j]) # 计算隶属度的m次方 m_fuzzy_one_dim_list = [p ** m for p in fuzzy_one_dim_list] # 隶属度求和,求解聚类中心公式中的分母 denominator = sum(m_fuzzy_one_dim_list) # numerator_list = [] # 遍历所有样本,求分子 for i in range(n_sample): # 取出一个样本 sample = list(df.iloc[i]) # 聚类簇中心的分子部分,样本与对应的隶属度的m次方相乘 mul_sample_fuzzy = [m_fuzzy_one_dim_list[i] * val for val in sample] numerator_list.append(mul_sample_fuzzy) # 计算分子,求和 numerator = map(sum, list(zip(*numerator_list))) cluster_center = [val/denominator for val in numerator] cluster_centers.append(cluster_center) return cluster_centers # 更新隶属度矩阵,参考公式 (8) def update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers): # 分母的指数项 order = float(2 / (m - 1)) # 遍历样本 for i in range(n_sample): # 单个样本 sample = list(df.iloc[i]) # 计算更新公式的分母:样本减去聚类中心 distances = [np.linalg.norm( np.array(list( map(operator.sub, sample, cluster_centers[j]) )) ) \ for j in range(c)] for j in range(c): # 更新公式的分母 denominator = sum([math.pow(float(distances[j]/distances[val]), order) for val in range(c)]) fuzzy_matrix[i][j] = float(1 / denominator) return fuzzy_matrix #, distances # 获取聚类中心 def get_clusters(fuzzy_matrix, n_sample, iter, max_iter): # 隶属度最大的那一个维度作为最终的聚类结果 cluster_labels, delete_labels = [], [] for i in range(n_sample): max_val, idx = max( (val, idx) for (idx, val) in enumerate(fuzzy_matrix[i]) ) cluster_labels.append(idx) if iter == max_iter-1: print("max_val = ", max_val) if max_val < 0.15: delete_labels.append(i) return cluster_labels, delete_labels # 模糊c均值聚类算法 def fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='random'): """ param init_random: 聚类中心的初始化方法 - random: 从样本中随机选择c个作为聚类中心 - multi_normal: 多元高斯分布采样 """ # 样本特征数量 n_features = df.shape[-1] # 初始化隶属度矩阵 fuzzy_matrix = init_fuzzy_matrix(n_sample, c) # 初始化迭代次数 current_iter = 0 # 初始化聚类中心 init_cluster_centers = [] cluster_centers = [] # 初始化样本聚类标签的列表,每次迭代都需要保存每个样本的聚类 max_iter_cluster_labels = [] # 选择初始化方法 if init_method == 'multi_normal': # 均值列表 mean = [0] * n_features # 多元高斯分布的协方差矩阵,对角阵 cov = np.identity(n_features) for i in range(0, c): init_cluster_centers.append(list(np.random.multivariate_normal(mean, cov) ) ) # else: # init_cluster_centers = [[0.1] * n_features ] * c print(init_cluster_centers) while current_iter < max_iter: if current_iter == 0 and init_method == 'multi_normal': cluster_centers = init_cluster_centers else: cluster_centers = cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m) fuzzy_matrix = update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers) cluster_labels, delete_labels = get_clusters(fuzzy_matrix, n_sample, iter=current_iter, max_iter=max_iter) max_iter_cluster_labels.append(cluster_labels) current_iter += 1 print('-' * 32) print("Fuzzy Matrix U:\n") print(np.array(fuzzy_matrix)) return cluster_centers, cluster_labels, max_iter_cluster_labels, delete_labels def process_nwp(labels, delete_labels): env = load_env_data() nwps = pd.read_csv('../xiangzhou/NWP.csv') nwp_1, nwp_2, nwp_3, nwp_4 = [], [], [], [] nwps['C_TIME'] = pd.to_datetime(nwps['C_TIME']) for index, nwp in nwps.iterrows(): time = nwp['C_TIME'].strftime('%Y-%m-%d %H:00:00') if len(env[env['C_TIME'].values == time].index) == 0: print("nwp此时的时间点在环境数据中找不到:", nwp['C_TIME']) continue row = env[env['C_TIME'].values == time].index[0] cls = labels[row] if row in delete_labels: continue if cls == 0: nwp_1.append(nwp) elif cls == 1: nwp_2.append(nwp) elif cls == 2: nwp_3.append(nwp) elif cls == 3: nwp_4.append(nwp) nwp_1 = pd.concat(nwp_1, axis=1).T.reset_index(drop=True) nwp_2 = pd.concat(nwp_2, axis=1).T.reset_index(drop=True) nwp_3 = pd.concat(nwp_3, axis=1).T.reset_index(drop=True) nwp_4 = pd.concat(nwp_4, axis=1).T.reset_index(drop=True) nwp1_train, nwp1_test = train_test_split(nwp_1, test_size=0.1, random_state=7, shuffle=False) nwp1_test['label'] = 1 nwp2_train, nwp2_test = train_test_split(nwp_2, test_size=0.1, random_state=7, shuffle=False) nwp2_test['label'] = 2 nwp3_train, nwp3_test = train_test_split(nwp_3, test_size=0.1, random_state=7, shuffle=False) nwp3_test['label'] = 3 nwp4_train, nwp4_test = train_test_split(nwp_4, test_size=0.1, random_state=7, shuffle=False) nwp4_test['label'] = 4 data_test = pd.concat([nwp1_test, nwp2_test, nwp3_test, nwp4_test]) data_test.to_csv('../xiangzhou/Dataset_training/nwp_test.csv', index=False) nwp1_train.to_csv('../xiangzhou/Dataset_training/nwp_1.csv', index=False) nwp2_train.to_csv('../xiangzhou/Dataset_training/nwp_2.csv', index=False) nwp3_train.to_csv('../xiangzhou/Dataset_training/nwp_3.csv', index=False) nwp4_train.to_csv('../xiangzhou/Dataset_training/nwp_4.csv', index=False) data_train = pd.concat([nwp1_train, nwp2_train, nwp3_train, nwp4_train]) data_train.to_csv('../xiangzhou/Dataset_training/nwp_train.csv', index=False) if __name__ == '__main__': # df_full, df, class_labels, target_dicts = load_iris_data() df = load_env_data().iloc[:, 1:] # 簇数量,鸢尾花数据集有3类 c = 4 # 最大迭代次数,防止无限循环 max_iter = 20 # 数据量 n_sample = len(df) # 加权指数m,有论文建议 [1.5, 2.5] 范围之间比较好 m = 1.7 fuzzy_matrix = init_fuzzy_matrix(n_sample, c) centers, labels, acc, delete_labels = fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='multi_normal') # multi_normal, random process_nwp(labels, delete_labels) from visual import cluster_scatter cluster_scatter(x=df.values, y=labels) # plot_random_init_iris_sepal(df) # plot_random_init_iris_petal(df) # plot_cluster_iris_sepal(df, labels, centers) # plot_cluster_iris_petal(df, labels, centers)