123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- # -*- coding: utf-8 -*-
- import os
- import pandas as pd
- import numpy as np
- import random
- import operator
- import math
- from copy import deepcopy
- import matplotlib.pyplot as plt
- from sklearn.model_selection import train_test_split
- # # 将网格线置于曲线之下
- # plt.rcParams['axes.axisbelow'] = False
- plt.style.use('fivethirtyeight') # 'ggplot'
- from PlotFunctions import plot_random_init_iris_sepal, plot_random_init_iris_petal, plot_cluster_iris_sepal, plot_cluster_iris_petal
- from sklearn.datasets import load_iris
- def load_iris_data():
- data = load_iris()
- # iris数据集的特征列
- features = data['data']
- # iris数据集的标签
- target = data['target']
- # 增加维度1,用于拼接
- target = target[:, np.newaxis]
-
- target_names = data['target_names']
- target_dicts = dict(zip(np.unique(target), target_names))
-
- feature_names = data['feature_names']
-
- # 浅拷贝,防止原地修改
- feature_names = data['feature_names'].copy() # deepcopy(data['feature_names'])
- feature_names.append('label')
-
- df_full = pd.DataFrame(data = np.concatenate([features, target], axis=1),
- columns=feature_names)
- # 保存数据集
- df_full.to_csv(str(os.getcwd()) + '/iris_data.csv', index=None)
-
- columns = list(df_full.columns)
- features = columns[:len(columns)-1]
- class_labels = list(df_full[columns[-1]])
- df = df_full[features]
-
- return df_full, df, class_labels, target_dicts
- def load_env_data():
- path = '../xiangzhou/features.csv'
- env = pd.read_csv(path)
- return env
- # 初始化隶属度矩阵 U
- def init_fuzzy_matrix(n_sample, c):
- """
- 随机初始化隶属度矩阵,注意针对一个样本,三个隶属度的相加和=1
- ----
- param n_sample: 样本数量
- param c: 聚类数量
- """
- # 针对数据集中所有样本的隶属度矩阵,shape = [n_sample, c]
- fuzzy_matrix = []
-
- for i in range(n_sample):
- # 生成 c 个随机数列表, random.random()方法随机生成[0,1)范围内的一个实数。
- random_list = [random.random() for i in range(c)]
- sum_of_random = sum(random_list)
- # 归一化之后的随机数列表
- # 单个样本的模糊隶属度列表
- norm_random_list = [x/sum_of_random for x in random_list]
- # 选择随机参数列表中最大的数的索引
- one_of_random_index = norm_random_list.index(max(norm_random_list))
-
- for j in range(0, len(norm_random_list)):
- if(j == one_of_random_index):
- norm_random_list[j] = 1
- else:
- norm_random_list[j] = 0
-
- fuzzy_matrix.append(norm_random_list)
-
- return fuzzy_matrix
- # 计算FCM的聚类中心
- def cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m):
- """
- param df: 数据集的特征集,不包含标签列
- param fuzzy_matrix: 隶属度矩阵
- param c: 聚类簇数量
- param m: 加权指数
- """
- # *字符称为解包运算符
- # zip(*fuzzy_amtrix) 相当于将fuzzy_matrix按列展开并拼接,但并不合并!
- # list(zip(*fuzzy_amtrix)) 包含 列数 个元组。
- fuzzy_mat_ravel = list(zip(*fuzzy_matrix))
-
- cluster_centers = []
-
- # 遍历聚类数量次
- for j in range(c):
- # 取出属于某一类的所有样本的隶属度列表(隶属度矩阵的一列)
- fuzzy_one_dim_list = list(fuzzy_mat_ravel[j])
- # 计算隶属度的m次方
- m_fuzzy_one_dim_list = [p ** m for p in fuzzy_one_dim_list]
- # 隶属度求和,求解聚类中心公式中的分母
- denominator = sum(m_fuzzy_one_dim_list)
-
- #
- numerator_list = []
-
- # 遍历所有样本,求分子
- for i in range(n_sample):
- # 取出一个样本
- sample = list(df.iloc[i])
- # 聚类簇中心的分子部分,样本与对应的隶属度的m次方相乘
- mul_sample_fuzzy = [m_fuzzy_one_dim_list[i] * val for val in sample]
- numerator_list.append(mul_sample_fuzzy)
- # 计算分子,求和
- numerator = map(sum, list(zip(*numerator_list)))
- cluster_center = [val/denominator for val in numerator]
- cluster_centers.append(cluster_center)
-
- return cluster_centers
- # 更新隶属度矩阵,参考公式 (8)
- def update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers):
- # 分母的指数项
- order = float(2 / (m - 1))
- # 遍历样本
- for i in range(n_sample):
- # 单个样本
- sample = list(df.iloc[i])
- # 计算更新公式的分母:样本减去聚类中心
- distances = [np.linalg.norm( np.array(list( map(operator.sub, sample, cluster_centers[j]) )) ) \
- for j in range(c)]
- for j in range(c):
- # 更新公式的分母
- denominator = sum([math.pow(float(distances[j]/distances[val]), order) for val in range(c)])
- fuzzy_matrix[i][j] = float(1 / denominator)
-
- return fuzzy_matrix #, distances
- # 获取聚类中心
- def get_clusters(fuzzy_matrix, n_sample, iter, max_iter):
- # 隶属度最大的那一个维度作为最终的聚类结果
- cluster_labels, delete_labels = [], []
- for i in range(n_sample):
- max_val, idx = max( (val, idx) for (idx, val) in enumerate(fuzzy_matrix[i]) )
- cluster_labels.append(idx)
- if iter == max_iter-1:
- print("max_val = ", max_val)
- if max_val < 0.15:
- delete_labels.append(i)
- return cluster_labels, delete_labels
- # 模糊c均值聚类算法
- def fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='random'):
- """
- param init_random: 聚类中心的初始化方法
- - random: 从样本中随机选择c个作为聚类中心
- - multi_normal: 多元高斯分布采样
- """
- # 样本特征数量
- n_features = df.shape[-1]
- # 初始化隶属度矩阵
- fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
- # 初始化迭代次数
- current_iter = 0
- # 初始化聚类中心
- init_cluster_centers = []
- cluster_centers = []
- # 初始化样本聚类标签的列表,每次迭代都需要保存每个样本的聚类
- max_iter_cluster_labels = []
- # 选择初始化方法
- if init_method == 'multi_normal':
- # 均值列表
- mean = [0] * n_features
- # 多元高斯分布的协方差矩阵,对角阵
- cov = np.identity(n_features)
- for i in range(0, c):
- init_cluster_centers.append(list(np.random.multivariate_normal(mean, cov) ) )
- # else:
- # init_cluster_centers = [[0.1] * n_features ] * c
-
- print(init_cluster_centers)
-
- while current_iter < max_iter:
- if current_iter == 0 and init_method == 'multi_normal':
- cluster_centers = init_cluster_centers
- else:
- cluster_centers = cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m)
- fuzzy_matrix = update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers)
- cluster_labels, delete_labels = get_clusters(fuzzy_matrix, n_sample, iter=current_iter, max_iter=max_iter)
- max_iter_cluster_labels.append(cluster_labels)
-
- current_iter += 1
-
- print('-' * 32)
- print("Fuzzy Matrix U:\n")
- print(np.array(fuzzy_matrix))
-
- return cluster_centers, cluster_labels, max_iter_cluster_labels, delete_labels
- def process_nwp(labels, delete_labels):
- env = load_env_data()
- nwps = pd.read_csv('../xiangzhou/NWP.csv')
- nwp_1, nwp_2, nwp_3, nwp_4 = [], [], [], []
- nwps['C_TIME'] = pd.to_datetime(nwps['C_TIME'])
- for index, nwp in nwps.iterrows():
- time = nwp['C_TIME'].strftime('%Y-%m-%d %H:00:00')
- if len(env[env['C_TIME'].values == time].index) == 0:
- print("nwp此时的时间点在环境数据中找不到:", nwp['C_TIME'])
- continue
- row = env[env['C_TIME'].values == time].index[0]
- cls = labels[row]
- if row in delete_labels:
- continue
- if cls == 0:
- nwp_1.append(nwp)
- elif cls == 1:
- nwp_2.append(nwp)
- elif cls == 2:
- nwp_3.append(nwp)
- elif cls == 3:
- nwp_4.append(nwp)
- nwp_1 = pd.concat(nwp_1, axis=1).T.reset_index(drop=True)
- nwp_2 = pd.concat(nwp_2, axis=1).T.reset_index(drop=True)
- nwp_3 = pd.concat(nwp_3, axis=1).T.reset_index(drop=True)
- nwp_4 = pd.concat(nwp_4, axis=1).T.reset_index(drop=True)
- nwp1_train, nwp1_test = train_test_split(nwp_1, test_size=0.1,
- random_state=7,
- shuffle=False)
- nwp1_test['label'] = 1
- nwp2_train, nwp2_test = train_test_split(nwp_2, test_size=0.1,
- random_state=7,
- shuffle=False)
- nwp2_test['label'] = 2
- nwp3_train, nwp3_test = train_test_split(nwp_3, test_size=0.1,
- random_state=7,
- shuffle=False)
- nwp3_test['label'] = 3
- nwp4_train, nwp4_test = train_test_split(nwp_4, test_size=0.1,
- random_state=7,
- shuffle=False)
- nwp4_test['label'] = 4
- data_test = pd.concat([nwp1_test, nwp2_test, nwp3_test, nwp4_test])
- data_test.to_csv('../xiangzhou/Dataset_training/nwp_test.csv', index=False)
- nwp1_train.to_csv('../xiangzhou/Dataset_training/nwp_1.csv', index=False)
- nwp2_train.to_csv('../xiangzhou/Dataset_training/nwp_2.csv', index=False)
- nwp3_train.to_csv('../xiangzhou/Dataset_training/nwp_3.csv', index=False)
- nwp4_train.to_csv('../xiangzhou/Dataset_training/nwp_4.csv', index=False)
- data_train = pd.concat([nwp1_train, nwp2_train, nwp3_train, nwp4_train])
- data_train.to_csv('../xiangzhou/Dataset_training/nwp_train.csv', index=False)
- if __name__ == '__main__':
- # df_full, df, class_labels, target_dicts = load_iris_data()
- df = load_env_data().iloc[:, 1:]
- # 簇数量,鸢尾花数据集有3类
- c = 4
- # 最大迭代次数,防止无限循环
- max_iter = 20
- # 数据量
- n_sample = len(df)
- # 加权指数m,有论文建议 [1.5, 2.5] 范围之间比较好
- m = 1.7
-
- fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
- centers, labels, acc, delete_labels = fuzzy_c_means(df,
- fuzzy_matrix,
- n_sample,
- c,
- m,
- max_iter,
- init_method='multi_normal') # multi_normal, random
- process_nwp(labels, delete_labels)
- from visual import cluster_scatter
- cluster_scatter(x=df.values, y=labels)
- # plot_random_init_iris_sepal(df)
- # plot_random_init_iris_petal(df)
- # plot_cluster_iris_sepal(df, labels, centers)
- # plot_cluster_iris_petal(df, labels, centers)
-
-
-
-
-
-
-
-
|