fuzzy_c_means.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import pandas as pd
  4. import numpy as np
  5. import random
  6. import operator
  7. import math
  8. from copy import deepcopy
  9. import matplotlib.pyplot as plt
  10. from sklearn.model_selection import train_test_split
  11. # # 将网格线置于曲线之下
  12. # plt.rcParams['axes.axisbelow'] = False
  13. plt.style.use('fivethirtyeight') # 'ggplot'
  14. from PlotFunctions import plot_random_init_iris_sepal, plot_random_init_iris_petal, plot_cluster_iris_sepal, plot_cluster_iris_petal
  15. from sklearn.datasets import load_iris
  16. def load_iris_data():
  17. data = load_iris()
  18. # iris数据集的特征列
  19. features = data['data']
  20. # iris数据集的标签
  21. target = data['target']
  22. # 增加维度1,用于拼接
  23. target = target[:, np.newaxis]
  24. target_names = data['target_names']
  25. target_dicts = dict(zip(np.unique(target), target_names))
  26. feature_names = data['feature_names']
  27. # 浅拷贝,防止原地修改
  28. feature_names = data['feature_names'].copy() # deepcopy(data['feature_names'])
  29. feature_names.append('label')
  30. df_full = pd.DataFrame(data = np.concatenate([features, target], axis=1),
  31. columns=feature_names)
  32. # 保存数据集
  33. df_full.to_csv(str(os.getcwd()) + '/iris_data.csv', index=None)
  34. columns = list(df_full.columns)
  35. features = columns[:len(columns)-1]
  36. class_labels = list(df_full[columns[-1]])
  37. df = df_full[features]
  38. return df_full, df, class_labels, target_dicts
  39. def load_env_data():
  40. path = '../xiangzhou/features.csv'
  41. env = pd.read_csv(path)
  42. return env
  43. # 初始化隶属度矩阵 U
  44. def init_fuzzy_matrix(n_sample, c):
  45. """
  46. 随机初始化隶属度矩阵,注意针对一个样本,三个隶属度的相加和=1
  47. ----
  48. param n_sample: 样本数量
  49. param c: 聚类数量
  50. """
  51. # 针对数据集中所有样本的隶属度矩阵,shape = [n_sample, c]
  52. fuzzy_matrix = []
  53. for i in range(n_sample):
  54. # 生成 c 个随机数列表, random.random()方法随机生成[0,1)范围内的一个实数。
  55. random_list = [random.random() for i in range(c)]
  56. sum_of_random = sum(random_list)
  57. # 归一化之后的随机数列表
  58. # 单个样本的模糊隶属度列表
  59. norm_random_list = [x/sum_of_random for x in random_list]
  60. # 选择随机参数列表中最大的数的索引
  61. one_of_random_index = norm_random_list.index(max(norm_random_list))
  62. for j in range(0, len(norm_random_list)):
  63. if(j == one_of_random_index):
  64. norm_random_list[j] = 1
  65. else:
  66. norm_random_list[j] = 0
  67. fuzzy_matrix.append(norm_random_list)
  68. return fuzzy_matrix
  69. # 计算FCM的聚类中心
  70. def cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m):
  71. """
  72. param df: 数据集的特征集,不包含标签列
  73. param fuzzy_matrix: 隶属度矩阵
  74. param c: 聚类簇数量
  75. param m: 加权指数
  76. """
  77. # *字符称为解包运算符
  78. # zip(*fuzzy_amtrix) 相当于将fuzzy_matrix按列展开并拼接,但并不合并!
  79. # list(zip(*fuzzy_amtrix)) 包含 列数 个元组。
  80. fuzzy_mat_ravel = list(zip(*fuzzy_matrix))
  81. cluster_centers = []
  82. # 遍历聚类数量次
  83. for j in range(c):
  84. # 取出属于某一类的所有样本的隶属度列表(隶属度矩阵的一列)
  85. fuzzy_one_dim_list = list(fuzzy_mat_ravel[j])
  86. # 计算隶属度的m次方
  87. m_fuzzy_one_dim_list = [p ** m for p in fuzzy_one_dim_list]
  88. # 隶属度求和,求解聚类中心公式中的分母
  89. denominator = sum(m_fuzzy_one_dim_list)
  90. #
  91. numerator_list = []
  92. # 遍历所有样本,求分子
  93. for i in range(n_sample):
  94. # 取出一个样本
  95. sample = list(df.iloc[i])
  96. # 聚类簇中心的分子部分,样本与对应的隶属度的m次方相乘
  97. mul_sample_fuzzy = [m_fuzzy_one_dim_list[i] * val for val in sample]
  98. numerator_list.append(mul_sample_fuzzy)
  99. # 计算分子,求和
  100. numerator = map(sum, list(zip(*numerator_list)))
  101. cluster_center = [val/denominator for val in numerator]
  102. cluster_centers.append(cluster_center)
  103. return cluster_centers
  104. # 更新隶属度矩阵,参考公式 (8)
  105. def update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers):
  106. # 分母的指数项
  107. order = float(2 / (m - 1))
  108. # 遍历样本
  109. for i in range(n_sample):
  110. # 单个样本
  111. sample = list(df.iloc[i])
  112. # 计算更新公式的分母:样本减去聚类中心
  113. distances = [np.linalg.norm( np.array(list( map(operator.sub, sample, cluster_centers[j]) )) ) \
  114. for j in range(c)]
  115. for j in range(c):
  116. # 更新公式的分母
  117. denominator = sum([math.pow(float(distances[j]/distances[val]), order) for val in range(c)])
  118. fuzzy_matrix[i][j] = float(1 / denominator)
  119. return fuzzy_matrix #, distances
  120. # 获取聚类中心
  121. def get_clusters(fuzzy_matrix, n_sample, iter, max_iter):
  122. # 隶属度最大的那一个维度作为最终的聚类结果
  123. cluster_labels, delete_labels = [], []
  124. for i in range(n_sample):
  125. max_val, idx = max( (val, idx) for (idx, val) in enumerate(fuzzy_matrix[i]) )
  126. cluster_labels.append(idx)
  127. if iter == max_iter-1:
  128. print("max_val = ", max_val)
  129. if max_val < 0.15:
  130. delete_labels.append(i)
  131. return cluster_labels, delete_labels
  132. # 模糊c均值聚类算法
  133. def fuzzy_c_means(df, fuzzy_matrix, n_sample, c, m, max_iter, init_method='random'):
  134. """
  135. param init_random: 聚类中心的初始化方法
  136. - random: 从样本中随机选择c个作为聚类中心
  137. - multi_normal: 多元高斯分布采样
  138. """
  139. # 样本特征数量
  140. n_features = df.shape[-1]
  141. # 初始化隶属度矩阵
  142. fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
  143. # 初始化迭代次数
  144. current_iter = 0
  145. # 初始化聚类中心
  146. init_cluster_centers = []
  147. cluster_centers = []
  148. # 初始化样本聚类标签的列表,每次迭代都需要保存每个样本的聚类
  149. max_iter_cluster_labels = []
  150. # 选择初始化方法
  151. if init_method == 'multi_normal':
  152. # 均值列表
  153. mean = [0] * n_features
  154. # 多元高斯分布的协方差矩阵,对角阵
  155. cov = np.identity(n_features)
  156. for i in range(0, c):
  157. init_cluster_centers.append(list(np.random.multivariate_normal(mean, cov) ) )
  158. # else:
  159. # init_cluster_centers = [[0.1] * n_features ] * c
  160. print(init_cluster_centers)
  161. while current_iter < max_iter:
  162. if current_iter == 0 and init_method == 'multi_normal':
  163. cluster_centers = init_cluster_centers
  164. else:
  165. cluster_centers = cal_cluster_centers(df, fuzzy_matrix, n_sample, c, m)
  166. fuzzy_matrix = update_fuzzy_matrix(df, fuzzy_matrix, n_sample, c, m, cluster_centers)
  167. cluster_labels, delete_labels = get_clusters(fuzzy_matrix, n_sample, iter=current_iter, max_iter=max_iter)
  168. max_iter_cluster_labels.append(cluster_labels)
  169. current_iter += 1
  170. print('-' * 32)
  171. print("Fuzzy Matrix U:\n")
  172. print(np.array(fuzzy_matrix))
  173. return cluster_centers, cluster_labels, max_iter_cluster_labels, delete_labels
  174. def process_nwp(labels, delete_labels):
  175. env = load_env_data()
  176. nwps = pd.read_csv('../xiangzhou/NWP.csv')
  177. nwp_1, nwp_2, nwp_3, nwp_4 = [], [], [], []
  178. nwps['C_TIME'] = pd.to_datetime(nwps['C_TIME'])
  179. for index, nwp in nwps.iterrows():
  180. time = nwp['C_TIME'].strftime('%Y-%m-%d %H:00:00')
  181. if len(env[env['C_TIME'].values == time].index) == 0:
  182. print("nwp此时的时间点在环境数据中找不到:", nwp['C_TIME'])
  183. continue
  184. row = env[env['C_TIME'].values == time].index[0]
  185. cls = labels[row]
  186. if row in delete_labels:
  187. continue
  188. if cls == 0:
  189. nwp_1.append(nwp)
  190. elif cls == 1:
  191. nwp_2.append(nwp)
  192. elif cls == 2:
  193. nwp_3.append(nwp)
  194. elif cls == 3:
  195. nwp_4.append(nwp)
  196. nwp_1 = pd.concat(nwp_1, axis=1).T.reset_index(drop=True)
  197. nwp_2 = pd.concat(nwp_2, axis=1).T.reset_index(drop=True)
  198. nwp_3 = pd.concat(nwp_3, axis=1).T.reset_index(drop=True)
  199. nwp_4 = pd.concat(nwp_4, axis=1).T.reset_index(drop=True)
  200. nwp1_train, nwp1_test = train_test_split(nwp_1, test_size=0.1,
  201. random_state=7,
  202. shuffle=False)
  203. nwp1_test['label'] = 1
  204. nwp2_train, nwp2_test = train_test_split(nwp_2, test_size=0.1,
  205. random_state=7,
  206. shuffle=False)
  207. nwp2_test['label'] = 2
  208. nwp3_train, nwp3_test = train_test_split(nwp_3, test_size=0.1,
  209. random_state=7,
  210. shuffle=False)
  211. nwp3_test['label'] = 3
  212. nwp4_train, nwp4_test = train_test_split(nwp_4, test_size=0.1,
  213. random_state=7,
  214. shuffle=False)
  215. nwp4_test['label'] = 4
  216. data_test = pd.concat([nwp1_test, nwp2_test, nwp3_test, nwp4_test])
  217. data_test.to_csv('../xiangzhou/Dataset_training/nwp_test.csv', index=False)
  218. nwp1_train.to_csv('../xiangzhou/Dataset_training/nwp_1.csv', index=False)
  219. nwp2_train.to_csv('../xiangzhou/Dataset_training/nwp_2.csv', index=False)
  220. nwp3_train.to_csv('../xiangzhou/Dataset_training/nwp_3.csv', index=False)
  221. nwp4_train.to_csv('../xiangzhou/Dataset_training/nwp_4.csv', index=False)
  222. data_train = pd.concat([nwp1_train, nwp2_train, nwp3_train, nwp4_train])
  223. data_train.to_csv('../xiangzhou/Dataset_training/nwp_train.csv', index=False)
  224. if __name__ == '__main__':
  225. # df_full, df, class_labels, target_dicts = load_iris_data()
  226. df = load_env_data().iloc[:, 1:]
  227. # 簇数量,鸢尾花数据集有3类
  228. c = 4
  229. # 最大迭代次数,防止无限循环
  230. max_iter = 20
  231. # 数据量
  232. n_sample = len(df)
  233. # 加权指数m,有论文建议 [1.5, 2.5] 范围之间比较好
  234. m = 1.7
  235. fuzzy_matrix = init_fuzzy_matrix(n_sample, c)
  236. centers, labels, acc, delete_labels = fuzzy_c_means(df,
  237. fuzzy_matrix,
  238. n_sample,
  239. c,
  240. m,
  241. max_iter,
  242. init_method='multi_normal') # multi_normal, random
  243. process_nwp(labels, delete_labels)
  244. from visual import cluster_scatter
  245. cluster_scatter(x=df.values, y=labels)
  246. # plot_random_init_iris_sepal(df)
  247. # plot_random_init_iris_petal(df)
  248. # plot_cluster_iris_sepal(df, labels, centers)
  249. # plot_cluster_iris_petal(df, labels, centers)