data_clean.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. # !usr/bin/env python
  2. # -*- coding:utf-8 _*-
  3. """
  4. @Author:Lijiaxing
  5. @File:data_clean.py
  6. @Time:2023/4/26 18:06
  7. """
  8. import os.path
  9. import numpy as np
  10. import pandas as pd
  11. import matplotlib.pyplot as plt
  12. def paint_data(clean_data, clean_index=1):
  13. x = [index for index in range(len(clean_data))]
  14. plt.figure(figsize=(20, 10))
  15. plt.title('clean_{}'.format(clean_index))
  16. # 绘制曲线
  17. plt.plot(x, clean_data, color='red', label='clean_data')
  18. plt.savefig('data_{}.png'.format(clean_index))
  19. plt.show()
  20. class clean_file:
  21. """
  22. 清洗数据
  23. """
  24. def __init__(self, output_path='./'):
  25. """
  26. :param output_path: 清洗后的数据存放路径 ,只传入路径,不包括文件名
  27. """
  28. self.data = []
  29. output_path = os.path.join(output_path, 'clean_data')
  30. if not os.path.exists(output_path):
  31. os.makedirs(output_path)
  32. self.output_path = output_path
  33. def clean_data(self, file_path, clean_name, clean_value, multi_value=False, clean_index=1, paint=True):
  34. """
  35. 数据清洗
  36. 将-9999或-99数据进行插值处理,并绘制处理后的数据图像保存至output_path路径下
  37. :param paint: 是否绘制图像
  38. :param multi_value: 若为True,则clean_value为list
  39. :param clean_value: 清洗数据中异常值
  40. :param clean_name: 清洗数据中异常值列名
  41. :param file_path: 需要清洗的数据,csv格式
  42. :param clean_index: 清洗数据输出文件名格式为 clean_${clean_index}.csv
  43. :return: None
  44. """
  45. data = pd.read_csv(file_path)
  46. if paint:
  47. paint_old_data = [item for item in data[clean_name].values]
  48. old_data = data[clean_name].values
  49. if multi_value:
  50. for clean_value_i in clean_value:
  51. data[clean_name][old_data == clean_value_i] = np.nan
  52. else:
  53. data[clean_name][old_data == clean_value] = np.nan
  54. data[clean_name] = data[clean_name].interpolate()
  55. data.to_csv(os.path.join(self.output_path, 'clean_{}.csv'.format(clean_index)), index=False)
  56. already_clean = data[clean_name].values
  57. if paint:
  58. paint_data(already_clean, clean_index)
  59. # 使用示例
  60. cleaner = clean_file(output_path='Dataset_training/power/')
  61. for i in range(6):
  62. cleaner.clean_data(file_path='Dataset_training/power/power_{}.csv'.format(i), clean_name='C_REAL_VALUE',
  63. clean_value=[-9999.0, -99], multi_value=True,
  64. clean_index=i)