123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- # !usr/bin/env python
- # -*- coding:utf-8 _*-
- """
- @Author:Lijiaxing
-
- @File:data_clean.py
- @Time:2023/4/26 18:06
- """
- import os.path
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- def paint_data(clean_data, clean_index=1):
- x = [index for index in range(len(clean_data))]
- plt.figure(figsize=(20, 10))
- plt.title('clean_{}'.format(clean_index))
- # 绘制曲线
- plt.plot(x, clean_data, color='red', label='clean_data')
- plt.savefig('data_{}.png'.format(clean_index))
- plt.show()
- class clean_file:
- """
- 清洗数据
- """
- def __init__(self, output_path='./'):
- """
- :param output_path: 清洗后的数据存放路径 ,只传入路径,不包括文件名
- """
- self.data = []
- output_path = os.path.join(output_path, 'clean_data')
- if not os.path.exists(output_path):
- os.makedirs(output_path)
- self.output_path = output_path
- def clean_data(self, file_path, clean_name, clean_value, multi_value=False, clean_index=1, paint=True):
- """
- 数据清洗
- 将-9999或-99数据进行插值处理,并绘制处理后的数据图像保存至output_path路径下
- :param paint: 是否绘制图像
- :param multi_value: 若为True,则clean_value为list
- :param clean_value: 清洗数据中异常值
- :param clean_name: 清洗数据中异常值列名
- :param file_path: 需要清洗的数据,csv格式
- :param clean_index: 清洗数据输出文件名格式为 clean_${clean_index}.csv
- :return: None
- """
- data = pd.read_csv(file_path)
- if paint:
- paint_old_data = [item for item in data[clean_name].values]
- old_data = data[clean_name].values
- if multi_value:
- for clean_value_i in clean_value:
- data[clean_name][old_data == clean_value_i] = np.nan
- else:
- data[clean_name][old_data == clean_value] = np.nan
- data[clean_name] = data[clean_name].interpolate()
- data.to_csv(os.path.join(self.output_path, 'clean_{}.csv'.format(clean_index)), index=False)
- already_clean = data[clean_name].values
- if paint:
- paint_data(already_clean, clean_index)
- # 使用示例
- cleaner = clean_file(output_path='Dataset_training/power/')
- for i in range(6):
- cleaner.clean_data(file_path='Dataset_training/power/power_{}.csv'.format(i), clean_name='C_REAL_VALUE',
- clean_value=[-9999.0, -99], multi_value=True,
- clean_index=i)
|