# !usr/bin/env python # -*- coding:utf-8 _*- """ @Author:Lijiaxing @File:data_clean.py @Time:2023/4/26 18:06 """ import os.path import numpy as np import pandas as pd import matplotlib.pyplot as plt def paint_data(clean_data, clean_index=1): x = [index for index in range(len(clean_data))] plt.figure(figsize=(20, 10)) plt.title('clean_{}'.format(clean_index)) # 绘制曲线 plt.plot(x, clean_data, color='red', label='clean_data') plt.savefig('data_{}.png'.format(clean_index)) plt.show() class clean_file: """ 清洗数据 """ def __init__(self, output_path='./'): """ :param output_path: 清洗后的数据存放路径 ,只传入路径,不包括文件名 """ self.data = [] output_path = os.path.join(output_path, 'clean_data') if not os.path.exists(output_path): os.makedirs(output_path) self.output_path = output_path def clean_data(self, file_path, clean_name, clean_value, multi_value=False, clean_index=1, paint=True): """ 数据清洗 将-9999或-99数据进行插值处理,并绘制处理后的数据图像保存至output_path路径下 :param paint: 是否绘制图像 :param multi_value: 若为True,则clean_value为list :param clean_value: 清洗数据中异常值 :param clean_name: 清洗数据中异常值列名 :param file_path: 需要清洗的数据,csv格式 :param clean_index: 清洗数据输出文件名格式为 clean_${clean_index}.csv :return: None """ data = pd.read_csv(file_path) if paint: paint_old_data = [item for item in data[clean_name].values] old_data = data[clean_name].values if multi_value: for clean_value_i in clean_value: data[clean_name][old_data == clean_value_i] = np.nan else: data[clean_name][old_data == clean_value] = np.nan data[clean_name] = data[clean_name].interpolate() data.to_csv(os.path.join(self.output_path, 'clean_{}.csv'.format(clean_index)), index=False) already_clean = data[clean_name].values if paint: paint_data(already_clean, clean_index) # 使用示例 cleaner = clean_file(output_path='Dataset_training/power/') for i in range(6): cleaner.clean_data(file_path='Dataset_training/power/power_{}.csv'.format(i), clean_name='C_REAL_VALUE', clean_value=[-9999.0, -99], multi_value=True, clean_index=i)