1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # time: 2023/10/11 11:00
- # file: data_cleaning.py
- # author: David
- # company: shenyang JY
- import numpy as np
- def cleaning(df, name, cols=None, dup=True):
- print("开始清洗:{}……".format(name))
- data = df.copy()
- data = data_column_cleaning(data)
- if dup:
- data = rm_duplicated(data)
- if cols is not None:
- data = key_field_row_cleaning(data, cols)
- else:
- data = interpolation(data)
- return data
- def data_column_cleaning(data, clean_value=[-9999.0, -99, -99.0]):
- """
- 列的清洗
- :param data:
- :param clean_value:
- :return:
- """
- cols_pre = data.columns.to_list()
- for val in clean_value:
- data = data.replace(val, np.nan)
- # nan 列超过80% 删除
- data = data.dropna(axis=1, thresh=len(data) * 0.8)
- # 删除取值全部相同的列
- data = data.loc[:, (data != data.iloc[0]).any()]
- cols_late = data.columns.tolist()
- if len(cols_pre) > len(cols_late):
- print("清洗的列有:{}".format(set(cols_pre) - set(cols_late)))
- return data
- def interpolation(data):
- # 剩下的nan进行线性插值
- data = data.bfill()
- return data
- def key_field_row_cleaning(data, cols):
- """
- 行的重要字段清洗: 过滤含有- 99的数字,过滤空值
- :param data:
- :param cols: 指定的字段列表
- :return:
- """
- rows_pre = len(data)
- for col in cols:
- data = data[~((data.loc[:, col] < 0) & (data.loc[:, col].astype(str).str.contains('99')))]
- data = data[~data.loc[:, col].isnull()]
- rows_late = len(data)
- if rows_pre - rows_late > 0:
- print("清洗的行数有:", rows_pre-rows_late)
- return data
- def rm_duplicated(data):
- """
- 按照时间去重
- :param data:
- :return:
- """
- # 按照时间去重
- rows_pre = len(data)
- data = data.groupby(by='C_TIME').mean()
- data.reset_index(inplace=True)
- rows_late = len(data)
- if rows_pre - rows_late > 0:
- print("时间去重的行数有:", rows_pre - rows_late)
- return data
|