#!/usr/bin/env python # -*- coding: utf-8 -*- # time: 2023/10/11 11:00 # file: data_cleaning.py # author: David # company: shenyang JY import numpy as np np.random.seed(42) def cleaning(df, name, logger, cols=None, dup=True): logger.info("开始清洗:{}……".format(name)) data = df.copy() data = data_column_cleaning(data, logger) if dup: data = rm_duplicated(data, logger) if cols is not None: data = key_field_row_cleaning(data, cols, logger) return data def data_column_cleaning(data, logger, clean_value=[-99.0, -99]): """ 列的清洗 :param data: :param logger: :param clean_value: :return: """ data1 = data.copy() cols_pre = data.columns.to_list() for val in clean_value: data1 = data1.replace(val, np.nan) # nan 列超过80% 删除 data1 = data1.dropna(axis=1, thresh=len(data) * 0.8) # 删除取值全部相同的列 data1 = data1.loc[:, (data1 != data1.iloc[0]).any()] data = data[data1.columns.tolist()] cols_late = data.columns.tolist() if len(cols_pre) > len(cols_late): logger.info("列清洗:清洗的列有:{}".format(set(cols_pre) - set(cols_late))) return data def key_field_row_cleaning(data, cols, logger): """ 行的重要字段清洗: 过滤含有- 99的数字,过滤空值 :param data: :param cols: 指定的字段列表 :param logger: :return: """ rows_pre = len(data) nan_cols = [] for col in cols: begin = len(data) if col in data.columns.tolist(): # data = data[~((data.loc[:, col] < 0) & (data.loc[:, col].astype(str).str.contains('99')))] data = data[~(data[col] == -99)] data = data[~data[col].isnull()] end = len(data) if end - begin > 0: nan_cols.append(col) rows_late = len(data) if rows_pre - rows_late > 0: logger.info("行清洗:清洗的行数有:{},缺失的列有:{}".format(rows_pre-rows_late, ', '.join(nan_cols))) return data def rm_duplicated(data, logger): """ 按照时间去重 :param data: :param logger: :return: """ # 按照时间去重 rows_pre = len(data) data = data.drop_duplicates(subset='C_TIME') rows_late = len(data) if rows_pre - rows_late > 0: logger.info("时间去重的行数有:{}".format(rows_pre - rows_late)) return data