data_cleaning.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/10/11 11:00
  4. # file: data_cleaning.py
  5. # author: David
  6. # company: shenyang JY
  7. import numpy as np
  8. def cleaning(df, name, cols=None, dup=True):
  9. print("开始清洗:{}……".format(name))
  10. data = df.copy()
  11. data = data_column_cleaning(data)
  12. if dup:
  13. data = rm_duplicated(data)
  14. if cols is not None:
  15. data = key_field_row_cleaning(data, cols)
  16. else:
  17. data = interpolation(data)
  18. return data
  19. def data_column_cleaning(data, clean_value=[-9999.0, -99, -99.0]):
  20. """
  21. 列的清洗
  22. :param data:
  23. :param clean_value:
  24. :return:
  25. """
  26. cols_pre = data.columns.to_list()
  27. for val in clean_value:
  28. data = data.replace(val, np.nan)
  29. # nan 列超过80% 删除
  30. data = data.dropna(axis=1, thresh=len(data) * 0.8)
  31. # 删除取值全部相同的列
  32. data = data.loc[:, (data != data.iloc[0]).any()]
  33. cols_late = data.columns.tolist()
  34. if len(cols_pre) > len(cols_late):
  35. print("清洗的列有:{}".format(set(cols_pre) - set(cols_late)))
  36. return data
  37. def interpolation(data):
  38. # 剩下的nan进行线性插值
  39. data = data.bfill()
  40. return data
  41. def key_field_row_cleaning(data, cols):
  42. """
  43. 行的重要字段清洗: 过滤含有- 99的数字,过滤空值
  44. :param data:
  45. :param cols: 指定的字段列表
  46. :return:
  47. """
  48. rows_pre = len(data)
  49. for col in cols:
  50. data = data[~((data.loc[:, col] < 0) & (data.loc[:, col].astype(str).str.contains('99')))]
  51. data = data[~data.loc[:, col].isnull()]
  52. rows_late = len(data)
  53. if rows_pre - rows_late > 0:
  54. print("清洗的行数有:", rows_pre-rows_late)
  55. return data
  56. def rm_duplicated(data):
  57. """
  58. 按照时间去重
  59. :param data:
  60. :return:
  61. """
  62. # 按照时间去重
  63. rows_pre = len(data)
  64. data = data.groupby(by='C_TIME').mean()
  65. data.reset_index(inplace=True)
  66. rows_late = len(data)
  67. if rows_pre - rows_late > 0:
  68. print("时间去重的行数有:", rows_pre - rows_late)
  69. return data