data_cleaning.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/10/11 11:00
  4. # file: data_cleaning.py
  5. # author: David
  6. # company: shenyang JY
  7. import numpy as np
  8. np.random.seed(42)
  9. def cleaning(df, name, cols=None, dup=True):
  10. print("开始清洗:{}……".format(name))
  11. data = df.copy()
  12. data = data_column_cleaning(data)
  13. if dup:
  14. data = rm_duplicated(data)
  15. if cols is not None:
  16. data = key_field_row_cleaning(data, cols)
  17. return data
  18. def data_column_cleaning(data, clean_value=[-99.0, -99]):
  19. """
  20. 列的清洗
  21. :param data:
  22. :param clean_value:
  23. :return:
  24. """
  25. data1 = data.copy()
  26. cols_pre = data.columns.to_list()
  27. for val in clean_value:
  28. data1 = data1.replace(val, np.nan)
  29. # nan 列超过80% 删除
  30. data1 = data1.dropna(axis=1, thresh=len(data) * 0.8)
  31. # 删除取值全部相同的列
  32. data1 = data1.loc[:, (data1 != data1.iloc[0]).any()]
  33. data = data[data1.columns.tolist()]
  34. cols_late = data.columns.tolist()
  35. if len(cols_pre) > len(cols_late):
  36. print("清洗的列有:{}".format(set(cols_pre) - set(cols_late)))
  37. return data
  38. def interpolation(data):
  39. # 剩下的nan进行线性插值
  40. data = data.bfill()
  41. return data
  42. def key_field_row_cleaning(data, cols):
  43. """
  44. 行的重要字段清洗: 过滤含有- 99的数字,过滤空值
  45. :param data:
  46. :param cols: 指定的字段列表
  47. :return:
  48. """
  49. rows_pre = len(data)
  50. for col in cols:
  51. if col in data.columns.tolist():
  52. data = data[~((data.loc[:, col] < 0) & (data.loc[:, col].astype(str).str.contains('99')))]
  53. data = data[~data.loc[:, col].isnull()]
  54. rows_late = len(data)
  55. if rows_pre - rows_late > 0:
  56. print("清洗的行数有:", rows_pre-rows_late)
  57. return data
  58. def rm_duplicated(data):
  59. """
  60. 按照时间去重
  61. :param data:
  62. :return:
  63. """
  64. # 按照时间去重
  65. rows_pre = len(data)
  66. data = data.drop_duplicates(subset='C_TIME')
  67. # data = data.groupby(by='C_TIME').mean()
  68. # data.reset_index(inplace=True)
  69. rows_late = len(data)
  70. if rows_pre - rows_late > 0:
  71. print("时间去重的行数有:", rows_pre - rows_late)
  72. return data
  73. if __name__ == '__main__':
  74. import pandas as pd
  75. power = pd.read_csv('./data/power.csv')
  76. x = data_column_cleaning(power)