data_cleaning.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2023/10/11 11:00
  4. # file: data_cleaning.py
  5. # author: David
  6. # company: shenyang JY
  7. import numpy as np
  8. import pandas as pd
  9. from app.common.config import logger, parser
  10. np.random.seed(42)
  11. def clean_power(power, env, plant_id):
  12. env_power = pd.merge(env, power, on=params['col_time'])
  13. if 'HubSpeed' in env.columns.tolist():
  14. from app.common.limited_power_wind import LimitPower
  15. lp = LimitPower(logger, params, env_power)
  16. power = lp.clean_limited_power(plant_id, True)
  17. elif 'Irradiance' in env.columns.tolist():
  18. from app.common.limited_power_solar import LimitPower
  19. lp = LimitPower(logger, params, env_power)
  20. power = lp.clean_limited_power(plant_id, True)
  21. return power
  22. def cleaning(df, name, logger, cols=None, dup=True, col_time='Datetime'):
  23. logger.info("开始清洗:{}……".format(name))
  24. data = df.copy()
  25. data = data_column_cleaning(data, logger)
  26. if dup:
  27. data = rm_duplicated(data, logger, col_time)
  28. if cols is not None:
  29. data = key_field_row_cleaning(data, cols, logger)
  30. return data
  31. def data_column_cleaning(data, logger, clean_value='null'):
  32. """
  33. 列的清洗
  34. :param data:
  35. :param logger:
  36. :param clean_value:
  37. :return:
  38. """
  39. data1 = data.copy()
  40. cols_pre = data.columns.to_list()
  41. for val in clean_value:
  42. data1 = data1.replace(val, np.nan)
  43. # nan 列超过80% 删除
  44. data1 = data1.dropna(axis=1, thresh=len(data) * 0.8)
  45. # 删除取值全部相同的列
  46. data1 = data1.loc[:, (data1 != data1.iloc[0]).any()]
  47. data = data[data1.columns.tolist()]
  48. cols_late = data.columns.tolist()
  49. if len(cols_pre) > len(cols_late):
  50. logger.info("列清洗:清洗的列有:{}".format(set(cols_pre) - set(cols_late)))
  51. return data
  52. def key_field_row_cleaning(data, cols, logger):
  53. """
  54. 行的重要字段清洗: 过滤含有- 99的数字,过滤空值
  55. :param data:
  56. :param cols: 指定的字段列表
  57. :param logger:
  58. :return:
  59. """
  60. rows_pre = len(data)
  61. nan_cols = []
  62. for col in cols:
  63. begin = len(data)
  64. if col in data.columns.tolist():
  65. # data = data[~((data.loc[:, col] < 0) & (data.loc[:, col].astype(str).str.contains('99')))]
  66. data = data[~(data[col] == -99)]
  67. data = data[~data.loc[:, col].isnull()]
  68. end = len(data)
  69. if end - begin > 0:
  70. nan_cols.append(col)
  71. rows_late = len(data)
  72. if rows_pre - rows_late > 0:
  73. logger.info("行清洗:清洗的行数有:{},缺失的列有:{}".format(rows_pre-rows_late, ', '.join(nan_cols)))
  74. return data
  75. def rm_duplicated(data, logger, col_time='Datetime'):
  76. """
  77. 按照时间去重
  78. :param data:
  79. :param logger:
  80. :return:
  81. """
  82. # 按照时间去重
  83. rows_pre = len(data)
  84. data = data.drop_duplicates(subset=col_time)
  85. rows_late = len(data)
  86. if rows_pre - rows_late > 0:
  87. logger.info("时间去重的行数有:{}".format(rows_pre - rows_late))
  88. return data