data_filling.py 4.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # time: 2024/5/6 13:52
  4. # file: data_process.py
  5. # author: David
  6. # company: shenyang JY
  7. import os
  8. import numpy as np
  9. import pandas as pd
  10. from cache.data_cleaning import rm_duplicated
  11. np.random.seed(42)
  12. class DataProcess(object):
  13. def __init__(self, log, args):
  14. self.logger = log
  15. self.args = args
  16. self.opt = self.args.parse_args_and_yaml()
  17. # 主要是联立后的补值操作
  18. def get_train_data(self, unite, envir):
  19. # unite = pd.merge(unite, envir, on='C_TIME')
  20. unite['C_TIME'] = pd.to_datetime(unite['C_TIME'])
  21. unite['time_diff'] = unite['C_TIME'].diff()
  22. dt_short = pd.Timedelta(minutes=15)
  23. dt_long = pd.Timedelta(minutes=15 * self.opt.Model['how_long_fill'])
  24. data_train = self.missing_time_splite(unite, dt_short, dt_long)
  25. miss_points = unite[(unite['time_diff'] > dt_short) & (unite['time_diff'] < dt_long)]
  26. miss_number = miss_points['time_diff'].dt.total_seconds().sum(axis=0)/(15*60) - len(miss_points)
  27. self.logger.info("再次测算,需要插值的总点数为:{}".format(miss_number))
  28. if miss_number > 0 and self.opt.Model["train_data_fill"]:
  29. data_train = self.data_fill(data_train)
  30. return data_train, envir
  31. def get_predict_data(self, nwp, dq):
  32. if self.opt.Model["predict_data_fill"] and len(dq) > len(nwp):
  33. self.logger.info("接口nwp和dq合并清洗后,需要插值的总点数为:{}".format(len(dq)-len(nwp)))
  34. nwp.set_index('C_TIME', inplace=True)
  35. dq.set_index('C_TIME', inplace=True)
  36. nwp = nwp.resample('15T').interpolate(method='linear') # nwp先进行线性填充
  37. nwp = nwp.reindex(dq.index, method='bfill') # 再对超过采样边缘无法填充的点进行二次填充
  38. nwp = nwp.reindex(dq.index, method='ffill')
  39. nwp.reset_index(drop=False, inplace=True)
  40. dq.reset_index(drop=False, inplace=True)
  41. return nwp
  42. def get_test_data(self, unite, envir):
  43. # 第二步:计算间隔
  44. unite['C_TIME'] = pd.to_datetime(unite['C_TIME'])
  45. unite['time_diff'] = unite['C_TIME'].diff()
  46. dt_short = pd.Timedelta(minutes=15)
  47. dt_long = pd.Timedelta(minutes=15 * self.opt.Model['how_long_fill'])
  48. data_test = self.missing_time_splite(unite, dt_short, dt_long)
  49. miss_points = unite[(unite['time_diff'] > dt_short) & (unite['time_diff'] < dt_long)]
  50. miss_number = miss_points['time_diff'].dt.total_seconds().sum(axis=0) / (15 * 60) - len(miss_points)
  51. self.logger.info("再次测算,需要插值的总点数为:{}".format(miss_number))
  52. if self.opt.Model["test_data_fill"] and miss_number > 0:
  53. data_test = self.data_fill(data_test, test=True)
  54. return data_test, envir
  55. def missing_time_splite(self, df, dt_short, dt_long):
  56. n_long, n_short, n_points = 0, 0, 0
  57. start_index = 0
  58. dfs = []
  59. for i in range(1, len(df)):
  60. if df['time_diff'][i] >= dt_long:
  61. df_long = df.iloc[start_index:i, :-1]
  62. dfs.append(df_long)
  63. start_index = i
  64. n_long += 1
  65. if df['time_diff'][i] > dt_short:
  66. self.logger.info(f"{df['C_TIME'][i-1]} ~ {df['C_TIME'][i]}")
  67. points = df['time_diff'].dt.total_seconds()[i]/(60*15)-1
  68. self.logger.info("缺失点数:{}".format(points))
  69. if df['time_diff'][i] < dt_long:
  70. n_short += 1
  71. n_points += points
  72. self.logger.info("需要补值的点数:{}".format(points))
  73. dfs.append(df.iloc[start_index:, :-1])
  74. self.logger.info(f"数据总数:{len(df)}, 时序缺失的间隔:{n_short}, 其中,较长的时间间隔:{n_long}")
  75. self.logger.info("需要补值的总点数:{}".format(n_points))
  76. return dfs
  77. def data_fill(self, dfs, test=False):
  78. dfs_fill, inserts = [], 0
  79. for i, df in enumerate(dfs):
  80. df = rm_duplicated(df, self.logger)
  81. df1 = df.set_index('C_TIME', inplace=False)
  82. dff = df1.resample('15T').interpolate(method='linear') # 采用线性补值,其他补值方法需要进一步对比
  83. dff.reset_index(inplace=True)
  84. points = len(dff) - len(df1)
  85. dfs_fill.append(dff)
  86. self.logger.info("{} ~ {} 有 {} 个点, 填补 {} 个点.".format(dff.iloc[0, 0], dff.iloc[-1, 0], len(dff), points))
  87. inserts += points
  88. name = "预测数据" if test is True else "训练集"
  89. self.logger.info("{}分成了{}段,实际一共补值{}点".format(name, len(dfs_fill), inserts))
  90. return dfs_fill