processing_data_common.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import random
  2. from datetime import date, timedelta
  3. import pandas as pd
  4. def str_to_list(arg):
  5. if arg == '':
  6. return []
  7. else:
  8. return arg.split(',')
  9. # 随机生成唯一颜色
  10. def generate_unique_colors(num_colors):
  11. generated_colors = set()
  12. while len(generated_colors) < num_colors:
  13. color = f"rgb({random.randint(0, 255)}, {random.randint(0, 255)}, {random.randint(0, 255)})"
  14. generated_colors.add(color)
  15. return list(generated_colors)
  16. def missing_features(df, features, col_time, threshold=0.2):
  17. df['day'] = df[col_time].str[:10]
  18. # 按日期分组,计算缺失率
  19. missing_rates = df[['day']+features].groupby('day').apply(
  20. lambda group: (group.isnull().sum() / group.shape[0]).mean()
  21. )
  22. # 筛选特征平均缺失率大于 20% 的日期
  23. days_with_high_missing = missing_rates[missing_rates >= threshold].index
  24. # 打印结果
  25. print("特征缺失率超过50%的日期:",days_with_high_missing)
  26. print()
  27. print("**********删除前维度", df.shape)
  28. df = df[~df['day'].isin(days_with_high_missing)]
  29. print("**********删除后维度", df.shape)
  30. return df.drop('day',axis=1)
  31. def check_nwp_data(nwp_df,features):
  32. tomorrow = (date.today() + timedelta(days=1)).strftime('%Y-%m-%d')
  33. if ~all(item in nwp_df.columns for item in features):
  34. diff = set(features)-set(nwp_df.columns)
  35. message = f"NWP特征列缺失!features:{diff}"
  36. #判断日前短期NWP是否缺数据
  37. elif len(nwp_df[nwp_df['date_time'].contains(tomorrow)])<96:
  38. message = "日前数据记录缺失,不足96条!"
  39. else:
  40. message=''
  41. return message
  42. def get_xxl_dq(farm_id, dt):
  43. path_dir = f'data_processing/cache/data/xxl/{farm_id}/meteoforce_{farm_id}_{dt}06_power.csv'
  44. df = pd.read_csv(path_dir,index_col=0)
  45. return df[['farm_id', 'date_time', 'power_forecast']]