processing_data_common.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import random
  2. from datetime import date, timedelta
  3. def str_to_list(arg):
  4. if arg == '':
  5. return []
  6. else:
  7. return arg.split(',')
  8. # 随机生成唯一颜色
  9. def generate_unique_colors(num_colors):
  10. generated_colors = set()
  11. while len(generated_colors) < num_colors:
  12. color = f"rgb({random.randint(0, 255)}, {random.randint(0, 255)}, {random.randint(0, 255)})"
  13. generated_colors.add(color)
  14. return list(generated_colors)
  15. def missing_features(df, features, col_time, threshold=0.2):
  16. df['day'] = df[col_time].str[:10]
  17. # 按日期分组,计算缺失率
  18. missing_rates = df[['day']+features].groupby('day').apply(
  19. lambda group: (group.isnull().sum() / group.shape[0]).mean()
  20. )
  21. # 筛选特征平均缺失率大于 20% 的日期
  22. days_with_high_missing = missing_rates[missing_rates >= threshold].index
  23. # 打印结果
  24. print("特征缺失率超过50%的日期:",days_with_high_missing)
  25. print()
  26. print("**********删除前维度", df.shape)
  27. df = df[~df['day'].isin(days_with_high_missing)]
  28. print("**********删除后维度", df.shape)
  29. return df.drop('day',axis=1)
  30. def check_nwp_data(nwp_df,features):
  31. tomorrow = (date.today() + timedelta(days=1)).strftime('%Y-%m-%d')
  32. if ~all(item in nwp_df.columns for item in features):
  33. diff = set(features)-set(nwp_df.columns)
  34. message = f"NWP特征列缺失!features:{diff}"
  35. #判断日前短期NWP是否缺数据
  36. elif len(nwp_df[nwp_df['date_time'].contains(tomorrow)])<96:
  37. message = "日前数据记录缺失,不足96条!"
  38. else:
  39. message=''
  40. return message