processing_data_common.py 1.1 KB

1234567891011121314151617181920212223242526272829303132
  1. import random
  2. def str_to_list(arg):
  3. if arg == '':
  4. return []
  5. else:
  6. return arg.split(',')
  7. # 随机生成唯一颜色
  8. def generate_unique_colors(num_colors):
  9. generated_colors = set()
  10. while len(generated_colors) < num_colors:
  11. color = f"rgb({random.randint(0, 255)}, {random.randint(0, 255)}, {random.randint(0, 255)})"
  12. generated_colors.add(color)
  13. return list(generated_colors)
  14. def missing_features(df, features, col_time, threshold=0.2):
  15. df['day'] = df[col_time].str[:10]
  16. # 按日期分组,计算缺失率
  17. missing_rates = df[['day']+features].groupby('day').apply(
  18. lambda group: (group.isnull().sum() / group.shape[0]).mean()
  19. )
  20. # 筛选特征平均缺失率大于 20% 的日期
  21. days_with_high_missing = missing_rates[missing_rates >= threshold].index
  22. # 打印结果
  23. print("特征缺失率超过50%的日期:",days_with_high_missing)
  24. print()
  25. print("**********删除前维度", df.shape)
  26. df = df[~df['day'].isin(days_with_high_missing)]
  27. print("**********删除后维度", df.shape)
  28. return df.drop('day',axis=1)