missingRate.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import pandas as pd
  2. import numpy as np
  3. def calculate_missing_rate(file_path):
  4. # 读取CSV文件
  5. df = pd.read_csv(file_path)
  6. # 确保C_TIME列的格式是字符串并转换为datetime对象
  7. df['C_TIME'] = pd.to_datetime(df['C_TIME'], format='%Y-%m-%d %H:%M:%S')
  8. # 提取年月信息作为分组依据
  9. df['YearMonth'] = df['C_TIME'].dt.to_period('M')
  10. # 按月份分组处理数据
  11. grouped = df.groupby('YearMonth')
  12. # 结果字典保存每月缺失率和缺失的时间点
  13. missing_rates = {}
  14. for period, group in grouped:
  15. # 获取该月的最小和最大时间
  16. start_time = group['C_TIME'].min()
  17. end_time = group['C_TIME'].max()
  18. # 生成该月所有的15分钟时间点
  19. full_time_range = pd.date_range(start=start_time, end=end_time, freq='15T')
  20. # 找到该月的时间点缺失部分
  21. missing_times = full_time_range.difference(group['C_TIME'])
  22. # 计算缺失率:缺失时间点的数量 / 全部应该有的时间点数量
  23. missing_rate = len(missing_times) / len(full_time_range)
  24. # 保存结果
  25. missing_rates[period] = {
  26. 'missing_rate': missing_rate,
  27. 'missing_times': missing_times
  28. }
  29. # 输出结果
  30. for period, data in missing_rates.items():
  31. print(f"Month: {period}, Missing Rate: {data['missing_rate']:.2%}")
  32. if len(data['missing_times']) > 0:
  33. print(f" Missing Time Points:")
  34. for missing_time in data['missing_times']:
  35. print(f" {missing_time}")
  36. else:
  37. print(" No missing time points")
  38. # 调用函数,假设文件路径是'tower-1-process.csv'
  39. file_path = 'weather-1-process.csv'
  40. calculate_missing_rate(file_path)