Count number of hours per day above thresholds/targets

Note

In this script, I import the weather, precipitation and pollution data.

I compute tmin, tmax, tmean (if there is less than 4/24 missing per AGEB day). I compute mean of pollution, and also count the number of hours above WHO’s thresholds. I add precipitation as well to create a unified dataset.

Then, i compute some quintiles, and look at the share of missing values for each variable.

I save weather and pollution data (day by AGEB level) in weather_cdmx.rds.

1 Import the data

# takes long to read bc they're compressed, but theyre only read once

weather <- 
  read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_redmet_20_ageb.rds"
                ))

pollution <- 
    read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_rama_20_ageb.rds"
                ))
  
prec <- 
  read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_conagua_ageb.rds"))

2 Temperature: averages and count number of hours

We want to take care of the missing values…

weather %>% 
  count(nr_hrs_NA_tmp) %>% 
  mutate(share_pc = 100*n/sum(n)) %>% 
  kable()
nr_hrs_NA_tmp n share_pc
0 7688716 99.8982661
1 4972 0.0646004
2 40 0.0005197
3 8 0.0001039
4 26 0.0003378
5 4 0.0000520
6 1715 0.0222827
7 745 0.0096797
9 3 0.0000390
11 5 0.0000650
12 1 0.0000130
14 2 0.0000260
15 10 0.0001299
16 1 0.0000130
24 298 0.0038719

Now, we count and take avg… IF there are less than 4 NA out of the 24 hourly measurements.

weather <- weather %>% 
  mutate(
    tmean = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ mean(.x, na.rm = TRUE))),
    tmax  = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ max(.x, na.rm = TRUE))),
    tmin  = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ min(.x, na.rm = TRUE))),
    
    temp = (tmin + tmax)/2,
    
    # i keep the list of tmp_hourly, in case i want to count later...

    # tmp_h_above_30C = if_else(nr_hrs_NA_tmp >= 4,
    #                           NA_real_,
    #                           map_int(tmp_hourly, ~ sum(.x > 30, na.rm = TRUE))),
    # tmp_h_above_32C = if_else(nr_hrs_NA_tmp >= 4,
    #                           NA_real_,
    #                           map_int(tmp_hourly, ~ sum(.x > 32, na.rm = TRUE)))
    
  ) 

3 Pollution: averages and count number of hours

We want to take care of the missing values… In very few cases, we have only a few hours per day missing. This should not impact the averages, min max by a lot.

pollution %>% 
  count(nr_hrs_NA_pm25) %>% 
  mutate(share_pc = 100*n/sum(n)) %>% 
  kable()
nr_hrs_NA_pm25 n share_pc
0 8561362 99.7378918
1 8356 0.0973455
2 4903 0.0571188
3 1323 0.0154126
4 995 0.0115915
5 573 0.0066753
6 2664 0.0310350
7 211 0.0024581
8 81 0.0009436
9 156 0.0018174
10 103 0.0011999
11 19 0.0002213
12 43 0.0005009
13 320 0.0037279
14 8 0.0000932
15 64 0.0007456
16 156 0.0018174
17 17 0.0001980
18 4 0.0000466
19 45 0.0005242
20 1 0.0000116
21 13 0.0001514
22 29 0.0003378
23 45 0.0005242
24 2370 0.0276100
# pollution %>% 
#   count(nr_hrs_NA_pm10)

Now, we count and take avg…

pollution <- pollution %>%
  mutate(
      pm25_mean = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ mean(.x, na.rm = TRUE))),
      pm25_max  = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ max(.x, na.rm = TRUE))),
      pm25_min  = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ min(.x, na.rm = TRUE))),
      
      pm10_mean = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ mean(.x, na.rm = TRUE))),
      pm10_max  = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ max(.x, na.rm = TRUE))),
      pm10_min  = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ min(.x, na.rm = TRUE))),
      
      pm25_h_above_aqg = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 25, na.rm = TRUE))),
      pm25_h_above_it3 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 37.5, na.rm = TRUE))),
      pm25_h_above_it2 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 50, na.rm = TRUE))),
      pm25_h_above_it1 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 75, na.rm = TRUE))),
      
      # pm10_h_above_aqg = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 50, na.rm = TRUE))),
      # pm10_h_above_it3 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 75, na.rm = TRUE))),
      # pm10_h_above_it2 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 100, na.rm = TRUE))),
      # pm10_h_above_it1 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 150, na.rm = TRUE)))
    ) %>% 
  #i discard the hourly list to save on memory
  select(-pm25_hourly, -pm10_hourly)

Since pollution is only a control, I focus on PM25 and PM10: No need for co, no, no2, etc

pollution <- pollution %>% 
  select(ageb, fecha, contains("pm25"), contains("pm10")) 

4 Merge the data

data <- weather %>% 
  full_join(pollution, by = c("ageb", "fecha")) %>%
  full_join(prec, by = c("ageb", "fecha")) %>% 
  rename(date = fecha)

rm(prec, pollution, weather)

n_ageb <- data %>% distinct(ageb) %>% nrow()
n_dates <- data %>% distinct(date) %>% nrow()

n_ageb*n_dates
[1] 9016579
data %>% nrow()
[1] 9016579

5 Quintiles for prec, humidity, windspeed

data <- data %>% 
  mutate(prec_quintile = ntile(prec, 5),
         rh_quintile = ntile(rh, 5),
         wsp_quintile = ntile(wsp, 5))

6 Save the data

data %>% 
  mutate(year = lubridate::year(date),
         month = lubridate::month(date, label = T)) %>%
  # thats when the crime data starts
  filter(year >= 2016) %>%
  write_rds(here("..",
                 "output",
                 "temporary_data",
                 "weather",
                 "weather_cdmx.rds"))