Count number of hours per day above thresholds/targets

Note

In this script, I import the weather, precipitation and pollution data.

I compute tmin, tmax, tmean (if there is less than 4/24 missing per AGEB day). I compute mean of pollution, and also count the number of hours above WHO’s thresholds. I add precipitation as well to create a unified dataset.

Then, i compute some quintiles, and look at the share of missing values for each variable.

I save weather and pollution data (day by AGEB level) in weather_cdmx.rds.

1 Import the data

# takes long to read bc they're compressed, but theyre only read once

weather <- 
  read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_redmet_20_ageb.rds"
                ))

pollution <- 
    read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_rama_20_ageb.rds"
                ))
  
prec <- 
  read_rds(here("..", "output",
                "temporary_data",
                "weather",
                "processed_conagua_ageb.rds"))

2 Temperature: averages and count number of hours

We want to take care of the missing values…

weather %>% 
  count(nr_hrs_NA_tmp) %>% 
  mutate(share_pc = 100*n/sum(n)) %>% 
  kable()

nr_hrs_NA_tmp	n	share_pc
0	7688716	99.8982661
1	4972	0.0646004
2	40	0.0005197
3	8	0.0001039
4	26	0.0003378
5	4	0.0000520
6	1715	0.0222827
7	745	0.0096797
9	3	0.0000390
11	5	0.0000650
12	1	0.0000130
14	2	0.0000260
15	10	0.0001299
16	1	0.0000130
24	298	0.0038719

Now, we count and take avg… IF there are less than 4 NA out of the 24 hourly measurements.

weather <- weather %>% 
  mutate(
    tmean = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ mean(.x, na.rm = TRUE))),
    tmax  = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ max(.x, na.rm = TRUE))),
    tmin  = if_else(nr_hrs_NA_tmp >= 4,
                    NA_real_,
                    map_dbl(tmp_hourly, ~ min(.x, na.rm = TRUE))),
    
    temp = (tmin + tmax)/2,
    
    # i keep the list of tmp_hourly, in case i want to count later...

    # tmp_h_above_30C = if_else(nr_hrs_NA_tmp >= 4,
    #                           NA_real_,
    #                           map_int(tmp_hourly, ~ sum(.x > 30, na.rm = TRUE))),
    # tmp_h_above_32C = if_else(nr_hrs_NA_tmp >= 4,
    #                           NA_real_,
    #                           map_int(tmp_hourly, ~ sum(.x > 32, na.rm = TRUE)))
    
  )

3 Pollution: averages and count number of hours

We want to take care of the missing values… In very few cases, we have only a few hours per day missing. This should not impact the averages, min max by a lot.

pollution %>% 
  count(nr_hrs_NA_pm25) %>% 
  mutate(share_pc = 100*n/sum(n)) %>% 
  kable()

nr_hrs_NA_pm25	n	share_pc
0	8561362	99.7378918
1	8356	0.0973455
2	4903	0.0571188
3	1323	0.0154126
4	995	0.0115915
5	573	0.0066753
6	2664	0.0310350
7	211	0.0024581
8	81	0.0009436
9	156	0.0018174
10	103	0.0011999
11	19	0.0002213
12	43	0.0005009
13	320	0.0037279
14	8	0.0000932
15	64	0.0007456
16	156	0.0018174
17	17	0.0001980
18	4	0.0000466
19	45	0.0005242
20	1	0.0000116
21	13	0.0001514
22	29	0.0003378
23	45	0.0005242
24	2370	0.0276100

# pollution %>% 
#   count(nr_hrs_NA_pm10)

Now, we count and take avg…

pollution <- pollution %>%
  mutate(
      pm25_mean = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ mean(.x, na.rm = TRUE))),
      pm25_max  = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ max(.x, na.rm = TRUE))),
      pm25_min  = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_dbl(pm25_hourly, ~ min(.x, na.rm = TRUE))),
      
      pm10_mean = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ mean(.x, na.rm = TRUE))),
      pm10_max  = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ max(.x, na.rm = TRUE))),
      pm10_min  = if_else(nr_hrs_NA_pm10 >= 4,
                          NA_real_,
                          map_dbl(pm10_hourly, ~ min(.x, na.rm = TRUE))),
      
      pm25_h_above_aqg = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 25, na.rm = TRUE))),
      pm25_h_above_it3 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 37.5, na.rm = TRUE))),
      pm25_h_above_it2 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 50, na.rm = TRUE))),
      pm25_h_above_it1 = if_else(nr_hrs_NA_pm25 >= 4,
                          NA_real_,
                          map_int(pm25_hourly, ~ sum(.x > 75, na.rm = TRUE))),
      
      # pm10_h_above_aqg = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 50, na.rm = TRUE))),
      # pm10_h_above_it3 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 75, na.rm = TRUE))),
      # pm10_h_above_it2 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 100, na.rm = TRUE))),
      # pm10_h_above_it1 = if_else(nr_hrs_NA_pm10 >= 4,
      #                     NA_real_,
      #                     map_int(pm10_hourly, ~ sum(.x > 150, na.rm = TRUE)))
    ) %>% 
  #i discard the hourly list to save on memory
  select(-pm25_hourly, -pm10_hourly)

Since pollution is only a control, I focus on PM25 and PM10: No need for co, no, no2, etc

pollution <- pollution %>% 
  select(ageb, fecha, contains("pm25"), contains("pm10"))

4 Merge the data

data <- weather %>% 
  full_join(pollution, by = c("ageb", "fecha")) %>%
  full_join(prec, by = c("ageb", "fecha")) %>% 
  rename(date = fecha)

rm(prec, pollution, weather)

n_ageb <- data %>% distinct(ageb) %>% nrow()
n_dates <- data %>% distinct(date) %>% nrow()

n_ageb*n_dates

[1] 9016579

data %>% nrow()

[1] 9016579

5 Quintiles for prec, humidity, windspeed

data <- data %>% 
  mutate(prec_quintile = ntile(prec, 5),
         rh_quintile = ntile(rh, 5),
         wsp_quintile = ntile(wsp, 5))

6 Save the data

data %>% 
  mutate(year = lubridate::year(date),
         month = lubridate::month(date, label = T)) %>%
  # thats when the crime data starts
  filter(year >= 2016) %>%
  write_rds(here("..",
                 "output",
                 "temporary_data",
                 "weather",
                 "weather_cdmx.rds"))