# takes long to read bc they're compressed, but theyre only read once
<-
weather read_rds(here("..", "output",
"temporary_data",
"weather",
"processed_redmet_20_ageb.rds"
))
<-
pollution read_rds(here("..", "output",
"temporary_data",
"weather",
"processed_rama_20_ageb.rds"
))
<-
prec read_rds(here("..", "output",
"temporary_data",
"weather",
"processed_conagua_ageb.rds"))
Count number of hours per day above thresholds/targets
In this script, I import the weather, precipitation and pollution data.
I compute tmin, tmax, tmean (if there is less than 4/24 missing per AGEB day). I compute mean of pollution, and also count the number of hours above WHO’s thresholds. I add precipitation as well to create a unified dataset.
Then, i compute some quintiles, and look at the share of missing values for each variable.
I save weather and pollution data (day by AGEB level) in weather_cdmx.rds
.
1 Import the data
2 Temperature: averages and count number of hours
We want to take care of the missing values…
%>%
weather count(nr_hrs_NA_tmp) %>%
mutate(share_pc = 100*n/sum(n)) %>%
kable()
nr_hrs_NA_tmp | n | share_pc |
---|---|---|
0 | 7688716 | 99.8982661 |
1 | 4972 | 0.0646004 |
2 | 40 | 0.0005197 |
3 | 8 | 0.0001039 |
4 | 26 | 0.0003378 |
5 | 4 | 0.0000520 |
6 | 1715 | 0.0222827 |
7 | 745 | 0.0096797 |
9 | 3 | 0.0000390 |
11 | 5 | 0.0000650 |
12 | 1 | 0.0000130 |
14 | 2 | 0.0000260 |
15 | 10 | 0.0001299 |
16 | 1 | 0.0000130 |
24 | 298 | 0.0038719 |
Now, we count and take avg… IF there are less than 4 NA out of the 24 hourly measurements.
<- weather %>%
weather mutate(
tmean = if_else(nr_hrs_NA_tmp >= 4,
NA_real_,
map_dbl(tmp_hourly, ~ mean(.x, na.rm = TRUE))),
tmax = if_else(nr_hrs_NA_tmp >= 4,
NA_real_,
map_dbl(tmp_hourly, ~ max(.x, na.rm = TRUE))),
tmin = if_else(nr_hrs_NA_tmp >= 4,
NA_real_,
map_dbl(tmp_hourly, ~ min(.x, na.rm = TRUE))),
temp = (tmin + tmax)/2,
# i keep the list of tmp_hourly, in case i want to count later...
# tmp_h_above_30C = if_else(nr_hrs_NA_tmp >= 4,
# NA_real_,
# map_int(tmp_hourly, ~ sum(.x > 30, na.rm = TRUE))),
# tmp_h_above_32C = if_else(nr_hrs_NA_tmp >= 4,
# NA_real_,
# map_int(tmp_hourly, ~ sum(.x > 32, na.rm = TRUE)))
)
3 Pollution: averages and count number of hours
We want to take care of the missing values… In very few cases, we have only a few hours per day missing. This should not impact the averages, min max by a lot.
%>%
pollution count(nr_hrs_NA_pm25) %>%
mutate(share_pc = 100*n/sum(n)) %>%
kable()
nr_hrs_NA_pm25 | n | share_pc |
---|---|---|
0 | 8561362 | 99.7378918 |
1 | 8356 | 0.0973455 |
2 | 4903 | 0.0571188 |
3 | 1323 | 0.0154126 |
4 | 995 | 0.0115915 |
5 | 573 | 0.0066753 |
6 | 2664 | 0.0310350 |
7 | 211 | 0.0024581 |
8 | 81 | 0.0009436 |
9 | 156 | 0.0018174 |
10 | 103 | 0.0011999 |
11 | 19 | 0.0002213 |
12 | 43 | 0.0005009 |
13 | 320 | 0.0037279 |
14 | 8 | 0.0000932 |
15 | 64 | 0.0007456 |
16 | 156 | 0.0018174 |
17 | 17 | 0.0001980 |
18 | 4 | 0.0000466 |
19 | 45 | 0.0005242 |
20 | 1 | 0.0000116 |
21 | 13 | 0.0001514 |
22 | 29 | 0.0003378 |
23 | 45 | 0.0005242 |
24 | 2370 | 0.0276100 |
# pollution %>%
# count(nr_hrs_NA_pm10)
Now, we count and take avg…
<- pollution %>%
pollution mutate(
pm25_mean = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_dbl(pm25_hourly, ~ mean(.x, na.rm = TRUE))),
pm25_max = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_dbl(pm25_hourly, ~ max(.x, na.rm = TRUE))),
pm25_min = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_dbl(pm25_hourly, ~ min(.x, na.rm = TRUE))),
pm10_mean = if_else(nr_hrs_NA_pm10 >= 4,
NA_real_,
map_dbl(pm10_hourly, ~ mean(.x, na.rm = TRUE))),
pm10_max = if_else(nr_hrs_NA_pm10 >= 4,
NA_real_,
map_dbl(pm10_hourly, ~ max(.x, na.rm = TRUE))),
pm10_min = if_else(nr_hrs_NA_pm10 >= 4,
NA_real_,
map_dbl(pm10_hourly, ~ min(.x, na.rm = TRUE))),
pm25_h_above_aqg = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_int(pm25_hourly, ~ sum(.x > 25, na.rm = TRUE))),
pm25_h_above_it3 = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_int(pm25_hourly, ~ sum(.x > 37.5, na.rm = TRUE))),
pm25_h_above_it2 = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_int(pm25_hourly, ~ sum(.x > 50, na.rm = TRUE))),
pm25_h_above_it1 = if_else(nr_hrs_NA_pm25 >= 4,
NA_real_,
map_int(pm25_hourly, ~ sum(.x > 75, na.rm = TRUE))),
# pm10_h_above_aqg = if_else(nr_hrs_NA_pm10 >= 4,
# NA_real_,
# map_int(pm10_hourly, ~ sum(.x > 50, na.rm = TRUE))),
# pm10_h_above_it3 = if_else(nr_hrs_NA_pm10 >= 4,
# NA_real_,
# map_int(pm10_hourly, ~ sum(.x > 75, na.rm = TRUE))),
# pm10_h_above_it2 = if_else(nr_hrs_NA_pm10 >= 4,
# NA_real_,
# map_int(pm10_hourly, ~ sum(.x > 100, na.rm = TRUE))),
# pm10_h_above_it1 = if_else(nr_hrs_NA_pm10 >= 4,
# NA_real_,
# map_int(pm10_hourly, ~ sum(.x > 150, na.rm = TRUE)))
%>%
) #i discard the hourly list to save on memory
select(-pm25_hourly, -pm10_hourly)
Since pollution is only a control, I focus on PM25 and PM10: No need for co, no, no2, etc
<- pollution %>%
pollution select(ageb, fecha, contains("pm25"), contains("pm10"))
4 Merge the data
<- weather %>%
data full_join(pollution, by = c("ageb", "fecha")) %>%
full_join(prec, by = c("ageb", "fecha")) %>%
rename(date = fecha)
rm(prec, pollution, weather)
<- data %>% distinct(ageb) %>% nrow()
n_ageb <- data %>% distinct(date) %>% nrow()
n_dates
*n_dates n_ageb
[1] 9016579
%>% nrow() data
[1] 9016579
5 Quintiles for prec, humidity, windspeed
<- data %>%
data mutate(prec_quintile = ntile(prec, 5),
rh_quintile = ntile(rh, 5),
wsp_quintile = ntile(wsp, 5))
6 Save the data
%>%
data mutate(year = lubridate::year(date),
month = lubridate::month(date, label = T)) %>%
# thats when the crime data starts
filter(year >= 2016) %>%
write_rds(here("..",
"output",
"temporary_data",
"weather",
"weather_cdmx.rds"))