Count, bin, and merge data sources

Note

In this script, I import the reports, calls, weather, and pollution data.

I count the reports/calls per day, per AGEB/neighborhood. I also split e.g. per type of crime, per time of day etc.

I then create bins with the weather data.

I merge all the data sources into two final datasets that I use for regressions:

data_reports.rds which is at the AGEB per day level
data_calls.rds which is at the neighborhood per day level

Import the pre-processed data

poligonos <-
  read_sf(here("..", "output", "ageb_poligonos.geojson"),
          quiet = TRUE) 

colonias <-
  read_sf(here("..", "output", "colonias.gpkg"),
          quiet = TRUE)

calls <-
  read_rds(here("..", "output", "temporary_data", "dv", "locatel.rds")) %>% 
  mutate(date = as_date(fecha_alta),
         hour = as.numeric(str_sub(hora_alta, 1, 2)),
         .keep = "unused")

reports <- 
  read_rds(here("..", "output", "temporary_data", "dv", "PGJ_carpetas.rds")) %>% 
  st_drop_geometry() %>% 
  mutate(hour = lubridate::hour(dttm))

weather <-
  read_rds(here("..", "output", "temporary_data", "weather", "weather_cdmx.rds")) %>% 
  mutate(date = ymd(date))

Time dimension

Create a complete time dataset

I start by creating a dataset for the time dimension, at the day level.

# I create an ageb x day level data, that I populate along the way
# 2,922 days and 1,948 colonias
time <- expand_grid(
  date = seq(ymd("2016-01-01"), ymd("2024-12-31"), by = "days")
  ) %>%
  as_tibble()

I then add variables for the fixed effects, e.g. day of week, month, weekend dummy…

# Create some useful time variables
time <- time %>% 
  mutate(year = year(date),
         month = lubridate::month(date, label = T),
         day = lubridate::day(date),
         day_of_week = lubridate::wday(date, label = T),
         day_of_year = paste(lubridate::mday(date),
                             lubridate::month(date, label = T), sep = "-"),
         weekend_dummy = ifelse(day_of_week %in% c("Sat", "Sun"), 
                          "Weekend",
                          "Weekdays"),
         calendar_day = paste(month, day))

Pandemic timeline

I set the pandemic period and create a variable. Here is a timeline of data coverage and covid phases.

#My own indicators of covid_phases, based on the COVID appendix
time <- time %>%
  mutate(covid_phase = case_when(date >= ymd("2021-11-01") ~ "Post-pandemic",
                                 date >= ymd("2020-02-15") ~ "Pandemic",
                                 .default = "Pre-pandemic"))

time %>% select(date, covid_phase) %>% 
  group_by(covid_phase) %>% 
  summarise(start_date = min(date), end_date = max(date)) %>% 
  mutate(n_days = end_date- start_date) %>% arrange(start_date) %>% gt()

covid_phase	start_date	end_date	n_days
Pre-pandemic	2016-01-01	2020-02-14	1505
Pandemic	2020-02-15	2021-10-31	624
Post-pandemic	2021-11-01	2024-12-31	1156

Data coverage timeline

time %>%
  select(date, covid_phase) %>% 
  group_by(covid_phase) %>% 
  summarise(start = min(date), end = max(date)) %>%
  rename(phase = covid_phase) %>% 
  mutate(ymin = 0, ymax = 100, ymed = 50)  %>% 
  add_row(phase = "Call data" ,
          start = as_date(min(calls$date)),
          end = as_date(max(calls$date)), 
          ymin = 120, ymax = 220,  ymed = 170) %>% 
  add_row(phase = "Crime data" ,
          start = as_date(min(reports$date)),
          end = as_date(max(reports$date)), 
          ymin = 240, ymax = 340,  ymed = 290) %>% 
  add_row(phase = "Weather data" ,
          start = as_date(min(weather$date)),
          end = as_date(max(weather$date)), 
          ymin = 360, ymax = 460,  ymed = 410) %>% 
  mutate(median = start + (end - start)/2) %>% 
ggplot(aes(xmin = start, xmax = end, ymin = ymin, ymax = ymax, fill = phase)) +
  geom_rect(colour = "white") + 
  scale_fill_brewer(palette = "Set3") +
  geom_text(aes(x = median, y = ymed, size = .3, label = phase)) +
  xlab(NULL) +  ylab(NULL) +
  scale_x_date(breaks = "years", date_labels = "%d %b %y ") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  theme(legend.position = "none") +
  guides(x = guide_axis(angle = 45))

Holidays and school holidays

Import the holiday days and create a dummy in the dataset. I do not have school holidays (yet) because it’s a bit cumbersome.

public_holidays <- 
  list.files(here("..", "raw_data", "8_holidays"),
             pattern = "csv", full.names = T) %>% 
  map_dfr(read_csv) 

public_holidays %>% filter(year(Date) == 2016) %>% 
  kable()

Date	LocalName	Name	CountryCode	Fixed	Global	LaunchYear	Type	Counties
2016-01-01	Año Nuevo	New Year’s Day	MX	FALSE	TRUE	NA	Public	NA
2016-02-01	Día de la Constitución	Constitution Day	MX	FALSE	TRUE	NA	Public	NA
2016-03-21	Natalicio de Benito Juárez	Benito Juárez’s birthday	MX	FALSE	TRUE	NA	Public	NA
2016-05-02	Día del Trabajo	Labor Day	MX	FALSE	TRUE	NA	Public	NA
2016-09-16	Día de la Independencia	Independence Day	MX	FALSE	TRUE	NA	Public	NA
2016-11-21	Día de la Revolución	Revolution Day	MX	FALSE	TRUE	NA	Public	NA
2016-12-25	Navidad	Christmas Day	MX	TRUE	TRUE	NA	Public	NA

time <- time %>% 
  mutate(holiday = date %in% public_holidays$Date)

time %>% count(holiday) %>% kable()

holiday	n
FALSE	3223
TRUE	65

rm(public_holidays)

Count DV events

I want to count the number of domestic violence reports per day and AGEB. For the call data, I count by day and neighborhood.

There are thus two distincts datasets.

Reports

Now, count the reports per day per AGEB. We basically go from incident level to day per AGEB level.

Count all crimes

I first count all crimes together per day per AGEB

count_reports <- reports %>%
  group_by(date, ageb) %>% 
  summarise(reports_all = n(), .groups = "drop")

Count by type of crime

Now, I want to count some crime by categories. Ofc DV is the main one, but for comparison, I also count some others crime types.

fct_count(reports$delito_lumped, sort = T, prop = T) %>% head(20) %>% kable()

f	n	p
Theft	769554	0.4096264
Other	471889	0.2511821
Domestic violence	237691	0.1265207
Fraud	134536	0.0716122
Threats	126262	0.0672081
Drugs related	38173	0.0203191
Sexual crimes	37768	0.0201036
Trust abuse	32118	0.0170961
Homicide	13889	0.0073930
Rapes	11327	0.0060293
Suicides	4614	0.0024560
Feminicide	852	0.0004535

reports <- reports %>% 
  mutate(tracked = fct_collapse(reports$delito_lumped,
                                theft = "Theft",
                                dv = "Domestic violence",
                                fraud = "Fraud",
                                threats = "Threats",
                                drugs = "Drugs related",
                                sexual = "Sexual crimes",
                                trust = "Trust abuse",
                                homicide = "Homicide",
                                rapes = "Rapes",
                                suicide = "Suicides",
                                feminicide = "Feminicide",
                                other_level = "other"))

count_reports <- reports %>%
  group_by(date, ageb, tracked) %>% 
  summarise(reports = n(), .groups = "drop") %>% 
  pivot_wider(names_from = tracked,
              values_from = reports,
              names_prefix = "reports_",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_reports, by = c("date", "ageb"))

Count at day and night

Count DV during the day and night. I exclude hour 12, because it is the fallback when hour is not recorded.

#create daytime vs nighttime
reports <- reports %>%
  mutate(daytime = case_when(
    hour %in% c(7:11, 13:19) ~ "day",
    hour %in% c(0:6, 20:23) ~ "night"
  ))

reports %>% 
  filter(delito_lumped == "Domestic violence") %>% 
  ggplot(aes(x = hour, fill = daytime)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(0, 24, 1))

#Count the calls per day and per colonia and per night vs day
count_reports <- reports %>% 
  filter(delito_lumped == "Domestic violence") %>% 
  group_by(ageb, date, daytime) %>% 
  filter(hour != 12) %>% 
  summarise(reports_daytime = n(), .groups = "drop") %>% 
  pivot_wider(names_from = daytime,
              values_from = reports_daytime,
              names_prefix = "reports_dv_",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_reports, by = c("date", "ageb"))

Count by hour of the day

Count the DV crimes per bins of two hours. I also exclude hour 12 (so only hour 13 is in that bin). Even if the count is lower than it should, it doesnt matter because were comparing this bin with the same bin another day, and NOT this bin with other bins.

#create hour_bins
reports <- reports %>%
  mutate(hour_bins = cut_width(hour, width = 2, center = 1, closed = "left"))

#Count the reports per day and per neighborhood and per bin_hour
count_reports <- reports %>%
  filter(delito_lumped == "Domestic violence") %>% 
  filter(hour != 12) %>% 
  group_by(date, ageb, hour_bins) %>%
  summarise(hourly_crimes = n(), .groups = "drop") %>%
  pivot_wider(names_from = hour_bins,
              values_from = hourly_crimes,
              names_prefix = "reports_dv_h",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_reports)

Count by delay in reporting

I want to count the number of crimes by category, i.e. those that were reported quickly, those that were reported with a one week delay, etc…

I have two variables for delay, that I computed earlier: - delay_full_days for the whole dataset - delay_hours only for second half of 2022 onwards (because data wasnt registered before)

First, categorize the delays in different categories for delays full days:

reports %>% count(delay_full_days) %>% 
  head(10) %>% kable()

delay_full_days	n
0	684353
1	351600
2	123151
3	79539
4	54800
5	40158
6	32365
7	27114
8	20913
9	17551

reports <- reports %>%
  arrange(delay_full_days) %>% 
  mutate(delay_category_full_days = case_when(
    delay_full_days ==  0 ~ "[0]",
    delay_full_days == 1 ~ "[1]",
    delay_full_days %in% 2:7 ~ "[2-7]",
    delay_full_days %in% 7:14 ~ "[7-14]",
    delay_full_days > 14 ~ "(14-)"),
    delay_category_full_days = as_factor(delay_category_full_days)) 

reports %>% count(delay_category_full_days) %>% 
  filter(!is.na(delay_category_full_days)) %>% 
  mutate(freq = scales::percent(n/sum(n), accuracy = .01)) %>% 
  kable()

delay_category_full_days	n	freq
[0]	684353	36.43%
[1]	351600	18.72%
[2-7]	357127	19.01%
[7-14]	103352	5.50%
(14-)	382241	20.35%

Second, categorize the delays in different categories, for delays in hours:

We can do the same for a subsample of the data, at a finer hourly measure. - Very short delays: 0-6 hours. - Short delays: 6-24 hours. - Medium delays: 24-48 hours. - Long delays: 48+ hours.

reports <- reports %>%
  arrange(delay_hours) %>% 
  mutate(delay_category_hours = case_when(
    delay_hours <= 6 ~ "[0-6]",
    delay_hours > 6 & delay_hours <= 24 ~ "(6-24]",
    delay_hours > 24 & delay_hours <= 48 ~ "(24-48]",
    delay_hours > 48 & delay_hours <= 168 ~ "(48-168]",
    delay_hours > 168 ~ "(168-)"),
    delay_category_hours = as_factor(delay_category_hours)) 

reports %>% count(delay_category_hours) %>% 
  filter(!is.na(delay_category_hours)) %>% 
  mutate(freq = scales::percent(n/sum(n), accuracy = .01)) %>% 
  kable()

delay_category_hours	n	freq
[0-6]	119946	24.50%
(6-24]	83306	17.01%
(24-48]	55874	11.41%
(48-168]	84086	17.17%
(168-)	146423	29.90%

Now we can count the crimes per category of delay

#Count the calls per day and per colonia and per category of delay in full days
count_reports <- reports %>% 
  filter(delito_lumped == "Domestic violence") %>% 
  group_by(ageb, date, delay_category_full_days) %>% 
  summarise(nr_reports_cat = n(), .groups = "drop") %>% 
  pivot_wider(names_from = delay_category_full_days,
              values_from = nr_reports_cat,
              names_prefix = "reports_dv_delay_days_",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_reports, by = c("date", "ageb"))

#Count the calls per day and per colonia and per category of delay in continuous time days
count_reports <- reports %>% 
  filter(delito_lumped == "Domestic violence") %>% 
  filter(!is.na(delay_category_hours)) %>%
  group_by(ageb, date, delay_category_hours) %>% 
  summarise(nr_reports_cat = n(), .groups = "drop") %>% 
  pivot_wider(names_from = delay_category_hours,
              values_from = nr_reports_cat,
              names_prefix = "reports_dv_delay_hours_",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_reports, by = c("date", "ageb"))

Clean the counts and join

Now, crucially, we need to assign 0 for days x AGEB that got count 0 in any of those counts.

First, we need to make the dataset complete i.e. each combination of ageb x day

count_reports <- count_reports %>% 
  complete(date = seq.Date(min(date), max(date), by = "day"),
           ageb = unique(ageb))

Then, we need to replace all the NAs that result (i) from the full_joins (ii) and from the complete with a count of 0.

count_reports <- count_reports %>%
  mutate(across(contains("reports"), ~ replace_na(.x, 0))) %>% 
  arrange(ageb, date)

Reorder the columns for better readability.

count_reports <- count_reports %>%
  relocate(date, ageb, reports_dv, reports_dv_day, reports_dv_night, reports_all) %>% 
  relocate(contains("reports_dv_delay"), contains("reports_dv_h"), .after = last_col())

names(count_reports)

 [1] "date"                            "ageb"                           
 [3] "reports_dv"                      "reports_dv_day"                 
 [5] "reports_dv_night"                "reports_all"                    
 [7] "reports_drugs"                   "reports_feminicide"             
 [9] "reports_fraud"                   "reports_homicide"               
[11] "reports_rapes"                   "reports_sexual"                 
[13] "reports_suicide"                 "reports_theft"                  
[15] "reports_threats"                 "reports_trust"                  
[17] "reports_other"                   "reports_dv_delay_hours_[0-6]"   
[19] "reports_dv_delay_hours_(6-24]"   "reports_dv_delay_hours_(24-48]" 
[21] "reports_dv_delay_hours_(48-168]" "reports_dv_delay_hours_(168-)"  
[23] "reports_dv_delay_days_[0]"       "reports_dv_delay_days_[1]"      
[25] "reports_dv_delay_days_[2-7]"     "reports_dv_delay_days_[7-14]"   
[27] "reports_dv_delay_days_(14-)"     "reports_dv_h[0,2)"              
[29] "reports_dv_h[2,4)"               "reports_dv_h[4,6)"              
[31] "reports_dv_h[6,8)"               "reports_dv_h[8,10)"             
[33] "reports_dv_h[10,12)"             "reports_dv_h[12,14)"            
[35] "reports_dv_h[14,16)"             "reports_dv_h[16,18)"            
[37] "reports_dv_h[18,20)"             "reports_dv_h[20,22)"            
[39] "reports_dv_h[22,24]"

Join time and count of reports

We add the time dimensions to the count of reports.

count_reports <- count_reports %>% 
  left_join(time, by = "date") %>% 
  relocate(contains("reports_"), .after = last_col()) 

rm(reports)

Calls

Now, we do the same type of counting, but for calls. Here, we count at day x neighborhood level.

# create a combined version of location
calls <- calls %>% 
  mutate(col_id = ifelse(!is.na(col_id_hechos),
                                  col_id_hechos,
                                  col_id_usuaria)) %>%
  relocate(col_id, .after = "col_id_usuaria")

# remove those that connot be located
calls <- calls %>% 
  filter(!is.na(col_id))

Count all calls

Count calls per day and per neighborhood. I still have two locations: col_id_hechos and col_id_usuaria. I count for each, and for a combined version of each.

count_calls <- calls %>%
  group_by(date, col_id) %>%
  summarise(calls = n())

# using the incident location, when available
count_calls <- calls %>%
  summarise(calls_hechos = n(), .by  = c("col_id_hechos", "date")) %>% 
  rename(col_id = col_id_hechos) %>% 
  right_join(count_calls)

# using the caller's location, much more frequent but not precise
count_calls <- calls %>%
  summarise(calls_usuaria = n(), .by  = c("col_id_usuaria", "date")) %>% 
  rename(col_id = col_id_usuaria) %>% 
  right_join(count_calls)

Count at day and night

Count calls during the day and during the night. Day is 7am to 7pm included. Here we keep 12:00, because it is not mismeasured since it is recorded automatically.

#create daytime vs nighttime
calls <- calls %>%
  mutate(daytime = if_else(hour %in% 7:19, "day", "night"))

calls %>% 
  ggplot(aes(x = hour, fill = daytime)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(0, 24, 1))

#Count the calls per day and per municipality and per night vs day
count_calls <- calls %>% 
  group_by(col_id, date, daytime) %>% 
  summarise(calls_daytime = n(), .groups = "drop") %>% 
  pivot_wider(names_from = daytime,
              values_from = calls_daytime,
              names_prefix = "calls_",
              names_expand = T,
              values_fill = 0) %>% 
  right_join(count_calls)

Count by type of DV calls

I also want to count the calls for each type when I know it;

calls %>% 
  count(TYPE) %>% 
  mutate(freq = scales::percent(n/sum(n))) %>%
  kable()

TYPE	n	freq
intimate partner violence	13376	24.21%
unknown	13555	24.53%
violence against adult	18879	34.17%
violence against children	5871	10.63%
violence committed by family members	3570	6.46%

count_calls <- calls %>%
  group_by(date, col_id, TYPE) %>% 
  summarise(reports = n(), .groups = "drop") %>% 
  pivot_wider(names_from = TYPE,
              values_from = reports,
              names_prefix = "calls_",
              names_expand = T,
              values_fill = 0) %>% 
  full_join(count_calls)

Clean the counts and join

Now, crucially, we need to assign 0 for days x AGEB that got count 0 in any of those counts.

First, we need to make the dataset complete i.e. each combination of ageb x day

count_calls <- count_calls %>% 
  complete(date = seq.Date(min(date), max(date), by = "day"),
           col_id = unique(col_id))

Then, we need to replace all the NAs that result (i) from the full_joins (ii) and from the complete with a count of 0.

count_calls <- count_calls %>%
  mutate(across(contains("calls"), ~ replace_na(.x, 0))) %>% 
  arrange(col_id, date)

Reorder the columns for better readability.

count_calls <- count_calls %>%
  relocate(date, col_id, calls, calls_hechos, calls_usuaria, calls_day, calls_night) 

names(count_calls)

 [1] "date"                                      
 [2] "col_id"                                    
 [3] "calls"                                     
 [4] "calls_hechos"                              
 [5] "calls_usuaria"                             
 [6] "calls_day"                                 
 [7] "calls_night"                               
 [8] "calls_intimate partner violence"           
 [9] "calls_unknown"                             
[10] "calls_violence against adult"              
[11] "calls_violence against children"           
[12] "calls_violence committed by family members"

Join time and count of reports

We add the time dimensions to the count of reports.

count_calls <- count_calls %>% 
  left_join(time, by = "date") %>% 
  relocate(contains("calls_"), .after = last_col()) 

rm(calls)

Create bins and assign weather and pollution

We want to add the weather variables. First, I compute some bins at the day level.

Create the bins for temperature

source(here("fun_dynamic_bins.R"))

I create bins for temperature. I define a function that dynamically creates bins as a function of how many observations fall in each bin. It lumps together the extreme bins that have less than a certain proportion of observations. The rule is that the extreme bin must contain at least 1pc of observations for the 2 degree wide, and 0.5pc for the 1 degree wide.

#2 degree wide
weather <- weather %>% 
  mutate(bins_temp_two = create_dynamic_bins(temp, 2, threshold_percentage = 0.01),
         bins_tmean_two = create_dynamic_bins(tmean, 2, threshold_percentage = 0.01),
         bins_tmax_two = create_dynamic_bins(tmax, 2, threshold_percentage = 0.01),
         bins_tmin_two = create_dynamic_bins(tmin, 2, threshold_percentage = 0.01))

#and 1 degree wide
weather <- weather %>% 
  mutate(bins_temp_one = create_dynamic_bins(temp, 1, threshold_percentage = 0.005),
         bins_tmean_one = create_dynamic_bins(tmean, 1, threshold_percentage = 0.005),
         bins_tmax_one = create_dynamic_bins(tmax, 1, threshold_percentage = 0.005),
         bins_tmin_one = create_dynamic_bins(tmin, 1, threshold_percentage = 0.005))

weather %>%
  group_by(year) %>% 
  summarise(total = n(),
            missing_tmean = sum(is.na(tmean))) %>% 
  mutate(share = scales::percent(missing_tmean/total)) %>% 
  kable()

year	total	missing_tmean	share
2016	889746	1	0.00011%
2017	887315	30	0.00338%
2018	887315	3	0.00034%
2019	887315	27	0.00304%
2020	889746	129	0.01450%
2021	887315	108	0.01217%
2022	887315	35	0.00394%
2023	887315	0	0.00000%
2024	889746	299059	33.61173%
2025	136136	136136	100.00000%

weather %>% 
  count(bins_tmean_one, bins_tmean_two) %>% 
  drop_na() %>% 
  mutate(prop_in_pc = scales::percent(n / sum(n))) %>% 
  kable()

bins_tmean_one	bins_tmean_two	n	prop_in_pc
<11	<12	68138	0.886%
[11,12)	<12	83377	1.084%
[12,13)	[12,14)	178154	2.316%
[13,14)	[12,14)	322391	4.190%
[14,15)	[14,16)	544405	7.076%
[15,16)	[14,16)	879386	11.430%
[16,17)	[16,18)	1144944	14.882%
[17,18)	[16,18)	1329687	17.283%
[18,19)	[18,20)	1190306	15.471%
[19,20)	[18,20)	848012	11.022%
[20,21)	[20,22)	524100	6.812%
[21,22)	[20,22)	326054	4.238%
[22,23)	>=22	151587	1.970%
>=23	>=22	103195	1.341%

Join weather to the count of reports

Straightforward, since both are day per ageb level.

Weather is missing for some days x ageb, but very few in general.

data_reports <- count_reports %>% 
  left_join(weather)

data_reports %>%
  group_by(year) %>% 
  summarise(total = n(),
            missing_tmean = sum(is.na(tmean))) %>% 
  mutate(share = scales::percent(missing_tmean/total)) %>% 
  kable()

year	total	missing_tmean	share
2016	889014	1	0.00011%
2017	886585	30	0.00338%
2018	886585	3	0.00034%
2019	886585	27	0.00305%
2020	889014	129	0.01451%
2021	886585	108	0.01218%
2022	886585	35	0.00395%
2023	886585	0	0.00000%
2024	592676	2475	0.41760%

Join weather to the count of calls

Less straightforward, since weather is measured at AGEB level. I could compute weather all over again at the neighborhood level, but since the scale is so tiny, I will just assign the weather of the AGEB to the neighborhood.

Match AGEBs and colonias

First, establish a correspondance between AGEB and neighborhoods.

poligonos %>% distinct(ageb) %>% nrow()

[1] 2431

colonias %>% distinct(col_id) %>% nrow()

[1] 1948

# Compute centroids
colonias_centroids <- colonias %>% select(col_id) %>% st_centroid() 
agebs_centroids <- poligonos %>% select(ageb) %>% st_centroid()

colonias_nearest_ageb <- 
  st_join(colonias_centroids, agebs_centroids, join = st_nearest_feature) %>%
  select(col_id, ageb) %>% 
  st_drop_geometry()

We can plot the connections, just for fun, but exactly what we expected.

connections <- colonias_nearest_ageb %>%
  left_join(colonias_centroids, by = "col_id") %>%
  left_join(agebs_centroids, by = "ageb") %>% 
#   Convert each pair of points into a LINESTRING
  rowwise() %>%
  mutate(geometry = st_sfc(st_linestring(rbind(st_coordinates(geom), 
                                               st_coordinates(geometry))),
                           crs = 4326)) %>%
  ungroup() %>%
  select(col_id, ageb, geometry) %>%
  st_as_sf()

ggplot() +
  geom_sf(data = colonias_centroids, size = .2, color = "red", alpha = .5) + 
  #geom_sf(data = agebs_centroids, size = .2, color = "red", alpha = .5) +
  geom_sf(data = connections, color = "blue", alpha = 1) +
  labs(title = "Connections between colonias and AGEBs centroids") +
  theme_minimal() +
  annotation_scale(location = "br", style = "ticks")

Actually join the data

We assign to each neighborhood the identifier of the closest AGEB.

count_calls <- count_calls %>% 
  left_join(colonias_nearest_ageb)

And based on this AGEB, we can merge the weather data.

data_calls <- count_calls %>% 
  left_join(weather)

# remove the ageb identifier, it was just for the join
# do not get confused
data_calls <- data_calls %>% select(-ageb)

There are only very few missing temp obs.

data_calls %>%
  group_by(year) %>% 
  summarise(total = n(),
            missing_tmean = sum(is.na(tmean))) %>% 
  mutate(share = scales::percent(missing_tmean/total)) %>% 
  kable()

year	total	missing_tmean	share
2016	84180	0	0.00000%
2017	503700	24	0.00476%
2018	503700	2	0.00040%
2019	503700	27	0.00536%
2020	505080	93	0.01841%
2021	503700	105	0.02085%
2022	335340	6	0.00179%

Save the final datasets

We save one dataset for reports and another for calls since they’re on different levels.

Reports is AGEB per day, calls is neighborhood per day.

data_reports <- data_reports %>% 
  mutate(year = as_factor(year)) %>%
  relocate(starts_with("bins"),
           starts_with("reports"),
           .after = last_col()) %>% 
  relocate(ageb, date, 
           reports_dv,
           tmean, bins_tmean_one, prec, tmax, tmin, pm25_mean) %>% 
  # remove temp = tmin+tmax/2, we now use tmean, the 24 hourly avg
  select(-contains("temp"))


data_reports %>% 
  select( - tmp_hourly) %>% 
  write_rds(here("..", "output",
                 "data_reports.rds"))

# save the hourly list separately, its just very big
data_reports %>% 
  select(date, ageb, tmp_hourly) %>% 
  write_rds(here("..", "output", "temporary_data", 
                 "tmp_hourly_ageb.rds"))

data_calls <- data_calls %>% 
  mutate(year = as_factor(year)) %>%
  relocate(starts_with("bins"),
           starts_with("calls"),
           .after = last_col()) %>% 
  relocate(col_id, date, 
           calls,
           tmean, bins_tmean_one, prec, tmax, tmin, pm25_mean) %>% 
  # remove temp = tmin+tmax/2, we now use tmean, the 24 hourly avg
  select(-contains("temp"))
  
data_calls %>% 
  select( - tmp_hourly) %>%
  write_rds(here("..", "output",
                 "data_calls.rds"))

data_calls %>% 
  select(date, col_id, tmp_hourly) %>% 
  write_rds(here("..", "output", "temporary_data",
                 "tmp_hourly_col_id.rds"))