library(tidyverse)
library(here) # path control
library(lubridate) # handle dates
library(fixest) # estimations with fixed effects
library(sf)
library(viridis)
# Prepare the data ---------
## Load crime level data ------
<-
carpetas readRDS(here("output",
"temporary_data",
"dv",
"PGJ_carpetas.rds")) %>%
::st_drop_geometry() %>%
sfselect(-c_id, #crime id, we keep col_id
-alcaldia_catalogo, -latitud, -longitud) %>%
rename(dttm_hecho = dttm, fecha_hecho = date)
%>% count(delito_lumped)
carpetas
<- carpetas %>%
carpetas filter(delito_lumped == "Domestic violence") %>%
# recompute the deciles for DV crimes onlu
mutate(decile_delay_full_days = ntile(delay_full_days, 10))
## Join weather on crime day and reporting day ------
<-
weather readRDS(here("output", "temporary_data", "weather", "weather_cdmx.rds")) %>%
mutate(date = ymd(date)) %>%
select(col_id, date, tmean, prec_quintile, rh_quintile, wsp_quintile)
# join the weather variables for day of crime
<- left_join(carpetas, weather, by = c("fecha_hecho" = "date", "col_id"))
data rm(carpetas)
# join the weather variables for day of report
<- weather %>%
weather #append all variables names with _report_day
rename_with(~str_c(., "_report_day")) %>%
rename(fecha_inicio = date_report_day,
col_id = col_id_report_day)
<- left_join(data, weather, by = c("fecha_inicio", "col_id")) %>%
data relocate(col_id, fecha_hecho, fecha_inicio, tmean, tmean_report_day)
# this worked, but wince we don't have most recent weather data, some NAs
%>% arrange(desc(delay_full_days)) %>% filter(!is.na(tmean_report_day))
data
rm(weather)
## Filter pre pandemix (keep year 2016-2019 only) --------
#filter to pre pandemix, like the rest
<- data %>%
data filter(fecha_hecho <= ymd("2020-02-14"))
# i also filter out the whole of 2020, because we look into the future
<- data %>%
data filter(year(fecha_hecho) != 2020)
#Create a few variables for fixed effects
<- data %>%
data mutate(year = lubridate::year(fecha_hecho),
month = lubridate::month(fecha_hecho, label = T),
day_of_week = lubridate::wday(fecha_hecho, label = TRUE),
day_of_year = lubridate::yday(fecha_hecho))
## Hours of crime -------
# Extract hour hecho
<- data %>%
data mutate(hour_hecho = as_factor(hour(dttm_hecho)))
# Hour cannot really be trusted... because of the 12:00 peak, but wihtout it
# it seems OK.
%>% count(hour_hecho) %>%
data ggplot(aes(hour_hecho, n)) +
geom_col()
# Long tails for the delays --------
#show distribution of delays
%>%
data mutate(delay_full_days = if_else(delay_full_days > 56, 56, delay_full_days)) %>%
count(delay_full_days) %>%
mutate(share = n/sum(n))
# show min max and mean of delay full day for each decile
%>%
data group_by(decile_delay_full_days) %>%
summarise(min = min(delay_full_days),
max = max(delay_full_days),
mean = mean(delay_full_days),
n = n())
%>%
data mutate(delay_full_days = if_else(delay_full_days > 31, 31, delay_full_days)) %>%
ggplot(aes(delay_full_days)) +
geom_histogram(aes(y = after_stat(count / sum(count)),
group = as_factor(decile_delay_full_days),
fill = as_factor(decile_delay_full_days)),
binwidth = 1) +
# shares instrad of count
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(breaks = seq(0,56,7), minor_breaks = NULL) +
# chnage to viridis colors
labs(#title = "Distribution of reporting delays (winsorized)",
x = "Reporting delay in days",
y = "Share") +
scale_fill_manual(name = "Decile",
values = c("1" = viridis(9)[1],
"2" = viridis(9)[2],
"3" = viridis(9)[3],
"4" = viridis(9)[4],
"5" = viridis(9)[5],
"6" = viridis(9)[6],
"7" = viridis(9)[7],
"8" = viridis(9)[8],
"9" = viridis(9)[9],
"10" = "gray50")) +
theme_minimal()
ggsave(here("output", "figures", "reporting_delays_distrib.png"),
width = 6, height = 4, dpi = 300)
# Take the log of delay+1 to dampen outliers
%>% count(delay_hours)
data <- data %>%
data mutate(log_delay_full_days = log(delay_full_days + 1),
log_delay_hours = log(delay_hours))
# Model delay in days as function of temp -------
<- data %>%
data mutate(
delay_zero = delay_full_days == 0, # Indicator for zero delay
delay_zero_or_one = delay_full_days %in% 0:1, # Indicator for zero delay
delay_positive = ifelse(delay_full_days > 0, delay_full_days, NA) # Positive delays
)
%>% count(delay_full_days, delay_zero, delay_zero_or_one, delay_positive)
data
# Model the Probability of Immediate Reporting (Zero Delay)
<-
logit_model %>%
data filter(decile_delay_full_days != 10) %>%
feglm(delay_zero ~ tmean |
+ rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id family = binomial(link = "logit"),
cluster = ~ col_id)
<-
logit_model_zero_one %>%
data filter(decile_delay_full_days != 10) %>%
feglm(delay_zero_or_one ~ tmean |
+ rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id family = binomial(link = "logit"),
cluster = ~ col_id)
etable(logit_model, logit_model_zero_one)
# Model the Positive Delays
<-
nb_model %>%
data filter(decile_delay_full_days != 10) %>%
filter(delay_full_days > 0) %>%
fenegbin(
~ tmean |
delay_positive + rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id cluster = ~ col_id
)
<-
nb_model_with_0 %>%
data filter(decile_delay_full_days != 10) %>%
#filter(delay_full_days > 0) %>%
fenegbin(
~ tmean |
delay_full_days + rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id cluster = ~ col_id
)
etable(nb_model, nb_model_with_0)
<-
nb_model_both %>%
data filter(decile_delay_full_days != 10) %>%
filter(delay_full_days > 0) %>%
fenegbin(
~ tmean + tmean_report_day |
delay_positive + rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id cluster = ~ col_id
)
etable(logit_model, nb_model, nb_model_both,
fitstat = ~ n,
digits.stats = "s3", digits = "r4",
#extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
file = here("output", "tables", "reg_reporting_delays.tex"), replace = T,
view = T,
fixef_sizes = T,
dict = c(col_id = "Neighborhood",
tmean = "Temperature on incident day",
tmean_report_day = "Temperature on reporting day"),
fixef.group=list("Prec, hum, wsp quintiles"="quintile",
"Date FEs"="month|day"),
style.tex = style.tex("aer",
model.format = "(i)",
tpt = TRUE,
notes.tpt.intro = "\\footnotesize",
fixef.suffix = " FEs"),
adjustbox = TRUE, float = T,
title = "Reporting delays",
label = "reg_reporting_delays",
notes = ""
)
# Use Reporting Time Categories as a Test for Reporting Bias ----
# Disaggregate DV Reports by categories of delay
<-
final_data read_rds(here("output",
"final_data_for_analysis.rds")) %>%
filter(covid_phase == "Pre-pandemic")
%>%
final_data select(contains("reports_dv_delay")) %>%
names()
#weird names fuck up the fixest estimations
<- final_data %>%
final_data rename_with(~ str_replace_all(., "[\\[\\]()]", "")) %>% # Remove brackets and parentheses
rename_with(~ str_replace_all(., "-", "_")) # Replace dashes with underscores
<-
reg_delays_days fepois(c(reports_dv_delay_days_0,
reports_dv_delay_days_1,
reports_dv_delay_days_2_7,
reports_dv_delay_days_7_14,~ tmean |
reports_dv_delay_days_14_) + year^month + day_of_week + day_of_year +
col_id + rh_quintile + wsp_quintile,
prec_quintile cluster = ~ col_id,
data = final_data)
etable(reg_delays_days, view = T)
etable(reg_delays_days,
fitstat = ~ n + my,
digits.stats = "s3", digits = "r4",
#extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
file = here("output", "tables", "reg_reporting_delays_categories.tex"), replace = T,
view = T,
fixef_sizes = T,
dict = c(col_id = "Neighborhood",
tmean = "Temperature on incident day",
tmean_report_day = "Temperature on reporting day",
reports_dv_delay_days_0 = "Same day",
reports_dv_delay_days_1 = "Next day",
reports_dv_delay_days_2_7 = "2-7 days",
reports_dv_delay_days_7_14 = "7-14 days",
reports_dv_delay_days_14_ = "14+ days"),
fixef.group=list("Prec, hum, wsp quintiles"="quintile",
"Date FEs"="month|day"),
style.tex = style.tex("aer",
model.format = "(i)",
tpt = TRUE,
notes.tpt.intro = "\\footnotesize",
fixef.suffix = " FEs"),
adjustbox = TRUE, float = T,
title = "Reporting delays: back to daily neighborhood counts",
label = "reg_reporting_delays_categories",
notes = "Here I disaggregate DV Reports by categories of delay. Higher temperatures increase DV reporting primarily in the immediate and short-delay categories"
)
Reporting bias
Note
I run all the analyses on the reporting bias.