Reporting bias

Note
I run all the analyses on the reporting bias.
library(tidyverse)
library(here) # path control
library(lubridate) # handle dates

library(fixest) # estimations with fixed effects
library(sf)
library(viridis)


# Prepare the data ---------

## Load crime level data ------
carpetas <-  
  readRDS(here("output",
               "temporary_data",
               "dv",
               "PGJ_carpetas.rds")) %>% 
  sf::st_drop_geometry() %>% 
  select(-c_id, #crime id, we keep col_id
         -alcaldia_catalogo, -latitud, -longitud) %>% 
  rename(dttm_hecho = dttm, fecha_hecho = date)

carpetas %>% count(delito_lumped)

carpetas <- carpetas %>% 
  filter(delito_lumped == "Domestic violence") %>% 
  # recompute the deciles for DV crimes onlu
  mutate(decile_delay_full_days = ntile(delay_full_days, 10))

## Join weather on crime day and reporting day ------

weather <-
  readRDS(here("output", "temporary_data", "weather", "weather_cdmx.rds")) %>% 
  mutate(date = ymd(date)) %>% 
  select(col_id, date, tmean, prec_quintile, rh_quintile, wsp_quintile) 

# join the weather variables for day of crime
data <- left_join(carpetas, weather, by = c("fecha_hecho" = "date", "col_id"))
rm(carpetas)

# join the weather variables for day of report
weather <- weather %>% 
  #append all variables names with _report_day
  rename_with(~str_c(., "_report_day")) %>% 
  rename(fecha_inicio = date_report_day,
         col_id = col_id_report_day)

data <- left_join(data, weather, by = c("fecha_inicio", "col_id")) %>% 
  relocate(col_id, fecha_hecho, fecha_inicio, tmean, tmean_report_day)

# this worked, but wince we don't have most recent weather data, some NAs
data %>% arrange(desc(delay_full_days)) %>% filter(!is.na(tmean_report_day))

rm(weather)

## Filter pre pandemix (keep year 2016-2019 only) --------

#filter to pre pandemix, like the rest
data <- data %>%
  filter(fecha_hecho <= ymd("2020-02-14"))
# i also filter out the whole of 2020, because we look into the future
data <- data %>%
  filter(year(fecha_hecho) != 2020)


#Create a few variables for fixed effects
data <- data %>%
  mutate(year = lubridate::year(fecha_hecho),
         month = lubridate::month(fecha_hecho, label = T),
         day_of_week = lubridate::wday(fecha_hecho, label = TRUE),
         day_of_year = lubridate::yday(fecha_hecho))


## Hours of crime -------

# Extract hour hecho
data <- data %>%
  mutate(hour_hecho = as_factor(hour(dttm_hecho))) 

# Hour cannot really be trusted... because of the 12:00 peak, but wihtout it 
# it seems OK.
data %>% count(hour_hecho) %>% 
  ggplot(aes(hour_hecho, n)) +
  geom_col() 

# Long tails for the delays --------

#show distribution of delays
data %>% 
  mutate(delay_full_days = if_else(delay_full_days > 56, 56, delay_full_days)) %>%
  count(delay_full_days) %>% 
  mutate(share = n/sum(n))

# show min max and mean of delay full day for each decile
data %>% 
  group_by(decile_delay_full_days) %>% 
  summarise(min = min(delay_full_days),
            max = max(delay_full_days),
            mean = mean(delay_full_days),
            n = n())

data %>% 
  mutate(delay_full_days = if_else(delay_full_days > 31, 31, delay_full_days)) %>%
  ggplot(aes(delay_full_days)) +
  geom_histogram(aes(y = after_stat(count / sum(count)),
                     group = as_factor(decile_delay_full_days),
                     fill = as_factor(decile_delay_full_days)),
                 binwidth = 1) +
# shares instrad of count
scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(breaks = seq(0,56,7), minor_breaks = NULL) +
  # chnage to viridis colors
  labs(#title = "Distribution of reporting delays (winsorized)",
    x = "Reporting delay in days",
    y = "Share") +
  scale_fill_manual(name = "Decile",
                    values = c("1" = viridis(9)[1],
                               "2" = viridis(9)[2],
                               "3" = viridis(9)[3],
                               "4" = viridis(9)[4],
                               "5" = viridis(9)[5], 
                               "6" = viridis(9)[6],
                               "7" = viridis(9)[7],
                               "8" = viridis(9)[8],
                               "9" = viridis(9)[9],
                               "10" = "gray50")) +
  theme_minimal()

ggsave(here("output", "figures", "reporting_delays_distrib.png"),
       width = 6, height = 4, dpi = 300)

# Take the log of delay+1 to dampen outliers
data %>% count(delay_hours)
data <- data %>% 
  mutate(log_delay_full_days = log(delay_full_days + 1),
         log_delay_hours = log(delay_hours))

# Model delay in days as function of temp -------

data <- data %>%
  mutate(
    delay_zero = delay_full_days == 0,  # Indicator for zero delay
    delay_zero_or_one = delay_full_days %in% 0:1,  # Indicator for zero delay
    delay_positive = ifelse(delay_full_days > 0, delay_full_days, NA)  # Positive delays
  )

data %>% count(delay_full_days, delay_zero, delay_zero_or_one, delay_positive)


# Model the Probability of Immediate Reporting (Zero Delay)
logit_model <-
  data %>% 
  filter(decile_delay_full_days != 10) %>% 
  feglm(delay_zero ~ tmean |
          prec_quintile + rh_quintile + wsp_quintile +
          col_id + year^month + day_of_week + day_of_year,
        family = binomial(link = "logit"),
        cluster = ~ col_id)

logit_model_zero_one <-
  data %>% 
  filter(decile_delay_full_days != 10) %>% 
  feglm(delay_zero_or_one ~ tmean |
          prec_quintile + rh_quintile + wsp_quintile +
          col_id + year^month + day_of_week + day_of_year,
        family = binomial(link = "logit"),
        cluster = ~ col_id)
etable(logit_model, logit_model_zero_one)

# Model the Positive Delays
nb_model <- 
  data %>% 
  filter(decile_delay_full_days != 10) %>% 
  filter(delay_full_days > 0) %>% 
  fenegbin(
  delay_positive ~ tmean |
    prec_quintile + rh_quintile + wsp_quintile +
    col_id + year^month + day_of_week + day_of_year,
  cluster = ~ col_id
)

nb_model_with_0 <- 
  data %>% 
  filter(decile_delay_full_days != 10) %>% 
  #filter(delay_full_days > 0) %>% 
  fenegbin(
    delay_full_days ~ tmean |
      prec_quintile + rh_quintile + wsp_quintile +
      col_id + year^month + day_of_week + day_of_year,
    cluster = ~ col_id
  )

etable(nb_model, nb_model_with_0)

nb_model_both <- 
  data %>% 
  filter(decile_delay_full_days != 10) %>% 
  filter(delay_full_days > 0) %>% 
  fenegbin(
  delay_positive ~ tmean + tmean_report_day |
    prec_quintile + rh_quintile + wsp_quintile +
    col_id + year^month + day_of_week + day_of_year,
  cluster = ~ col_id
)

etable(logit_model, nb_model, nb_model_both,
       fitstat = ~ n,
       digits.stats = "s3", digits = "r4",
       #extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
       file = here("output", "tables", "reg_reporting_delays.tex"), replace = T,
       view = T,
       fixef_sizes = T,
       dict = c(col_id = "Neighborhood",
                tmean = "Temperature on incident day",
                tmean_report_day = "Temperature on reporting day"),
       fixef.group=list("Prec, hum, wsp quintiles"="quintile",
                        "Date FEs"="month|day"),
       style.tex = style.tex("aer",
                             model.format = "(i)",
                             tpt = TRUE,
                             notes.tpt.intro = "\\footnotesize",
                             fixef.suffix = " FEs"),
       adjustbox = TRUE, float = T,
       title = "Reporting delays",
       label = "reg_reporting_delays",
       notes = ""
)



# Use Reporting Time Categories as a Test for Reporting Bias ----

# Disaggregate DV Reports by categories of delay

final_data <-
  read_rds(here("output",
                "final_data_for_analysis.rds")) %>%
  filter(covid_phase == "Pre-pandemic")

final_data %>% 
  select(contains("reports_dv_delay")) %>% 
  names()

#weird names fuck up the fixest estimations
final_data <- final_data %>% 
  rename_with(~ str_replace_all(., "[\\[\\]()]", "")) %>%  # Remove brackets and parentheses
  rename_with(~ str_replace_all(., "-", "_"))              # Replace dashes with underscores


reg_delays_days <- 
  fepois(c(reports_dv_delay_days_0,
           reports_dv_delay_days_1,
           reports_dv_delay_days_2_7,
           reports_dv_delay_days_7_14,
           reports_dv_delay_days_14_) ~ tmean |
           col_id + year^month + day_of_week + day_of_year +
           prec_quintile + rh_quintile + wsp_quintile,
         cluster = ~ col_id,
         data = final_data)

etable(reg_delays_days, view = T)

etable(reg_delays_days,
       fitstat = ~ n + my,
       digits.stats = "s3", digits = "r4",
       #extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
       file = here("output", "tables", "reg_reporting_delays_categories.tex"), replace = T,
       view = T,
       fixef_sizes = T,
       dict = c(col_id = "Neighborhood",
                tmean = "Temperature on incident day",
                tmean_report_day = "Temperature on reporting day",
                reports_dv_delay_days_0 = "Same day",
                reports_dv_delay_days_1 = "Next day",
                reports_dv_delay_days_2_7 = "2-7 days",
                reports_dv_delay_days_7_14 = "7-14 days",
                reports_dv_delay_days_14_ = "14+ days"),
       fixef.group=list("Prec, hum, wsp quintiles"="quintile",
                        "Date FEs"="month|day"),
       style.tex = style.tex("aer",
                             model.format = "(i)",
                             tpt = TRUE,
                             notes.tpt.intro = "\\footnotesize",
                             fixef.suffix = " FEs"),
       adjustbox = TRUE, float = T,
       title = "Reporting delays: back to daily neighborhood counts",
       label = "reg_reporting_delays_categories",
       notes = "Here I disaggregate DV Reports by categories of delay. Higher temperatures increase DV reporting primarily in the immediate and short-delay categories"
)