library(tidyverse)
library(here) # path control
library(lubridate) # handle dates
library(fixest) # estimations with fixed effects
library(modelsummary) # for the modelplot figure
<-
data readRDS(here("output", "final_data_for_analysis.rds")) %>%
filter(covid_phase == "Pre-pandemic")
source(here("codes", "my_functions.R"))
#source(here("codes", "fixest_options.R"))
%>% select(tmp_hourly, col_id, date) %>%
data unnest(cols = c(tmp_hourly)) %>%
group_by(col_id, date) %>%
mutate(hour = row_number()) %>%
ungroup() %>%
ggplot(aes(x = round(tmp_hourly))) +
geom_bar(fill = "blue", alpha = 0.5) +
#geom_density(fill = "blue", alpha = 0.5) +
scale_x_continuous(breaks = seq(-20, 40, 2),
minor_breaks = seq(-20,40,1)) +
theme_light() #+ facet_wrap(~hour)
# NUMBER OF HOURS IN TEMPERATURE BINS -----------------------------------------
## Remove the observations with missing values in the 24 hours----
# there are missing on 61 days
%>% filter(is.na(tmp_hourly)) %>%
hourly_data count(date)
# there are missing for 14 different neighborhoods
%>% filter(is.na(tmp_hourly)) %>%
hourly_data count(col_id)
# i find all the missing observations
<- hourly_data %>%
missing_obs filter(is.na(tmp_hourly)) %>%
count(date, col_id)
# remove these 854 observations from the data, to be able to choose
# a ref category without having to estimate a coefficient for 0.01 pc of na obsevrations
<- data %>%
data filter(!(date %in% missing_obs$date & col_id %in% missing_obs$col_id))
## Count the hours in bins ----
<- data %>%
data mutate(hours_below_0 = map_int(tmp_hourly, ~ sum(.x < 0)),
hours_below_2 = map_int(tmp_hourly, ~ sum(.x < 2)),
hours_below_4 = map_int(tmp_hourly, ~ sum(.x < 4)),
hours_below_6 = map_int(tmp_hourly, ~ sum(.x < 6)),
hours_in_0_2 = map_int(tmp_hourly, ~ sum(.x >= 0 & .x < 2)),
hours_in_2_4 = map_int(tmp_hourly, ~ sum(.x >= 2 & .x < 4)),
hours_in_4_6 = map_int(tmp_hourly, ~ sum(.x >= 4 & .x < 6)),
hours_in_6_8 = map_int(tmp_hourly, ~ sum(.x >= 6 & .x < 8)),
hours_in_8_10 = map_int(tmp_hourly, ~ sum(.x >= 8 & .x < 10)),
hours_in_10_12 = map_int(tmp_hourly, ~ sum(.x >= 10 & .x < 12)),
hours_in_12_14 = map_int(tmp_hourly, ~ sum(.x >= 12 & .x < 14)),
hours_in_14_16 = map_int(tmp_hourly, ~ sum(.x >= 14 & .x < 16)),
hours_in_16_18 = map_int(tmp_hourly, ~ sum(.x >= 16 & .x < 18)),
hours_in_18_20 = map_int(tmp_hourly, ~ sum(.x >= 18 & .x < 20)),
hours_in_20_22 = map_int(tmp_hourly, ~ sum(.x >= 20 & .x < 22)),
hours_in_22_24 = map_int(tmp_hourly, ~ sum(.x >= 22 & .x < 24)),
hours_in_24_26 = map_int(tmp_hourly, ~ sum(.x >= 24 & .x < 26)),
hours_in_26_28 = map_int(tmp_hourly, ~ sum(.x >= 26 & .x < 28)),
hours_in_28_30 = map_int(tmp_hourly, ~ sum(.x >= 28 & .x < 30)),
hours_in_30_32 = map_int(tmp_hourly, ~ sum(.x >= 30 & .x < 32)),
hours_above_26 = map_int(tmp_hourly, ~ sum(.x >= 26)),
hours_above_28 = map_int(tmp_hourly, ~ sum(.x >= 28)),
hours_above_30 = map_int(tmp_hourly, ~ sum(.x >= 30)),
hours_above_32 = map_int(tmp_hourly, ~ sum(.x >= 32)),
hours_missing = map_int(tmp_hourly, ~ sum(is.na(.x)))
)
# is it complete??? YES
%>%
data summarise(across(starts_with("hours_"), ~ sum(.x, na.rm = T))) %>%
select(hours_below_6, hours_in_6_8, hours_in_8_10,
hours_in_10_12, hours_in_12_14, hours_in_14_16,
hours_in_16_18, hours_in_18_20, hours_in_20_22,
hours_in_22_24, hours_in_24_26, hours_in_26_28,%>%
hours_above_28, hours_missing) pivot_longer(cols = everything(), names_to = "estimate_names", values_to = "count") %>%
print(n=30) %>%
mutate(all_hours = sum(count),
obs = all_hours/24,
nrow(data)))
(
## Regs ----
# i ofc cannot put all the bins together, collinear
# i choose one bin for reference
<- data %>%
reg_nr_hours_bins fepois(reports_dv ~
+
hours_below_6 +
hours_in_6_8 +
hours_in_8_10 +
hours_in_10_12 +
hours_in_12_14 +
hours_in_14_16 #hours_in_16_18 +
+
hours_in_18_20 +
hours_in_20_22 +
hours_in_22_24 +
hours_in_24_26 +
hours_in_26_28 |
hours_above_28 + rh_quintile + wsp_quintile +
prec_quintile + year^month + day_of_week + day_of_year,
col_id cluster = ~ col_id)
## summary stats ----
# compute support
<- data %>%
sum_stats_hours summarise(across(starts_with("hours_"), ~ 100*mean(.x, na.rm =T)/24)) %>%
select(hours_below_6,
hours_in_6_8, hours_in_8_10, hours_in_10_12,
hours_in_12_14, hours_in_14_16, hours_in_16_18,
hours_in_18_20, hours_in_20_22, hours_in_22_24,
hours_in_24_26, hours_in_26_28,%>%
hours_above_28) pivot_longer(cols = everything(), names_to = "estimate_names", values_to = "share_in_pc") %>%
mutate(total = sum(share_in_pc)) %>%
print(n=20)
# add estimates
<- sum_stats_hours %>%
sum_stats_hours left_join(coefplot(reg_nr_hours_bins, only.params = T)$prms, by = "estimate_names") %>%
mutate(ref = is.na(estimate)) %>%
#replace all NA with 0
mutate(estimate = ifelse(is.na(estimate), 0, estimate),
ci_low = ifelse(is.na(ci_low), 0, ci_low),
ci_high = ifelse(is.na(ci_high), 0, ci_high)) %>%
mutate(estimate_names = as_factor(estimate_names))
# back of the enveloppe
# if i increase the whole day by two degrees, how many more reports do i get?
%>%
sum_stats_hours select(estimate_names, estimate, share_in_pc) %>%
mutate(
#i try to find "the average day, with rounded numbers of hours
share_in_hours = share_in_pc/100*24,
# round (and 0.05 is correction to have 24 hours)
share_in_hours = round(share_in_hours-0.05),
#check that sums to 24
#check = sum(share_in_hours)
#create new distribution
new_share_in_hours = lag(share_in_hours),
diff_share_in_hours = new_share_in_hours - share_in_hours,
#compute the effect
effect = estimate*diff_share_in_hours,
avg_effect = sum(effect, na.rm =T)
)
# there are usually 4 hours in the ref bin 16-18
# if on the average day i move two hours to 18-20
# i get 0.00639*100 = 0.6% more reports
#rename estimate names
<- sum_stats_hours %>%
sum_stats_hours mutate(estimate_labels = fct_recode(estimate_names,
"<6" = "hours_below_6",
"[6-8)" = "hours_in_6_8",
"[8-10)" = "hours_in_8_10",
"[10-12)" = "hours_in_10_12",
"[12-14)" = "hours_in_12_14",
"[14-16)" = "hours_in_14_16",
"[16-18)" = "hours_in_16_18",
"[18-20)" = "hours_in_18_20",
"[20-22)" = "hours_in_20_22",
"[22-24)" = "hours_in_22_24",
"[24-26)" = "hours_in_24_26",
"[26-28)" = "hours_in_26_28",
">=28" = "hours_above_28"))
## Plot the results ----
%>%
sum_stats_hours ggplot(aes(x = estimate_names, y = estimate)) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = ci_low, ymax = ci_high), width = 0.2) +
geom_hline(yintercept = 0, alpha = .5) +
theme_light() +
scale_x_discrete(guide = guide_axis(angle = 30)) +
scale_y_continuous(breaks = seq(-0.1, 0.1, 0.01)) +
labs(x = "Number of hours in temperature bin",
y="Estimate and 95% CI")
ggsave(path = here("output", "figures"),
filename = "reg_bins_nr_of_hours.png",
width = 6, height = 4)
## Plot the support of the bins ----
%>%
sum_stats_hours ggplot(aes(x = estimate_names, y = share_in_pc, fill = ref)) +
geom_col() +
scale_fill_manual(values = c("TRUE" = "gray", "FALSE" = "gray30")) +
theme_light() +
scale_x_discrete(guide = guide_axis(angle = 30)) +
scale_y_continuous(labels = scales::percent_format(scale = 1),
breaks = seq(0, 100, 2)) +
theme(legend.position = "none") +
labs(x="Hourly temperature bin",
y="Share of hours per day",
caption = "Lighter bar is the reference bin in the estimations")
ggsave(path = here("output", "figures"),
filename = "reg_bins_nr_of_hours_distrib.png",
width = 6, height = 4)
## Table the output ---------
etable(reg_nr_hours_bins)
setFixest_dict(
c(reports_dv = "Reports",
hours_be = "Daily Mean Temp. Bin")
)
etable(reg_nr_hours_bins,
drop.section = "fixef",
fitstat = ~ n,
digits.stats = "s3", digits = "r4",
#extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
file = here("output", "tables", "reg_nr_hours_bins.tex"), replace = T,
view = T,
style.tex = style.tex("aer",
model.format = "(i)",
tpt = TRUE,
notes.tpt.intro = "\\footnotesize",
fixef.suffix = " FEs"),
float = T,
title = "The Effect of Temperature on Domestic Violence Incidence",
label = "reg_nr_hours_bins",
notes = "these are my table notes"
)
# NON EXCLUSIVE BINS: NR OF HOURS ABOVE A CERTAIN THRESHOLD -------------------
# this makes less sense because bins are redudant since not exclusive
# i keep it becayse i tried but probably pointless
## Count the hours ----
#like 30 degrees during the days and 25 during the night?
<- data %>%
data mutate(hours_above_8 = map_int(tmp_hourly, ~ sum(.x > 8)),
hours_above_10 = map_int(tmp_hourly, ~ sum(.x > 10)),
hours_above_12 = map_int(tmp_hourly, ~ sum(.x > 12)),
hours_above_14 = map_int(tmp_hourly, ~ sum(.x > 14)),
hours_above_16 = map_int(tmp_hourly, ~ sum(.x > 16)),
hours_above_18 = map_int(tmp_hourly, ~ sum(.x > 18)),
hours_above_20 = map_int(tmp_hourly, ~ sum(.x > 20)),
hours_above_22 = map_int(tmp_hourly, ~ sum(.x > 22)),
hours_above_24 = map_int(tmp_hourly, ~ sum(.x > 24)),
hours_above_25 = map_int(tmp_hourly, ~ sum(.x > 24)),
hours_above_26 = map_int(tmp_hourly, ~ sum(.x > 26)),
hours_above_28 = map_int(tmp_hourly, ~ sum(.x > 28)),
hours_above_30 = map_int(tmp_hourly, ~ sum(.x > 30)),
hours_above_32 = map_int(tmp_hourly, ~ sum(.x > 32))
#34 would be only zeroes
)
%>%
data summarise(across(starts_with("hours_above"), ~ mean(.x, na.rm =T))) %>%
pivot_longer(cols = everything(), names_to = "threshold", values_to = "mean")
%>%
data group_by(month) %>%
summarise(across(starts_with("hours_above"), ~ mean(.x, na.rm =T))) %>%
pivot_longer(cols = starts_with("hours_above"), names_to = "threshold", values_to = "mean") %>%
pivot_wider(names_from = month, values_from = mean)
## Run the regs ----
<- data %>%
reg_nr_hours fepois(reports_dv ~ sw(hours_above_8,
hours_above_10,
hours_above_12,
hours_above_14,
hours_above_16,
hours_above_18,
hours_above_20,
hours_above_22,
hours_above_24,#hours_above_25,
hours_above_26,
hours_above_28,
hours_above_30,|
hours_above_32) + rh_quintile + wsp_quintile +
prec_quintile+ year^month + day_of_week + day_of_year,
col_id cluster = ~ col_id)
## Plot and table ----
#from the fepois estimation, since it takes ages...
# all from different regressions
<- tribble(
reg_nr_hours ~estimate, ~ci_low, ~ci_high, ~x, ~id, ~estimate_names, ~estimate_names_raw, ~y,
0.014056025, 0.008161320, 0.01995073, 0.650000, 1, "hours_above_8", "hours_above_8", 0.014056025,
0.010706560, 0.006519568, 0.01489355, 1.708333, 2, "hours_above_10", "hours_above_10", 0.010706560,
0.008560532, 0.005175507, 0.01194556, 2.766667, 3, "hours_above_12", "hours_above_12", 0.008560532,
0.008424704, 0.005746303, 0.01110310, 3.825000, 4, "hours_above_14", "hours_above_14", 0.008424704,
0.008092284, 0.005424632, 0.01075994, 4.883333, 5, "hours_above_16", "hours_above_16", 0.008092284,
0.011583179, 0.008149088, 0.01501727, 5.941667, 6, "hours_above_18", "hours_above_18", 0.011583179,
0.012108312, 0.008426261, 0.01579036, 7.000000, 7, "hours_above_20", "hours_above_20", 0.012108312,
0.010810006, 0.006834899, 0.01478511, 8.058333, 8, "hours_above_22", "hours_above_22", 0.010810006,
0.010138456, 0.005112505, 0.01516441, 9.116667, 9, "hours_above_24", "hours_above_24", 0.010138456,
0.004533660, -0.002805860, 0.01187318, 10.175000, 10, "hours_above_26", "hours_above_26", 0.004533660,
0.003003480, -0.009620764, 0.01562772, 11.233333, 11, "hours_above_28", "hours_above_28", 0.003003480,
-0.008474810, -0.059064996, 0.04211538, 12.291667, 12, "hours_above_30", "hours_above_30", -0.008474810,
-5.819714160, -5.954458705, -5.68496962, 13.350000, 13, "hours_above_32", "hours_above_32", -5.819714160
)
#coefplot(reg_nr_hours, only.params = T)$prms %>%
%>%
reg_nr_hours filter(abs(ci_low) < 0.05) %>%
mutate(estimate_names = as_factor(estimate_names)) %>%
ggplot(aes(x = estimate_names, y = estimate)) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = ci_low, ymax = ci_high), width = 0.2) +
geom_hline(yintercept = 0, alpha = .5) +
theme_light() +
scale_x_discrete(guide = guide_axis(angle = 30)) +
labs(x=NULL,y="Estimate and 95% CI")
etable(reg_nr_hours,
fitstat = ~ n+my,
extralines = ~ tmean_eff_size_1 + tmean_eff_size_2, drop.section = "fixef")
Number of hours
Note
I run the analyses in which I use the number of hours above a certain threshold as the main coeff of interest.
This follows a bit the approahc of Hoffmann Rud ECMA but adapted to temperatures.
It makes less sense for temp though, so maybe I should just let it goooo