Number of hours

Note

I run the analyses in which I use the number of hours above a certain threshold as the main coeff of interest.

This follows a bit the approahc of Hoffmann Rud ECMA but adapted to temperatures.

It makes less sense for temp though, so maybe I should just let it goooo

library(tidyverse)
library(here) # path control
library(lubridate) # handle dates
library(fixest) # estimations with fixed effects
library(modelsummary) # for the modelplot figure

data <-
  readRDS(here("output", "final_data_for_analysis.rds")) %>% 
  filter(covid_phase == "Pre-pandemic")

source(here("codes", "my_functions.R"))

#source(here("codes", "fixest_options.R"))

data %>% select(tmp_hourly, col_id, date) %>%
  unnest(cols = c(tmp_hourly)) %>% 
  group_by(col_id, date) %>%
  mutate(hour = row_number()) %>% 
  ungroup() %>%
  ggplot(aes(x = round(tmp_hourly))) +
  geom_bar(fill = "blue", alpha = 0.5) +
  #geom_density(fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(-20, 40, 2),
                     minor_breaks = seq(-20,40,1)) +
  theme_light() #+ facet_wrap(~hour)


# NUMBER OF HOURS IN TEMPERATURE BINS -----------------------------------------

## Remove the observations with missing values in the 24 hours----
# there are missing on 61 days
hourly_data %>% filter(is.na(tmp_hourly)) %>% 
  count(date)
# there are missing for 14 different neighborhoods
hourly_data %>% filter(is.na(tmp_hourly)) %>% 
  count(col_id)

# i find all the missing observations
missing_obs <- hourly_data %>% 
  filter(is.na(tmp_hourly)) %>% 
  count(date, col_id)

# remove these 854 observations from the data, to be able to choose
# a ref category without having to estimate a coefficient for 0.01 pc of na obsevrations
data <- data %>% 
  filter(!(date %in% missing_obs$date & col_id %in% missing_obs$col_id))

## Count the hours in bins ----

data <- data %>%
  mutate(hours_below_0 = map_int(tmp_hourly, ~ sum(.x < 0)),
         hours_below_2 = map_int(tmp_hourly, ~ sum(.x < 2)),
         hours_below_4 = map_int(tmp_hourly, ~ sum(.x < 4)),
         hours_below_6 = map_int(tmp_hourly, ~ sum(.x < 6)),
         
         hours_in_0_2 = map_int(tmp_hourly, ~ sum(.x >= 0 & .x < 2)),
         hours_in_2_4 = map_int(tmp_hourly, ~ sum(.x >= 2 & .x < 4)),
         hours_in_4_6 = map_int(tmp_hourly, ~ sum(.x >= 4 & .x < 6)),
         hours_in_6_8 = map_int(tmp_hourly, ~ sum(.x >= 6 & .x < 8)),
         hours_in_8_10 = map_int(tmp_hourly, ~ sum(.x >= 8 & .x < 10)),
         hours_in_10_12 = map_int(tmp_hourly, ~ sum(.x >= 10 & .x < 12)),
         hours_in_12_14 = map_int(tmp_hourly, ~ sum(.x >= 12 & .x < 14)),
         hours_in_14_16 = map_int(tmp_hourly, ~ sum(.x >= 14 & .x < 16)),
         hours_in_16_18 = map_int(tmp_hourly, ~ sum(.x >= 16 & .x < 18)),
         hours_in_18_20 = map_int(tmp_hourly, ~ sum(.x >= 18 & .x < 20)),
         hours_in_20_22 = map_int(tmp_hourly, ~ sum(.x >= 20 & .x < 22)),
         hours_in_22_24 = map_int(tmp_hourly, ~ sum(.x >= 22 & .x < 24)),
         hours_in_24_26 = map_int(tmp_hourly, ~ sum(.x >= 24 & .x < 26)),
         hours_in_26_28 = map_int(tmp_hourly, ~ sum(.x >= 26 & .x < 28)),
         hours_in_28_30 = map_int(tmp_hourly, ~ sum(.x >= 28 & .x < 30)),
         hours_in_30_32 = map_int(tmp_hourly, ~ sum(.x >= 30 & .x < 32)),
         
         hours_above_26 = map_int(tmp_hourly, ~ sum(.x >= 26)),
         hours_above_28 = map_int(tmp_hourly, ~ sum(.x >= 28)),
         hours_above_30 = map_int(tmp_hourly, ~ sum(.x >= 30)),
         hours_above_32 = map_int(tmp_hourly, ~ sum(.x >= 32)),
         
         hours_missing = map_int(tmp_hourly, ~ sum(is.na(.x)))
  )

# is it complete??? YES
data %>% 
  summarise(across(starts_with("hours_"), ~ sum(.x, na.rm = T))) %>% 
  select(hours_below_6, hours_in_6_8, hours_in_8_10, 
         hours_in_10_12, hours_in_12_14, hours_in_14_16,
         hours_in_16_18, hours_in_18_20, hours_in_20_22,
         hours_in_22_24, hours_in_24_26, hours_in_26_28,
         hours_above_28, hours_missing) %>%
  pivot_longer(cols = everything(), names_to = "estimate_names", values_to = "count") %>% 
  print(n=30) %>% 
  mutate(all_hours = sum(count),
         obs = all_hours/24,
         (nrow(data)))

## Regs ----

# i ofc cannot put all the bins together, collinear
# i choose one bin for reference
reg_nr_hours_bins <- data %>% 
  fepois(reports_dv ~
          hours_below_6 + 
          hours_in_6_8 +
          hours_in_8_10 +
          hours_in_10_12 + 
          hours_in_12_14 +
          hours_in_14_16 +
          #hours_in_16_18 +
          hours_in_18_20 +
          hours_in_20_22 +
          hours_in_22_24 +
          hours_in_24_26 +
          hours_in_26_28 +
          hours_above_28 |
          prec_quintile + rh_quintile + wsp_quintile +
          col_id + year^month + day_of_week + day_of_year,
        cluster = ~ col_id)

## summary stats ----

# compute support
sum_stats_hours <- data %>% 
  summarise(across(starts_with("hours_"), ~ 100*mean(.x, na.rm =T)/24)) %>% 
  select(hours_below_6,
         hours_in_6_8, hours_in_8_10, hours_in_10_12,
         hours_in_12_14, hours_in_14_16, hours_in_16_18,
         hours_in_18_20, hours_in_20_22, hours_in_22_24,
         hours_in_24_26, hours_in_26_28,
         hours_above_28) %>% 
  pivot_longer(cols = everything(), names_to = "estimate_names", values_to = "share_in_pc") %>% 
  mutate(total = sum(share_in_pc)) %>% 
  print(n=20)

# add estimates
sum_stats_hours <- sum_stats_hours %>% 
  left_join(coefplot(reg_nr_hours_bins, only.params = T)$prms, by = "estimate_names") %>% 
  mutate(ref = is.na(estimate)) %>% 
  #replace all NA with 0
  mutate(estimate = ifelse(is.na(estimate), 0, estimate),
         ci_low = ifelse(is.na(ci_low), 0, ci_low),
         ci_high = ifelse(is.na(ci_high), 0, ci_high)) %>%
  mutate(estimate_names = as_factor(estimate_names))

# back of the enveloppe
# if i increase the whole day by two degrees, how many more reports do i get?
sum_stats_hours %>% 
  select(estimate_names, estimate, share_in_pc) %>% 
  mutate(
    #i try to find "the average day, with rounded numbers of hours
    share_in_hours = share_in_pc/100*24,
    # round (and 0.05 is correction to have 24 hours)
    share_in_hours = round(share_in_hours-0.05),
    #check that sums to 24
    #check = sum(share_in_hours)
    #create new distribution
    new_share_in_hours = lag(share_in_hours),
    diff_share_in_hours = new_share_in_hours - share_in_hours,
    #compute the effect
    effect = estimate*diff_share_in_hours,
    avg_effect = sum(effect, na.rm =T)
    )

# there are usually 4 hours in the ref bin 16-18
# if on the average day i move two hours to 18-20
# i get 0.00639*100 = 0.6% more reports


#rename estimate names
sum_stats_hours <- sum_stats_hours %>% 
  mutate(estimate_labels = fct_recode(estimate_names, 
                                     "<6" = "hours_below_6",
                                     "[6-8)" = "hours_in_6_8",
                                     "[8-10)" = "hours_in_8_10",
                                     "[10-12)" = "hours_in_10_12",
                                     "[12-14)" = "hours_in_12_14",
                                     "[14-16)" = "hours_in_14_16",
                                     "[16-18)" = "hours_in_16_18",
                                     "[18-20)" = "hours_in_18_20",
                                     "[20-22)" = "hours_in_20_22",
                                     "[22-24)" = "hours_in_22_24",
                                     "[24-26)" = "hours_in_24_26",
                                     "[26-28)" = "hours_in_26_28",
                                     ">=28" = "hours_above_28"))

## Plot the results ----

sum_stats_hours %>% 
  ggplot(aes(x = estimate_names, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci_low, ymax = ci_high), width = 0.2) +
  geom_hline(yintercept = 0, alpha = .5) +
  theme_light() +
  scale_x_discrete(guide = guide_axis(angle = 30)) +
  scale_y_continuous(breaks = seq(-0.1, 0.1, 0.01)) +
  labs(x = "Number of hours in temperature bin",
       y="Estimate and 95% CI")

ggsave(path = here("output", "figures"),
       filename = "reg_bins_nr_of_hours.png",
       width = 6, height = 4)

## Plot the support of the bins ----
sum_stats_hours %>% 
  ggplot(aes(x = estimate_names, y = share_in_pc, fill = ref)) +
  geom_col() +
  scale_fill_manual(values = c("TRUE" = "gray", "FALSE" = "gray30")) +
  theme_light() +
  scale_x_discrete(guide = guide_axis(angle = 30)) +
  scale_y_continuous(labels = scales::percent_format(scale = 1),
                     breaks = seq(0, 100, 2)) +
  theme(legend.position = "none") +
  labs(x="Hourly temperature bin",
       y="Share of hours per day",
       caption = "Lighter bar is the reference bin in the estimations")

ggsave(path = here("output", "figures"),
       filename = "reg_bins_nr_of_hours_distrib.png",
       width = 6, height = 4)

## Table the output ---------

etable(reg_nr_hours_bins)

setFixest_dict(
  c(reports_dv = "Reports",
    hours_be = "Daily Mean Temp. Bin")
)

etable(reg_nr_hours_bins,
       drop.section = "fixef",
       fitstat = ~ n,
       digits.stats = "s3", digits = "r4",
       #extralines = ~ tmean_eff_size_1 + tmean_eff_size_2,
       file = here("output", "tables", "reg_nr_hours_bins.tex"), replace = T,
       view = T,
       style.tex = style.tex("aer",
                             model.format = "(i)",
                             tpt = TRUE,
                             notes.tpt.intro = "\\footnotesize",
                             fixef.suffix = " FEs"),
      float = T,
       title = "The Effect of Temperature on Domestic Violence Incidence",
       label = "reg_nr_hours_bins",
       notes = "these are my table notes"
)




# NON EXCLUSIVE BINS: NR OF HOURS ABOVE A CERTAIN THRESHOLD -------------------

# this makes less sense because bins are redudant since not exclusive
# i keep it becayse i tried but probably pointless

## Count the hours ----
#like 30 degrees during the days and 25 during the night?

data <- data %>%
  mutate(hours_above_8 = map_int(tmp_hourly, ~ sum(.x > 8)),
         hours_above_10 = map_int(tmp_hourly, ~ sum(.x > 10)),
         hours_above_12 = map_int(tmp_hourly, ~ sum(.x > 12)),
         hours_above_14 = map_int(tmp_hourly, ~ sum(.x > 14)),
         hours_above_16 = map_int(tmp_hourly, ~ sum(.x > 16)),
         hours_above_18 = map_int(tmp_hourly, ~ sum(.x > 18)),
         hours_above_20 = map_int(tmp_hourly, ~ sum(.x > 20)),
         hours_above_22 = map_int(tmp_hourly, ~ sum(.x > 22)),
         hours_above_24 = map_int(tmp_hourly, ~ sum(.x > 24)),
         hours_above_25 = map_int(tmp_hourly, ~ sum(.x > 24)),
         hours_above_26 = map_int(tmp_hourly, ~ sum(.x > 26)),
         hours_above_28 = map_int(tmp_hourly, ~ sum(.x > 28)),
         hours_above_30 = map_int(tmp_hourly, ~ sum(.x > 30)),
         hours_above_32 = map_int(tmp_hourly, ~ sum(.x > 32))
         #34 would be only zeroes
  )

data %>% 
  summarise(across(starts_with("hours_above"), ~ mean(.x, na.rm =T))) %>% 
  pivot_longer(cols = everything(), names_to = "threshold", values_to = "mean") 

data %>% 
  group_by(month) %>% 
  summarise(across(starts_with("hours_above"), ~ mean(.x, na.rm =T))) %>% 
  pivot_longer(cols = starts_with("hours_above"), names_to = "threshold", values_to = "mean") %>% 
  pivot_wider(names_from = month, values_from = mean)

## Run the regs ----

reg_nr_hours <- data %>% 
  fepois(reports_dv ~ sw(hours_above_8,
                         hours_above_10,
                         hours_above_12,
                         hours_above_14,
                         hours_above_16,
                         hours_above_18,
                         hours_above_20,
                         hours_above_22,
                         hours_above_24,
                         #hours_above_25,
                         hours_above_26,
                         hours_above_28,
                         hours_above_30,
                         hours_above_32) |
           prec_quintile+ rh_quintile + wsp_quintile +
           col_id + year^month + day_of_week + day_of_year,
         cluster = ~ col_id)



## Plot and table ----
#from the fepois estimation, since it takes ages...
# all from different regressions
reg_nr_hours <- tribble(
  ~estimate,       ~ci_low,      ~ci_high,     ~x,        ~id,  ~estimate_names, ~estimate_names_raw, ~y,
  0.014056025,  0.008161320,  0.01995073,  0.650000,   1,  "hours_above_8",  "hours_above_8",  0.014056025,
  0.010706560,  0.006519568,  0.01489355,  1.708333,   2,  "hours_above_10", "hours_above_10", 0.010706560,
  0.008560532,  0.005175507,  0.01194556,  2.766667,   3,  "hours_above_12", "hours_above_12", 0.008560532,
  0.008424704,  0.005746303,  0.01110310,  3.825000,   4,  "hours_above_14", "hours_above_14", 0.008424704,
  0.008092284,  0.005424632,  0.01075994,  4.883333,   5,  "hours_above_16", "hours_above_16", 0.008092284,
  0.011583179,  0.008149088,  0.01501727,  5.941667,   6,  "hours_above_18", "hours_above_18", 0.011583179,
  0.012108312,  0.008426261,  0.01579036,  7.000000,   7,  "hours_above_20", "hours_above_20", 0.012108312,
  0.010810006,  0.006834899,  0.01478511,  8.058333,   8,  "hours_above_22", "hours_above_22", 0.010810006,
  0.010138456,  0.005112505,  0.01516441,  9.116667,   9,  "hours_above_24", "hours_above_24", 0.010138456,
  0.004533660, -0.002805860,  0.01187318, 10.175000,  10,  "hours_above_26", "hours_above_26", 0.004533660,
  0.003003480, -0.009620764,  0.01562772, 11.233333,  11,  "hours_above_28", "hours_above_28", 0.003003480,
  -0.008474810, -0.059064996,  0.04211538, 12.291667,  12,  "hours_above_30", "hours_above_30", -0.008474810,
  -5.819714160, -5.954458705, -5.68496962, 13.350000,  13,  "hours_above_32", "hours_above_32", -5.819714160
)

#coefplot(reg_nr_hours, only.params = T)$prms %>% 
  reg_nr_hours %>% 
  filter(abs(ci_low) < 0.05) %>% 
  mutate(estimate_names = as_factor(estimate_names)) %>% 
  ggplot(aes(x = estimate_names, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci_low, ymax = ci_high), width = 0.2) +
  geom_hline(yintercept = 0, alpha = .5) +
  theme_light() +
  scale_x_discrete(guide = guide_axis(angle = 30)) +
  labs(x=NULL,y="Estimate and 95% CI")

etable(reg_nr_hours,
       fitstat = ~ n+my,
       extralines = ~ tmean_eff_size_1 + tmean_eff_size_2, drop.section = "fixef")