Precipitation data from CONAGUA

Note

In this script, I process the CONAGUA data for precipitations, measured at daily level for the entire country. I download the relevant station level data in this script.

I then process it, IDW with 20kms cutoff, and save it at the AGEB by day level in processed_conagua_ageb.rds.

(NB: for the most recent data (from second half of 2024), I use a 50kms cutoff to avoid NAs, since the data is more sparse)

I eventually save a file that is at the AGEB by day level, with the daily precipitation.

1 Download the raw data

This website links all the precipitation data from CONAGUA. https://sih.conagua.gob.mx/climas.html I crawl and download each file. The download would take long because there are almost 8000 files, one for each station over the whole country.

# Extract all the links to CSV files
csv_links <-
  read_html("https://sih.conagua.gob.mx/climas.html") %>% # Read the webpage
  html_nodes("a") %>%  # Select all links
  html_attr("href") %>%  # Extract the href attribute
  str_subset("\\.csv$") %>%  # Keep only the links ending with .csv
  as_tibble() %>% 
  rename(relative_url = value)

csv_links <- csv_links %>%
  mutate(base_url = "https://sih.conagua.gob.mx/",
         url = paste0(base_url, relative_url),
         filename = basename(relative_url))

Import catalog of stations

The catalog of stations is on the same page. I download it separately.

catalogo_link <- csv_links %>% 
  filter(str_detect(filename, "Catalogo"))

csv_links <- csv_links %>% 
  filter(!str_detect(filename, "Catalogo"))

# download the catalogo
# no need to download it again and again
# download.file(catalogo_link$url,
#               here("..", "raw_data", "3_conagua",
#                    "csv_downloaded_by_script_CDMX",
#                    basename(catalogo_link$relative_url)))

# import it
catalogo <- 
  read_csv(here("..", "raw_data", "3_conagua",
                "csv_downloaded_by_script_CDMX",
                basename(catalogo_link$relative_url)),
           locale = locale(encoding = "ISO-8859-1"))

# Clean the column names manually
colnames(catalogo) <- colnames(catalogo) %>%
  tolower() %>%                         # Convert to lowercase
  stri_trans_general("Latin-ASCII") %>% # Remove accents and special characters
  gsub("[^a-z0-9_]", "_", .) %>%        # Replace non-alphanumeric characters with "_"
  gsub("_+", "_", .) %>%                # Replace multiple underscores with a single one
  gsub("_$", "", .)                     # Remove trailing underscores

rm(catalogo_link)

Select only the stations within 50kms

It contains daily measurements for thousands of weather stations across the whole country. I first want to identify the relevant stations for my analysis, i.e. those surrounding CDMX.

I first select the surrounding states.

catalogo <- catalogo %>% 
  filter(estado %in% c("Ciudad de México", "México", "Hidalgo",
                       "Morelos", "Puebla", "Tlaxcala"))

We can be stricter, and only download the ones within 50km of the center of CDMX. Remove two stations with unknown location and convert to sf:

catalogo <- catalogo %>% 
    filter(!round(latitud) == 0 & !round(longitud) == 0) 

catalogo <- 
  st_as_sf(catalogo, coords = c("longitud", "latitud"), crs = 4326,
           remove = FALSE)

#Import AGEBs
ageb <-  
  st_read(here("..",
               "output",
               "ageb_poligonos.geojson"))%>% 
  st_transform(4326) %>% 
  st_make_valid()

Reading layer `ageb_poligonos' from data source 
  `/Users/martin/Documents/EUI/RESEARCH/violence and climate/heat and DV mexico/data and regressions/output/ageb_poligonos.geojson' 
  using driver `GeoJSON'
Simple feature collection with 2431 features and 1 field
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -99.34306 ymin: 19.12409 xmax: -98.94664 ymax: 19.59276
Geodetic CRS:  WGS 84

ageb <- ageb %>% 
  st_centroid() %>%
  mutate(lon = st_coordinates(.)[, 1],
         lat = st_coordinates(.)[, 2])

I compute the distance of all stations from the geographic center of CDMX. I keep only those located less than 50km away from the center, more than enough. Note, this is not the cutoff for the IDW. Just a first filter.

catalogo <- catalogo %>% 
  mutate(dist_center = as.numeric(
    st_distance(catalogo, st_centroid(st_union(ageb)))),
         .before = 1) %>% 
  # 50km cutoff
  filter(dist_center < 50000)

2 Download all the station files

Filter only the relevant stations, now there are only about 300. Its historical data, so the vast majority of stations will be unused.

csv_links <- csv_links %>% 
  # remove .csv from filename to get the ID of the station
  mutate(clave = str_remove(filename, ".csv"))

# keep the links only to the relevant stations
csv_links <- csv_links %>% 
  filter(clave %in% catalogo$clave)

And downlaod the data, one by one:

# to keep results reproducible, do not add new files
# to download fresh files, remove the existing ones first

# Check if file already exists, if yes, no need to download it again.
# Be polite; ask only once!

csv_links %>% 
  filter(!filename %in%
           list.files(here("..", "raw_data", "3_conagua", "csv_downloaded_by_script_CDMX"))) %>% 
  # Download each file
  pull(url) %>%
  walk(~ download.file(.x,
                       here("..", "raw_data", "3_conagua",
                            "csv_downloaded_by_script_CDMX",
                            basename(.x))),
       .progress = TRUE)

rm(csv_links)

3 Import the precipitation data

I can now import the data only for those stations.

# list all the files
prec_files <-
  list.files(here("..", "raw_data", "3_conagua", "csv_downloaded_by_script_CDMX"),
                    full.names = T) %>% 
  as_tibble() %>% 
  rename(full_path = value) %>% 
  mutate(filename = basename(full_path),
         clave = str_remove(filename, ".csv"))

prec_data <-
  map_dfr(.x = prec_files %>% pull(full_path),
          .f = ~ fread(.x,
                       skip = 6,
                       # only read date and prec, the rest is incomplete
                       #select = (1:2),
                       na.strings = c("-","")) %>% 
            mutate(clave = str_remove(basename(.x), ".csv"))
          )
# keep only prec and non NA values
prec_data <- prec_data %>% 
  select(clave,
         fecha = Fecha,
         prec = `Precipitación(mm)`) %>% 
  filter(!is.na(prec))

rm(prec_files)

Filter based on dates:

prec_data <- prec_data %>%
  mutate(fecha = ymd(fecha), .keep = "unused") %>% 
  filter(year(fecha) >= 2016)

Check station activity

Most stations are active throughout the entire period, but less so in recent years.

# there are 210 stations!
prec_data %>% distinct(clave) %>% nrow()

[1] 210

prec_data %>% 
  group_by(clave) %>% 
  #  mutate(number_of_measurements = ) %>% 
  filter(n() > 50) %>% 
  group_by(fecha) %>% 
  ggplot(aes(x = fecha, y = clave, group = clave)) +
  geom_line()  + 
  geom_point(shape = 15) +
  scale_y_discrete(guide = guide_axis(n.dodge = 2)) +  
  theme(axis.text.y = element_text(size = 8))

ggsave(here("..", "output",
            "figures",
            "stations_activity_precipitation_all.png"),
       width = 10,
       height = 20,
       dpi = 300)

We want to filter the stations that record consistently

station_activity <- prec_data %>% 
  # 9 years, that is 3285 days in total
  filter(year(fecha) %in% 2016:2024) %>%
  # count nr of non NA days per station
  group_by(clave) %>% 
  summarise(nr_days = n(),
            share_of_total = 100 * nr_days/3285) 

# we keep stations that are active at least 75pc of the time
station_activity <- station_activity %>% filter(share_of_total >= 75)
catalogo <- catalogo %>% filter(clave %in% station_activity$clave)
prec_data <- prec_data %>% filter(clave %in% station_activity$clave)

prec_data %>% 
  group_by(fecha) %>% 
  ggplot(aes(x = fecha, y = clave, group = clave)) +
  geom_line()  + 
  geom_point(shape = 15) +
  scale_y_discrete(guide = guide_axis(n.dodge = 2)) +  
  theme(axis.text.y = element_text(size = 8))

ggsave(here("..", "output",
            "figures",
            "stations_activity_precipitation.png"),
       width = 8,
       height = 16,
       dpi = 300)

There are many stations recording throughout the whole period, within 50kms of the center. We will only use those within 20km of a CDMX AGEB for the IDW.

library(tmap)
tm_shape(catalogo) +
  tm_dots(col = "red", scale = 3) +
tm_shape(ageb) +
  tm_dots(alpha = .3) +
tm_layout(title = 'Many weather stations record precipitation') +
  tm_scale_bar()

4 Matching procedure

I now have to go from stations data to AGEB level.

Compute distances

distances <- 
  crossing(
    (ageb %>% st_drop_geometry() %>% 
       select(ageb,
              ageb_lon = lon,
              ageb_lat = lat)),
    (catalogo %>% st_drop_geometry() %>% 
       select(clave,
              sta_lon = longitud,
              sta_lat = latitud))
  ) 

# compute the distance in meters, using haversine method, and convert to km
distances <- distances %>% 
  mutate(distance = geosphere::distHaversine(cbind(ageb_lon, ageb_lat),
                                             cbind(sta_lon, sta_lat)),
         distance = distance/1000)

distances %>%  
  saveRDS(here("..", "output",
                "temporary_data",
                "weather",
                "distances_precipitation.rds"))

Matching procedure

I have about 3300 days. I have to do it day by day, to not exhaust memory.

# get all the dates
fechas <- prec_data %>% 
  pull(fecha) %>% 
  unique()

I assign to each AGEB per day the measures of all stations within 20km. I then compute a distance weighted average for day.

# this long function computes daily averages at the AGEB level
# from hourly data at the station level
# It also takes care of missing values

process_data_for_one_day <- function(fecha, max_dist) {
  
  prec_data_one_day <- prec_data %>% 
    filter(fecha == !!fecha)
  
  one_day <- 
    # take each AGEB and station combination that are within 20km of each other
    distances %>%
    filter(distance <= max_dist) %>% 
    select(ageb, clave, distance) %>% 
    # add date to each observation
    mutate(fecha = prec_data_one_day %>% pull(fecha) %>% unique(), .before = 1) %>% 
    # Add the actual data (day x station) to each AGEB
    left_join(prec_data_one_day,
              by = c("clave", "fecha")) %>% 
    filter(!is.na(prec)) %>% 
    # Now, i want to move to the AGEB by date level
    # I take inverse distance weighted averages, so I compute a weight based on distance
    mutate(weight = 1/distance) %>% 
    group_by(ageb, fecha) %>% 
    # compute idw averages
    summarise(prec = stats::weighted.mean(prec, weight, na.rm = T),
              .groups = "drop")
    
    return(one_day)
}

# just as an example for one day.
# I have one observation per AGEB.
process_data_for_one_day(fechas[2000], max_dist = 20)

# A tibble: 2,431 × 3
   ageb          fecha       prec
   <chr>         <date>     <dbl>
 1 0900200010010 2021-06-23 14.3 
 2 0900200010025 2021-06-23 12.4 
 3 090020001003A 2021-06-23 16.2 
 4 0900200010044 2021-06-23 13.0 
 5 0900200010097 2021-06-23 14.1 
 6 090020001010A 2021-06-23 12.4 
 7 0900200010114 2021-06-23  9.58
 8 0900200010129 2021-06-23  8.86
 9 0900200010133 2021-06-23  8.57
10 0900200010148 2021-06-23 11.3 
# ℹ 2,421 more rows

Run the function for each day

# I want to run a lower cutoff ofr IDW for most data,
# but a higher cutoff for the most recent days, until
# the data gets updated (to avoid NAs) (see station activity above)
# We dont want to drop observations just because rain is missing

# so i split the date series in two and run the function twice
fechas_recent <- fechas[fechas >= as_date("2023-04-01")]
fechas_old <- as_date(setdiff(fechas, fechas_recent))


# set parallel computing
parallel::detectCores()

[1] 10

plan(multisession, workers = 6)
# set memory limit
mem.maxVSize(vsize = 40000)

[1] 40000

# run for each day
tic()
list_of_days_old <-
  # use map insetad of future_map to run single core with purrr
  future_map(fechas_old, ~ process_data_for_one_day(.x, max_dist = 20),
             .progress = TRUE)
toc()

27.057 sec elapsed

tic()
list_of_days_recent <-
  future_map(fechas_recent, ~ process_data_for_one_day(.x, max_dist = 50),
             .progress = TRUE)
toc()

8.208 sec elapsed

# to go back to single core
plan(sequential)

Bind the dataframes and complete:

prec_daily <- 
  rbind(
    list_rbind(list_of_days_old),
    list_rbind(list_of_days_recent)
  )

# fill with NAs for absent combinations
prec_daily <- prec_daily %>% 
  complete(fecha = seq(min(fecha), max(fecha), by = "1 day"),
           ageb)

5 Look at missing values

We only have more sparse data for the recent years (2023, 2024). There are no missing for the earlier years.

With 20km cutoff, we have 54 days in 2023 with some AGEBs missing for precipitation. With 50km, a few days in 2024… Thats why we did it.

missings <- 
  prec_daily %>% 
  group_by(fecha) %>% 
  summarise(nr_missing = sum(is.na(prec)),
            share_missing = mean(is.na(prec)))

missings %>% 
  count(year(fecha), missing_days = share_missing != 0)

# A tibble: 11 × 3
   `year(fecha)` missing_days     n
           <int> <lgl>        <int>
 1          2016 FALSE          366
 2          2017 FALSE          365
 3          2018 FALSE          365
 4          2019 FALSE          365
 5          2020 FALSE          366
 6          2021 FALSE          365
 7          2022 FALSE          365
 8          2023 FALSE          365
 9          2024 FALSE          363
10          2024 TRUE             3
11          2025 FALSE           56

missings %>% 
  ggplot(aes(x = fecha, y = share_missing, color = as_factor(year(fecha)))) +
  geom_line() +
  labs(title = "Missing values in the precipitation data",
       x = "Date",
       y = "Share of missing values")

6 Save the data

prec_daily %>% 
  write_rds(here("..", "output",
                 "temporary_data",
                 "weather",
                 "processed_conagua_ageb.rds"),
            compress = "gz")