# Extract all the links to CSV files
<-
csv_links read_html("https://sih.conagua.gob.mx/climas.html") %>% # Read the webpage
html_nodes("a") %>% # Select all links
html_attr("href") %>% # Extract the href attribute
str_subset("\\.csv$") %>% # Keep only the links ending with .csv
as_tibble() %>%
rename(relative_url = value)
<- csv_links %>%
csv_links mutate(base_url = "https://sih.conagua.gob.mx/",
url = paste0(base_url, relative_url),
filename = basename(relative_url))
Precipitation data from CONAGUA
In this script, I process the CONAGUA data for precipitations, measured at daily level for the entire country. I download the relevant station level data in this script.
I then process it, IDW with 20kms cutoff, and save it at the AGEB by day level in processed_conagua_ageb.rds
.
(NB: for the most recent data (from second half of 2024), I use a 50kms cutoff to avoid NAs, since the data is more sparse)
I eventually save a file that is at the AGEB by day level, with the daily precipitation.
1 Download the raw data
This website links all the precipitation data from CONAGUA. https://sih.conagua.gob.mx/climas.html I crawl and download each file. The download would take long because there are almost 8000 files, one for each station over the whole country.
Import catalog of stations
The catalog of stations is on the same page. I download it separately.
<- csv_links %>%
catalogo_link filter(str_detect(filename, "Catalogo"))
<- csv_links %>%
csv_links filter(!str_detect(filename, "Catalogo"))
# download the catalogo
# no need to download it again and again
# download.file(catalogo_link$url,
# here("..", "raw_data", "3_conagua",
# "csv_downloaded_by_script_CDMX",
# basename(catalogo_link$relative_url)))
# import it
<-
catalogo read_csv(here("..", "raw_data", "3_conagua",
"csv_downloaded_by_script_CDMX",
basename(catalogo_link$relative_url)),
locale = locale(encoding = "ISO-8859-1"))
# Clean the column names manually
colnames(catalogo) <- colnames(catalogo) %>%
tolower() %>% # Convert to lowercase
stri_trans_general("Latin-ASCII") %>% # Remove accents and special characters
gsub("[^a-z0-9_]", "_", .) %>% # Replace non-alphanumeric characters with "_"
gsub("_+", "_", .) %>% # Replace multiple underscores with a single one
gsub("_$", "", .) # Remove trailing underscores
rm(catalogo_link)
Select only the stations within 50kms
It contains daily measurements for thousands of weather stations across the whole country. I first want to identify the relevant stations for my analysis, i.e. those surrounding CDMX.
I first select the surrounding states.
<- catalogo %>%
catalogo filter(estado %in% c("Ciudad de México", "México", "Hidalgo",
"Morelos", "Puebla", "Tlaxcala"))
We can be stricter, and only download the ones within 50km of the center of CDMX. Remove two stations with unknown location and convert to sf:
<- catalogo %>%
catalogo filter(!round(latitud) == 0 & !round(longitud) == 0)
<-
catalogo st_as_sf(catalogo, coords = c("longitud", "latitud"), crs = 4326,
remove = FALSE)
#Import AGEBs
<-
ageb st_read(here("..",
"output",
"ageb_poligonos.geojson"))%>%
st_transform(4326) %>%
st_make_valid()
Reading layer `ageb_poligonos' from data source
`/Users/martin/Documents/EUI/RESEARCH/violence and climate/heat and DV mexico/data and regressions/output/ageb_poligonos.geojson'
using driver `GeoJSON'
Simple feature collection with 2431 features and 1 field
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: -99.34306 ymin: 19.12409 xmax: -98.94664 ymax: 19.59276
Geodetic CRS: WGS 84
<- ageb %>%
ageb st_centroid() %>%
mutate(lon = st_coordinates(.)[, 1],
lat = st_coordinates(.)[, 2])
I compute the distance of all stations from the geographic center of CDMX. I keep only those located less than 50km away from the center, more than enough. Note, this is not the cutoff for the IDW. Just a first filter.
<- catalogo %>%
catalogo mutate(dist_center = as.numeric(
st_distance(catalogo, st_centroid(st_union(ageb)))),
.before = 1) %>%
# 50km cutoff
filter(dist_center < 50000)
2 Download all the station files
Filter only the relevant stations, now there are only about 300. Its historical data, so the vast majority of stations will be unused.
<- csv_links %>%
csv_links # remove .csv from filename to get the ID of the station
mutate(clave = str_remove(filename, ".csv"))
# keep the links only to the relevant stations
<- csv_links %>%
csv_links filter(clave %in% catalogo$clave)
And downlaod the data, one by one:
# to keep results reproducible, do not add new files
# to download fresh files, remove the existing ones first
# Check if file already exists, if yes, no need to download it again.
# Be polite; ask only once!
%>%
csv_links filter(!filename %in%
list.files(here("..", "raw_data", "3_conagua", "csv_downloaded_by_script_CDMX"))) %>%
# Download each file
pull(url) %>%
walk(~ download.file(.x,
here("..", "raw_data", "3_conagua",
"csv_downloaded_by_script_CDMX",
basename(.x))),
.progress = TRUE)
rm(csv_links)
3 Import the precipitation data
I can now import the data only for those stations.
# list all the files
<-
prec_files list.files(here("..", "raw_data", "3_conagua", "csv_downloaded_by_script_CDMX"),
full.names = T) %>%
as_tibble() %>%
rename(full_path = value) %>%
mutate(filename = basename(full_path),
clave = str_remove(filename, ".csv"))
<-
prec_data map_dfr(.x = prec_files %>% pull(full_path),
.f = ~ fread(.x,
skip = 6,
# only read date and prec, the rest is incomplete
#select = (1:2),
na.strings = c("-","")) %>%
mutate(clave = str_remove(basename(.x), ".csv"))
)# keep only prec and non NA values
<- prec_data %>%
prec_data select(clave,
fecha = Fecha,
prec = `Precipitación(mm)`) %>%
filter(!is.na(prec))
rm(prec_files)
Filter based on dates:
<- prec_data %>%
prec_data mutate(fecha = ymd(fecha), .keep = "unused") %>%
filter(year(fecha) >= 2016)
Check station activity
Most stations are active throughout the entire period, but less so in recent years.
# there are 210 stations!
%>% distinct(clave) %>% nrow() prec_data
[1] 210
%>%
prec_data group_by(clave) %>%
# mutate(number_of_measurements = ) %>%
filter(n() > 50) %>%
group_by(fecha) %>%
ggplot(aes(x = fecha, y = clave, group = clave)) +
geom_line() +
geom_point(shape = 15) +
scale_y_discrete(guide = guide_axis(n.dodge = 2)) +
theme(axis.text.y = element_text(size = 8))
ggsave(here("..", "output",
"figures",
"stations_activity_precipitation_all.png"),
width = 10,
height = 20,
dpi = 300)
We want to filter the stations that record consistently
<- prec_data %>%
station_activity # 9 years, that is 3285 days in total
filter(year(fecha) %in% 2016:2024) %>%
# count nr of non NA days per station
group_by(clave) %>%
summarise(nr_days = n(),
share_of_total = 100 * nr_days/3285)
# we keep stations that are active at least 75pc of the time
<- station_activity %>% filter(share_of_total >= 75)
station_activity <- catalogo %>% filter(clave %in% station_activity$clave)
catalogo <- prec_data %>% filter(clave %in% station_activity$clave)
prec_data
%>%
prec_data group_by(fecha) %>%
ggplot(aes(x = fecha, y = clave, group = clave)) +
geom_line() +
geom_point(shape = 15) +
scale_y_discrete(guide = guide_axis(n.dodge = 2)) +
theme(axis.text.y = element_text(size = 8))
ggsave(here("..", "output",
"figures",
"stations_activity_precipitation.png"),
width = 8,
height = 16,
dpi = 300)
There are many stations recording throughout the whole period, within 50kms of the center. We will only use those within 20km of a CDMX AGEB for the IDW.
library(tmap)
tm_shape(catalogo) +
tm_dots(col = "red", scale = 3) +
tm_shape(ageb) +
tm_dots(alpha = .3) +
tm_layout(title = 'Many weather stations record precipitation') +
tm_scale_bar()
4 Matching procedure
I now have to go from stations data to AGEB level.
Compute distances
<-
distances crossing(
%>% st_drop_geometry() %>%
(ageb select(ageb,
ageb_lon = lon,
ageb_lat = lat)),
%>% st_drop_geometry() %>%
(catalogo select(clave,
sta_lon = longitud,
sta_lat = latitud))
)
# compute the distance in meters, using haversine method, and convert to km
<- distances %>%
distances mutate(distance = geosphere::distHaversine(cbind(ageb_lon, ageb_lat),
cbind(sta_lon, sta_lat)),
distance = distance/1000)
%>%
distances saveRDS(here("..", "output",
"temporary_data",
"weather",
"distances_precipitation.rds"))
Matching procedure
I have about 3300 days. I have to do it day by day, to not exhaust memory.
# get all the dates
<- prec_data %>%
fechas pull(fecha) %>%
unique()
I assign to each AGEB per day the measures of all stations within 20km. I then compute a distance weighted average for day.
# this long function computes daily averages at the AGEB level
# from hourly data at the station level
# It also takes care of missing values
<- function(fecha, max_dist) {
process_data_for_one_day
<- prec_data %>%
prec_data_one_day filter(fecha == !!fecha)
<-
one_day # take each AGEB and station combination that are within 20km of each other
%>%
distances filter(distance <= max_dist) %>%
select(ageb, clave, distance) %>%
# add date to each observation
mutate(fecha = prec_data_one_day %>% pull(fecha) %>% unique(), .before = 1) %>%
# Add the actual data (day x station) to each AGEB
left_join(prec_data_one_day,
by = c("clave", "fecha")) %>%
filter(!is.na(prec)) %>%
# Now, i want to move to the AGEB by date level
# I take inverse distance weighted averages, so I compute a weight based on distance
mutate(weight = 1/distance) %>%
group_by(ageb, fecha) %>%
# compute idw averages
summarise(prec = stats::weighted.mean(prec, weight, na.rm = T),
.groups = "drop")
return(one_day)
}
# just as an example for one day.
# I have one observation per AGEB.
process_data_for_one_day(fechas[2000], max_dist = 20)
# A tibble: 2,431 × 3
ageb fecha prec
<chr> <date> <dbl>
1 0900200010010 2021-06-23 14.3
2 0900200010025 2021-06-23 12.4
3 090020001003A 2021-06-23 16.2
4 0900200010044 2021-06-23 13.0
5 0900200010097 2021-06-23 14.1
6 090020001010A 2021-06-23 12.4
7 0900200010114 2021-06-23 9.58
8 0900200010129 2021-06-23 8.86
9 0900200010133 2021-06-23 8.57
10 0900200010148 2021-06-23 11.3
# ℹ 2,421 more rows
Run the function for each day
# I want to run a lower cutoff ofr IDW for most data,
# but a higher cutoff for the most recent days, until
# the data gets updated (to avoid NAs) (see station activity above)
# We dont want to drop observations just because rain is missing
# so i split the date series in two and run the function twice
<- fechas[fechas >= as_date("2023-04-01")]
fechas_recent <- as_date(setdiff(fechas, fechas_recent))
fechas_old
# set parallel computing
::detectCores() parallel
[1] 10
plan(multisession, workers = 6)
# set memory limit
mem.maxVSize(vsize = 40000)
[1] 40000
# run for each day
tic()
<-
list_of_days_old # use map insetad of future_map to run single core with purrr
future_map(fechas_old, ~ process_data_for_one_day(.x, max_dist = 20),
.progress = TRUE)
toc()
27.057 sec elapsed
tic()
<-
list_of_days_recent future_map(fechas_recent, ~ process_data_for_one_day(.x, max_dist = 50),
.progress = TRUE)
toc()
8.208 sec elapsed
# to go back to single core
plan(sequential)
Bind the dataframes and complete:
<-
prec_daily rbind(
list_rbind(list_of_days_old),
list_rbind(list_of_days_recent)
)
# fill with NAs for absent combinations
<- prec_daily %>%
prec_daily complete(fecha = seq(min(fecha), max(fecha), by = "1 day"),
ageb)
5 Look at missing values
We only have more sparse data for the recent years (2023, 2024). There are no missing for the earlier years.
With 20km cutoff, we have 54 days in 2023 with some AGEBs missing for precipitation. With 50km, a few days in 2024… Thats why we did it.
<-
missings %>%
prec_daily group_by(fecha) %>%
summarise(nr_missing = sum(is.na(prec)),
share_missing = mean(is.na(prec)))
%>%
missings count(year(fecha), missing_days = share_missing != 0)
# A tibble: 11 × 3
`year(fecha)` missing_days n
<int> <lgl> <int>
1 2016 FALSE 366
2 2017 FALSE 365
3 2018 FALSE 365
4 2019 FALSE 365
5 2020 FALSE 366
6 2021 FALSE 365
7 2022 FALSE 365
8 2023 FALSE 365
9 2024 FALSE 363
10 2024 TRUE 3
11 2025 FALSE 56
%>%
missings ggplot(aes(x = fecha, y = share_missing, color = as_factor(year(fecha)))) +
geom_line() +
labs(title = "Missing values in the precipitation data",
x = "Date",
y = "Share of missing values")
6 Save the data
%>%
prec_daily write_rds(here("..", "output",
"temporary_data",
"weather",
"processed_conagua_ageb.rds"),
compress = "gz")