Site Key Cleaning

Author

JD Reigrut, Juan Carlos Villaseñor-Derbez

Published

December 9, 2025

Step 1) Clear each group of sites

Baja West cleaning

Baja_west <- read_excel(path = here("data", "raw", "atlas_locations_raw.xlsx"),
                            sheet = "BAJA_WEST") %>%
  mutate(
    #Reformatting Clave column to accuratly reflect the four decimal ID
    Clave = format(
      round(as.numeric(stri_trim_both(Clave)), 4),  
      nsmall = 4)) %>% 
  select(-Num)

Baja East cleaning

Baja_east <- read_excel(path = here("data", "raw", "atlas_locations_raw.xlsx"),
                            sheet = "BAJA_EAST") |> 
  mutate(
    #Reformatting Clave column to accuratly reflect the four decimal ID
    Clave = format(
      round(as.numeric(stri_trim_both(Clave)), 4),  
      nsmall = 4)) %>% 
  select(-Num, -Mapa)

Baja Sur West cleaning

Bajasur_west <- read_excel(path = here("data", "raw", "atlas_locations_raw.xlsx"),
                            sheet = "BAJASUR_WEST") %>%
  mutate(
    #Reformatting Clave column to accuratly reflect the four decimal ID
    Clave = format(
      round(as.numeric(stri_trim_both(Clave)), 4),  
      nsmall = 4)) %>% 
  select(-Num, -Mapa)

Baja Sur East cleaning

Bajasur_east <- read_excel(path = here("data", "raw", "atlas_locations_raw.xlsx"),
                            sheet = "BAJASUR_EAST") %>%
  mutate(
    #Reformatting Clave column to accuratly reflect the four decimal ID
    Clave = format(
      round(as.numeric(stri_trim_both(Clave)), 4),  
      nsmall = 4)) %>% 
  select(-Num, -Mapa)

Sonora cleaning

Sonora <- read_excel(path = here("data", "raw", "atlas_locations_raw.xlsx"),
                            sheet = "SONORA") %>%
   select(-NUM., -MAPA) %>%
  rename(Localidad = LOCALIDAD,
         Clave = CLAVE, 
         Captura = CAPTURA,
         Desembarque = DESEMBAR.) %>%
  mutate(
    #Reformatting Clave column to accuratly reflect the four decimal ID
    Clave = format(
      round(as.numeric(stri_trim_both(Clave)), 4),  
      nsmall = 4))

Step 2) Combining each group into one comprehensive data.frame

sites_key <- rbind(Baja_east,
                   Baja_west,
                   Bajasur_east,
                   Bajasur_west,
                   Sonora)

Step 3) Add geospatial reference to the cleaned sites

gpkg <- st_read(dsn = here("data", "processed", "landingsites.gpkg")) %>%
  mutate(Clave = as.character(Clave),
         Clave = trimws(Clave))
Reading layer `landingsites' from data source 
  `/Users/jcvd/GitHub/mex_fishing_locations/data/processed/landingsites.gpkg' 
  using driver `GPKG'
Simple feature collection with 1714 features and 2 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -118.403 ymin: 22.87 xmax: -109.042 ymax: 32.6508
Geodetic CRS:  WGS 84
sites_key_geo <- sites_key %>%
  left_join(gpkg, by = "Clave") %>%
  rename(Localidad_geo = Localidad.y,
         Localidad_key = Localidad.x) |> 
  st_as_sf(crs = "EPSG:4326")

Step 4) Minor cleanup after join

A few problematic sites exist where a single key is mapped to different locations

Check which keys in the site list match multiple rows in the geopackage

#Evaluating how many times each key appears in the geopackage
key_count <- gpkg %>% count(Clave, name = "count")

#Join with site keys
site_check <- sites_key %>% left_join(key_count, by = "Clave")

#Counting rows where count>1
site_check <- site_check %>% filter(count > 1)

Check which keys in the geopackage match multiple rows in the site list

#Evaluating how many times each key appears in the geopackage
site_count <- sites_key %>% count(Clave, name = "count")

#Join with site keys
key_check <- site_count %>% left_join(gpkg, by = "Clave")

#Counting rows where count>1
key_check <- key_check %>% filter(count > 1)

Fix duplicates

Using locality name, manually selecting one location per key for problem keys

sites_key_geo <- sites_key_geo %>%
  #MUELLE (1.0081)
  filter(!(Clave == "1.0081" & Localidad_geo != "MUELLE")) %>%
  #BOCANA EL ROSARIO (1.0236)
  filter(!(Clave == "1.0236" & Localidad_geo != "ROSARIO BOCANA EL")) %>%
  #PUERTO CORTEZ (2.0306)
  filter(!(Clave == "2.0306" & Localidad_geo != "CORTEZ PUERTO")) %>%
  #BOCA DE PIEDRA (5.0321)
  filter(!(Clave == "5.0321" & Localidad_geo != "PIEDRA BOCA DE")) %>% 
  # Remove empty geometries
  filter(!st_is_empty(.))

Step 4) Visualize data before exporting

mapview::mapview(sites_key_geo)

Step 5) Saving final clean geopackage

clean_path <- here("data", "processed", "sites_key.gpkg")
write_sf(sites_key_geo, clean_path)