In this notebook we are going to assign country codes to Darwin Core records using a shapefile from Marine Regions.
Load packages:
library(dplyr)
library(sf)
library(ggplot2)
library(tibble)
Let’s read the shapefile. The shapefile is not included in the code repository but can be downloaded from Marine Regions (see “Marine and land zones: the union of world country boundaries and EEZ’s”).
shapes <- st_read("EEZ_land_union_v3_202003/EEZ_Land_v3_202030.shp", quiet = TRUE)
head(shapes)
## Simple feature collection with 6 features and 30 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 8.326077 ymin: -23.18541 xmax: 51.72543 ymax: 70.09226
## CRS: 4326
## UNION MRGID_EEZ
## 1 Estonia 5675
## 2 Mayotte 48944
## 3 Overlapping claim Qatar / Saudi Arabia / United Arab Emirates 50170
## 4 Cameroon 8475
## 5 Finland 5676
## 6 Bassas da India 8340
## TERRITORY1 MRGID_TER1 ISO_TER1 UN_TER1 SOVEREIGN1 MRGID_SOV1 ISO_SOV1
## 1 Estonia 2110 EST 233 Estonia 2110 EST
## 2 Mayotte 8606 MYT 175 France 17 FRA
## 3 Qatar 8468 QAT 634 Qatar 8468 QAT
## 4 Cameroon 2170 CMR 120 Cameroon 2170 CMR
## 5 Finland 2106 FIN 246 Finland 2106 FIN
## 6 Bassas da India 31136 ATF 260 France 17 FRA
## UN_SOV1 TERRITORY2 MRGID_TER2 ISO_TER2 UN_TER2 SOVEREIGN2 MRGID_SOV2
## 1 233 <NA> 0 <NA> NA <NA> 0
## 2 250 Mayotte 8606 MYT 175 Comores 2163
## 3 634 Saudi Arabia 2215 SAU 682 Saudi Arabia 2215
## 4 120 <NA> 0 <NA> NA <NA> 0
## 5 246 <NA> 0 <NA> NA <NA> 0
## 6 250 <NA> 0 <NA> NA <NA> 0
## ISO_SOV2 UN_SOV2 TERRITORY3 MRGID_TER3 ISO_TER3 UN_TER3
## 1 <NA> NA <NA> 0 <NA> NA
## 2 COM 174 <NA> 0 <NA> 0
## 3 SAU 682 United Arab Emirates 2206 ARE 784
## 4 <NA> NA <NA> 0 <NA> NA
## 5 <NA> NA <NA> 0 <NA> NA
## 6 <NA> NA <NA> 0 <NA> NA
## SOVEREIGN3 MRGID_SOV3 ISO_SOV3 UN_SOV3 POL_TYPE
## 1 <NA> 0 <NA> NA Union EEZ and country
## 2 <NA> 0 <NA> 0 Overlapping claim
## 3 United Arab Emirates 2206 ARE 784 Overlapping claim
## 4 <NA> 0 <NA> NA Union EEZ and country
## 5 <NA> 0 <NA> NA Union EEZ and country
## 6 <NA> 0 <NA> NA Union EEZ and country
## Y_1 x_1 AREA_KM2 geometry
## 1 58.71530 24.40898 81842 MULTIPOLYGON (((26.61348 57...
## 2 -13.21377 45.30528 67285 MULTIPOLYGON (((46.68598 -1...
## 3 24.69243 51.59986 126 MULTIPOLYGON (((51.68368 24...
## 4 5.62533 12.63665 480130 MULTIPOLYGON (((16.05693 1....
## 5 63.97904 25.44114 420076 MULTIPOLYGON (((19.11472 60...
## 6 -20.86104 39.42539 120802 MULTIPOLYGON (((41.174 -19....
Now let’s generate some random coordinates and convert them to an sf object:
n_points <- 30000
points <- data.frame(decimalLongitude = runif(n_points, min = -180, max = 180), decimalLatitude = runif(n_points, min = -90, max = 90)) %>%
st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), crs = 4326)
head(points)
## Simple feature collection with 6 features and 0 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -160.4588 ymin: -82.4573 xmax: 150.4549 ymax: 85.31104
## CRS: EPSG:4326
## geometry
## 1 POINT (-160.4588 -69.45966)
## 2 POINT (-67.67719 -22.7881)
## 3 POINT (150.4549 -11.56853)
## 4 POINT (-160.2717 -82.4573)
## 5 POINT (-120.9354 -63.18504)
## 6 POINT (-15.25975 85.31104)
Now we can join the points and shapes layers using st_join(). I’m also adding a column with the original row numbers here, this is necessary because the resulting dataframe joined may be larger than the original points dataframe due to overlapping polygons.
joined <- points %>%
rownames_to_column(var = "rowname") %>%
st_join(shapes, join = st_within)
head(joined)
## Simple feature collection with 6 features and 31 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: -160.4588 ymin: -82.4573 xmax: 150.4549 ymax: 85.31104
## CRS: EPSG:4326
## rowname UNION MRGID_EEZ TERRITORY1 MRGID_TER1 ISO_TER1
## 1 1 <NA> NA <NA> NA <NA>
## 2 2 Bolivia NA Bolivia 2137 BOL
## 3 3 Papua New Guinea 8324 Papua New Guinea 2237 PNG
## 4 4 Antarctica 8489 Antarctica 1926 ATA
## 5 5 <NA> NA <NA> NA <NA>
## 6 6 Greenland 8438 Greenland 2260 GRL
## UN_TER1 SOVEREIGN1 MRGID_SOV1 ISO_SOV1 UN_SOV1 TERRITORY2 MRGID_TER2
## 1 NA <NA> NA <NA> NA <NA> NA
## 2 68 Bolivia 2137 BOL 68 <NA> NA
## 3 598 Papua New Guinea 2237 PNG 598 <NA> 0
## 4 10 Antarctica 1926 ATA 10 <NA> 0
## 5 NA <NA> NA <NA> NA <NA> NA
## 6 304 Denmark 2157 DNK 208 <NA> 0
## ISO_TER2 UN_TER2 SOVEREIGN2 MRGID_SOV2 ISO_SOV2 UN_SOV2 TERRITORY3 MRGID_TER3
## 1 <NA> NA <NA> NA <NA> NA <NA> NA
## 2 <NA> NA <NA> NA <NA> NA <NA> NA
## 3 <NA> NA <NA> 0 <NA> NA <NA> 0
## 4 <NA> NA <NA> 0 <NA> NA <NA> 0
## 5 <NA> NA <NA> NA <NA> NA <NA> NA
## 6 <NA> NA <NA> 0 <NA> NA <NA> 0
## ISO_TER3 UN_TER3 SOVEREIGN3 MRGID_SOV3 ISO_SOV3 UN_SOV3 POL_TYPE
## 1 <NA> NA <NA> NA <NA> NA <NA>
## 2 <NA> NA <NA> NA <NA> NA Landlocked country
## 3 <NA> NA <NA> 0 <NA> NA Union EEZ and country
## 4 <NA> NA <NA> 0 <NA> NA Union EEZ and country
## 5 <NA> NA <NA> NA <NA> NA <NA>
## 6 <NA> NA <NA> 0 <NA> NA Union EEZ and country
## Y_1 x_1 AREA_KM2 geometry
## 1 NA NA NA POINT (-160.4588 -69.45966)
## 2 -16.71004 -64.66432 1082641 POINT (-67.67719 -22.7881)
## 3 -5.62490 149.83749 2861676 POINT (150.4549 -11.56853)
## 4 -77.78128 11.74745 22143430 POINT (-160.2717 -82.4573)
## 5 NA NA NA POINT (-120.9354 -63.18504)
## 6 75.13542 -36.40807 4437543 POINT (-15.25975 85.31104)
Now we can merge the duplicated lines and concatenate country codes (column ISO_TER1) where necessary.
result <- joined %>%
group_by(rowname) %>%
summarize(country_code = paste0(na.omit(unique(ISO_TER1)), collapse = ";")) %>%
mutate(country_code = na_if(country_code, "")) %>%
arrange(as.numeric(rowname)) %>%
st_set_geometry(NULL)
head(result)
## # A tibble: 6 x 2
## rowname country_code
## <chr> <chr>
## 1 1 <NA>
## 2 2 BOL
## 3 3 PNG
## 4 4 ATA
## 5 5 <NA>
## 6 6 GRL
table(result$country_code)
##
## ABW AFG AGO AIA ALB ARE ARG ARG;URY ARM ASM
## 1 26 59 6 5 5 187 1 3 17
## ATA ATF ATG AUS AUT AZE BEL BEN BES BFA
## 3899 91 5 590 9 11 1 7 4 8
## BGD BGR BHS BIH BLR BLZ BMU BOL BRA BRB
## 14 9 23 3 8 2 18 36 448 7
## BTN BWA CAF CAN CCK CHE CHL CHN CIV CMR
## 2 20 23 1425 17 2 195 527 21 15
## COD COG COK COL COM CPV CRI CUB CUW CXR
## 100 15 94 71 8 42 26 19 3 8
## CYM CYP CZE DEU DNK DOM DZA ECU EGY ERI
## 2 3 4 30 7 14 108 11 44 5
## ESH ESP EST ETH FIN FJI FLK FRA FRO FSM
## 26 42 5 42 33 37 31 48 17 117
## GAB GBR GEO GHA GIN GLP GMB GNB GNB;SEN GNQ
## 16 62 7 19 12 4 1 6 2 14
## GRC GRL GTM GUF GUM GUY HMD HND HRV HTI
## 37 650 11 12 8 18 21 7 6 6
## HUN IDN IND IRL IRN IRQ ISL ITA JAM JOR
## 2 291 184 25 59 19 80 44 12 2
## JPN KAZ KEN KGZ KHM KIR KOR KWT LAO LBR
## 197 162 25 14 14 115 29 1 13 18
## LBY LKA LSO LTU LVA MAR MDA MDG MDV MEX
## 82 18 1 6 3 35 1 77 40 217
## MHL MLI MLT MMR MNG MNP MOZ MRT MTQ MUS
## 76 51 1 45 94 33 67 60 2 47
## MWI MYS MYT NAM NCL NER NFK NGA NIC NIU
## 1 36 3 62 61 51 21 45 10 19
## NLD NOR NPL NRU NZL OMN PAK PAN PCN PER
## 3 125 6 9 208 38 49 13 30 89
## PHL PLW PNG POL PRI PRK PRT PRY PYF REU
## 85 29 123 16 9 9 19 24 199 10
## ROU RUS RWA SAU SDN SEN SGS SHN SJM SJM;ISL
## 17 2255 1 83 73 13 76 79 181 3
## SLB SLE SLV SOM SPM SRB SSD STP SUR SVK
## 70 5 5 54 1 7 26 7 6 4
## SVN SWE SYC SYR TCA TCD TGO THA TJK TKL
## 2 48 41 7 3 43 2 27 8 10
## TKM TLS TON TTO TUN TUR TUV TWN TZA UGA
## 27 5 31 2 9 51 32 13 49 6
## UKR UMI URY USA UZB VCT VEN VGB VNM VUT
## 42 81 12 500 29 1 53 1 34 22
## WLF WSM YEM ZAF ZMB ZWE
## 7 6 44 99 42 15
Finally I’m adding the country codes to the original table:
points$country_code <- result$country_code
points %>%
filter(!is.na(country_code)) %>%
ggplot() +
geom_sf(aes(color = country_code), size = 0.3) +
theme(legend.position = "none")