In this notebook we are going to assign country codes to Darwin Core records using a shapefile from Marine Regions.

Load packages:

library(dplyr)
library(sf)
library(ggplot2)
library(tibble)

Let’s read the shapefile. The shapefile is not included in the code repository but can be downloaded from Marine Regions (see “Marine and land zones: the union of world country boundaries and EEZ’s”).

shapes <- st_read("EEZ_land_union_v3_202003/EEZ_Land_v3_202030.shp", quiet = TRUE)
head(shapes)
## Simple feature collection with 6 features and 30 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 8.326077 ymin: -23.18541 xmax: 51.72543 ymax: 70.09226
## CRS:            4326
##                                                           UNION MRGID_EEZ
## 1                                                       Estonia      5675
## 2                                                       Mayotte     48944
## 3 Overlapping claim Qatar / Saudi Arabia / United Arab Emirates     50170
## 4                                                      Cameroon      8475
## 5                                                       Finland      5676
## 6                                               Bassas da India      8340
##        TERRITORY1 MRGID_TER1 ISO_TER1 UN_TER1 SOVEREIGN1 MRGID_SOV1 ISO_SOV1
## 1         Estonia       2110      EST     233    Estonia       2110      EST
## 2         Mayotte       8606      MYT     175     France         17      FRA
## 3           Qatar       8468      QAT     634      Qatar       8468      QAT
## 4        Cameroon       2170      CMR     120   Cameroon       2170      CMR
## 5         Finland       2106      FIN     246    Finland       2106      FIN
## 6 Bassas da India      31136      ATF     260     France         17      FRA
##   UN_SOV1   TERRITORY2 MRGID_TER2 ISO_TER2 UN_TER2   SOVEREIGN2 MRGID_SOV2
## 1     233         <NA>          0     <NA>      NA         <NA>          0
## 2     250      Mayotte       8606      MYT     175      Comores       2163
## 3     634 Saudi Arabia       2215      SAU     682 Saudi Arabia       2215
## 4     120         <NA>          0     <NA>      NA         <NA>          0
## 5     246         <NA>          0     <NA>      NA         <NA>          0
## 6     250         <NA>          0     <NA>      NA         <NA>          0
##   ISO_SOV2 UN_SOV2           TERRITORY3 MRGID_TER3 ISO_TER3 UN_TER3
## 1     <NA>      NA                 <NA>          0     <NA>      NA
## 2      COM     174                 <NA>          0     <NA>       0
## 3      SAU     682 United Arab Emirates       2206      ARE     784
## 4     <NA>      NA                 <NA>          0     <NA>      NA
## 5     <NA>      NA                 <NA>          0     <NA>      NA
## 6     <NA>      NA                 <NA>          0     <NA>      NA
##             SOVEREIGN3 MRGID_SOV3 ISO_SOV3 UN_SOV3              POL_TYPE
## 1                 <NA>          0     <NA>      NA Union EEZ and country
## 2                 <NA>          0     <NA>       0     Overlapping claim
## 3 United Arab Emirates       2206      ARE     784     Overlapping claim
## 4                 <NA>          0     <NA>      NA Union EEZ and country
## 5                 <NA>          0     <NA>      NA Union EEZ and country
## 6                 <NA>          0     <NA>      NA Union EEZ and country
##         Y_1      x_1 AREA_KM2                       geometry
## 1  58.71530 24.40898    81842 MULTIPOLYGON (((26.61348 57...
## 2 -13.21377 45.30528    67285 MULTIPOLYGON (((46.68598 -1...
## 3  24.69243 51.59986      126 MULTIPOLYGON (((51.68368 24...
## 4   5.62533 12.63665   480130 MULTIPOLYGON (((16.05693 1....
## 5  63.97904 25.44114   420076 MULTIPOLYGON (((19.11472 60...
## 6 -20.86104 39.42539   120802 MULTIPOLYGON (((41.174 -19....

Now let’s generate some random coordinates and convert them to an sf object:

n_points <- 30000
points <- data.frame(decimalLongitude = runif(n_points, min = -180, max = 180), decimalLatitude = runif(n_points, min = -90, max = 90)) %>%
  st_as_sf(coords = c("decimalLongitude", "decimalLatitude"), crs = 4326)
head(points)
## Simple feature collection with 6 features and 0 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -160.4588 ymin: -82.4573 xmax: 150.4549 ymax: 85.31104
## CRS:            EPSG:4326
##                      geometry
## 1 POINT (-160.4588 -69.45966)
## 2  POINT (-67.67719 -22.7881)
## 3  POINT (150.4549 -11.56853)
## 4  POINT (-160.2717 -82.4573)
## 5 POINT (-120.9354 -63.18504)
## 6  POINT (-15.25975 85.31104)

Now we can join the points and shapes layers using st_join(). I’m also adding a column with the original row numbers here, this is necessary because the resulting dataframe joined may be larger than the original points dataframe due to overlapping polygons.

joined <- points %>%
  rownames_to_column(var = "rowname") %>%
  st_join(shapes, join = st_within)
head(joined)
## Simple feature collection with 6 features and 31 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -160.4588 ymin: -82.4573 xmax: 150.4549 ymax: 85.31104
## CRS:            EPSG:4326
##   rowname            UNION MRGID_EEZ       TERRITORY1 MRGID_TER1 ISO_TER1
## 1       1             <NA>        NA             <NA>         NA     <NA>
## 2       2          Bolivia        NA          Bolivia       2137      BOL
## 3       3 Papua New Guinea      8324 Papua New Guinea       2237      PNG
## 4       4       Antarctica      8489       Antarctica       1926      ATA
## 5       5             <NA>        NA             <NA>         NA     <NA>
## 6       6        Greenland      8438        Greenland       2260      GRL
##   UN_TER1       SOVEREIGN1 MRGID_SOV1 ISO_SOV1 UN_SOV1 TERRITORY2 MRGID_TER2
## 1      NA             <NA>         NA     <NA>      NA       <NA>         NA
## 2      68          Bolivia       2137      BOL      68       <NA>         NA
## 3     598 Papua New Guinea       2237      PNG     598       <NA>          0
## 4      10       Antarctica       1926      ATA      10       <NA>          0
## 5      NA             <NA>         NA     <NA>      NA       <NA>         NA
## 6     304          Denmark       2157      DNK     208       <NA>          0
##   ISO_TER2 UN_TER2 SOVEREIGN2 MRGID_SOV2 ISO_SOV2 UN_SOV2 TERRITORY3 MRGID_TER3
## 1     <NA>      NA       <NA>         NA     <NA>      NA       <NA>         NA
## 2     <NA>      NA       <NA>         NA     <NA>      NA       <NA>         NA
## 3     <NA>      NA       <NA>          0     <NA>      NA       <NA>          0
## 4     <NA>      NA       <NA>          0     <NA>      NA       <NA>          0
## 5     <NA>      NA       <NA>         NA     <NA>      NA       <NA>         NA
## 6     <NA>      NA       <NA>          0     <NA>      NA       <NA>          0
##   ISO_TER3 UN_TER3 SOVEREIGN3 MRGID_SOV3 ISO_SOV3 UN_SOV3              POL_TYPE
## 1     <NA>      NA       <NA>         NA     <NA>      NA                  <NA>
## 2     <NA>      NA       <NA>         NA     <NA>      NA    Landlocked country
## 3     <NA>      NA       <NA>          0     <NA>      NA Union EEZ and country
## 4     <NA>      NA       <NA>          0     <NA>      NA Union EEZ and country
## 5     <NA>      NA       <NA>         NA     <NA>      NA                  <NA>
## 6     <NA>      NA       <NA>          0     <NA>      NA Union EEZ and country
##         Y_1       x_1 AREA_KM2                    geometry
## 1        NA        NA       NA POINT (-160.4588 -69.45966)
## 2 -16.71004 -64.66432  1082641  POINT (-67.67719 -22.7881)
## 3  -5.62490 149.83749  2861676  POINT (150.4549 -11.56853)
## 4 -77.78128  11.74745 22143430  POINT (-160.2717 -82.4573)
## 5        NA        NA       NA POINT (-120.9354 -63.18504)
## 6  75.13542 -36.40807  4437543  POINT (-15.25975 85.31104)

Now we can merge the duplicated lines and concatenate country codes (column ISO_TER1) where necessary.

result <- joined %>%
  group_by(rowname) %>%
  summarize(country_code = paste0(na.omit(unique(ISO_TER1)), collapse = ";")) %>%
  mutate(country_code = na_if(country_code, "")) %>%
  arrange(as.numeric(rowname)) %>%
  st_set_geometry(NULL)
head(result)
## # A tibble: 6 x 2
##   rowname country_code
##   <chr>   <chr>       
## 1 1       <NA>        
## 2 2       BOL         
## 3 3       PNG         
## 4 4       ATA         
## 5 5       <NA>        
## 6 6       GRL
table(result$country_code)
## 
##     ABW     AFG     AGO     AIA     ALB     ARE     ARG ARG;URY     ARM     ASM 
##       1      26      59       6       5       5     187       1       3      17 
##     ATA     ATF     ATG     AUS     AUT     AZE     BEL     BEN     BES     BFA 
##    3899      91       5     590       9      11       1       7       4       8 
##     BGD     BGR     BHS     BIH     BLR     BLZ     BMU     BOL     BRA     BRB 
##      14       9      23       3       8       2      18      36     448       7 
##     BTN     BWA     CAF     CAN     CCK     CHE     CHL     CHN     CIV     CMR 
##       2      20      23    1425      17       2     195     527      21      15 
##     COD     COG     COK     COL     COM     CPV     CRI     CUB     CUW     CXR 
##     100      15      94      71       8      42      26      19       3       8 
##     CYM     CYP     CZE     DEU     DNK     DOM     DZA     ECU     EGY     ERI 
##       2       3       4      30       7      14     108      11      44       5 
##     ESH     ESP     EST     ETH     FIN     FJI     FLK     FRA     FRO     FSM 
##      26      42       5      42      33      37      31      48      17     117 
##     GAB     GBR     GEO     GHA     GIN     GLP     GMB     GNB GNB;SEN     GNQ 
##      16      62       7      19      12       4       1       6       2      14 
##     GRC     GRL     GTM     GUF     GUM     GUY     HMD     HND     HRV     HTI 
##      37     650      11      12       8      18      21       7       6       6 
##     HUN     IDN     IND     IRL     IRN     IRQ     ISL     ITA     JAM     JOR 
##       2     291     184      25      59      19      80      44      12       2 
##     JPN     KAZ     KEN     KGZ     KHM     KIR     KOR     KWT     LAO     LBR 
##     197     162      25      14      14     115      29       1      13      18 
##     LBY     LKA     LSO     LTU     LVA     MAR     MDA     MDG     MDV     MEX 
##      82      18       1       6       3      35       1      77      40     217 
##     MHL     MLI     MLT     MMR     MNG     MNP     MOZ     MRT     MTQ     MUS 
##      76      51       1      45      94      33      67      60       2      47 
##     MWI     MYS     MYT     NAM     NCL     NER     NFK     NGA     NIC     NIU 
##       1      36       3      62      61      51      21      45      10      19 
##     NLD     NOR     NPL     NRU     NZL     OMN     PAK     PAN     PCN     PER 
##       3     125       6       9     208      38      49      13      30      89 
##     PHL     PLW     PNG     POL     PRI     PRK     PRT     PRY     PYF     REU 
##      85      29     123      16       9       9      19      24     199      10 
##     ROU     RUS     RWA     SAU     SDN     SEN     SGS     SHN     SJM SJM;ISL 
##      17    2255       1      83      73      13      76      79     181       3 
##     SLB     SLE     SLV     SOM     SPM     SRB     SSD     STP     SUR     SVK 
##      70       5       5      54       1       7      26       7       6       4 
##     SVN     SWE     SYC     SYR     TCA     TCD     TGO     THA     TJK     TKL 
##       2      48      41       7       3      43       2      27       8      10 
##     TKM     TLS     TON     TTO     TUN     TUR     TUV     TWN     TZA     UGA 
##      27       5      31       2       9      51      32      13      49       6 
##     UKR     UMI     URY     USA     UZB     VCT     VEN     VGB     VNM     VUT 
##      42      81      12     500      29       1      53       1      34      22 
##     WLF     WSM     YEM     ZAF     ZMB     ZWE 
##       7       6      44      99      42      15

Finally I’m adding the country codes to the original table:

points$country_code <- result$country_code

points %>%
  filter(!is.na(country_code)) %>%
  ggplot() +
    geom_sf(aes(color = country_code), size = 0.3) +
    theme(legend.position = "none")