Bundling Compain 2021 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Compain, N. (2021). Biodiversity and community assemblage of shallow habitats of the National Park of Banc d’Arguin (Mauritania): influence of habitat, season and site. MSc thesis. University of Algarve.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file
suppressMessages(library(tidyr))

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "banc_darguin_national_park"
dataset_dir_name <- "Compain_2021"
original_pdf <- "Thesis_Nicolas_COMPAIN_a68831.pdf"
short_name <- "banc_darguin-compain-2021"

Parsing PDF table to CSV

The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV.

#conda environment
condaenv <- "mwhs-data-mobilization"

# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")

# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")

# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", sep="/")

# Define page numbers and table areas (see documentation)
page_args <- c(
"-a 64.713,15.258,520.338,726.579 -p 24",
"-a 65.766,12.101,526.651,841.274 -p 25",
"-a 52.086,16.31,509.815,580.316 -p 26"

)

# Define run parameters (see documentation)
run_parameters <- "-s"

# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
##  [1] ""                                                                                                                             
##  [2] "Script Execution Summary"                                                                                                     
##  [3] "Date and Time: 2023-10-03 18:35:03"                                                                                           
##  [4] "------------------------------"                                                                                               
##  [5] ""                                                                                                                             
##  [6] "PDF input: ../../../datasets/banc_darguin_national_park/Compain_2021/raw/Thesis_Nicolas_COMPAIN_a68831.pdf"                   
##  [7] "Perform Table Parsing: TRUE"                                                                                                  
##  [8] "Selected Areas:"                                                                                                              
##  [9] "  Area 1: [64.713, 15.258, 520.338, 726.579]"                                                                                 
## [10] "  Area 2: [65.766, 12.101, 526.651, 841.274]"                                                                                 
## [11] "  Area 3: [52.086, 16.31, 509.815, 580.316]"                                                                                  
## [12] "Pages: 24, 25, 26"                                                                                                            
## [13] "Concatenate: False"                                                                                                           
## [14] "Concatenate across headers: False"                                                                                            
## [15] "Stream Extraction: True"                                                                                                      
## [16] "Lattice Extraction: False"                                                                                                    
## [17] ""                                                                                                                             
## [18] "Parsing Tables"                                                                                                               
## [19] "------------------------------"                                                                                               
## [20] ""                                                                                                                             
## [21] ""                                                                                                                             
## [22] "Saving to CSV"                                                                                                                
## [23] "CSV file(s):"                                                                                                                 
## [24] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_1.csv"      
## [25] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_2.csv"      
## [26] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_3.csv"      
## [27] "------------------------------"                                                                                               
## [28] ""                                                                                                                             
## [29] ""                                                                                                                             
## [30] "Run Details: ../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_parameters.txt"
## [31] "Finished"                                                                                                                     
## [32] ""

Read source data

Now we’ll read in the csv tables outputted from the previous step.

processed_csv1 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_1.csv"
processed_csv2 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_2.csv"
processed_csv3 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_3.csv"

input_data1 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv1, sep="/"))
input_data2 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv2, sep="/"))
input_data3 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv3, sep="/"))

Preprocessing

Here we tidy the data up.

Tidy Data

input_data1 %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names() %>%
  select(c(unnamed_0, n, n_1, n_2, n_3, n_4, n_tl))
names(input_data1) <-  c("sciname", "w1", "w2", "w3", "w5", "w7", "w8")
input_data1$ID <- seq_len(nrow(input_data1))

input_data2 %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names() %>%
  select(c(unnamed_0, n, n_1, n_tl, n_tl_1, n_2, n_tl_2))
names(input_data2) <-  c("sciname", "s1", "s2", "s3", "s4", "s5", "s6")
input_data2$ID <- seq_len(nrow(input_data2))

input_data3 %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names() %>%
  select(c(unnamed_0, n, n_tl, n_1))
names(input_data3) <-  c("sciname", "s7", "s8", "s9")
input_data3$ID <- seq_len(nrow(input_data3))

input_data <- merge(input_data1, input_data2, by = "ID")
input_data <- merge(input_data, input_data3, by = "ID")

input_data$ID <- NULL
input_data$sciname.y <- NULL
input_data$sciname <- NULL

input_data$sciname.x <- gsub("^\\d+\\s", "", input_data$sciname.x)

cleaned_data <- input_data

#to preview pretty table
knitr::kable(head(cleaned_data))
sciname.x w1 w2 w3 w5 w7 w8 s1 s2 s3 s4 s5 s6 s7 s8 s9
Atherina boyeri NA NA NA NA NA NA 95 700 5.1 39 4.5 335 NA NA
Blennidae NA NA NA 1 NA NA 25 NA 12 5 NA NA
Callinectes marginatus NA NA NA NA NA NA NA NA NA NA
Citharichthys stampfilii NA NA NA NA NA 13 NA NA NA NA
Coptodon guineensis NA 1 1 NA NA NA 5 1 26.8 NA NA 8
Cynoglossus senegalensis NA NA NA NA NA NA NA NA 19 7.3 NA NA

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,1])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})
## 
##          id                                target
## 1    126224                           Hippocampus
## 5   1525460                           Hippocampus
## 6    275182               Hippocampus abdominalis
## 10   275183                    Hippocampus alatus
## 11   275184                 Hippocampus algiricus
## 12   275185                  Hippocampus angustus
## 18   275186                  Hippocampus barbouri
## 19   275187                Hippocampus bargibanti
## 23   212238              Hippocampus borboniensis
## 25   275189                 Hippocampus breviceps
## 30   212233            Hippocampus camelopardalis
## 31   212234                  Hippocampus capensis
## 32   886550                  Hippocampus casscsio
## 34   275190                  Hippocampus colemani
## 35   275191                     Hippocampus comes
## 36   275192                 Hippocampus coronatus
## 37   388711               Hippocampus curvicuspis
## 40   398432                  Hippocampus debelius
## 41   275193                    Hippocampus denise
## 44   159445                   Hippocampus erectus
## 49   275194                   Hippocampus fisheri
## 50   212230                    Hippocampus fuscus
## 51   212230                    Hippocampus fuscus
## 55   275195                Hippocampus grandiceps
## 56   154776                Hippocampus guttulatus
## 57   248042 Hippocampus guttulatus microstephanus
## 59  1376208                     Hippocampus haema
## 60   275196                  Hippocampus hendriki
## 64   127380               Hippocampus hippocampus
## 67   212239                   Hippocampus histrix
## 71   275197                    Hippocampus ingens
## 72  1288522                  Hippocampus japapigu
## 74   275198                  Hippocampus jayakari
## 75   275199                   Hippocampus jugumus
## 78   212236                  Hippocampus kelloggi
## 80   212237                      Hippocampus kuda
## 84   275200            Hippocampus lichtensteinii
## 90   275201                  Hippocampus minotaur
## 91   275202                  Hippocampus mohnikei
## 96   275203           Hippocampus montebelloensis
## 97   275204               Hippocampus multispinus
## 98  1437127                      Hippocampus nalu
## 106  712534                 Hippocampus paradoxus
## 107  275205               Hippocampus patagonicus
## 109  306811                Hippocampus planifrons
## 112  398433                   Hippocampus pontohi
## 116  388712                  Hippocampus pusillus
## 117  275207            Hippocampus queenslandicus
## 121  159446                     Hippocampus reidi
## 124  398434                  Hippocampus satomiae
## 125  275208              Hippocampus semispinosus
## 127  275209                  Hippocampus sindonis
## 128  275210             Hippocampus spinosissimus
## 131  275211              Hippocampus subelongatus
## 132  306822                 Hippocampus suezensis
## 137  212232              Hippocampus trimaculatus
## 140  474956                      Hippocampus tyro
## 143  398436                Hippocampus waleananus
## 144  212235                    Hippocampus whitei
## 145  275212                     Hippocampus zebra
## 146  275213                  Hippocampus zosterae
##                                                               authority
## 1                                                      Rafinesque, 1810
## 5                                                      Rafinesque, 1810
## 6                                                          Lesson, 1827
## 10                                                         Kuiter, 2001
## 11                                                           Kaup, 1856
## 12                                                        Günther, 1870
## 18                                            Jordan & Richardson, 1908
## 19                                                        Whitley, 1970
## 23                                                        Duméril, 1870
## 25                                                         Peters, 1869
## 30                                                       Bianconi, 1854
## 31                                                      Boulenger, 1900
## 32                                         Zhang, Qin, Wang & Lin, 2016
## 34                                                         Kuiter, 2003
## 35                                                         Cantor, 1849
## 36                                            Temminck & Schlegel, 1850
## 37                                                         Fricke, 2004
## 40                                                 Gomon & Kuiter, 2009
## 41                                               Lourie & Randall, 2003
## 44                                                          Perry, 1810
## 49                                              Jordan & Evermann, 1903
## 50                                                        Rüppell, 1838
## 51                                                        Rüppell, 1838
## 55                                                         Kuiter, 2001
## 56                                                         Cuvier, 1829
## 57                                                    Slastenenko, 1937
## 59                                          Han, Kim, Kai & Senou, 2017
## 60                                                         Kuiter, 2001
## 64                                                     (Linnaeus, 1758)
## 67                                                           Kaup, 1856
## 71                                                         Girard, 1858
## 72                     Short, Smith, Motomura, Harasti & Hamilton, 2018
## 74                                                      Boulenger, 1900
## 75                                                         Kuiter, 2001
## 78                                                Jordan & Snyder, 1901
## 80                                                        Bleeker, 1852
## 84                                                           Kaup, 1856
## 90                                                          Gomon, 1997
## 91                                                        Bleeker, 1853
## 96                                                         Kuiter, 2001
## 97                                                         Kuiter, 2001
## 98  Short, Claassens, Smith, De Brauwer, Hamilton, Stat & Harasti, 2020
## 106                                                Foster & Gomon, 2010
## 107                                         Piacentino & Luzzatto, 2004
## 109                                                        Peters, 1877
## 112                                               Lourie & Kuiter, 2008
## 116                                                        Fricke, 2004
## 117                                                         Horne, 2001
## 121                                                      Ginsburg, 1933
## 124                                               Lourie & Kuiter, 2008
## 125                                                        Kuiter, 2001
## 127                                               Jordan & Snyder, 1901
## 128                                                         Weber, 1913
## 131                                                     Castelnau, 1873
## 132                                                       Duncker, 1940
## 137                                                         Leach, 1814
## 140                                              Randall & Lourie, 2009
## 143                                                Gomon & Kuiter, 2009
## 144                                                       Bleeker, 1855
## 145                                                       Whitley, 1964
## 146                                              Jordan & Gilbert, 1882
##       status
## 1   accepted
## 5   accepted
## 6   accepted
## 10  accepted
## 11  accepted
## 12  accepted
## 18  accepted
## 19  accepted
## 23  accepted
## 25  accepted
## 30  accepted
## 31  accepted
## 32  accepted
## 34  accepted
## 35  accepted
## 36  accepted
## 37  accepted
## 40  accepted
## 41  accepted
## 44  accepted
## 49  accepted
## 50  accepted
## 51  accepted
## 55  accepted
## 56  accepted
## 57  accepted
## 59  accepted
## 60  accepted
## 64  accepted
## 67  accepted
## 71  accepted
## 72  accepted
## 74  accepted
## 75  accepted
## 78  accepted
## 80  accepted
## 84  accepted
## 90  accepted
## 91  accepted
## 96  accepted
## 97  accepted
## 98  accepted
## 106 accepted
## 107 accepted
## 109 accepted
## 112 accepted
## 116 accepted
## 117 accepted
## 121 accepted
## 124 accepted
## 125 accepted
## 127 accepted
## 128 accepted
## 131 accepted
## 132 accepted
## 137 accepted
## 140 accepted
## 143 accepted
## 144 accepted
## 145 accepted
## 146 accepted
## 
## More than one WORMS ID found for taxon 'Hippocampus'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##          id                                 target
## 1    126224                            Hippocampus
## 5   1525460                            Hippocampus
## 6    275182                Hippocampus abdominalis
## 7    306774                    Hippocampus agnesae
## 8    306775                      Hippocampus aimei
## 9    716772                      Hippocampus aimei
## 10   275183                     Hippocampus alatus
## 11   275184                  Hippocampus algiricus
## 12   275185                   Hippocampus angustus
## 13   306776                 Hippocampus antiquorum
## 14   306777                   Hippocampus antiquus
## 15   400954                      Hippocampus arnei
## 16   716773                      Hippocampus arnei
## 17   306778                  Hippocampus aterrimus
## 18   275186                   Hippocampus barbouri
## 19   275187                 Hippocampus bargibanti
## 20   713214                   Hippocampus bicuspis
## 21   275188                Hippocampus biocellatus
## 22   306779                   Hippocampus bleekeri
## 23   212238               Hippocampus borboniensis
## 24   306780             Hippocampus brachyrhynchus
## 25   275189                  Hippocampus breviceps
## 26   306781               Hippocampus brevirostris
## 27   306782                   Hippocampus brunneus
## 28   400945            Hippocampus cameleopardalis
## 29   400946            Hippocampus cameleopardalus
## 30   212233             Hippocampus camelopardalis
## 31   212234                   Hippocampus capensis
## 32   886550                   Hippocampus casscsio
## 33   306783                  Hippocampus chinensis
## 34   275190                   Hippocampus colemani
## 35   275191                      Hippocampus comes
## 36   275192                  Hippocampus coronatus
## 37   388711                Hippocampus curvicuspis
## 38   306784                      Hippocampus dahli
## 39   306785                     Hippocampus deanei
## 40   398432                   Hippocampus debelius
## 41   275193                     Hippocampus denise
## 42   306786               Hippocampus ecuadorensis
## 43   306787                  Hippocampus elongatus
## 44   159445                    Hippocampus erectus
## 45   306788                  Hippocampus erinaceus
## 46   154815                  Hippocampus europaeus
## 47   306789               Hippocampus fascicularis
## 48   713215               Hippocampus filamentosus
## 49   275194                    Hippocampus fisheri
## 50   212230                     Hippocampus fuscus
## 51   212230                     Hippocampus fuscus
## 52   306790              Hippocampus graciliformis
## 53   306791                   Hippocampus gracilis
## 54   306792              Hippocampus gracilissimus
## 55   275195                 Hippocampus grandiceps
## 56   154776                 Hippocampus guttulatus
## 57   248042  Hippocampus guttulatus microstephanus
## 58   323136  Hippocampus guttulatus multiannularis
## 59  1376208                      Hippocampus haema
## 60   275196                   Hippocampus hendriki
## 61   154458                 Hippocampus heptagonus
## 62   306793                Hippocampus hildebrandi
## 63   306794                    Hippocampus hilonis
## 64   127380                Hippocampus hippocampus
## 65   322937 Hippocampus hippocampus microcoronatus
## 66   322938 Hippocampus hippocampus microstephanus
## 67   212239                    Hippocampus histrix
## 68   306795                      Hippocampus horai
## 69   306796                  Hippocampus hudsonius
## 70   400949                    Hippocampus hystrix
## 71   275197                     Hippocampus ingens
## 72  1288522                   Hippocampus japapigu
## 73   306797                  Hippocampus japonicus
## 74   275198                   Hippocampus jayakari
## 75   275199                    Hippocampus jugumus
## 76   306798           Hippocampus kampylotrachelos
## 77   306799                     Hippocampus kaupii
## 78   212236                   Hippocampus kelloggi
## 79   306800                   Hippocampus kincaidi
## 80   212237                       Hippocampus kuda
## 81   323205        Hippocampus kuda multiannularis
## 82   306801              Hippocampus laevicaudatus
## 83   713217                      Hippocampus lenis
## 84   275200             Hippocampus lichtensteinii
## 85   154777               Hippocampus longirostris
## 86   306802                 Hippocampus manadensis
## 87   306803                   Hippocampus mannulus
## 88   306804                 Hippocampus marginalis
## 89   306805               Hippocampus melanospilos
## 90   275201                   Hippocampus minotaur
## 91   275202                   Hippocampus mohnikei
## 92   306806                Hippocampus moluccensis
## 93   400952                    Hippocampus monckei
## 94   400951                   Hippocampus monickei
## 95   400953                    Hippocampus monikei
## 96   275203            Hippocampus montebelloensis
## 97   275204                Hippocampus multispinus
## 98  1437127                       Hippocampus nalu
## 99   306807                 Hippocampus natalensis
## 100  306808             Hippocampus novaehebudorum
## 101  400955             Hippocampus novaehollandae
## 102  306809            Hippocampus novaehollandiae
## 103  713212                   Hippocampus obscurus
## 104  713213                   Hippocampus obscurus
## 105  306810                    Hippocampus obtusus
## 106  712534                  Hippocampus paradoxus
## 107  275205                Hippocampus patagonicus
## 108  400947                 Hippocampus pentagonus
## 109  306811                 Hippocampus planifrons
## 110  306812                      Hippocampus poeyi
## 111  306813                 Hippocampus polytaenia
## 112  398433                    Hippocampus pontohi
## 113  275206                   Hippocampus procerus
## 114  306814                Hippocampus punctulatus
## 115  306815                Hippocampus punctulatus
## 116  388712                   Hippocampus pusillus
## 117  275207             Hippocampus queenslandicus
## 118  306816                       Hippocampus raji
## 119  127381                  Hippocampus ramulosus
## 120  306817                    Hippocampus regulus
## 121  159446                      Hippocampus reidi
## 122  306818               Hippocampus rhynchomacer
## 123  306819                 Hippocampus rosamondae
## 124  398434                   Hippocampus satomiae
## 125  275208               Hippocampus semispinosus
## 126  398435                   Hippocampus severnsi
## 127  275209                   Hippocampus sindonis
## 128  275210              Hippocampus spinosissimus
## 129  306820                   Hippocampus stylifer
## 130  306821               Hippocampus subcoronatus
## 131  275211               Hippocampus subelongatus
## 132  306822                  Hippocampus suezensis
## 133  306823                   Hippocampus taeniops
## 134  306824               Hippocampus taeniopterus
## 135  306825                  Hippocampus takakurae
## 136  306826                Hippocampus tetragonous
## 137  212232               Hippocampus trimaculatus
## 138  306827                    Hippocampus tristis
## 139  306828               Hippocampus tuberculatus
## 140  474956                       Hippocampus tyro
## 141  306829                   Hippocampus villosus
## 142  306830                   Hippocampus vulgaris
## 143  398436                 Hippocampus waleananus
## 144  212235                     Hippocampus whitei
## 145  275212                      Hippocampus zebra
## 146  275213                   Hippocampus zosterae
## 2    843450                                   <NA>
## 3    843451                                   <NA>
## 4    843452                                   <NA>
##                                                               authority
## 1                                                      Rafinesque, 1810
## 5                                                      Rafinesque, 1810
## 6                                                          Lesson, 1827
## 7                                                          Fowler, 1907
## 8                                                           Roule, 1916
## 9                                                           Roule, 1916
## 10                                                         Kuiter, 2001
## 11                                                           Kaup, 1856
## 12                                                        Günther, 1870
## 13                                                          Leach, 1814
## 14                                                          Risso, 1827
## 15                                                          Roule, 1916
## 16                                                          Roule, 1916
## 17                                                Jordan & Snyder, 1902
## 18                                            Jordan & Richardson, 1908
## 19                                                        Whitley, 1970
## 20                                                           Kaup, 1856
## 21                                                         Kuiter, 2001
## 22                                                         Fowler, 1907
## 23                                                        Duméril, 1870
## 24                                                        Duncker, 1914
## 25                                                         Peters, 1869
## 26                                                         Schinz, 1822
## 27                                                           Bean, 1906
## 28                                                       Bianconi, 1854
## 29                                                       Bianconi, 1854
## 30                                                       Bianconi, 1854
## 31                                                      Boulenger, 1900
## 32                                         Zhang, Qin, Wang & Lin, 2016
## 33                                                     Basilewsky, 1855
## 34                                                         Kuiter, 2003
## 35                                                         Cantor, 1849
## 36                                            Temminck & Schlegel, 1850
## 37                                                         Fricke, 2004
## 38                                                         Ogilby, 1908
## 39                                                        Duméril, 1861
## 40                                                 Gomon & Kuiter, 2009
## 41                                               Lourie & Randall, 2003
## 42                                                         Fowler, 1922
## 43                                                      Castelnau, 1873
## 44                                                          Perry, 1810
## 45                                                        Günther, 1870
## 46                                                       Ginsburg, 1933
## 47                                                           Kaup, 1856
## 48                                                        Duméril, 1870
## 49                                              Jordan & Evermann, 1903
## 50                                                        Rüppell, 1838
## 51                                                        Rüppell, 1838
## 52                                                      McCulloch, 1911
## 53                                                           Gill, 1862
## 54                                            Temminck & Schlegel, 1850
## 55                                                         Kuiter, 2001
## 56                                                         Cuvier, 1829
## 57                                                    Slastenenko, 1937
## 58                                                       Ginsburg, 1937
## 59                                          Han, Kim, Kai & Senou, 2017
## 60                                                         Kuiter, 2001
## 61                                                     Rafinesque, 1810
## 62                                                       Ginsburg, 1933
## 63                                              Jordan & Evermann, 1903
## 64                                                     (Linnaeus, 1758)
## 65                                                    Slastenenko, 1938
## 66                                                    Slastenenko, 1937
## 67                                                           Kaup, 1856
## 68                                                        Duncker, 1926
## 69                                                          DeKay, 1842
## 70                                                           Kaup, 1856
## 71                                                         Girard, 1858
## 72                     Short, Smith, Motomura, Harasti & Hamilton, 2018
## 73                                                           Kaup, 1856
## 74                                                      Boulenger, 1900
## 75                                                         Kuiter, 2001
## 76                                                        Bleeker, 1854
## 77                                                        Duméril, 1870
## 78                                                Jordan & Snyder, 1901
## 79                                             Townsend & Barbour, 1906
## 80                                                        Bleeker, 1852
## 81                                                            Raj, 1941
## 82                                                           Kaup, 1856
## 83                                                         De Vis, 1908
## 84                                                           Kaup, 1856
## 85                                                         Schinz, 1822
## 86                                                        Bleeker, 1856
## 87                                                         Cantor, 1849
## 88                                                           Kaup, 1856
## 89                                                        Bleeker, 1854
## 90                                                          Gomon, 1997
## 91                                                        Bleeker, 1853
## 92                                                        Bleeker, 1852
## 93                                                        Bleeker, 1853
## 94                                                        Bleeker, 1853
## 95                                                        Bleeker, 1853
## 96                                                         Kuiter, 2001
## 97                                                         Kuiter, 2001
## 98  Short, Claassens, Smith, De Brauwer, Hamilton, Stat & Harasti, 2020
## 99                                                      von Bonde, 1923
## 100                                                        Fowler, 1944
## 101                                                  Steindachner, 1866
## 102                                                  Steindachner, 1866
## 103                                          Hemprich & Ehrenberg, 1856
## 104                                                     Ehrenberg, 1871
## 105                                                      Ginsburg, 1933
## 106                                                Foster & Gomon, 2010
## 107                                         Piacentino & Luzzatto, 2004
## 108                                                    Rafinesque, 1810
## 109                                                        Peters, 1877
## 110                                                 Howell Rivero, 1934
## 111                                                       Bleeker, 1854
## 112                                               Lourie & Kuiter, 2008
## 113                                                        Kuiter, 2001
## 114                                                     Guichenot, 1853
## 115                                                          Kaup, 1856
## 116                                                        Fricke, 2004
## 117                                                         Horne, 2001
## 118                                                       Whitley, 1955
## 119                                                         Leach, 1814
## 120                                                      Ginsburg, 1933
## 121                                                      Ginsburg, 1933
## 122                                                       Duméril, 1870
## 123                                                       Borodin, 1928
## 124                                               Lourie & Kuiter, 2008
## 125                                                        Kuiter, 2001
## 126                                               Lourie & Kuiter, 2008
## 127                                               Jordan & Snyder, 1901
## 128                                                         Weber, 1913
## 129                                              Jordan & Gilbert, 1882
## 130                                                       Günther, 1866
## 131                                                     Castelnau, 1873
## 132                                                       Duncker, 1940
## 133                                                        Fowler, 1904
## 134                                                       Bleeker, 1852
## 135                                                        Tanaka, 1916
## 136                                                    (Mitchill, 1814)
## 137                                                         Leach, 1814
## 138                                                     Castelnau, 1872
## 139                                                     Castelnau, 1875
## 140                                              Randall & Lourie, 2009
## 141                                                       Günther, 1880
## 142                                                       Cloquet, 1821
## 143                                                Gomon & Kuiter, 2009
## 144                                                       Bleeker, 1855
## 145                                                       Whitley, 1964
## 146                                              Jordan & Gilbert, 1882
## 2                                                                  <NA>
## 3                                                                  <NA>
## 4                                                                  <NA>
##          status
## 1      accepted
## 5      accepted
## 6      accepted
## 7    unaccepted
## 8    unaccepted
## 9    unaccepted
## 10     accepted
## 11     accepted
## 12     accepted
## 13   unaccepted
## 14   unaccepted
## 15   unaccepted
## 16   unaccepted
## 17   unaccepted
## 18     accepted
## 19     accepted
## 20   unaccepted
## 21   unaccepted
## 22   unaccepted
## 23     accepted
## 24   unaccepted
## 25     accepted
## 26   unaccepted
## 27   unaccepted
## 28   unaccepted
## 29   unaccepted
## 30     accepted
## 31     accepted
## 32     accepted
## 33   unaccepted
## 34     accepted
## 35     accepted
## 36     accepted
## 37     accepted
## 38   unaccepted
## 39   unaccepted
## 40     accepted
## 41     accepted
## 42   unaccepted
## 43   unaccepted
## 44     accepted
## 45   unaccepted
## 46   unaccepted
## 47   unaccepted
## 48   unaccepted
## 49     accepted
## 50     accepted
## 51     accepted
## 52   unaccepted
## 53   unaccepted
## 54   unaccepted
## 55     accepted
## 56     accepted
## 57     accepted
## 58   unaccepted
## 59     accepted
## 60     accepted
## 61   unaccepted
## 62   unaccepted
## 63   unaccepted
## 64     accepted
## 65   unaccepted
## 66   unaccepted
## 67     accepted
## 68   unaccepted
## 69   unaccepted
## 70   unaccepted
## 71     accepted
## 72     accepted
## 73   unaccepted
## 74     accepted
## 75     accepted
## 76   unaccepted
## 77   unaccepted
## 78     accepted
## 79   unaccepted
## 80     accepted
## 81   unaccepted
## 82   unaccepted
## 83   unaccepted
## 84     accepted
## 85   unaccepted
## 86   unaccepted
## 87   unaccepted
## 88   unaccepted
## 89   unaccepted
## 90     accepted
## 91     accepted
## 92   unaccepted
## 93   unaccepted
## 94   unaccepted
## 95   unaccepted
## 96     accepted
## 97     accepted
## 98     accepted
## 99   unaccepted
## 100  unaccepted
## 101  unaccepted
## 102  unaccepted
## 103  unaccepted
## 104  unaccepted
## 105  unaccepted
## 106    accepted
## 107    accepted
## 108  unaccepted
## 109    accepted
## 110  unaccepted
## 111  unaccepted
## 112    accepted
## 113  unaccepted
## 114  unaccepted
## 115  unaccepted
## 116    accepted
## 117    accepted
## 118  unaccepted
## 119  unaccepted
## 120  unaccepted
## 121    accepted
## 122  unaccepted
## 123  unaccepted
## 124    accepted
## 125    accepted
## 126  unaccepted
## 127    accepted
## 128    accepted
## 129  unaccepted
## 130  unaccepted
## 131    accepted
## 132    accepted
## 133  unaccepted
## 134  unaccepted
## 135  unaccepted
## 136  unaccepted
## 137    accepted
## 138  unaccepted
## 139  unaccepted
## 140    accepted
## 141  unaccepted
## 142  unaccepted
## 143    accepted
## 144    accepted
## 145    accepted
## 146    accepted
## 2   quarantined
## 3   quarantined
## 4   quarantined
## 
## More than one WORMS ID found for taxon 'Hippocampus'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.sciname.x CleanedData.w1 CleanedData.w2 CleanedData.w3 CleanedData.w5 CleanedData.w7 CleanedData.w8 CleanedData.s1 CleanedData.s2 CleanedData.s3 CleanedData.s4 CleanedData.s5 CleanedData.s6 CleanedData.s7 CleanedData.s8 CleanedData.s9 CanonicalFull WormsIDs
Atherina boyeri NA NA NA NA NA NA 95 700 5.1 39 4.5 335 NA NA Atherina boyeri 272027
Blennidae NA NA NA 1 NA NA 25 NA 12 5 NA NA Blennidae NA
Callinectes marginatus NA NA NA NA NA NA NA NA NA NA Callinectes marginatus 241106
Citharichthys stampfilii NA NA NA NA NA 13 NA NA NA NA Citharichthys stampfilii NA
Coptodon guineensis NA 1 1 NA NA NA 5 1 26.8 NA NA 8 Coptodon guineensis 1021112
Cynoglossus senegalensis NA NA NA NA NA NA NA NA 19 7.3 NA NA Cynoglossus senegalensis 274226

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[2,17:18] = c("Blenniidae", 125519)
combined_dataframe[4,17:18] = c("Citharichthys stampflii", 275695)
combined_dataframe[15,17:18] = c("Ephippion guttifer", 127413)
combined_dataframe[17,17:18] = c('Eucinostomus melanopterus', 276423)
combined_dataframe[20,17:18] = c('Gobiidae', 125537)
combined_dataframe[22,17:18] = c('Hippocampus', 126224)
combined_dataframe[29,17:18] = c('Lithognathus mormyrus', 127055)
combined_dataframe[40,17:18] = c('Rhinobatos rhinobatos', 105898)
combined_dataframe[48,17:18] = c('Stephanolepis hispidus', 127409)

Locality data

Locality data was retrieved from the paper as below:

1 Mamghar (mangrove) sand - 19°22’16”N 16°31’52”W 2 Mamghar (baie saint jean) vegetation - 19°25’09”N 16°22’23”W 3 Iwik (center) vegetation - 19°53’00”N 16°17’34”W 4 Iwik (center) vegetation - 19°53’25”N 16°17’20”W 5 Iwik (center) sand - 19°54’18”N 16°18’35”W 6 Muzan vegetation - 19°54’01”N 16°30’11”W 7 Kiji Vegetation - 19°43’19”N 16°30’05”W 8 Nair Vegetation - 19°52’07”W 16°23’29”N 9 Agnefour Vegetation - 19°51’42”N 16°24’36”W

occ_data <- data.frame(
  canonicalFull = character(),
  wormsIDs = numeric(),
  eventDate = character(),
  locality = character(),
  fieldNumber = character(),
  decimalLongitude = numeric(),
  decimalLatitude = numeric(),
  coordinateUncertaintyInMeters = numeric()
)

for (i in 1:nrow(combined_dataframe)) {
  for (j in 2:16) {
    if (is.na(combined_dataframe[i, j]) == FALSE & combined_dataframe[i, j] != "") {
      # Create a new row in occ_data
      site <- sub("^[^.]+\\.", "", colnames(combined_dataframe)[j])
      
      matches <- str_match(site, "([A-Za-z]+)([0-9]+)")
      winter_or_spring <- matches[1, 2]
      site <- matches[1, 3]
      
      if (winter_or_spring == "w"){
        eventDate <- "2020-12-01/2020-12-31"
      } else if (winter_or_spring == "s"){
        eventDate <- "2020-04-01/2020-04-30"
      }
      
      if (site == "1") {
        locality <- "Banc d'Arguin: Mamghar (mangrove) sand"
        fieldNumber <- "1"
        decimalLatitude <- "19.371111"
        decimalLongitude <- "-16.531111"
      } else if (site == "2") {
        locality <- "Banc d'Arguin: Mamghar (baie saint jean) vegetation"
        fieldNumber <- "2" 
        decimalLatitude <- "19.419167"
        decimalLongitude <- "-16.373056"
      } else if (site == "3") {
        locality <- "Banc d'Arguin: Iwik (center) vegetation"
        fieldNumber <- "3"
        decimalLatitude <- "19.883333"
        decimalLongitude <- "-16.292778"
      } else if (site == "4") {
        locality <- "Banc d'Arguin: Iwik (center) vegetation extra"
        fieldNumber <- "4"
        decimalLatitude <- "19.890278"
        decimalLongitude <- "-16.288889"
      } else if (site == "5") {
        locality <- "Banc d'Arguin: Iwik (center) sand"
        fieldNumber <- "5"
        decimalLatitude <- "19.905"
        decimalLongitude <- "-16.309722"
      } else if (site == "6") {
        locality <- "Banc d'Arguin: Muzan vegetation"
        fieldNumber <- "6"
        decimalLatitude <- "19.900278"
        decimalLongitude <- "-16.503056"
      } else if (site == "7") {
        locality <- "Banc d'Arguin: Kiji Vegetation"
        fieldNumber <- "7"
        decimalLatitude <- "19.721944"
        decimalLongitude <- "-16.501389"
      } else if (site == "8") {
        locality <- "Banc d'Arguin: Nair Vegetation"
        fieldNumber <- "8"
        decimalLatitude <- "19.868611"
        decimalLongitude <- "-16.391389"
      } else if (site == "9") {
        locality <- "Banc d'Arguin: Agnefour Vegetation"
        fieldNumber <- "9" 
        decimalLatitude <- "19.861667"
        decimalLongitude <- "-16.41"
      }
      
      new_row <- data.frame(
        canonicalFull = combined_dataframe[i, "CanonicalFull"],
        wormsIDs = combined_dataframe[i, "WormsIDs"],
        locality = locality,
        eventDate = eventDate,
        fieldNumber = fieldNumber,
        decimalLongitude = decimalLongitude,
        decimalLatitude = decimalLatitude,
        coordinateUncertaintyInMeters = 50
      )
      # Append the new row to df3
      occ_data <- rbind(occ_data, new_row)
    }
  }
}

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occ_data %>%
  rename(scientificName = canonicalFull) %>%
  rename(scientificNameID = wormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, locality, eventDate), algo="md5"), sep=":"))

decimalLongitude/decimalLatitude

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. See above.

The calculations below are used to calculate the boundaries for the EML file.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Banc d'Arguin National Park
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Banc d'Arguin National Park")]

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Mauritania"
occurrence %<>% mutate(country)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, country, locality, fieldNumber, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, geodeticDatum, occurrenceStatus, basisOfRecord)

#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 0 × 0

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")
## [1] "eml-2.1.1"
#Title
title <- "Biodiversity and community assemblage of shallow habitats of the National Park of Banc d'Arguin (Mauritania): influence of habitat, season and site"

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")

#Abstract
abstract <- eml$abstract(
  para = "The biodiversity and community assemblages of seagrass and sand habitats located in shallow water habitats of the Banc d’Arguin were sampled with a beach seine at 3 sites, during two sampling missions in December and April. The objectives were to test if the community assemblages in term of abundance, species diversity and assemblage structure were different between habitats, as well as the effect of the season and the site."
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- list(eml$creator(
    individualName = eml$individualName(
      givenName = "Nicolas", 
      surName = "Compain")
  )
)


contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Compain, N. (2021). Biodiversity and community assemblage of shallow habitats of the National Park of Banc d'Arguin (Mauritania): influence of habitat, season and site. MSc thesis. University of Algarve.")
    )
  )
)

citationdoi <- "https://sapientia.ualg.pt/bitstream/10400.1/17835/1/Thesis%20%28digital%20version%29%20Nicolas%20COMPAIN%20a68831.pdf"

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Banc d'Arguin National Park",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
  ),
  temporalCoverage = eml$temporalCoverage(
    rangeOfDates = eml$rangeOfDates(
      beginDate = eml$beginDate(
        calendarDate = "2020-04-01"
      ),
      endDate = eml$endDate(
        calendarDate = "2020-12-31"
      )
    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)

#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)

#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "default", eventDate)
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  }
}

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}