Bundling Compain 2021 to a DwC Archive
This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:
Setup
Call the necessary libraries and variables. Suppresses loading messages.
library(magrittr) # To use %<>% pipes
suppressMessages(library(janitor)) # To clean input data
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
suppressMessages(library(rgnparser)) # To clean species names
suppressMessages(library(taxize)) # To get WoRMS IDs
library(worrms) # To get WoRMS IDs
library(digest) # To generate hashes
suppressMessages(library(obistools)) # To generate centroid lat/long and uncertainty
suppressMessages(library(sf)) # To generate wkt polygon
suppressMessages(library(EML)) # To create eml.xml file
library(xml2) # To create the meta.xml file
suppressMessages(library(zip)) # To zip DwC file
suppressMessages(library(tidyr))
Input Parameters and Paths
Parsing PDF table to CSV
The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV.
#conda environment
condaenv <- "mwhs-data-mobilization"
# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")
# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")
# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", sep="/")
# Define page numbers and table areas (see documentation)
page_args <- c(
"-a 64.713,15.258,520.338,726.579 -p 24",
"-a 65.766,12.101,526.651,841.274 -p 25",
"-a 52.086,16.31,509.815,580.316 -p 26"
)
# Define run parameters (see documentation)
run_parameters <- "-s"
# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
## [1] ""
## [2] "Script Execution Summary"
## [3] "Date and Time: 2023-10-03 18:35:03"
## [4] "------------------------------"
## [5] ""
## [6] "PDF input: ../../../datasets/banc_darguin_national_park/Compain_2021/raw/Thesis_Nicolas_COMPAIN_a68831.pdf"
## [7] "Perform Table Parsing: TRUE"
## [8] "Selected Areas:"
## [9] " Area 1: [64.713, 15.258, 520.338, 726.579]"
## [10] " Area 2: [65.766, 12.101, 526.651, 841.274]"
## [11] " Area 3: [52.086, 16.31, 509.815, 580.316]"
## [12] "Pages: 24, 25, 26"
## [13] "Concatenate: False"
## [14] "Concatenate across headers: False"
## [15] "Stream Extraction: True"
## [16] "Lattice Extraction: False"
## [17] ""
## [18] "Parsing Tables"
## [19] "------------------------------"
## [20] ""
## [21] ""
## [22] "Saving to CSV"
## [23] "CSV file(s):"
## [24] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_1.csv"
## [25] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_2.csv"
## [26] "\t../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_tables_parsed_3.csv"
## [27] "------------------------------"
## [28] ""
## [29] ""
## [30] "Run Details: ../../../datasets/banc_darguin_national_park/Compain_2021/processed/Thesis_Nicolas_COMPAIN_a68831_parameters.txt"
## [31] "Finished"
## [32] ""
Read source data
Now we’ll read in the csv tables outputted from the previous step.
processed_csv1 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_1.csv"
processed_csv2 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_2.csv"
processed_csv3 <- "Thesis_Nicolas_COMPAIN_a68831_tables_parsed_3.csv"
input_data1 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv1, sep="/"))
input_data2 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv2, sep="/"))
input_data3 <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv3, sep="/"))
Preprocessing
Here we tidy the data up.
Tidy Data
input_data1 %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
clean_names() %>%
select(c(unnamed_0, n, n_1, n_2, n_3, n_4, n_tl))
names(input_data1) <- c("sciname", "w1", "w2", "w3", "w5", "w7", "w8")
input_data1$ID <- seq_len(nrow(input_data1))
input_data2 %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
clean_names() %>%
select(c(unnamed_0, n, n_1, n_tl, n_tl_1, n_2, n_tl_2))
names(input_data2) <- c("sciname", "s1", "s2", "s3", "s4", "s5", "s6")
input_data2$ID <- seq_len(nrow(input_data2))
input_data3 %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
clean_names() %>%
select(c(unnamed_0, n, n_tl, n_1))
names(input_data3) <- c("sciname", "s7", "s8", "s9")
input_data3$ID <- seq_len(nrow(input_data3))
input_data <- merge(input_data1, input_data2, by = "ID")
input_data <- merge(input_data, input_data3, by = "ID")
input_data$ID <- NULL
input_data$sciname.y <- NULL
input_data$sciname <- NULL
input_data$sciname.x <- gsub("^\\d+\\s", "", input_data$sciname.x)
cleaned_data <- input_data
#to preview pretty table
knitr::kable(head(cleaned_data))
sciname.x | w1 | w2 | w3 | w5 | w7 | w8 | s1 | s2 | s3 | s4 | s5 | s6 | s7 | s8 | s9 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Atherina boyeri | NA | NA | NA | NA | NA | NA | 95 | 700 5.1 | 39 4.5 | 335 | NA | NA | |||
Blennidae | NA | NA | NA | 1 | NA | NA | 25 | NA | 12 5 | NA | NA | ||||
Callinectes marginatus | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | |||||
Citharichthys stampfilii | NA | NA | NA | NA | NA | 13 | NA | NA | NA | NA | |||||
Coptodon guineensis | NA | 1 | 1 | NA | NA | NA | 5 | 1 26.8 | NA | NA | 8 | ||||
Cynoglossus senegalensis | NA | NA | NA | NA | NA | NA | NA | NA | 19 7.3 | NA | NA |
Get WoRMS IDs
Auto matching
First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.
#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,1])
#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- NA
}
}
return(worms_id)
}
#Call the function
worms_ids <- lapply(parsed_names, function(element) {
if (element$parsed) {
return(get_worms_id_from_element(element))
} else {
return(NA)
}
})
##
## id target
## 1 126224 Hippocampus
## 5 1525460 Hippocampus
## 6 275182 Hippocampus abdominalis
## 10 275183 Hippocampus alatus
## 11 275184 Hippocampus algiricus
## 12 275185 Hippocampus angustus
## 18 275186 Hippocampus barbouri
## 19 275187 Hippocampus bargibanti
## 23 212238 Hippocampus borboniensis
## 25 275189 Hippocampus breviceps
## 30 212233 Hippocampus camelopardalis
## 31 212234 Hippocampus capensis
## 32 886550 Hippocampus casscsio
## 34 275190 Hippocampus colemani
## 35 275191 Hippocampus comes
## 36 275192 Hippocampus coronatus
## 37 388711 Hippocampus curvicuspis
## 40 398432 Hippocampus debelius
## 41 275193 Hippocampus denise
## 44 159445 Hippocampus erectus
## 49 275194 Hippocampus fisheri
## 50 212230 Hippocampus fuscus
## 51 212230 Hippocampus fuscus
## 55 275195 Hippocampus grandiceps
## 56 154776 Hippocampus guttulatus
## 57 248042 Hippocampus guttulatus microstephanus
## 59 1376208 Hippocampus haema
## 60 275196 Hippocampus hendriki
## 64 127380 Hippocampus hippocampus
## 67 212239 Hippocampus histrix
## 71 275197 Hippocampus ingens
## 72 1288522 Hippocampus japapigu
## 74 275198 Hippocampus jayakari
## 75 275199 Hippocampus jugumus
## 78 212236 Hippocampus kelloggi
## 80 212237 Hippocampus kuda
## 84 275200 Hippocampus lichtensteinii
## 90 275201 Hippocampus minotaur
## 91 275202 Hippocampus mohnikei
## 96 275203 Hippocampus montebelloensis
## 97 275204 Hippocampus multispinus
## 98 1437127 Hippocampus nalu
## 106 712534 Hippocampus paradoxus
## 107 275205 Hippocampus patagonicus
## 109 306811 Hippocampus planifrons
## 112 398433 Hippocampus pontohi
## 116 388712 Hippocampus pusillus
## 117 275207 Hippocampus queenslandicus
## 121 159446 Hippocampus reidi
## 124 398434 Hippocampus satomiae
## 125 275208 Hippocampus semispinosus
## 127 275209 Hippocampus sindonis
## 128 275210 Hippocampus spinosissimus
## 131 275211 Hippocampus subelongatus
## 132 306822 Hippocampus suezensis
## 137 212232 Hippocampus trimaculatus
## 140 474956 Hippocampus tyro
## 143 398436 Hippocampus waleananus
## 144 212235 Hippocampus whitei
## 145 275212 Hippocampus zebra
## 146 275213 Hippocampus zosterae
## authority
## 1 Rafinesque, 1810
## 5 Rafinesque, 1810
## 6 Lesson, 1827
## 10 Kuiter, 2001
## 11 Kaup, 1856
## 12 Günther, 1870
## 18 Jordan & Richardson, 1908
## 19 Whitley, 1970
## 23 Duméril, 1870
## 25 Peters, 1869
## 30 Bianconi, 1854
## 31 Boulenger, 1900
## 32 Zhang, Qin, Wang & Lin, 2016
## 34 Kuiter, 2003
## 35 Cantor, 1849
## 36 Temminck & Schlegel, 1850
## 37 Fricke, 2004
## 40 Gomon & Kuiter, 2009
## 41 Lourie & Randall, 2003
## 44 Perry, 1810
## 49 Jordan & Evermann, 1903
## 50 Rüppell, 1838
## 51 Rüppell, 1838
## 55 Kuiter, 2001
## 56 Cuvier, 1829
## 57 Slastenenko, 1937
## 59 Han, Kim, Kai & Senou, 2017
## 60 Kuiter, 2001
## 64 (Linnaeus, 1758)
## 67 Kaup, 1856
## 71 Girard, 1858
## 72 Short, Smith, Motomura, Harasti & Hamilton, 2018
## 74 Boulenger, 1900
## 75 Kuiter, 2001
## 78 Jordan & Snyder, 1901
## 80 Bleeker, 1852
## 84 Kaup, 1856
## 90 Gomon, 1997
## 91 Bleeker, 1853
## 96 Kuiter, 2001
## 97 Kuiter, 2001
## 98 Short, Claassens, Smith, De Brauwer, Hamilton, Stat & Harasti, 2020
## 106 Foster & Gomon, 2010
## 107 Piacentino & Luzzatto, 2004
## 109 Peters, 1877
## 112 Lourie & Kuiter, 2008
## 116 Fricke, 2004
## 117 Horne, 2001
## 121 Ginsburg, 1933
## 124 Lourie & Kuiter, 2008
## 125 Kuiter, 2001
## 127 Jordan & Snyder, 1901
## 128 Weber, 1913
## 131 Castelnau, 1873
## 132 Duncker, 1940
## 137 Leach, 1814
## 140 Randall & Lourie, 2009
## 143 Gomon & Kuiter, 2009
## 144 Bleeker, 1855
## 145 Whitley, 1964
## 146 Jordan & Gilbert, 1882
## status
## 1 accepted
## 5 accepted
## 6 accepted
## 10 accepted
## 11 accepted
## 12 accepted
## 18 accepted
## 19 accepted
## 23 accepted
## 25 accepted
## 30 accepted
## 31 accepted
## 32 accepted
## 34 accepted
## 35 accepted
## 36 accepted
## 37 accepted
## 40 accepted
## 41 accepted
## 44 accepted
## 49 accepted
## 50 accepted
## 51 accepted
## 55 accepted
## 56 accepted
## 57 accepted
## 59 accepted
## 60 accepted
## 64 accepted
## 67 accepted
## 71 accepted
## 72 accepted
## 74 accepted
## 75 accepted
## 78 accepted
## 80 accepted
## 84 accepted
## 90 accepted
## 91 accepted
## 96 accepted
## 97 accepted
## 98 accepted
## 106 accepted
## 107 accepted
## 109 accepted
## 112 accepted
## 116 accepted
## 117 accepted
## 121 accepted
## 124 accepted
## 125 accepted
## 127 accepted
## 128 accepted
## 131 accepted
## 132 accepted
## 137 accepted
## 140 accepted
## 143 accepted
## 144 accepted
## 145 accepted
## 146 accepted
##
## More than one WORMS ID found for taxon 'Hippocampus'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target
## 1 126224 Hippocampus
## 5 1525460 Hippocampus
## 6 275182 Hippocampus abdominalis
## 7 306774 Hippocampus agnesae
## 8 306775 Hippocampus aimei
## 9 716772 Hippocampus aimei
## 10 275183 Hippocampus alatus
## 11 275184 Hippocampus algiricus
## 12 275185 Hippocampus angustus
## 13 306776 Hippocampus antiquorum
## 14 306777 Hippocampus antiquus
## 15 400954 Hippocampus arnei
## 16 716773 Hippocampus arnei
## 17 306778 Hippocampus aterrimus
## 18 275186 Hippocampus barbouri
## 19 275187 Hippocampus bargibanti
## 20 713214 Hippocampus bicuspis
## 21 275188 Hippocampus biocellatus
## 22 306779 Hippocampus bleekeri
## 23 212238 Hippocampus borboniensis
## 24 306780 Hippocampus brachyrhynchus
## 25 275189 Hippocampus breviceps
## 26 306781 Hippocampus brevirostris
## 27 306782 Hippocampus brunneus
## 28 400945 Hippocampus cameleopardalis
## 29 400946 Hippocampus cameleopardalus
## 30 212233 Hippocampus camelopardalis
## 31 212234 Hippocampus capensis
## 32 886550 Hippocampus casscsio
## 33 306783 Hippocampus chinensis
## 34 275190 Hippocampus colemani
## 35 275191 Hippocampus comes
## 36 275192 Hippocampus coronatus
## 37 388711 Hippocampus curvicuspis
## 38 306784 Hippocampus dahli
## 39 306785 Hippocampus deanei
## 40 398432 Hippocampus debelius
## 41 275193 Hippocampus denise
## 42 306786 Hippocampus ecuadorensis
## 43 306787 Hippocampus elongatus
## 44 159445 Hippocampus erectus
## 45 306788 Hippocampus erinaceus
## 46 154815 Hippocampus europaeus
## 47 306789 Hippocampus fascicularis
## 48 713215 Hippocampus filamentosus
## 49 275194 Hippocampus fisheri
## 50 212230 Hippocampus fuscus
## 51 212230 Hippocampus fuscus
## 52 306790 Hippocampus graciliformis
## 53 306791 Hippocampus gracilis
## 54 306792 Hippocampus gracilissimus
## 55 275195 Hippocampus grandiceps
## 56 154776 Hippocampus guttulatus
## 57 248042 Hippocampus guttulatus microstephanus
## 58 323136 Hippocampus guttulatus multiannularis
## 59 1376208 Hippocampus haema
## 60 275196 Hippocampus hendriki
## 61 154458 Hippocampus heptagonus
## 62 306793 Hippocampus hildebrandi
## 63 306794 Hippocampus hilonis
## 64 127380 Hippocampus hippocampus
## 65 322937 Hippocampus hippocampus microcoronatus
## 66 322938 Hippocampus hippocampus microstephanus
## 67 212239 Hippocampus histrix
## 68 306795 Hippocampus horai
## 69 306796 Hippocampus hudsonius
## 70 400949 Hippocampus hystrix
## 71 275197 Hippocampus ingens
## 72 1288522 Hippocampus japapigu
## 73 306797 Hippocampus japonicus
## 74 275198 Hippocampus jayakari
## 75 275199 Hippocampus jugumus
## 76 306798 Hippocampus kampylotrachelos
## 77 306799 Hippocampus kaupii
## 78 212236 Hippocampus kelloggi
## 79 306800 Hippocampus kincaidi
## 80 212237 Hippocampus kuda
## 81 323205 Hippocampus kuda multiannularis
## 82 306801 Hippocampus laevicaudatus
## 83 713217 Hippocampus lenis
## 84 275200 Hippocampus lichtensteinii
## 85 154777 Hippocampus longirostris
## 86 306802 Hippocampus manadensis
## 87 306803 Hippocampus mannulus
## 88 306804 Hippocampus marginalis
## 89 306805 Hippocampus melanospilos
## 90 275201 Hippocampus minotaur
## 91 275202 Hippocampus mohnikei
## 92 306806 Hippocampus moluccensis
## 93 400952 Hippocampus monckei
## 94 400951 Hippocampus monickei
## 95 400953 Hippocampus monikei
## 96 275203 Hippocampus montebelloensis
## 97 275204 Hippocampus multispinus
## 98 1437127 Hippocampus nalu
## 99 306807 Hippocampus natalensis
## 100 306808 Hippocampus novaehebudorum
## 101 400955 Hippocampus novaehollandae
## 102 306809 Hippocampus novaehollandiae
## 103 713212 Hippocampus obscurus
## 104 713213 Hippocampus obscurus
## 105 306810 Hippocampus obtusus
## 106 712534 Hippocampus paradoxus
## 107 275205 Hippocampus patagonicus
## 108 400947 Hippocampus pentagonus
## 109 306811 Hippocampus planifrons
## 110 306812 Hippocampus poeyi
## 111 306813 Hippocampus polytaenia
## 112 398433 Hippocampus pontohi
## 113 275206 Hippocampus procerus
## 114 306814 Hippocampus punctulatus
## 115 306815 Hippocampus punctulatus
## 116 388712 Hippocampus pusillus
## 117 275207 Hippocampus queenslandicus
## 118 306816 Hippocampus raji
## 119 127381 Hippocampus ramulosus
## 120 306817 Hippocampus regulus
## 121 159446 Hippocampus reidi
## 122 306818 Hippocampus rhynchomacer
## 123 306819 Hippocampus rosamondae
## 124 398434 Hippocampus satomiae
## 125 275208 Hippocampus semispinosus
## 126 398435 Hippocampus severnsi
## 127 275209 Hippocampus sindonis
## 128 275210 Hippocampus spinosissimus
## 129 306820 Hippocampus stylifer
## 130 306821 Hippocampus subcoronatus
## 131 275211 Hippocampus subelongatus
## 132 306822 Hippocampus suezensis
## 133 306823 Hippocampus taeniops
## 134 306824 Hippocampus taeniopterus
## 135 306825 Hippocampus takakurae
## 136 306826 Hippocampus tetragonous
## 137 212232 Hippocampus trimaculatus
## 138 306827 Hippocampus tristis
## 139 306828 Hippocampus tuberculatus
## 140 474956 Hippocampus tyro
## 141 306829 Hippocampus villosus
## 142 306830 Hippocampus vulgaris
## 143 398436 Hippocampus waleananus
## 144 212235 Hippocampus whitei
## 145 275212 Hippocampus zebra
## 146 275213 Hippocampus zosterae
## 2 843450 <NA>
## 3 843451 <NA>
## 4 843452 <NA>
## authority
## 1 Rafinesque, 1810
## 5 Rafinesque, 1810
## 6 Lesson, 1827
## 7 Fowler, 1907
## 8 Roule, 1916
## 9 Roule, 1916
## 10 Kuiter, 2001
## 11 Kaup, 1856
## 12 Günther, 1870
## 13 Leach, 1814
## 14 Risso, 1827
## 15 Roule, 1916
## 16 Roule, 1916
## 17 Jordan & Snyder, 1902
## 18 Jordan & Richardson, 1908
## 19 Whitley, 1970
## 20 Kaup, 1856
## 21 Kuiter, 2001
## 22 Fowler, 1907
## 23 Duméril, 1870
## 24 Duncker, 1914
## 25 Peters, 1869
## 26 Schinz, 1822
## 27 Bean, 1906
## 28 Bianconi, 1854
## 29 Bianconi, 1854
## 30 Bianconi, 1854
## 31 Boulenger, 1900
## 32 Zhang, Qin, Wang & Lin, 2016
## 33 Basilewsky, 1855
## 34 Kuiter, 2003
## 35 Cantor, 1849
## 36 Temminck & Schlegel, 1850
## 37 Fricke, 2004
## 38 Ogilby, 1908
## 39 Duméril, 1861
## 40 Gomon & Kuiter, 2009
## 41 Lourie & Randall, 2003
## 42 Fowler, 1922
## 43 Castelnau, 1873
## 44 Perry, 1810
## 45 Günther, 1870
## 46 Ginsburg, 1933
## 47 Kaup, 1856
## 48 Duméril, 1870
## 49 Jordan & Evermann, 1903
## 50 Rüppell, 1838
## 51 Rüppell, 1838
## 52 McCulloch, 1911
## 53 Gill, 1862
## 54 Temminck & Schlegel, 1850
## 55 Kuiter, 2001
## 56 Cuvier, 1829
## 57 Slastenenko, 1937
## 58 Ginsburg, 1937
## 59 Han, Kim, Kai & Senou, 2017
## 60 Kuiter, 2001
## 61 Rafinesque, 1810
## 62 Ginsburg, 1933
## 63 Jordan & Evermann, 1903
## 64 (Linnaeus, 1758)
## 65 Slastenenko, 1938
## 66 Slastenenko, 1937
## 67 Kaup, 1856
## 68 Duncker, 1926
## 69 DeKay, 1842
## 70 Kaup, 1856
## 71 Girard, 1858
## 72 Short, Smith, Motomura, Harasti & Hamilton, 2018
## 73 Kaup, 1856
## 74 Boulenger, 1900
## 75 Kuiter, 2001
## 76 Bleeker, 1854
## 77 Duméril, 1870
## 78 Jordan & Snyder, 1901
## 79 Townsend & Barbour, 1906
## 80 Bleeker, 1852
## 81 Raj, 1941
## 82 Kaup, 1856
## 83 De Vis, 1908
## 84 Kaup, 1856
## 85 Schinz, 1822
## 86 Bleeker, 1856
## 87 Cantor, 1849
## 88 Kaup, 1856
## 89 Bleeker, 1854
## 90 Gomon, 1997
## 91 Bleeker, 1853
## 92 Bleeker, 1852
## 93 Bleeker, 1853
## 94 Bleeker, 1853
## 95 Bleeker, 1853
## 96 Kuiter, 2001
## 97 Kuiter, 2001
## 98 Short, Claassens, Smith, De Brauwer, Hamilton, Stat & Harasti, 2020
## 99 von Bonde, 1923
## 100 Fowler, 1944
## 101 Steindachner, 1866
## 102 Steindachner, 1866
## 103 Hemprich & Ehrenberg, 1856
## 104 Ehrenberg, 1871
## 105 Ginsburg, 1933
## 106 Foster & Gomon, 2010
## 107 Piacentino & Luzzatto, 2004
## 108 Rafinesque, 1810
## 109 Peters, 1877
## 110 Howell Rivero, 1934
## 111 Bleeker, 1854
## 112 Lourie & Kuiter, 2008
## 113 Kuiter, 2001
## 114 Guichenot, 1853
## 115 Kaup, 1856
## 116 Fricke, 2004
## 117 Horne, 2001
## 118 Whitley, 1955
## 119 Leach, 1814
## 120 Ginsburg, 1933
## 121 Ginsburg, 1933
## 122 Duméril, 1870
## 123 Borodin, 1928
## 124 Lourie & Kuiter, 2008
## 125 Kuiter, 2001
## 126 Lourie & Kuiter, 2008
## 127 Jordan & Snyder, 1901
## 128 Weber, 1913
## 129 Jordan & Gilbert, 1882
## 130 Günther, 1866
## 131 Castelnau, 1873
## 132 Duncker, 1940
## 133 Fowler, 1904
## 134 Bleeker, 1852
## 135 Tanaka, 1916
## 136 (Mitchill, 1814)
## 137 Leach, 1814
## 138 Castelnau, 1872
## 139 Castelnau, 1875
## 140 Randall & Lourie, 2009
## 141 Günther, 1880
## 142 Cloquet, 1821
## 143 Gomon & Kuiter, 2009
## 144 Bleeker, 1855
## 145 Whitley, 1964
## 146 Jordan & Gilbert, 1882
## 2 <NA>
## 3 <NA>
## 4 <NA>
## status
## 1 accepted
## 5 accepted
## 6 accepted
## 7 unaccepted
## 8 unaccepted
## 9 unaccepted
## 10 accepted
## 11 accepted
## 12 accepted
## 13 unaccepted
## 14 unaccepted
## 15 unaccepted
## 16 unaccepted
## 17 unaccepted
## 18 accepted
## 19 accepted
## 20 unaccepted
## 21 unaccepted
## 22 unaccepted
## 23 accepted
## 24 unaccepted
## 25 accepted
## 26 unaccepted
## 27 unaccepted
## 28 unaccepted
## 29 unaccepted
## 30 accepted
## 31 accepted
## 32 accepted
## 33 unaccepted
## 34 accepted
## 35 accepted
## 36 accepted
## 37 accepted
## 38 unaccepted
## 39 unaccepted
## 40 accepted
## 41 accepted
## 42 unaccepted
## 43 unaccepted
## 44 accepted
## 45 unaccepted
## 46 unaccepted
## 47 unaccepted
## 48 unaccepted
## 49 accepted
## 50 accepted
## 51 accepted
## 52 unaccepted
## 53 unaccepted
## 54 unaccepted
## 55 accepted
## 56 accepted
## 57 accepted
## 58 unaccepted
## 59 accepted
## 60 accepted
## 61 unaccepted
## 62 unaccepted
## 63 unaccepted
## 64 accepted
## 65 unaccepted
## 66 unaccepted
## 67 accepted
## 68 unaccepted
## 69 unaccepted
## 70 unaccepted
## 71 accepted
## 72 accepted
## 73 unaccepted
## 74 accepted
## 75 accepted
## 76 unaccepted
## 77 unaccepted
## 78 accepted
## 79 unaccepted
## 80 accepted
## 81 unaccepted
## 82 unaccepted
## 83 unaccepted
## 84 accepted
## 85 unaccepted
## 86 unaccepted
## 87 unaccepted
## 88 unaccepted
## 89 unaccepted
## 90 accepted
## 91 accepted
## 92 unaccepted
## 93 unaccepted
## 94 unaccepted
## 95 unaccepted
## 96 accepted
## 97 accepted
## 98 accepted
## 99 unaccepted
## 100 unaccepted
## 101 unaccepted
## 102 unaccepted
## 103 unaccepted
## 104 unaccepted
## 105 unaccepted
## 106 accepted
## 107 accepted
## 108 unaccepted
## 109 accepted
## 110 unaccepted
## 111 unaccepted
## 112 accepted
## 113 unaccepted
## 114 unaccepted
## 115 unaccepted
## 116 accepted
## 117 accepted
## 118 unaccepted
## 119 unaccepted
## 120 unaccepted
## 121 accepted
## 122 unaccepted
## 123 unaccepted
## 124 accepted
## 125 accepted
## 126 unaccepted
## 127 accepted
## 128 accepted
## 129 unaccepted
## 130 unaccepted
## 131 accepted
## 132 accepted
## 133 unaccepted
## 134 unaccepted
## 135 unaccepted
## 136 unaccepted
## 137 accepted
## 138 unaccepted
## 139 unaccepted
## 140 accepted
## 141 unaccepted
## 142 unaccepted
## 143 accepted
## 144 accepted
## 145 accepted
## 146 accepted
## 2 quarantined
## 3 quarantined
## 4 quarantined
##
## More than one WORMS ID found for taxon 'Hippocampus'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()
for (i in 1:nrow(cleaned_data)) {
cleaned_value <- cleaned_data[i,]
canonical_value <- parsed_names[[i]]$canonical$full
worms_id_value <- worms_ids[[i]][1]
if (is.null(canonical_value)){
canonical_value <- NA
}
temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
combined_dataframe <- rbind(combined_dataframe, temp_row)
}
knitr::kable(head(combined_dataframe))
CleanedData.sciname.x | CleanedData.w1 | CleanedData.w2 | CleanedData.w3 | CleanedData.w5 | CleanedData.w7 | CleanedData.w8 | CleanedData.s1 | CleanedData.s2 | CleanedData.s3 | CleanedData.s4 | CleanedData.s5 | CleanedData.s6 | CleanedData.s7 | CleanedData.s8 | CleanedData.s9 | CanonicalFull | WormsIDs |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Atherina boyeri | NA | NA | NA | NA | NA | NA | 95 | 700 5.1 | 39 4.5 | 335 | NA | NA | Atherina boyeri | 272027 | |||
Blennidae | NA | NA | NA | 1 | NA | NA | 25 | NA | 12 5 | NA | NA | Blennidae | NA | ||||
Callinectes marginatus | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Callinectes marginatus | 241106 | |||||
Citharichthys stampfilii | NA | NA | NA | NA | NA | 13 | NA | NA | NA | NA | Citharichthys stampfilii | NA | |||||
Coptodon guineensis | NA | 1 | 1 | NA | NA | NA | 5 | 1 26.8 | NA | NA | 8 | Coptodon guineensis | 1021112 | ||||
Cynoglossus senegalensis | NA | NA | NA | NA | NA | NA | NA | NA | 19 7.3 | NA | NA | Cynoglossus senegalensis | 274226 |
Human Verification
Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps
combined_dataframe[2,17:18] = c("Blenniidae", 125519)
combined_dataframe[4,17:18] = c("Citharichthys stampflii", 275695)
combined_dataframe[15,17:18] = c("Ephippion guttifer", 127413)
combined_dataframe[17,17:18] = c('Eucinostomus melanopterus', 276423)
combined_dataframe[20,17:18] = c('Gobiidae', 125537)
combined_dataframe[22,17:18] = c('Hippocampus', 126224)
combined_dataframe[29,17:18] = c('Lithognathus mormyrus', 127055)
combined_dataframe[40,17:18] = c('Rhinobatos rhinobatos', 105898)
combined_dataframe[48,17:18] = c('Stephanolepis hispidus', 127409)
Locality data
Locality data was retrieved from the paper as below:
1 Mamghar (mangrove) sand - 19°22’16”N 16°31’52”W 2 Mamghar (baie saint jean) vegetation - 19°25’09”N 16°22’23”W 3 Iwik (center) vegetation - 19°53’00”N 16°17’34”W 4 Iwik (center) vegetation - 19°53’25”N 16°17’20”W 5 Iwik (center) sand - 19°54’18”N 16°18’35”W 6 Muzan vegetation - 19°54’01”N 16°30’11”W 7 Kiji Vegetation - 19°43’19”N 16°30’05”W 8 Nair Vegetation - 19°52’07”W 16°23’29”N 9 Agnefour Vegetation - 19°51’42”N 16°24’36”W
occ_data <- data.frame(
canonicalFull = character(),
wormsIDs = numeric(),
eventDate = character(),
locality = character(),
fieldNumber = character(),
decimalLongitude = numeric(),
decimalLatitude = numeric(),
coordinateUncertaintyInMeters = numeric()
)
for (i in 1:nrow(combined_dataframe)) {
for (j in 2:16) {
if (is.na(combined_dataframe[i, j]) == FALSE & combined_dataframe[i, j] != "") {
# Create a new row in occ_data
site <- sub("^[^.]+\\.", "", colnames(combined_dataframe)[j])
matches <- str_match(site, "([A-Za-z]+)([0-9]+)")
winter_or_spring <- matches[1, 2]
site <- matches[1, 3]
if (winter_or_spring == "w"){
eventDate <- "2020-12-01/2020-12-31"
} else if (winter_or_spring == "s"){
eventDate <- "2020-04-01/2020-04-30"
}
if (site == "1") {
locality <- "Banc d'Arguin: Mamghar (mangrove) sand"
fieldNumber <- "1"
decimalLatitude <- "19.371111"
decimalLongitude <- "-16.531111"
} else if (site == "2") {
locality <- "Banc d'Arguin: Mamghar (baie saint jean) vegetation"
fieldNumber <- "2"
decimalLatitude <- "19.419167"
decimalLongitude <- "-16.373056"
} else if (site == "3") {
locality <- "Banc d'Arguin: Iwik (center) vegetation"
fieldNumber <- "3"
decimalLatitude <- "19.883333"
decimalLongitude <- "-16.292778"
} else if (site == "4") {
locality <- "Banc d'Arguin: Iwik (center) vegetation extra"
fieldNumber <- "4"
decimalLatitude <- "19.890278"
decimalLongitude <- "-16.288889"
} else if (site == "5") {
locality <- "Banc d'Arguin: Iwik (center) sand"
fieldNumber <- "5"
decimalLatitude <- "19.905"
decimalLongitude <- "-16.309722"
} else if (site == "6") {
locality <- "Banc d'Arguin: Muzan vegetation"
fieldNumber <- "6"
decimalLatitude <- "19.900278"
decimalLongitude <- "-16.503056"
} else if (site == "7") {
locality <- "Banc d'Arguin: Kiji Vegetation"
fieldNumber <- "7"
decimalLatitude <- "19.721944"
decimalLongitude <- "-16.501389"
} else if (site == "8") {
locality <- "Banc d'Arguin: Nair Vegetation"
fieldNumber <- "8"
decimalLatitude <- "19.868611"
decimalLongitude <- "-16.391389"
} else if (site == "9") {
locality <- "Banc d'Arguin: Agnefour Vegetation"
fieldNumber <- "9"
decimalLatitude <- "19.861667"
decimalLongitude <- "-16.41"
}
new_row <- data.frame(
canonicalFull = combined_dataframe[i, "CanonicalFull"],
wormsIDs = combined_dataframe[i, "WormsIDs"],
locality = locality,
eventDate = eventDate,
fieldNumber = fieldNumber,
decimalLongitude = decimalLongitude,
decimalLatitude = decimalLatitude,
coordinateUncertaintyInMeters = 50
)
# Append the new row to df3
occ_data <- rbind(occ_data, new_row)
}
}
}
Darwin Core mapping
Required Terms
OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.
scientificName/scientificNameID
Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.
#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occ_data %>%
rename(scientificName = canonicalFull) %>%
rename(scientificNameID = wormsIDs) %>%
mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))
occurrenceID
OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.
# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)
# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, locality, eventDate), algo="md5"), sep=":"))
decimalLongitude/decimalLatitude
Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. See above.
The calculations below are used to calculate the boundaries for the EML file.
if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}
shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source
## `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg'
## using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS: 4326
Post-processing
Check data
Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.
#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, country, locality, fieldNumber, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, geodeticDatum, occurrenceStatus, basisOfRecord)
#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 0 × 0
Create the EML file
This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.
## [1] "eml-2.1.1"
#Title
title <- "Biodiversity and community assemblage of shallow habitats of the National Park of Banc d'Arguin (Mauritania): influence of habitat, season and site"
#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")
#Abstract
abstract <- eml$abstract(
para = "The biodiversity and community assemblages of seagrass and sand habitats located in shallow water habitats of the Banc d’Arguin were sampled with a beach seine at 3 sites, during two sampling missions in December and April. The objectives were to test if the community assemblages in term of abundance, species diversity and assemblage structure were different between habitats, as well as the effect of the season and the site."
)
People
Here we add the people involved in the project:
The creator is the person or organization responsible for creating the resource itself.
The contact is the person or institution to contact with questions about the use, interpretation of a data set.
The metadataProvider is the person responsible for providing the metadata documentation for the resource.
The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.
creator <- list(eml$creator(
individualName = eml$individualName(
givenName = "Nicolas",
surName = "Compain")
)
)
contact <- eml$creator(
individualName = eml$individualName(
givenName = "OBIS",
surName = "Secretariat"),
electronicMailAddress = "helpdesk@obis.org",
organizationName = "OBIS",
positionName = "Secretariat"
)
metadataProvider <- eml$metadataProvider(
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)
associatedParty <- eml$associatedParty(
role = "processor",
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)
Additional Metadata
Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.
#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}
additionalMetadata <- eml$additionalMetadata(
metadata = list(
gbif = list(
dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
hierarchyLevel = "dataset",
citation = "IPT will autogenerate this",
bibliography = list(
citation = "Compain, N. (2021). Biodiversity and community assemblage of shallow habitats of the National Park of Banc d'Arguin (Mauritania): influence of habitat, season and site. MSc thesis. University of Algarve.")
)
)
)
citationdoi <- "https://sapientia.ualg.pt/bitstream/10400.1/17835/1/Thesis%20%28digital%20version%29%20Nicolas%20COMPAIN%20a68831.pdf"
Coverage
Here we describe the dataset’s geographic, taxonomic and temporal coverage.
#Coverage
coverage <- eml$coverage(
geographicCoverage = eml$geographicCoverage(
geographicDescription = "Banc d'Arguin National Park",
boundingCoordinates = eml$boundingCoordinates(
westBoundingCoordinate = st_bbox(ind_shape)$xmax,
eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
northBoundingCoordinate = st_bbox(ind_shape)$ymax,
southBoundingCoordinate = st_bbox(ind_shape)$ymin)
),
taxonomicCoverage = eml$taxonomicCoverage(
generalTaxonomicCoverage = "Fishes",
taxonomicClassification = list(
eml$taxonomicClassification(
taxonRankName = "Superclass",
taxonRankValue = "Agnatha"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Chondrichthyes"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Osteichthyes")
)
),
temporalCoverage = eml$temporalCoverage(
rangeOfDates = eml$rangeOfDates(
beginDate = eml$beginDate(
calendarDate = "2020-04-01"
),
endDate = eml$endDate(
calendarDate = "2020-12-31"
)
)
)
)
Extra MetaData
These fields are not required, though they make the metadata more complete.
methods <- eml$methods(
methodStep = eml$methodStep(
description = eml$description(
para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
)
)
)
#Other Data
pubDate <- "2023-10-15"
#language of original document
language <- "eng"
keywordSet <- eml$keywordSet(
keyword = "Occurrence",
keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)
maintenance <- eml$maintenance(
description = eml$description(
para = ""),
maintenanceUpdateFrequency = "notPlanned"
)
#Universal CC
intellectualRights <- eml$intellectualRights(
para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)
purpose <- eml$purpose(
para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)
additionalInfo <- eml$additionalInfo(
para = "marine, harvested by iOBIS"
)
Create and Validate EML
#Put it all together
my_eml <- eml$eml(
packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),
system = "http://gbif.org",
scope = "system",
dataset = eml$dataset(
alternateIdentifier = alternateIdentifier,
title = title,
creator = creator,
metadataProvider = metadataProvider,
associatedParty = associatedParty,
pubDate = pubDate,
coverage = coverage,
language = language,
abstract = abstract,
keywordSet = keywordSet,
contact = contact,
methods = methods,
intellectualRights = intellectualRights,
purpose = purpose,
maintenance = maintenance,
additionalInfo = additionalInfo),
additionalMetadata = additionalMetadata
)
eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)
Create meta.xml file
This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”
meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)
fields <- xml_find_all(meta, "//d1:field")
for (field in fields) {
term <- xml_attr(field, "term")
if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
xml_set_attr(field, "default", eventDate)
} else if (term == "http://rs.tdwg.org/dwc/terms/country") {
xml_set_attr(field, "default", country)
} else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
xml_set_attr(field, "default", geodeticDatum)
} else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
xml_set_attr(field, "default", occurrenceStatus)
} else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
xml_set_attr(field, "default", basisOfRecord)
}
}
Save outputs
dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")
write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))
Edit EML
We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.
#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))
#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")
#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)
eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)
write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))