Bundling Zajonz et al. 2019 to a DwC Archive
This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:
Setup
Call the necessary libraries and variables. Suppresses loading messages.
library(magrittr) # To use %<>% pipes
suppressMessages(library(janitor)) # To clean input data
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
suppressMessages(library(rgnparser)) # To clean species names
suppressMessages(library(taxize)) # To get WoRMS IDs
library(worrms) # To get WoRMS IDs
library(digest) # To generate hashes
suppressMessages(library(obistools)) # To generate centroid lat/long and uncertainty
suppressMessages(library(sf)) # To generate wkt polygon
suppressMessages(library(EML)) # To create eml.xml file
library(xml2) # To create the meta.xml file
suppressMessages(library(zip)) # To zip DwC file
Input Parameters and Paths
I pulled out the table pages and rotated them in a seperate PDF
Parsing PDF table to CSV
The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV.
#conda environment
condaenv <- "mwhs-data-mobilization"
# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")
# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")
# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", sep="/")
# Define page numbers and table areas (see documentation)
page_args <- c(
"-a 247.864,89.989,510.989,261.546 -p 1",
"-a 186.819,102.619,487.834,279.439 -p 2",
"-a 174.189,86.831,510.989,279.439 -p 3",
"-a 162.611,87.884,506.779,279.439 -p 4",
"-a 160.506,102.619,501.516,291.016 -p 5",
"-a 162.611,94.199,498.359,279.439 -p 6",
"-a 168.926,96.304,506.779,287.859 -p 7",
"-a 173.136,102.619,501.516,288.911 -p 8",
"-a 172.084,86.831,508.884,271.019 -p 9",
"-a 177.346,86.831,506.779,274.176 -p 10",
"-a 175.241,93.146,516.251,279.439 -p 11",
"-a 163.664,89.989,501.516,279.439 -p 12",
"-a 168.926,99.461,497.306,286.806 -p 13",
"-a 162.611,103.671,465.731,289.964 -p 14",
"-a 165.769,99.461,502.569,281.544 -p 15",
"-a 164.716,102.619,501.516,279.439 -p 16",
"-a 160.506,102.619,504.674,279.439 -p 17",
"-a 161.559,102.619,505.726,284.701 -p 18",
"-a 162.611,99.461,503.621,288.911 -p 19",
"-a 160.506,99.461,498.359,283.649 -p 20",
"-a 171.031,100.514,481.519,279.439 -p 21",
"-a 171.031,80.516,487.834,279.439 -p 22",
"-a 164.716,102.619,499.411,285.754 -p 23",
"-a 165.769,96.304,497.306,286.806 -p 24",
"-a 169.979,85.779,467.836,279.439 -p 25",
"-a 165.769,91.041,514.146,279.439 -p 26",
"-a 177.346,102.619,484.676,295.226 -p 27",
"-a 171.031,81.569,478.361,279.439 -p 28",
"-a 169.979,88.936,509.936,282.596 -p 29",
"-a 165.769,102.619,503.621,289.964 -p 30",
"-a 162.611,102.619,504.674,295.226 -p 31",
"-a 165.769,102.619,501.516,286.806 -p 32",
"-a 163.664,95.251,505.726,287.859 -p 33",
"-a 165.769,102.619,502.569,279.439 -p 34",
"-a 173.136,88.936,502.569,279.439 -p 35",
"-a 152.086,85.779,509.936,279.439 -p 36",
"-a 171.031,96.304,503.621,279.439 -p 37",
"-a 162.611,102.619,505.726,293.121 -p 38",
"-a 166.821,102.619,510.989,287.859 -p 39",
"-a 173.136,89.989,506.779,279.439 -p 40",
"-a 164.716,99.461,474.151,279.439 -p 41",
"-a 166.821,96.304,510.989,279.439 -p 42",
"-a 167.874,86.831,513.094,282.596 -p 43",
"-a 167.874,96.304,501.516,287.859 -p 44",
"-a 180.504,86.831,510.989,279.439 -p 45",
"-a 164.716,102.619,336.274,289.964 -p 46"
)
# Define run parameters (see documentation)
run_parameters <- "-s -c -nh"
# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
## [1] ""
## [2] "Script Execution Summary"
## [3] "Date and Time: 2023-09-12 10:11:10"
## [4] "------------------------------"
## [5] ""
## [6] "PDF input: ../../../datasets/socotra_archipelago/Zajonz_et_al_2019/raw/Zajonz_et_al._2019-58-108_rotated.pdf"
## [7] "Perform Table Parsing: TRUE"
## [8] "Selected Areas:"
## [9] " Area 1: [247.864, 89.989, 510.989, 261.546]"
## [10] " Area 2: [186.819, 102.619, 487.834, 279.439]"
## [11] " Area 3: [174.189, 86.831, 510.989, 279.439]"
## [12] " Area 4: [162.611, 87.884, 506.779, 279.439]"
## [13] " Area 5: [160.506, 102.619, 501.516, 291.016]"
## [14] " Area 6: [162.611, 94.199, 498.359, 279.439]"
## [15] " Area 7: [168.926, 96.304, 506.779, 287.859]"
## [16] " Area 8: [173.136, 102.619, 501.516, 288.911]"
## [17] " Area 9: [172.084, 86.831, 508.884, 271.019]"
## [18] " Area 10: [177.346, 86.831, 506.779, 274.176]"
## [19] " Area 11: [175.241, 93.146, 516.251, 279.439]"
## [20] " Area 12: [163.664, 89.989, 501.516, 279.439]"
## [21] " Area 13: [168.926, 99.461, 497.306, 286.806]"
## [22] " Area 14: [162.611, 103.671, 465.731, 289.964]"
## [23] " Area 15: [165.769, 99.461, 502.569, 281.544]"
## [24] " Area 16: [164.716, 102.619, 501.516, 279.439]"
## [25] " Area 17: [160.506, 102.619, 504.674, 279.439]"
## [26] " Area 18: [161.559, 102.619, 505.726, 284.701]"
## [27] " Area 19: [162.611, 99.461, 503.621, 288.911]"
## [28] " Area 20: [160.506, 99.461, 498.359, 283.649]"
## [29] " Area 21: [171.031, 100.514, 481.519, 279.439]"
## [30] " Area 22: [171.031, 80.516, 487.834, 279.439]"
## [31] " Area 23: [164.716, 102.619, 499.411, 285.754]"
## [32] " Area 24: [165.769, 96.304, 497.306, 286.806]"
## [33] " Area 25: [169.979, 85.779, 467.836, 279.439]"
## [34] " Area 26: [165.769, 91.041, 514.146, 279.439]"
## [35] " Area 27: [177.346, 102.619, 484.676, 295.226]"
## [36] " Area 28: [171.031, 81.569, 478.361, 279.439]"
## [37] " Area 29: [169.979, 88.936, 509.936, 282.596]"
## [38] " Area 30: [165.769, 102.619, 503.621, 289.964]"
## [39] " Area 31: [162.611, 102.619, 504.674, 295.226]"
## [40] " Area 32: [165.769, 102.619, 501.516, 286.806]"
## [41] " Area 33: [163.664, 95.251, 505.726, 287.859]"
## [42] " Area 34: [165.769, 102.619, 502.569, 279.439]"
## [43] " Area 35: [173.136, 88.936, 502.569, 279.439]"
## [44] " Area 36: [152.086, 85.779, 509.936, 279.439]"
## [45] " Area 37: [171.031, 96.304, 503.621, 279.439]"
## [46] " Area 38: [162.611, 102.619, 505.726, 293.121]"
## [47] " Area 39: [166.821, 102.619, 510.989, 287.859]"
## [48] " Area 40: [173.136, 89.989, 506.779, 279.439]"
## [49] " Area 41: [164.716, 99.461, 474.151, 279.439]"
## [50] " Area 42: [166.821, 96.304, 510.989, 279.439]"
## [51] " Area 43: [167.874, 86.831, 513.094, 282.596]"
## [52] " Area 44: [167.874, 96.304, 501.516, 287.859]"
## [53] " Area 45: [180.504, 86.831, 510.989, 279.439]"
## [54] " Area 46: [164.716, 102.619, 336.274, 289.964]"
## [55] "Pages: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46"
## [56] "Concatenate: True"
## [57] "Concatenate across headers: True"
## [58] "Stream Extraction: True"
## [59] "Lattice Extraction: False"
## [60] ""
## [61] "Parsing Tables"
## [62] "------------------------------"
## [63] ""
## [64] ""
## [65] "Saving to CSV"
## [66] "CSV file: ../../../datasets/socotra_archipelago/Zajonz_et_al_2019/processed/Zajonz_et_al._2019-58-108_rotated_tables_parsed_concatenated.csv"
## [67] "------------------------------"
## [68] ""
## [69] ""
## [70] "Run Details: ../../../datasets/socotra_archipelago/Zajonz_et_al_2019/processed/Zajonz_et_al._2019-58-108_rotated_parameters.txt"
## [71] "Finished"
## [72] ""
Read source data
Now we’ll read in the csv table outputted from the previous step
processed_csv <- "Zajonz_et_al._2019-58-108_rotated_tables_parsed_concatenated.csv"
input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv, sep="/"))
#to preview pretty table
knitr::kable(head(input_data))
X0 | X1 |
---|---|
Orectolobiformes | NA |
Rhincodontidae:. 1 sp. | NA |
Rhincodon typus Smith, 1828 | NA |
Hemiscylliidae: 1 sp. | NA |
Chiloscyllium arabicum Gubanov, 1980 | NA |
Stegostomatidae: 1 sp. | NA |
Preprocessing
Here we tidy the data up, since OCR and table parsing errors are common and only take the list of species, since this is a checklist.
Tidy Data
input_data %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
clean_names() # Have sensible (lowercase) column names
# Remove lines with colons, lines with only one word and lines that start with "("
cleaned_data <- input_data %>%
filter(!str_detect(x0, ":"),
str_count(x0, "\\S+") > 1,
!str_starts(x0, "^\\(")) %>%
remove_empty(c("rows", "cols"))
#to preview pretty table
knitr::kable(head(cleaned_data))
x0 |
---|
Rhincodon typus Smith, 1828 |
Chiloscyllium arabicum Gubanov, 1980 |
Stegostoma fasciatum (Hermann, 1783) |
Nebrius ferrugineus (Lesson, 1831) |
Isurus oxyrinchus Rafinesque, 1810 |
Mustelus mosis Hemprich & Ehrenberg, 1899 |
Get WoRMS IDs
Auto matching
First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.
#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,])
#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- NA
}
}
return(worms_id)
}
#Call the function
worms_ids <- lapply(parsed_names, function(element) {
if (element$parsed) {
return(get_worms_id_from_element(element))
} else {
return(NA)
}
})
##
## id target authority status
## 1 439446 Randallia Stimpson, 1857 accepted
## 5 441327 Randallia bulligera Rathbun, 1898 accepted
## 7 441328 Randallia curacaoensis Rathbun, 1922 accepted
## 14 1663147 Randallia jingomao Hu & Tao, 2004 accepted
## 15 441331 Randallia laevis (Borradaile, 1916) accepted
## 18 441332 Randallia minuta Rathbun, 1935 accepted
## 21 441333 Randallia ornata (Randall, 1840) accepted
## 23 1660821 Randallia pleistocenica Rathbun, 1926 accepted
## 24 1675261 Randallia prolanata Hu & Tao, 1996 accepted
## 28 1635436 Randallia saitoensis Karasawa, 1993 accepted
## 36 270695 Randallichthys Anderson, Kami & Johnson, 1977 accepted
## 37 282547 Randallichthys filamentosus (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target
## 1 439446 Randallia
## 2 441325 Randallia agaricias
## 3 441326 Randallia americana
## 4 1380547 Randallia angelica
## 5 441327 Randallia bulligera
## 6 455328 Randallia coronata
## 7 441328 Randallia curacaoensis
## 8 455329 Randallia distincta
## 9 455330 Randallia eburnea
## 10 441329 Randallia gilberti
## 11 441330 Randallia granulata
## 12 455333 Randallia granuloides
## 13 455334 Randallia japonica
## 14 1663147 Randallia jingomao
## 15 441331 Randallia laevis
## 16 455335 Randallia lamellidentata
## 17 455337 Randallia mesjatzevi
## 18 441332 Randallia minuta
## 19 455338 Randallia mirabilis
## 20 455339 Randallia nana
## 21 441333 Randallia ornata
## 22 455340 Randallia pila
## 23 1660821 Randallia pleistocenica
## 24 1675261 Randallia prolanata
## 25 455341 Randallia pustulilabris
## 26 455342 Randallia pustuloides
## 27 455343 Randallia pustulosa
## 28 1635436 Randallia saitoensis
## 29 455344 Randallia serenei
## 30 455345 Randallia speciosa
## 31 1646091 Randallia strouhali
## 32 1675259 Randallia trinucloidea
## 33 455346 Randallia trituberculata
## 34 455347 Randallia villosa
## 35 455348 Randallia vitjazi
## 36 270695 Randallichthys
## 37 282547 Randallichthys filamentosus
## authority status
## 1 Stimpson, 1857 accepted
## 2 Rathbun, 1898 superseded combination
## 3 (Rathbun, 1894) superseded combination
## 4 Garth, 1940 junior subjective synonym
## 5 Rathbun, 1898 accepted
## 6 Alcock & Anderson, 1894 superseded combination
## 7 Rathbun, 1922 accepted
## 8 Rathbun, 1894 superseded combination
## 9 Alcock, 1896 superseded combination
## 10 Rathbun, 1906 unaccepted
## 11 Miers in Tizard, Moseley, Buchanan & Murray, 1885 superseded combination
## 12 Sakai, 1961 superseded combination
## 13 Yokoya, 1933 junior subjective synonym
## 14 Hu & Tao, 2004 accepted
## 15 (Borradaile, 1916) accepted
## 16 Wood-Mason, 1892 superseded combination
## 17 Zarenkov, 1990 superseded combination
## 18 Rathbun, 1935 accepted
## 19 Zarenkov, 1969 superseded combination
## 20 Zarenkov, 1990 superseded combination
## 21 (Randall, 1840) accepted
## 22 Tan, 1996 superseded combination
## 23 Rathbun, 1926 accepted
## 24 Hu & Tao, 1996 accepted
## 25 Alcock, 1896 junior subjective synonym
## 26 Sakai, 1961 superseded combination
## 27 Wood-Mason in Wood-Mason & Alcock, 1891 superseded combination
## 28 Karasawa, 1993 accepted
## 29 Richer de Forges, 1983 superseded combination
## 30 Chen, 1989 superseded combination
## 31 Bachmayer, 1953 superseded combination
## 32 Hu & Tao, 1996 superseded combination
## 33 Sakai, 1961 superseded combination
## 34 Chen, 1989 superseded combination
## 35 Zarenkov, 1994 junior subjective synonym
## 36 Anderson, Kami & Johnson, 1977 accepted
## 37 (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target authority status
## 1 439446 Randallia Stimpson, 1857 accepted
## 5 441327 Randallia bulligera Rathbun, 1898 accepted
## 7 441328 Randallia curacaoensis Rathbun, 1922 accepted
## 14 1663147 Randallia jingomao Hu & Tao, 2004 accepted
## 15 441331 Randallia laevis (Borradaile, 1916) accepted
## 18 441332 Randallia minuta Rathbun, 1935 accepted
## 21 441333 Randallia ornata (Randall, 1840) accepted
## 23 1660821 Randallia pleistocenica Rathbun, 1926 accepted
## 24 1675261 Randallia prolanata Hu & Tao, 1996 accepted
## 28 1635436 Randallia saitoensis Karasawa, 1993 accepted
## 36 270695 Randallichthys Anderson, Kami & Johnson, 1977 accepted
## 37 282547 Randallichthys filamentosus (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target
## 1 439446 Randallia
## 2 441325 Randallia agaricias
## 3 441326 Randallia americana
## 4 1380547 Randallia angelica
## 5 441327 Randallia bulligera
## 6 455328 Randallia coronata
## 7 441328 Randallia curacaoensis
## 8 455329 Randallia distincta
## 9 455330 Randallia eburnea
## 10 441329 Randallia gilberti
## 11 441330 Randallia granulata
## 12 455333 Randallia granuloides
## 13 455334 Randallia japonica
## 14 1663147 Randallia jingomao
## 15 441331 Randallia laevis
## 16 455335 Randallia lamellidentata
## 17 455337 Randallia mesjatzevi
## 18 441332 Randallia minuta
## 19 455338 Randallia mirabilis
## 20 455339 Randallia nana
## 21 441333 Randallia ornata
## 22 455340 Randallia pila
## 23 1660821 Randallia pleistocenica
## 24 1675261 Randallia prolanata
## 25 455341 Randallia pustulilabris
## 26 455342 Randallia pustuloides
## 27 455343 Randallia pustulosa
## 28 1635436 Randallia saitoensis
## 29 455344 Randallia serenei
## 30 455345 Randallia speciosa
## 31 1646091 Randallia strouhali
## 32 1675259 Randallia trinucloidea
## 33 455346 Randallia trituberculata
## 34 455347 Randallia villosa
## 35 455348 Randallia vitjazi
## 36 270695 Randallichthys
## 37 282547 Randallichthys filamentosus
## authority status
## 1 Stimpson, 1857 accepted
## 2 Rathbun, 1898 superseded combination
## 3 (Rathbun, 1894) superseded combination
## 4 Garth, 1940 junior subjective synonym
## 5 Rathbun, 1898 accepted
## 6 Alcock & Anderson, 1894 superseded combination
## 7 Rathbun, 1922 accepted
## 8 Rathbun, 1894 superseded combination
## 9 Alcock, 1896 superseded combination
## 10 Rathbun, 1906 unaccepted
## 11 Miers in Tizard, Moseley, Buchanan & Murray, 1885 superseded combination
## 12 Sakai, 1961 superseded combination
## 13 Yokoya, 1933 junior subjective synonym
## 14 Hu & Tao, 2004 accepted
## 15 (Borradaile, 1916) accepted
## 16 Wood-Mason, 1892 superseded combination
## 17 Zarenkov, 1990 superseded combination
## 18 Rathbun, 1935 accepted
## 19 Zarenkov, 1969 superseded combination
## 20 Zarenkov, 1990 superseded combination
## 21 (Randall, 1840) accepted
## 22 Tan, 1996 superseded combination
## 23 Rathbun, 1926 accepted
## 24 Hu & Tao, 1996 accepted
## 25 Alcock, 1896 junior subjective synonym
## 26 Sakai, 1961 superseded combination
## 27 Wood-Mason in Wood-Mason & Alcock, 1891 superseded combination
## 28 Karasawa, 1993 accepted
## 29 Richer de Forges, 1983 superseded combination
## 30 Chen, 1989 superseded combination
## 31 Bachmayer, 1953 superseded combination
## 32 Hu & Tao, 1996 superseded combination
## 33 Sakai, 1961 superseded combination
## 34 Chen, 1989 superseded combination
## 35 Zarenkov, 1994 junior subjective synonym
## 36 Anderson, Kami & Johnson, 1977 accepted
## 37 (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target authority status
## 1 439446 Randallia Stimpson, 1857 accepted
## 5 441327 Randallia bulligera Rathbun, 1898 accepted
## 7 441328 Randallia curacaoensis Rathbun, 1922 accepted
## 14 1663147 Randallia jingomao Hu & Tao, 2004 accepted
## 15 441331 Randallia laevis (Borradaile, 1916) accepted
## 18 441332 Randallia minuta Rathbun, 1935 accepted
## 21 441333 Randallia ornata (Randall, 1840) accepted
## 23 1660821 Randallia pleistocenica Rathbun, 1926 accepted
## 24 1675261 Randallia prolanata Hu & Tao, 1996 accepted
## 28 1635436 Randallia saitoensis Karasawa, 1993 accepted
## 36 270695 Randallichthys Anderson, Kami & Johnson, 1977 accepted
## 37 282547 Randallichthys filamentosus (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
## id target
## 1 439446 Randallia
## 2 441325 Randallia agaricias
## 3 441326 Randallia americana
## 4 1380547 Randallia angelica
## 5 441327 Randallia bulligera
## 6 455328 Randallia coronata
## 7 441328 Randallia curacaoensis
## 8 455329 Randallia distincta
## 9 455330 Randallia eburnea
## 10 441329 Randallia gilberti
## 11 441330 Randallia granulata
## 12 455333 Randallia granuloides
## 13 455334 Randallia japonica
## 14 1663147 Randallia jingomao
## 15 441331 Randallia laevis
## 16 455335 Randallia lamellidentata
## 17 455337 Randallia mesjatzevi
## 18 441332 Randallia minuta
## 19 455338 Randallia mirabilis
## 20 455339 Randallia nana
## 21 441333 Randallia ornata
## 22 455340 Randallia pila
## 23 1660821 Randallia pleistocenica
## 24 1675261 Randallia prolanata
## 25 455341 Randallia pustulilabris
## 26 455342 Randallia pustuloides
## 27 455343 Randallia pustulosa
## 28 1635436 Randallia saitoensis
## 29 455344 Randallia serenei
## 30 455345 Randallia speciosa
## 31 1646091 Randallia strouhali
## 32 1675259 Randallia trinucloidea
## 33 455346 Randallia trituberculata
## 34 455347 Randallia villosa
## 35 455348 Randallia vitjazi
## 36 270695 Randallichthys
## 37 282547 Randallichthys filamentosus
## authority status
## 1 Stimpson, 1857 accepted
## 2 Rathbun, 1898 superseded combination
## 3 (Rathbun, 1894) superseded combination
## 4 Garth, 1940 junior subjective synonym
## 5 Rathbun, 1898 accepted
## 6 Alcock & Anderson, 1894 superseded combination
## 7 Rathbun, 1922 accepted
## 8 Rathbun, 1894 superseded combination
## 9 Alcock, 1896 superseded combination
## 10 Rathbun, 1906 unaccepted
## 11 Miers in Tizard, Moseley, Buchanan & Murray, 1885 superseded combination
## 12 Sakai, 1961 superseded combination
## 13 Yokoya, 1933 junior subjective synonym
## 14 Hu & Tao, 2004 accepted
## 15 (Borradaile, 1916) accepted
## 16 Wood-Mason, 1892 superseded combination
## 17 Zarenkov, 1990 superseded combination
## 18 Rathbun, 1935 accepted
## 19 Zarenkov, 1969 superseded combination
## 20 Zarenkov, 1990 superseded combination
## 21 (Randall, 1840) accepted
## 22 Tan, 1996 superseded combination
## 23 Rathbun, 1926 accepted
## 24 Hu & Tao, 1996 accepted
## 25 Alcock, 1896 junior subjective synonym
## 26 Sakai, 1961 superseded combination
## 27 Wood-Mason in Wood-Mason & Alcock, 1891 superseded combination
## 28 Karasawa, 1993 accepted
## 29 Richer de Forges, 1983 superseded combination
## 30 Chen, 1989 superseded combination
## 31 Bachmayer, 1953 superseded combination
## 32 Hu & Tao, 1996 superseded combination
## 33 Sakai, 1961 superseded combination
## 34 Chen, 1989 superseded combination
## 35 Zarenkov, 1994 junior subjective synonym
## 36 Anderson, Kami & Johnson, 1977 accepted
## 37 (Fourmanoir, 1970) accepted
##
## More than one WORMS ID found for taxon 'Randall'!
##
## Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()
for (i in 1:nrow(cleaned_data)) {
cleaned_value <- cleaned_data[i,]
canonical_value <- parsed_names[[i]]$canonical$full
worms_id_value <- worms_ids[[i]][1]
if (is.null(canonical_value)){
canonical_value <- NA
}
temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
combined_dataframe <- rbind(combined_dataframe, temp_row)
}
knitr::kable(head(combined_dataframe))
CleanedData | CanonicalFull | WormsIDs |
---|---|---|
Rhincodon typus Smith, 1828 | Rhincodon typus | 105847 |
Chiloscyllium arabicum Gubanov, 1980 | Chiloscyllium arabicum | 277827 |
Stegostoma fasciatum (Hermann, 1783) | Stegostoma fasciatum | 220032 |
Nebrius ferrugineus (Lesson, 1831) | Nebrius ferrugineus | 220030 |
Isurus oxyrinchus Rafinesque, 1810 | Isurus oxyrinchus | 105839 |
Mustelus mosis Hemprich & Ehrenberg, 1899 | Mustelus mosis | 214558 |
Human Verification
Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps
combined_dataframe[30, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gymnura", "cf. poecilura", 105754)
combined_dataframe[31, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gymnura", "aff. tentaculata", 105754)
combined_dataframe[47, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gymnothorax", "cf. chilospilus", 125636)
combined_dataframe[57, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gymnothorax", "cf. pseudothyrsoideus", 125636)
combined_dataframe[65, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Anodontostoma", "cf. chacunda", 268322)
combined_dataframe[83, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Crenimugil", "cf. buchanani", 151497)
combined_dataframe[87, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Osteomugil", "cf. cunnesius", 1042878)
combined_dataframe[96, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Cheilopogon", "cf. spilopterus", 125691)
combined_dataframe[120, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Scorpaenopsis", "cf. lactomaculata", 204563)
combined_dataframe[146, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Epinephelus", "cf. chlorostigma", 126068)
combined_dataframe[147, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Epinephelus", "cf. coioides", 126068)
combined_dataframe[154, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Epinephelus", "cf. indistinctus", 126068)
combined_dataframe[156, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Epinephelus", "cf. malabaricus", 126068)
combined_dataframe[178, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pseudochromis", "cf. omanensis", 205515)
combined_dataframe[179, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pseudochromis", "cf. punctatus", 205515)
combined_dataframe[187, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Plesiops", "cf. mystaxus", 203918)
combined_dataframe[194, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Priacanthus", "cf. tayenus", 126049)
combined_dataframe[197,2:3] = c("Apogonichthyoides pseudotaeniatus", 475091)
combined_dataframe[198, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Apogonichthyoides", "cf. taeniatus", 204673)
combined_dataframe[199, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Apogonichthyoides", "cf. timorensis", 204673)
combined_dataframe[201, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Cheilodipterus", "cf. artus", 206449)
combined_dataframe[207,2:3] = c("Jaydia queketti", 209280)
combined_dataframe[219, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Sillago", "cf. sihama", 126072)
combined_dataframe[252, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pristipomoides", "cf. filamentosus", 159804)
combined_dataframe[260, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Lutjanus", "cf. indicus", 159791)
combined_dataframe[280, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gerres", "cf. infasciatus", 204064)
combined_dataframe[292, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Plectorhinchus", "cf. chubbi", 126010)
combined_dataframe[304, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Argyrops", "cf. spinifer", 206644)
combined_dataframe[306, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Diplodus", "cf. kotschyi", 126076)
combined_dataframe[315, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Lethrinus", "cf. olivaceus", 206059)
combined_dataframe[326,2:3] = c("Mulloidichthys flavolineatus flavicaudus", 881626)
combined_dataframe[363, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chaetodon", "cf. mesoleucos", 125954)
combined_dataframe[370, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chaetodon", "Chaetodon collare × Chaetodon lunula", 125954)
combined_dataframe[384, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pomacanthus", "Pomacanthus asfur × Pomacanthus maculosus", 159286)
combined_dataframe[385, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pomacanthus", "Pomacanthus semicirculatus × Pomacanthus maculosus", 159286)
combined_dataframe[388, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Amphiprion", "cf. chagosensis", 205723)
combined_dataframe[390, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chromis", "cf. acares", 126045)
combined_dataframe[393, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chromis", "cf. nigrura", 126045)
combined_dataframe[402, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Dascyllus", "Dascyllus carneus × Dascyllus marginatus", 205736)
#combination not in WoRMs
combined_dataframe[420,2:3] = c("Plectroglyphidodon leucozonus cingulum", NA)
combined_dataframe[421, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pomacentrus", "cf. aquilus", 204256)
combined_dataframe[424, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pomacentrus", "sp. 2 [aff. leptus]", 204256)
combined_dataframe[425, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pomacentrus", "cf. sulfureus", 204256)
combined_dataframe[432, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Cheilinus", "cf. fasciatus", 204502)
combined_dataframe[437, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Cirrhilabrus", "cf. cyanopleura", 204972)
combined_dataframe[443, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Oxycheilinus", "cf. mentalis", 205227)
combined_dataframe[466, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Halichoeres", "cf. stigmaticus", 158813)
combined_dataframe[467, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Halichoeres", "cf. zeylonicus", 158813)
combined_dataframe[476, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Leptojulis", "cf. cyanopleura", 269660)
combined_dataframe[480, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Stethojulis", "cf. strigiventer", 204183)
combined_dataframe[483, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Thalassoma", "cf. hardwicke", 126024)
combined_dataframe[489, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Thalassoma", "cf. rueppellii", 126024)
combined_dataframe[492, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chlorurus", "cf. gibbus", 204543)
combined_dataframe[501, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Scarus", "cf. persicus", 159299)
combined_dataframe[504, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Scarus", "cf. scaber", 159299)
combined_dataframe[505, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Scarus", "cf. tricolor", 159299)
combined_dataframe[513, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Helcogramma", "cf. obtusirostris", 206858)
combined_dataframe[516, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Ecsenius", "cf. bicolor", 204781)
combined_dataframe[520, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Ecsenius", "n. sp. [pulcher-complex]", 204781)
combined_dataframe[524, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Scartella", "cf. emarginata", 125923)
combined_dataframe[532, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Meiacanthus", "cf. mossambicus", 206901)
combined_dataframe[537, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Alloblennius", "cf. parvus", 205245)
combined_dataframe[547, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gnatholepis", "cf. cauerensis", 204522)
combined_dataframe[548, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Awaous", "cf. aeneofuscus", 826711)
combined_dataframe[563, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Eviota", "cf. pardalota", 205965)
combined_dataframe[564, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Eviota", "cf. prasina", 205965)
combined_dataframe[566, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Fusigobius", "cf. duospilus", 206282)
combined_dataframe[569, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Glossogobius", "sp. 1 [aff. tenuiformis]", 203910)
combined_dataframe[570, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Glossogobius", "sp. 2 [aff. tenuiformis]", 203910)
combined_dataframe[572, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gobiodon", "cf. reticulatus", 204038)
combined_dataframe[583, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Trimma", "sp. 1", 205799)
combined_dataframe[584, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Trimma", "sp. 2", 205799)
combined_dataframe[585, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Trimma", "sp. 3", 205799)
combined_dataframe[588, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Xenisthmus", "cf. balius", 205125)
combined_dataframe[590, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Gunnellichthys", "cf. viridescens", 205532)
combined_dataframe[591, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Ptereleotris", "cf. arabica", 204246)
combined_dataframe[598, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Siganus", "cf. luridus", 126071)
combined_dataframe[607, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Acanthurus", "cf. nigricans", 125908)
combined_dataframe[614, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Ctenochaetus", "cf. binotatus", 204635)
combined_dataframe[618, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Zebrasoma", "cf. scopas", 204630)
combined_dataframe[627, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Naso", "cf. tuberosus", 205230)
combined_dataframe <- combined_dataframe[-c(27, 270, 327, 340, 679),]
Darwin Core mapping
Required Terms
OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.
scientificName/scientificNameID
Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.
#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- combined_dataframe %>%
distinct(CanonicalFull, identificationQualifier, WormsIDs) %>%
rename(scientificName = CanonicalFull) %>%
rename(scientificNameID = WormsIDs) %>%
mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))
occurrenceID
OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.
# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)
# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, identificationQualifier), algo="md5"), sep=":"))
eventDate
This is NULL since this is technically a checklist and we do not know the collection date.
decimalLongitude/decimalLatitude
Use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. The WKT strings are from https://github.com/iobis/mwhs-shapes.
if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}
shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source
## `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg'
## using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS: 4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
group_by(name) %>%
summarize()
#Socotra Archipelago
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Socotra Archipelago")]
#convert shape to WKT
wkt <- st_as_text(ind_shape, digits = 6)
localities <- calculate_centroid(wkt)
occurrence %<>% mutate(decimalLatitude = localities$decimalLatitude)
occurrence %<>% mutate(decimalLongitude = localities$decimalLongitude)
Extra Terms
coordinateUncertaintyInMeters
Post-processing
Check data
Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.
#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, identificationQualifier,scientificNameID, eventDate, country, locality, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, footprintWKT, geodeticDatum, occurrenceStatus, basisOfRecord)
#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 683 × 4
## level field row message
## <chr> <chr> <int> <chr>
## 1 error eventDate 1 Empty value for required field eventDate
## 2 error eventDate 2 Empty value for required field eventDate
## 3 error eventDate 3 Empty value for required field eventDate
## 4 error eventDate 4 Empty value for required field eventDate
## 5 error eventDate 5 Empty value for required field eventDate
## 6 error eventDate 6 Empty value for required field eventDate
## 7 error eventDate 7 Empty value for required field eventDate
## 8 error eventDate 8 Empty value for required field eventDate
## 9 error eventDate 9 Empty value for required field eventDate
## 10 error eventDate 10 Empty value for required field eventDate
## # ℹ 673 more rows
Create the EML file
This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.
## [1] "eml-2.1.1"
#Title
title <- "Coastal fish diversity of the Socotra Archipelago, Yemen: Fishes Checklist"
#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")
#Abstract
abstract <- eml$abstract(
para = "The Socotra Archipelago, located in the eastern Gulf of Aden, has a unique marine environment which combines tropical and ‘pseudo-temperate’ elements. Studies on the fish biogeography of the archipelago, partially framed in regional studies, have substantially outpaced critical elementary research on the archipelago’s fish diversity. The present study seeks to close this gap and identifies the Socotra Archipelago as a major hotspot of coastal fish diversity in the Indian Ocean. The archipelago supports unique coastal fish assemblages which are predominantly composed of coral-associated (“reef”) species, in spite of the limited biogenic reef frameworks. A Preliminary Checklist comprises 682 species with confirmed records and a “Working List” includes an additional 51 records, totalling 733 faunal records in 108 families."
)
People
Here we add the people involved in the project:
The creator is the person or organization responsible for creating the resource itself.
The contact is the person or institution to contact with questions about the use, interpretation of a data set.
The metadataProvider is the person responsible for providing the metadata documentation for the resource.
The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.
creator <- list(eml$creator(
individualName = eml$individualName(
givenName = "Uwe",
surName = "Zajonz"),
organizationName = "Senckenberg Research Institute and Museum of Nature",
electronicMailAddress = "uzajonz@senckenberg.de"
), eml$creator(
individualName = eml$individualName(
givenName = "Edouard",
surName = "Lavergne"),
organizationName = "Kyoto University",
electronicMailAddress = "edouard.lavergne@gmail.com"
), eml$creator(
individualName = eml$individualName(
givenName = "Sergey Y.",
surName = "Bogorodsky"),
organizationName = "Station of Naturalists",
electronicMailAddress = "ic187196@yandex.ru"
), eml$creator(
individualName = eml$individualName(
givenName = "Fouad Naseeb",
surName = "Saeed"),
organizationName = "Environmental Protection Authority, Socotra Branch",
electronicMailAddress = "imhor.fouad@gmail.com"
), eml$creator(
individualName = eml$individualName(
givenName = "Moteah Sheikh",
surName = "Aideed"),
organizationName = "Hadhramout University",
electronicMailAddress = "sh79mo@yahoo.com"
), eml$creator(
individualName = eml$individualName(
givenName = "Friedhelm",
surName = "Krupp"),
organizationName = "Senckenberg Research Institute and Museum of Nature"
)
)
contact <- eml$creator(
individualName = eml$individualName(
givenName = "OBIS",
surName = "Secretariat"),
electronicMailAddress = "helpdesk@obis.org",
organizationName = "OBIS",
positionName = "Secretariat"
)
metadataProvider <- eml$metadataProvider(
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)
associatedParty <- eml$associatedParty(
role = "processor",
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)
Additional Metadata
Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.
#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}
additionalMetadata <- eml$additionalMetadata(
metadata = list(
gbif = list(
dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
hierarchyLevel = "dataset",
citation = "IPT will autogenerate this",
bibliography = list(
citation = "Zajonz U, Lavergne E, Bogorodsky S, Saeed F, Aideed M, Krupp F. (2019). Coastal fish diversity of the Socotra Archipelago, Yemen. Zootaxa. 4636. 001-108. ")
)
)
)
citationdoi <- "https://doi.org/10.11646/zootaxa.4636.1.1"
Coverage
Here we describe the dataset’s geographic, taxonomic and temporal coverage.
#Coverage
coverage <- eml$coverage(
geographicCoverage = eml$geographicCoverage(
geographicDescription = "Socotra Archipelago",
boundingCoordinates = eml$boundingCoordinates(
westBoundingCoordinate = st_bbox(ind_shape)$xmax,
eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
northBoundingCoordinate = st_bbox(ind_shape)$ymax,
southBoundingCoordinate = st_bbox(ind_shape)$ymin)
),
taxonomicCoverage = eml$taxonomicCoverage(
generalTaxonomicCoverage = "Fishes",
taxonomicClassification = list(
eml$taxonomicClassification(
taxonRankName = "Superclass",
taxonRankValue = "Agnatha"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Chondrichthyes"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Osteichthyes")
)
# ),
# temporalCoverage = eml$temporalCoverage(
# rangeOfDates = eml$rangeOfDates(
# beginDate = eml$beginDate(
# calendarDate = "2019-05-01"
# ),
# endDate = eml$endDate(
# calendarDate = "2016-05-06"
# )
# )
)
)
Extra MetaData
These fields are not required, though they make the metadata more complete.
methods <- eml$methods(
methodStep = eml$methodStep(
description = eml$description(
para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
)
)
)
#Other Data
pubDate <- "2023-10-15"
#language of original document
language <- "eng"
keywordSet <- eml$keywordSet(
keyword = "Occurrence",
keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)
maintenance <- eml$maintenance(
description = eml$description(
para = ""),
maintenanceUpdateFrequency = "notPlanned"
)
#Universal CC
intellectualRights <- eml$intellectualRights(
para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)
purpose <- eml$purpose(
para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)
additionalInfo <- eml$additionalInfo(
para = "marine, harvested by iOBIS"
)
Create and Validate EML
#Put it all together
my_eml <- eml$eml(
packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),
system = "http://gbif.org",
scope = "system",
dataset = eml$dataset(
alternateIdentifier = alternateIdentifier,
title = title,
creator = creator,
metadataProvider = metadataProvider,
associatedParty = associatedParty,
pubDate = pubDate,
coverage = coverage,
language = language,
abstract = abstract,
keywordSet = keywordSet,
contact = contact,
methods = methods,
intellectualRights = intellectualRights,
purpose = purpose,
maintenance = maintenance,
additionalInfo = additionalInfo),
additionalMetadata = additionalMetadata
)
eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)
Create meta.xml file
This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”
meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_checklist_template.xml", sep="/")
meta <- read_xml(meta_template)
fields <- xml_find_all(meta, "//d1:field")
for (field in fields) {
term <- xml_attr(field, "term")
if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
xml_set_attr(field, "default", eventDate)
} else if (term == "http://rs.tdwg.org/dwc/terms/country") {
xml_set_attr(field, "default", country)
} else if (term == "http://rs.tdwg.org/dwc/terms/locality") {
xml_set_attr(field, "default", locality)
} else if (term == "http://rs.tdwg.org/dwc/terms/decimalLatitude") {
xml_set_attr(field, "default", localities$decimalLatitude)
} else if (term == "http://rs.tdwg.org/dwc/terms/decimalLongitude") {
xml_set_attr(field, "default", localities$decimalLongitude)
} else if (term == "http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters") {
xml_set_attr(field, "default", localities$coordinateUncertaintyInMeters)
} else if (term == "http://rs.tdwg.org/dwc/terms/footprintWKT") {
xml_set_attr(field, "default", wkt)
} else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
xml_set_attr(field, "default", geodeticDatum)
} else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
xml_set_attr(field, "default", occurrenceStatus)
} else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
xml_set_attr(field, "default", basisOfRecord)
}
}
#Add identificationQualifier
new_field <- xml_add_sibling(fields[[3]], "field")
xml_set_attr(new_field, "index", "3")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/identificationQualifier")
fields <- append(fields, list(new_field))
Save outputs
dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")
write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))
Edit EML
We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.
#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))
#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")
#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)
eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)
write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))