Bundling Bakker et al. 2017 to a DwC Archive
This is an R Markdown Notebook for converting the eDNA Data found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:
Setup
Call the necessary libraries and variables. Suppresses loading messages.
library(magrittr) # To use %<>% pipes
suppressMessages(library(janitor)) # To clean input data
suppressMessages(library(dplyr)) # To clean input data
library(stringr) # To clean input data
suppressMessages(library(rgnparser)) # To clean species names
suppressMessages(library(taxize)) # To get WoRMS IDs
library(worrms) # To get WoRMS IDs
library(digest) # To generate hashes
suppressMessages(library(obistools)) # To generate centroid lat/long and uncertainty
suppressMessages(library(sf)) # To generate wkt polygon
suppressMessages(library(EML)) # To create eml.xml file
library(xml2) # To create the meta.xml file
suppressMessages(library(zip)) # To zip DwC fileInput Parameters and Paths
Parsing PDF table to CSV
We don’t have to do this since we have the raw data.
Read source data
Now we’ll read in the csv files. There are two datasets, the sites data and the number of reads per taxon per site.
sites_csv <- "41598_2017_17150_MOESM1_ESM.csv"
sites_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", sites_csv, sep="/"))
reads_csv <- "41598_2017_17150_MOESM2_ESM.csv"
reads_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", reads_csv, sep="/"))Preprocessing
Here we tidy the data up and concatenate the tables together into one.
Tidy Data
sites_data %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
clean_names() %>%
select(-c(raw_reads, elasmobranch_reads, x_elasmo_reads_per_total_reads, elasmobranchs_observed, elasmobranch_e_dna_detected))
sites_data <- sites_data[-(77:86),]
reads_data %<>%
remove_empty(c("rows", "cols")) %>% # Remove empty rows and columns
select(-c(rank, best_identity, total_reads, superkingdom, kingdom, phylum, class, order, family, genus, species))
reads_data <- reads_data[-(65),]
all_data <- data.frame()
for (i in 1:nrow(reads_data)){
for (j in 3:(ncol(reads_data)-1)){
if (reads_data[i, j] != "0"){
fieldnumber = names(reads_data)[j]
row_index <- which(sites_data$code == fieldnumber)
temp_row <- data.frame(species = reads_data$scientific_name[i],
organismQuantity = reads_data[i, j],
fieldnumber = fieldnumber,
locality = gsub("\\s*[0-9]", "", sites_data[row_index, 1]),
decimalLatitude = sites_data[row_index, 3],
decimalLongitude = sites_data[row_index, 4],
eventDate = sites_data[row_index, 5],
sampleSizeValue = sites_data[row_index, 6],
eventID = reads_data[i, 1]
)
all_data <- rbind(all_data, temp_row)
}
}
}Get WoRMS IDs
Auto matching
First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.
#Parse author names out
parsed_names <- rgnparser::gn_parse(all_data[,1])
#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
if (attr(worms_id, "match") == "not found") {
worms_id <- NA
}
}
return(worms_id)
}
#Call the function
worms_ids <- lapply(parsed_names, function(element) {
if (element$parsed) {
return(get_worms_id_from_element(element))
} else {
return(NA)
}
})
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()
for (i in 1:nrow(all_data)) {
cleaned_value <- all_data[i,]
canonical_value <- parsed_names[[i]]$canonical$full
worms_id_value <- worms_ids[[i]][1]
if (is.null(all_data)){
canonical_value <- NA
}
temp_row <- data.frame(cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
combined_dataframe <- rbind(combined_dataframe, temp_row)
}
knitr::kable(head(combined_dataframe))| species | organismQuantity | fieldnumber | locality | decimalLatitude | decimalLongitude | eventDate | sampleSizeValue | eventID | CanonicalFull | WormsIDs |
|---|---|---|---|---|---|---|---|---|---|---|
| Timarete caribous | 1 | BE5 | Belize | 16.73402 | -87.82374 | 1/23/2015 | 15 | SK34_000000029 | Timarete caribous | 761956 |
| Timarete caribous | 1 | BE7 | Belize | 16.71980 | -87.87290 | 1/24/2015 | 572 | SK34_000000029 | Timarete caribous | 761956 |
| Timarete caribous | 1 | BE8 | Belize | 16.71538 | -87.87599 | 1/20/2015 | 137518 | SK34_000000029 | Timarete caribous | 761956 |
| Timarete caribous | 4725 | JA1 | Jamaica | 18.47084 | -77.40258 | 1/30/2015 | 6549 | SK34_000000029 | Timarete caribous | 761956 |
| Timarete caribous | 1 | JA2 | Jamaica | 18.47186 | -77.40148 | 2/2/2015 | 11878 | SK34_000000029 | Timarete caribous | 761956 |
| Timarete caribous | 1 | JA6 | Jamaica | 18.47363 | -77.42023 | 2/2/2015 | 2987 | SK34_000000029 | Timarete caribous | 761956 |
Human Verification
Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps
Occurrence Core mapping
Required Terms
OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.
scientificName/scientificNameID
Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.
#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- combined_dataframe %>%
rename(scientificName = CanonicalFull) %>%
rename(scientificNameID = WormsIDs) %>%
rename(verbatimIdentification = species) %>%
mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))occurrenceID
OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.
# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)
# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(verbatimIdentification, fieldnumber, organismQuantity, decimalLatitude), algo="md5"), sep=":"))
#check for duplicates
#duplicated(occurrence$occurrenceID)eventDate
decimalLongitude/decimalLatitude
Use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. The WKT strings are from https://github.com/iobis/mwhs-shapes.
if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}
shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))## Reading layer `marine_world_heritage' from data source
## `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg'
## using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS: 4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
group_by(name) %>%
summarize()
#Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Lagoons of New Caledonia: Reef Diversity and Associated Ecosystems")]Extra Terms
country
locality
recordedBy
organismQuantity
organismQuantityType
sampleSizeValue
DNA Derived Data Extension Mapping
Here, we’ll fill in the data for the DNA Derived Data Extension
dnaderivedextension <- as.data.frame(occurrence$occurrenceID)
names(dnaderivedextension) <- "occurrenceID"pcr_primer_name_forward
pcr_primer_name_reverse
pcr_primer_reference
env_broad_scale
DNA_sequence
pcr_primer_forward
Post-processing
Check data
Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.
#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, country, locality, decimalLatitude, decimalLongitude, occurrenceStatus, basisOfRecord, organismQuantity, organismQuantityType, sampleSizeValue, sampleSizeUnit, verbatimIdentification, fieldnumber, eventID, geodeticDatum)
dnaderivedextension = dnaderivedextension %>% select(occurrenceID, env_broad_scale, lib_layout, target_gene, seq_meth, pcr_primer_name_forward, pcr_primer_name_reverse, pcr_primer_reference, DNA_sequence, pcr_primer_forward, pcr_primer_reverse)
#Check fields
check_fields(occurrence)## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 130 × 4
## level field row message
## <chr> <chr> <int> <chr>
## 1 error scientificNameID 54 Empty value for required field scientificNameID
## 2 error scientificNameID 55 Empty value for required field scientificNameID
## 3 error scientificNameID 56 Empty value for required field scientificNameID
## 4 error scientificNameID 184 Empty value for required field scientificNameID
## 5 error scientificNameID 185 Empty value for required field scientificNameID
## 6 error scientificNameID 186 Empty value for required field scientificNameID
## 7 error scientificNameID 187 Empty value for required field scientificNameID
## 8 error scientificNameID 188 Empty value for required field scientificNameID
## 9 error scientificNameID 189 Empty value for required field scientificNameID
## 10 error scientificNameID 190 Empty value for required field scientificNameID
## # ℹ 120 more rows
Create the EML file
This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.
## [1] "eml-2.1.1"
#Title
title <- "Environmental DNA reveals tropical shark diversity in contrasting levels of anthropogenic impact"
#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")
#Abstract
abstract <- eml$abstract(
para = "Here we pilot a novel, rapid and non-invasive environmental DNA (eDNA) metabarcoding approach specifically targeted to infer shark presence, diversity and eDNA read abundance in tropical habitats. We identified at least 21 shark species, from both Caribbean and Pacific Coral Sea water samples, whose geographical patterns of diversity and read abundance coincide with geographical differences in levels of anthropogenic pressure and conservation effort. We demonstrate that eDNA metabarcoding can be effectively employed to study shark diversity.")People
Here we add the people involved in the project:
The creator is the person or organization responsible for creating the resource itself.
The contact is the person or institution to contact with questions about the use, interpretation of a data set.
The metadataProvider is the person responsible for providing the metadata documentation for the resource.
The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.
creator <- list(eml$creator(
individualName = eml$individualName(
givenName = "Judith",
surName = "Bakker"),
organizationName = "University of Salford"
), eml$creator(
individualName = eml$individualName(
givenName = "Owen S.",
surName = "Wangensteen"),
organizationName = "University of Salford"
), eml$creator(
individualName = eml$individualName(
givenName = "Demian D.",
surName = "Chapman"),
organizationName = "Florida International University"
), eml$creator(
individualName = eml$individualName(
givenName = "Germain",
surName = "Boussarie"),
organizationName = "Université Montpellier"
), eml$creator(
individualName = eml$individualName(
givenName = "Dayne",
surName = "Buddo"),
organizationName = "University of the West Indies"
), eml$creator(
individualName = eml$individualName(
givenName = "Tristan L.",
surName = "Guttridge"),
organizationName = "Bimini Biological Field Station Foundation"
), eml$creator(
individualName = eml$individualName(
givenName = "Heidi",
surName = "Hertler"),
organizationName = "The SFS Centre for Marine Resource Studies"
), eml$creator(
individualName = eml$individualName(
givenName = "David",
surName = "Mouillot"),
organizationName = "Université Montpellier"
), eml$creator(
individualName = eml$individualName(
givenName = "Laurent",
surName = "Vigliola"),
organizationName = "Laboratoire d’Excellence Labex Corail"
), eml$creator(
individualName = eml$individualName(
givenName = "Stefano",
surName = "Mariani"),
organizationName = "University of Salford"
)
)
contact <- eml$creator(
individualName = eml$individualName(
givenName = "OBIS",
surName = "Secretariat"),
electronicMailAddress = "helpdesk@obis.org",
organizationName = "OBIS",
positionName = "Secretariat"
)
metadataProvider <- eml$metadataProvider(
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)
associatedParty <- eml$associatedParty(
role = "processor",
individualName = eml$individualName(
givenName = "Chandra",
surName = "Earl"),
electronicMailAddress = "c.earl@unesco.org",
organizationName = "UNESCO",
positionName = "eDNA Scientific Officer"
)Additional Metadata
Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.
#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}
additionalMetadata <- eml$additionalMetadata(
metadata = list(
gbif = list(
dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
hierarchyLevel = "dataset",
citation = "IPT will autogenerate this",
bibliography = list(
citation = "Bakker, J., Wangensteen, O.S., Chapman, D.D. et al. Environmental DNA reveals tropical shark diversity in contrasting levels of anthropogenic impact. Sci Rep 7, 16886 (2017).")
)
)
)
citationdoi <- "https://doi.org/10.1038/s41598-017-17150-2"Coverage
Here we describe the dataset’s geographic, taxonomic and temporal coverage.
#Coverage
coverage <- eml$coverage(
geographicCoverage = eml$geographicCoverage(
geographicDescription = "Lagoons of New Caledonia",
boundingCoordinates = eml$boundingCoordinates(
westBoundingCoordinate = st_bbox(ind_shape)$xmax,
eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
northBoundingCoordinate = st_bbox(ind_shape)$ymax,
southBoundingCoordinate = st_bbox(ind_shape)$ymin)
),
taxonomicCoverage = eml$taxonomicCoverage(
generalTaxonomicCoverage = "Fishes",
taxonomicClassification = list(
eml$taxonomicClassification(
taxonRankName = "Superclass",
taxonRankValue = "Agnatha"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Chondrichthyes"),
eml$taxonomicClassification(
taxonRankName = "unranked",
taxonRankValue = "Osteichthyes")
)
# ),
# temporalCoverage = eml$temporalCoverage(
# rangeOfDates = eml$rangeOfDates(
# beginDate = eml$beginDate(
# calendarDate = "2019-05-01"
# ),
# endDate = eml$endDate(
# calendarDate = "2016-05-06"
# )
# )
)
)Extra MetaData
These fields are not required, though they make the metadata more complete.
methods <- eml$methods(
methodStep = eml$methodStep(
description = eml$description(
para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
)
)
)
#Other Data
pubDate <- "2023-10-15"
#language of original document
language <- "eng"
keywordSet <- eml$keywordSet(
keyword = "Occurrence",
keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)
maintenance <- eml$maintenance(
description = eml$description(
para = ""),
maintenanceUpdateFrequency = "notPlanned"
)
#Universal CC
intellectualRights <- eml$intellectualRights(
para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)
purpose <- eml$purpose(
para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)
additionalInfo <- eml$additionalInfo(
para = "marine, harvested by iOBIS"
)Create and Validate EML
#Put it all together
my_eml <- eml$eml(
packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),
system = "http://gbif.org",
scope = "system",
dataset = eml$dataset(
alternateIdentifier = alternateIdentifier,
title = title,
creator = creator,
metadataProvider = metadataProvider,
associatedParty = associatedParty,
pubDate = pubDate,
coverage = coverage,
language = language,
abstract = abstract,
keywordSet = keywordSet,
contact = contact,
methods = methods,
intellectualRights = intellectualRights,
purpose = purpose,
maintenance = maintenance,
additionalInfo = additionalInfo),
additionalMetadata = additionalMetadata
)
eml_validate(my_eml)## [1] TRUE
## attr(,"errors")
## character(0)
Create meta.xml file
This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_edna_template.xml”
meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_edna_template.xml", sep="/")
meta <- read_xml(meta_template)
fields <- xml_find_all(meta, "//d1:field")
for (field in fields) {
term <- xml_attr(field, "term")
if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
xml_set_attr(field, "index", "3")
} else if (term == "http://rs.tdwg.org/dwc/terms/country") {
xml_set_attr(field, "index", "4")
} else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
xml_set_attr(field, "default", geodeticDatum)
} else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
xml_set_attr(field, "default", occurrenceStatus)
} else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
xml_set_attr(field, "default", basisOfRecord)
}
}Save outputs
dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")
write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write.csv(dnaderivedextension, paste(dwc_output_dir, "/dnaderiveddata.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))Edit EML
We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.
#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))
#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")
#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)
eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)
write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))