Bundling Hollarsmith et al. 2020 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Hollarsmith, Jordan & Ramírez-Ortiz, Georgina & Winquist, Tallulah & Velasco-Lozano, Manuel & DuBois, Katherine & Reyes-Bonilla, Héctor & Neumann, Kyle & Grosholz, Edwin. (2020). Habitats and fish communities at mesophotic depths in the Mexican Pacific. Journal of Biogeography. 47.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file
suppressMessages(library(tidyr))

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "archipielago_de_revillagigedo"
dataset_dir_name <- "Hollarsmith_et_al_2020"
original_pdf <- ""
short_name <- "revillagigedo-hollarsmith-2020"

Parsing PDF table to CSV

We don’t have to do this since we have the raw data.

Read source data

Now we’ll read in the raw data tables.

species_csv <- "Data_Hollarsmith_et_al_Jbio2020.csv"
locality_csv <- "Site_summary_Hollarsmith_et_al_Jbio2020.csv"

input_species_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", species_csv, sep="/"))
input_locality_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", locality_csv, sep="/"))

Preprocessing

Here we tidy the data up.

Tidy Data

input_data <- merge(input_species_data, input_locality_data, by = "Video_code", all.x = TRUE)

input_data %<>%
  select(-c(Video_code, Video_name.x, Video_name.y, Group, Code, Pilot, Temp_avg, Substrate_abiotic, Encrusting.red, Filamentous.algae, Rhodolith, Red.blade, Palmophyllum))

cleaned_data <- input_data

#to preview pretty table
knitr::kable(head(cleaned_data))
Location Observer Species Latitude Longitude Depth_avg Habitat_biotic
LaPaz Manuel #N/A 24.32603 -109.742 83 Burrows
LaPaz Manuel #N/A 24.32603 -109.742 83 Burrows
LaPaz Alexia Muraena argus 24.32603 -109.742 85 Burrows
LaPaz Manuel Muraena argus 24.32603 -109.742 85 Burrows
LaPaz Manuel Seriola rivoliana 24.32603 -109.742 85 Burrows
LaPaz Alexia Liopropoma fasciatum 24.32603 -109.742 85 Sponge

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,3])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})
## 
##       id                           target                  authority   status
## 1 513266              Eucidaris thouarsii (L. Agassiz & Desor, 1846) accepted
## 2 513653 Eucidaris thouarsii galapagensis          (Döderlein, 1887) accepted
## 3 608134    Eucidaris thouarsii thouarsii (L. Agassiz & Desor, 1846) accepted
## 
## More than one WORMS ID found for taxon 'Eucidaris thouarsi'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                           target                  authority   status
## 1 513266              Eucidaris thouarsii (L. Agassiz & Desor, 1846) accepted
## 2 513653 Eucidaris thouarsii galapagensis          (Döderlein, 1887) accepted
## 3 608134    Eucidaris thouarsii thouarsii (L. Agassiz & Desor, 1846) accepted
## 
## More than one WORMS ID found for taxon 'Eucidaris thouarsi'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.Location CleanedData.Observer CleanedData.Species CleanedData.Latitude CleanedData.Longitude CleanedData.Depth_avg CleanedData.Habitat_biotic CanonicalFull WormsIDs
LaPaz Manuel #N/A 24.32603 -109.742 83 Burrows NA NA
LaPaz Manuel #N/A 24.32603 -109.742 83 Burrows NA NA
LaPaz Alexia Muraena argus 24.32603 -109.742 85 Burrows Muraena argus 271896
LaPaz Manuel Muraena argus 24.32603 -109.742 85 Burrows Muraena argus 271896
LaPaz Manuel Seriola rivoliana 24.32603 -109.742 85 Burrows Seriola rivoliana 126818
LaPaz Alexia Liopropoma fasciatum 24.32603 -109.742 85 Sponge Liopropoma fasciatum 275942

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[8,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[18,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[23,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[29,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[30,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[37,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[62,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[65,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[146,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[149,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[164,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[169,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[180,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[183,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[185,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[194,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[197,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[211,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[223,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[321,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[329,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[339,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[395,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[481,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[539,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[540,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[541,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[547,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[550,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[555,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[679,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[684,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[702,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[727,8:9] = c('Prognathodes falcifer', 273374)
combined_dataframe[793,8:9] = c('Prognathodes falcifer', 273374)

combined_dataframe[125,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[129,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[287,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[363,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[373,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[376,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[377,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[379,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[384,8:9] = c('Pentaceraster cumingi', 370899)
combined_dataframe[385,8:9] = c('Pentaceraster cumingi', 370899)

combined_dataframe[201,8:9] = c('Holothuria', 123456)

combined_dataframe[714,8:9] = c('Eucidaris thouarsii', 513266)


#duplicates
combined_dataframe <- combined_dataframe[-c(1, 2, 167, 190, 558, 579, 600, 613, 617, 644, 661, 716, 294, 317, 354, 479, 612),]

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- combined_dataframe %>%
  distinct(CleanedData.Location, CleanedData.Observer, CleanedData.Latitude, CleanedData.Longitude, CleanedData.Depth_avg, CleanedData.Habitat_biotic, CanonicalFull, WormsIDs) %>%
  rename(scientificName = CanonicalFull) %>%
  rename(scientificNameID = WormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, CleanedData.Location, CleanedData.Observer, CleanedData.Latitude, CleanedData.Longitude, CleanedData.Depth_avg, CleanedData.Habitat_biotic), algo="md5"), sep=":"))

eventDate

This is NULL since this is technically a checklist and we do not know the collection date.

occurrence <- occurrence %>%
  mutate(eventDate = ifelse(CleanedData.Location == "LaPaz", "2018-10-01/2018-10-31", "2018-12-01/2018-12-31"))

decimalLongitude/decimalLatitude

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. See above.

The calculations below are used to calculate the boundaries for the EML file.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Archipiélago de Revillagigedo
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Archipiélago de Revillagigedo")]


occurrence <- occurrence %>%
  rename(decimalLatitude = CleanedData.Latitude) %>%
  rename(decimalLongitude = CleanedData.Longitude)

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

footprintWKT

coordinateUncertaintyInMeters

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Mexico"
occurrence %<>% mutate(country)

locality

occurrence <- occurrence %>%
  mutate(locality = ifelse(CleanedData.Location == "LaPaz", "Bay of La Paz", "Revillagigedo National Park")) %>%
  select(-c(CleanedData.Location))

recordedBy

occurrence <- occurrence %>%
  mutate(recordedBy = case_when(
    CleanedData.Observer == "Alexia" ~ "Alexia Uribe",
    CleanedData.Observer == "Benjamin" ~ "Benjamín Garza",
    CleanedData.Observer == "Hollarsmith" ~ "Jordan A. Hollarsmith",
    CleanedData.Observer == "KCN" ~ "Kyle C. Neumann",
    CleanedData.Observer == "Manuel" ~ "Manuel Velasco-Lozano",
    CleanedData.Observer == "PAUL PRECIADO" ~ "Paul Preciado-González",
    CleanedData.Observer == "Twinquist" ~ "Tallulah Winquist",
    TRUE ~ NA_character_
  )) %>%
  select(-c(CleanedData.Observer))

verbatimDepth

occurrence <- occurrence %>%
  rename(verbatimDepth = CleanedData.Depth_avg)

habitat

occurrence <- occurrence %>%
  rename(habitat = CleanedData.Habitat_biotic)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, country, locality, decimalLatitude, decimalLongitude, verbatimDepth, habitat, recordedBy, geodeticDatum, occurrenceStatus, basisOfRecord)

#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 8 × 4
##   level field              row message                                        
##   <chr> <chr>            <int> <chr>                                          
## 1 error decimalLongitude   259 Empty value for required field decimalLongitude
## 2 error decimalLongitude   260 Empty value for required field decimalLongitude
## 3 error decimalLongitude   261 Empty value for required field decimalLongitude
## 4 error decimalLongitude   262 Empty value for required field decimalLongitude
## 5 error decimalLatitude    259 Empty value for required field decimalLatitude 
## 6 error decimalLatitude    260 Empty value for required field decimalLatitude 
## 7 error decimalLatitude    261 Empty value for required field decimalLatitude 
## 8 error decimalLatitude    262 Empty value for required field decimalLatitude

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")
## [1] "eml-2.1.1"
#Title
title <- "Habitats and fish communities at mesophotic depths in the Mexican Pacific"

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")

#Abstract
abstract <- eml$abstract(
  para = "Mesophotic ecosystems, found at the limit of light penetration in the ocean, are rich in biodiversity and harbour unique ecological communities. However, they remain among the least studied habitat zones on earth due to the high costs and technological limitations. Here, we characterize mesophotic communities in two marine reserves across a range of habitat types, depths and temperatures using submersible technologies, with the goal of understanding the processes that structure these communities across biogeographical regions."
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- list(eml$creator(
    individualName = eml$individualName(
      givenName = "Jordan A.", 
      surName = "Hollarsmith"),
    organizationName = "University of California Bodega Marine Laboratory"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Georgina", 
      surName = "Ramírez-Ortiz"),
    organizationName = "Centro de Investigaciones Biológicas del Noroeste"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Tallulah", 
      surName = "Winquist"),
    organizationName = "University of California Bodega Marine Laboratory"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Manuel", 
      surName = "Velasco-Lozano"),
    organizationName = "Universidad Autónoma de Baja California Sur"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Katherine", 
      surName = "DuBois"),
    organizationName = "University of California Bodega Marine Laboratory"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Héctor", 
      surName = "Reyes-Bonilla"),
    organizationName = "Universidad Autónoma de Baja California Sur"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Kyle C.", 
      surName = "Neumann"),
    organizationName = "University of California"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Edwin D.", 
      surName = "Grosholz"),
    organizationName = "University of California Bodega Marine Laboratory"
  )
)


contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Hollarsmith, Jordan & Ramírez-Ortiz, Georgina & Winquist, Tallulah & Velasco-Lozano, Manuel & DuBois, Katherine & Reyes-Bonilla, Héctor & Neumann, Kyle & Grosholz, Edwin. (2020). Habitats and fish communities at mesophotic depths in the Mexican Pacific. Journal of Biogeography. 47.")
    )
  )
)

citationdoi <- "http://dx.doi.org/10.1111/jbi.13842"

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Archipiélago de Revillagigedo",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
  ),
  temporalCoverage = eml$temporalCoverage(
    rangeOfDates = eml$rangeOfDates(
      beginDate = eml$beginDate(
        calendarDate = "2018-10-01"
      ),
      endDate = eml$endDate(
        calendarDate = "2018-12-31"
      )
    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)

#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)

#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "index", "3")
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  }
}

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}