ISA DeepData

This notebook processes data from the International Seabed Authority (ISA) DeepData database into Darwin Core archives. The resulting datasets are hosted at https://datasets.obis.org/hosted/isa/index.html.

The code for this notebook is hosted at https://github.com/iobis/notebook-deepdata.

Data flow

Reading the data from S3

The DeepData dataset is delivered to OBIS via S3. Credentials are stored in env.txt. Earlier versions of the file were encoded in the non-standard ISO-8859-1, requiring the need to use readLines before parsing the data with the jsonlite package, but that is fixed now.

require(RCurl)
library(stringr)
library(dplyr)
library("aws.s3")

readRenviron("env.txt")
json <- get_object("uploads/isa/deepdata.json", "obis-datasets", as = "text")

Parsing the JSON file

library(jsonlite)
library(purrr)

records <- fromJSON(json, simplifyDataFrame = TRUE)$DEEPDATA %>%
  as_tibble()

Generating Darwin Core data files

We can now extract a list of distinct datasets from the data frame, and generate a Darwin Core archive for each dataset. Let’s first generate dataset identifiers from the dataset titles for later use in the dataset URLs. To account for possible future changes to dataset titles, I’m removing or fixing some words in the titles. The result should be that identifiers do not change when typos are fixed in the future.

library(knitr)

titles <- records %>%
  distinct(Metadata$title) %>%
  pull("Metadata$title")

identifiers <- titles %>%
  tolower(.) %>%
  str_replace(., "meiofaun$", "meiofauna") %>%
  str_replace(., "templaye", "template") %>%
  str_replace(., "template", "") %>%
  str_replace(., "biodiveristy", "biodiversity") %>%
  str_replace(., "macrfaun$", "macrofauna") %>%
  str_replace(., "meofauna", "meiofauna") %>%
  str_replace(., "meiobent$", "meiobenthos") %>%
  str_replace(., "-", " ") %>%
  str_squish(.) %>%
  str_replace_all(., "\\s", "_")

stopifnot(length(unique(titles)) == length(unique(identifiers)))
records$dataset_id <- identifiers[match(records$Metadata$title, titles)]

data.frame(titles, identifiers) %>%
  kable()

titles	identifiers
BGRPMN12017 Biodiveristy	bgrpmn12017_biodiversity
BGRPMN12017 Env Template MANGAN2014	bgrpmn12017_env_mangan2014
BGRPMN12017 Env Template FLUM	bgrpmn12017_env_flum
GSRPMN12017 Env Template BIOSO239	gsrpmn12017_env_bioso239
UKSRLPMN12019 NHM-NORCE	uksrlpmn12019_nhm_norce
UKSRLPMN12019 NOC NERC	uksrlpmn12019_noc_nerc
KOREAPMN12019 Macrofauna2018	koreapmn12019_macrofauna2018
KOREAPMN12019 macrofauna2019	koreapmn12019_macrofauna2019
GSRPMN12019 P03-MBL 1	gsrpmn12019_p03_mbl_1
COMRACFRC120015 Env Template 2015 demersal scavenger	comracfrc120015_env_2015_demersal_scavenger
COMRACFRC120015 Env Template 2015 macrofauna	comracfrc120015_env_2015_macrofauna
GSRPMN12019 P03-MBL 2	gsrpmn12019_p03_mbl_2
OMSPMN12019 NHM Env	omspmn12019_nhm_env
BGRPMS12015 ROV Biology	bgrpms12015_rov_biology
COMRACRFC12017 Env Template DY36 meiofauna	comracrfc12017_env_dy36_meiofauna
COMRACRFC12016 Env Template DY37 megafauna	comracrfc12016_env_dy37_megafauna
KOREAPMN12010 Env Template 2010 biomass	koreapmn12010_env_2010_biomass
KOREAPMN12011 Env Template 2011 abundance	koreapmn12011_env_2011_abundance
KOREAPMN12013 Env Template 2013 abundance	koreapmn12013_env_2013_abundance
RUSMNRCRFC12015 Env Template Biodata	rusmnrcrfc12015_env_biodata
JOGMECCRFC12018 Env Template HK17 NEMA	jogmeccrfc12018_env_hk17_nema
COMRAPMS12017 Env Template phytoplankton	comrapms12017_env_phytoplankton
COMRAPMS12017 Env Template zooplankton	comrapms12017_env_zooplankton
JOGMECCRFC12018 Env Template JK18 picoplankton	jogmeccrfc12018_env_jk18_picoplankton
JOGMECCRFC12018 Env Template HK17-01 phyto	jogmeccrfc12018_env_hk17_01_phyto
COMRAPMS12016 Env Template phytoplanton	comrapms12016_env_phytoplanton
IOMPMN12018 Env Template BIOL	iompmn12018_env_biol
COMRAPMS12018 zooplankton	comrapms12018_zooplankton
COMRAPMN12016 Sequences	comrapmn12016_sequences
COMRAPMN12018 Lander2017	comrapmn12018_lander2017
UKSRLPMN12015 Env Template Scavengers 032016	uksrlpmn12015_env_scavengers_032016
IFREMERPMN12015 AR ENV	ifremerpmn12015_ar_env
IFREMERPMN12018 Nodinaut Nematoda Copepoda	ifremerpmn12018_nodinaut_nematoda_copepoda
UKSRLPMN12016 Senkenberg	uksrlpmn12016_senkenberg
UKSRLPMN12016 Megafauna	uksrlpmn12016_megafauna
UKSRLPMN12016 AB02 NOCS	uksrlpmn12016_ab02_nocs
UKSRLPMN12017 Senkenberg Macrofauna	uksrlpmn12017_senkenberg_macrofauna
YUZHPMN12015 Biodata B6	yuzhpmn12015_biodata_b6
YUZPMN12016 Biodata	yuzpmn12016_biodata
UKSRLPMN12017 NOC NERC	uksrlpmn12017_noc_nerc
DORDPMN12018 Env Mn Bio	dordpmn12018_env_mn_bio
GSRPMN12016 MarBiol UGent	gsrpmn12016_marbiol_ugent
BGRPMS12020 2019 DR Biology	bgrpms12020_2019_dr_biology
OMSPMN12018 Env NHM NORCE	omspmn12018_env_nhm_norce
UKSRLPMN12016 NHM-UNI	uksrlpmn12016_nhm_uni
OMSPMN12017 Macro Senckenberg	omspmn12017_macro_senckenberg
OMSPMN12017 Scavengers Senckenberg	omspmn12017_scavengers_senckenberg
KOREAPMN12012 Env Template 2012 abundance	koreapmn12012_env_2012_abundance
BGRPMS12020 2019 GC Biology	bgrpms12020_2019_gc_biology
KOREAPMN12012 Env Template 2012 biomass	koreapmn12012_env_2012_biomass
KOREAPMN12012 Env Template 2012 macrofauna	koreapmn12012_env_2012_macrofauna
JOGMECCRFC12016 Env Template 2016 Edokko data	jogmeccrfc12016_env_2016_edokko_data
JOGMECCRFC12016 Env Template 2016 nematoda DNA	jogmeccrfc12016_env_2016_nematoda_dna
JOGMECCRFC12016 Env Template 2016 ROV	jogmeccrfc12016_env_2016_rov
JOGMECCRFC120017 Env Template 2017 abundance	jogmeccrfc120017_env_2017_abundance
DORDPMN12016 Mn2016 ENV	dordpmn12016_mn2016_env
BGRPMN12017 Env Template BIONOD2012	bgrpmn12017_env_bionod2012
BGRPMN12017 Env Template ECORESPONSE	bgrpmn12017_env_ecoresponse
BGRPMN12017 Env Template MANGAN2010	bgrpmn12017_env_mangan2010
BGRPMN12017 Env Template MANGAN2013	bgrpmn12017_env_mangan2013
BGRPMN12017 Env Template MANGAN2016	bgrpmn12017_env_mangan2016
IOMPMN12015 Env Template annex 1	iompmn12015_env_annex_1
IOMPMN12015 Env Template annex 11	iompmn12015_env_annex_11
KOREAPMN12014 Env Template 2014 abundance	koreapmn12014_env_2014_abundance
KOREAPMN12014 Env Template 2014 megafauna	koreapmn12014_env_2014_megafauna
BGRPMS12020 2019 WC Biology	bgrpms12020_2019_wc_biology
KOREAPMN12014 Env Template 2014 biomass	koreapmn12014_env_2014_biomass
KOREAPMN12011 Env Template 2011 biomass	koreapmn12011_env_2011_biomass
COMRAPMS12016 Env Template phytoplankton	comrapms12016_env_phytoplankton
KOREAPMN12012 Meiofauna	koreapmn12012_meiofauna
IOMPMN12014 Env Bio	iompmn12014_env_bio
DORDPMN12020 Env	dordpmn12020_env
GSRPMN12020 MarBiol UGent	gsrpmn12020_marbiol_ugent
IFREMERPMN12018 Polynoids	ifremerpmn12018_polynoids
NORIPMN12022 Env Template BIO	noripmn12022_env_bio
COMRAPMN12014 Env Template W1101	comrapmn12014_env_w1101
COMRAPMN12014 Env Template W1102	comrapmn12014_env_w1102
COMRAPMN12014 Env Template WS1102	comrapmn12014_env_ws1102
COMRAPMN12014 Env Template WS1104	comrapmn12014_env_ws1104
UKSRLPMN12015 Env Template AB01 NHM	uksrlpmn12015_env_ab01_nhm
COMRAPMS12015 ENV	comrapms12015_env
COMRAPMS12018 Phytoplankton	comrapms12018_phytoplankton
UKSRLPMN12015 Env Template Macrofauna 032016	uksrlpmn12015_env_macrofauna_032016
UKSRLPMN12015 Env Template Senckenberg 032016	uksrlpmn12015_env_senckenberg_032016
UKSRLPMN12015 Env Templaye Megafauna 032016	uksrlpmn12015_env_megafauna_032016
UKSRLPMN12015 Env Template GG 032020163	uksrlpmn12015_env_gg_032020163
IFREMERPMN12018 SO239	ifremerpmn12018_so239
OMSPMN12018 NUS Data	omspmn12018_nus_data
UKSRLPMN12017 Senkenberg Meofauna	uksrlpmn12017_senkenberg_meiofauna
COMRAPMS12015 Env Template Meiobent	comrapms12015_env_meiobenthos
IFREMERPMN12017 Env Template BIO1 2017	ifremerpmn12017_env_bio1_2017
COMRACRFC12017 Env Template DY29 zooplankton	comracrfc12017_env_dy29_zooplankton

Extracting occurrence data

Let’s first create a new ID column, this will be used later to link together the measurements and occurrences, and to select records by dataset. We cannot use occurrenceID here because these are not unique within the dataset.

library(uuid)

records$id <- UUIDgenerate(use.time = NA, n = nrow(records))
stopifnot(length(unique(records$id)) == nrow(records))

Now we can select and process the columns that will go into the occurrence table.

extract_occurrences <- function(df) {
    df %>%
      select("id", "dataset_id", "Occurrence", "Event", "Location", "Identification", "Record-level", "Taxon") %>%
      jsonlite::flatten() %>%
      rename_all(~str_replace(., ".*\\.", "")) %>%
      as_tibble()
}

occ <- extract_occurrences(records)

Initial cleanup of occurrence data

First clean up any escaped newlines, empty strings, and placeholder values. Also fix basisOfRecord and convert coordinates to numeric values:

library(stringr)

occ <- occ %>%
  mutate_all(~gsub("\\n", "", .)) %>%
  mutate_all(~na_if(., "")) %>%
  mutate(across(where(is.character), str_squish)) %>%
  mutate_all(~replace(., . %in% c("indet", "Not Reported", "indet."), NA)) %>%
  mutate(basisOfRecord = "HumanObservation") %>%
  mutate(
    decimalLongitude = as.numeric(decimalLongitude),
    decimalLatitude = as.numeric(decimalLatitude)
  )

Let’s check for coordinates issues:

robis::map_ggplot(occ)

Let’s take a look at scientificName and scientificNameID.

occ %>%
  group_by(scientificName) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  rmarkdown::paged_table()

occ %>%
  group_by(scientificNameID) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  rmarkdown::paged_table()

So at least in the current version at the time of writing (June 2021) there are some quality issues for scientificName.

Fixing taxonomy

Let’s try to clean up the scientific names before we do taxon matching with WoRMS. Here I’m using the gni_parse() function from the taxize package, which connects to the GNI name parser. If a name cannot be parsed, I’m keeping the original.

The first step is to create a list of all distinct names in the taxonomy columns.

taxonomy <- occ %>%
  select(phylum, class, order, family, genus, scientificName)
names <- na.omit(unique(unlist(taxonomy)))

Then pass through the name parser:

library(taxize)

clean_name <- function(name) {
  parsed <- tryCatch({
    res <- gni_parse(name)
    stopifnot(nrow(res) == 1)
    return(res$canonical[1])
  },
  error = function(cond){
    return(name)
  })
}

names_clean <- sapply(names, clean_name)

Now use the cleaned names for taxon matching:

library(worrms)

match_name <- function(name) {
  lsid <- tryCatch({
    res <- wm_records_names(name)
    matches <- res[[1]] %>%
      filter(match_type == "exact" | match_type == "exact_genus")
    if (nrow(matches) > 1) {
      message(paste0("Multiple exact matches for ", name))
    }
    return(matches$lsid[1])
  }, error = function(cond) {
    return(NA)
  })
}

lsids <- sapply(names_clean, match_name)

Now we need to find the lowest taxonomic level at which we find a name. Note that this will result in records with less taxonomic resolution than intended. Ideally we would only match on scientificName. First translate the taxonomy columns to LSIDs:

taxonomy_clean <- taxonomy %>%
  mutate_all(~names_clean[.]) %>%
  mutate_all(~lsids[.])

taxonomy_clean

## # A tibble: 132,928 × 6
##    phylum                                class order family genus scientificName
##    <chr>                                 <chr> <chr> <chr>  <chr> <chr>         
##  1 urn:lsid:marinespecies.org:taxname:2… <NA>  <NA>  <NA>   <NA>  urn:lsid:mari…
##  2 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  <NA>          
##  3 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  urn:lsid:mari…
##  4 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  <NA>          
##  5 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  urn:lsid:mari…
##  6 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  urn:lsid:mari…
##  7 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  <NA>          
##  8 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  urn:lsid:mari…
##  9 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  urn:lsid:mari…
## 10 urn:lsid:marinespecies.org:taxname:8… urn:… <NA>  <NA>   <NA>  <NA>          
## # ℹ 132,918 more rows

The find the most specific one for each row:

taxonomy_clean <- taxonomy_clean %>%
  mutate(best = coalesce(scientificName, genus, family, order, class))

I’ll use the resulting LSIDs to replace the provided scientificNameIDs.

occ$scientificNameID <- taxonomy_clean$best

Let’s take another look at the top scientificName and scientificNameID after mathing:

occ %>%
  group_by(scientificName, scientificNameID) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  head(30) %>%
  knitr::kable()

scientificName	scientificNameID	records
monothalamea	NA	29185
hymenopenaeus nereus	urn:lsid:marinespecies.org:taxname:1071	2761
polychaeta	urn:lsid:marinespecies.org:taxname:883	2587
abyssoprimnoa	NA	2351
scleralcyonacea	NA	2278
nematoda	urn:lsid:marinespecies.org:taxname:799	2249
plesiopenaeus armatus	urn:lsid:marinespecies.org:taxname:1071	2005
amphipoda	urn:lsid:marinespecies.org:taxname:1071	1900
isopoda	urn:lsid:marinespecies.org:taxname:1071	1856
coryphaenoides armatus or yaquinae	NA	1591
leptochiton	NA	1568
plesiodiadema	urn:lsid:marinespecies.org:taxname:123082	1411
reophax	NA	1308
thalassomonhystera	NA	1280
astrorhizida	NA	1226
copepoda	NA	1191
actiniaria	NA	1125
ostracoda	urn:lsid:marinespecies.org:taxname:1078	1095
malacostraca	urn:lsid:marinespecies.org:taxname:1071	1025
acantholaimus	NA	966
porifera	urn:lsid:marinespecies.org:taxname:558	951
ophiuroidea	urn:lsid:marinespecies.org:taxname:123084	860
harpacticoida	NA	859
tanaidacea	urn:lsid:marinespecies.org:taxname:1071	849
thenea	NA	824
trochammina	NA	745
callozostron	NA	721
halalaimus	NA	714
turbellaria	NA	696
manganonema	NA	640

Extracting MeasurementOrFact data

extract_mof <- function(df) {
    df %>%
      select("id", "dataset_id", "MeasurementOrFact") %>%
      jsonlite::flatten() %>%
      rename_all(~str_replace(., ".*\\.", "")) %>%
      mutate(across(where(is.character), str_squish)) %>%
      mutate_all(~na_if(., "")) %>%
      filter(!is.na(measurementType) & !is.na(measurementValue)) %>%
      as_tibble()
}

mof <- extract_mof(records)
mof

## # A tibble: 14,360 × 6
##    id                  dataset_id measurementID measurementType measurementValue
##    <chr>               <chr>      <chr>         <chr>           <chr>           
##  1 7085c2c3-e5a9-4118… bgrpmn120… BD12421641988 Relative abund… 1.8             
##  2 107d43f2-227e-45aa… bgrpmn120… BD12761642022 Relative abund… 0.72            
##  3 ccba49e6-0746-4f88… bgrpmn120… BD12771642023 Relative abund… 3.14            
##  4 c35699df-1c7f-4c79… bgrpmn120… BD12781642024 Relative abund… 0.72            
##  5 5d94738f-8c6b-445a… bgrpmn120… BD12791642025 Relative abund… 0.45            
##  6 0e859e7d-2618-4164… bgrpmn120… BD12801642026 Relative abund… 0.18            
##  7 788eb9b6-4b47-4020… bgrpmn120… BD12811642027 Relative abund… 13.66           
##  8 4db4b13a-cd63-443a… bgrpmn120… BD12821642028 Relative abund… 0.09            
##  9 9d7fb500-08f9-431e… bgrpmn120… BD12831642029 Relative abund… 5.12            
## 10 44aa97a6-e7aa-4810… bgrpmn120… BD12841642030 Relative abund… 31.81           
## # ℹ 14,350 more rows
## # ℹ 1 more variable: measurementUnit <chr>

A number of records appear to have empty values. To demonstrate this, let’s take a look at the most common combinations of measurementType and measurementValue:

mof %>%
  group_by(measurementType, measurementValue) %>%
  summarize(records = n()) %>%
  arrange(desc(records)) %>%
  head(10) %>%
  knitr::kable()

measurementType	measurementValue	records
Relative abundance	ns	4504
Relative abundance	0	462
Relative abundance	in progress	141
Relative abundance	0.01	107
Relative abundance	0.854700854700855	97
Relative abundance	0.0462962962962963	79
Relative abundance	0.09765625	78
Relative abundance	0.03	74
Relative abundance	0.282485875706215	72
Relative abundance	0.07	69

Generating Darwin Core Archives

Generating EML

For demonstration purposes, I’m working with the dataset pertaining to the first record here. The EML template is read from templates/eml.xml:

library(readr)
library(glue)

generate_eml <- function(df) {
  eml <- read_file("templates/eml.xml")
  metadata <- df$Metadata[1,]

  firstname <- strsplit(metadata$Creator$name, " ")[[1]][1]
  lastname <- strsplit(metadata$Creator$name, " ")[[1]][2]
  organization <- metadata$Creator$organisation
  email <- metadata$Creator$email
  position <- metadata$Creator$position

  creator_firstname <- ""
  creator_lastname <- ""
  creator_organization <- metadata$Contact$organisation
  creator_email <- ""
  creator_position <- ""

  abstract <- metadata$abstract
  title <- metadata$title
  citation <- metadata$citation
  packageid <- "https://datasets.obis.org/deepdata"
  pubdate <- format(Sys.time(), "%Y-%m-%d")
  datestamp <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
  glue(eml)
}

generate_eml(records)

## <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1"
##   xmlns:dc="http://purl.org/dc/terms/"
##   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
##   xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
##   packageId="https://datasets.obis.org/deepdata" system="http://gbif.org" scope="system"
##   xml:lang="eng">
## 
## <dataset>
##   <title xml:lang="eng">BGRPMN12017 Biodiveristy</title>
##   <pubDate>2023-06-14</pubDate>
##   <language>eng</language>
##   <abstract>
##     <para>Sampling data captured in Oceanic Exploration Research Based on geochemical and physical data we identified differences in the polymetallic nodule facies within the eastern German license area. Nodules in Prospective Areas # 1 are generally larger and show significantly higher contents of Ni and Cu as compared to the nodules in Prospective Area #2 which are smaller and are characterised by higher contents of Co. Furthermore, our study shows that sediment, which mainly consists of Si and Al oxides, contributes 13 % to the dry weight of the nodules on average. The method we used is quantitative and can thus be used for habitat mapping and be related to biodiversity and other environmental parameters</para>
##   </abstract>
##   <keywordSet>
##     <keyword>Occurrence</keyword>
##     <keywordThesaurus>GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type.xml</keywordThesaurus>
##   </keywordSet>
##   <intellectualRights>
##     <para>This work is licensed under a <ulink url="http://creativecommons.org/licenses/by/4.0/legalcode"><citetitle>Creative Commons Attribution (CC-BY) 4.0 License</citetitle></ulink>.</para>
##   </intellectualRights>
##   <maintenance>
##     <description>
##       <para></para>
##     </description>
##     <maintenanceUpdateFrequency>unkown</maintenanceUpdateFrequency>
##   </maintenance>
##   <creator>
##     <individualName>
##     <givenName></givenName>
##     <surName></surName>
##     </individualName>
##     <organizationName>Federal Institute for Geosciences and Natural Resources of Germany</organizationName>
##     <positionName></positionName>
##     <electronicMailAddress></electronicMailAddress>
##   </creator>
##   <metadataProvider>
##     <individualName>
##     <givenName>Sheldon</givenName>
##     <surName>Carter</surName>
##     </individualName>
##     <organizationName>International Seabed Authority</organizationName>
##     <positionName>Database Manager</positionName>
##     <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
##   </metadataProvider>
##   <contact>
##     <individualName>
##     <givenName>Sheldon</givenName>
##     <surName>Carter</surName>
##     </individualName>
##     <organizationName>International Seabed Authority</organizationName>
##     <positionName>Database Manager</positionName>
##     <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
##   </contact>
## </dataset>
## <additionalMetadata>
## <metadata>
## <gbif>
##   <dateStamp>2023-06-14T12:43:17+0200</dateStamp>
##   <hierarchyLevel>dataset</hierarchyLevel>
##   <citation>Federal Institute for Geosciences and Natural Resources of Germany, (2017). BGRPMN12017 Biodiveristy. Available : DeepData, International Seabed Authority https://data.isa.org.jm/ Accessed: [YYYY-MM-DD].</citation>
## </gbif>
## </metadata>
## </additionalMetadata>
## </eml:eml>

Generating an archive descriptor file

The archive also needs to include a meta.xml file which describes the files in the archive and their relationships.

Let’s first get a list of terms including their qualName.

library(xml2)

get_terms <- function(url) {
  doc <- read_xml(url)
  terms <- doc %>%
    xml_ns_strip() %>%
    xml_find_all(".//property") %>% 
    map_df(function(x) {
      list(
        name = xml_attr(x, "name"),
        qual = xml_attr(x, "qualName")
      )
    })
}

occurrence_terms <- get_terms("https://rs.gbif.org/core/dwc_occurrence_2020-07-15.xml")
mof_terms <- get_terms("https://rs.gbif.org/extension/obis/extended_measurement_or_fact.xml")

Using these we can generate a list of terms to go into the meta.xml file for each table.

generate_meta <- function(occ, mof) {
  occurrence_fields <- tibble(name = names(occ)) %>%
    left_join(occurrence_terms, by = "name") %>%
    mutate(index = as.numeric(row.names(.)) - 1)
  
  occurrence_lines <- paste0("<field index=\"", occurrence_fields$index, "\" term=\"", occurrence_fields$qual, "\"/>")
  occurrence_lines[1] <- "<id index=\"0\" />"
  occurrence_lines <- paste0(occurrence_lines, collapse = "\n")

  mof_fields <- tibble(name = names(mof)) %>%
  left_join(mof_terms, by = "name") %>%
  mutate(index = as.numeric(row.names(.)) - 1)

  mof_lines <- paste0("<field index=\"", mof_fields$index, "\" term=\"", mof_fields$qual, "\"/>")
  mof_lines[1] <- "<coreid index=\"0\" />"
  mof_lines <- paste0(mof_lines, collapse = "\n")

  meta <- read_file("templates/meta.xml")
  glue(meta)
}

generate_meta(occ, mof)

## <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
##   <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
##     <files>
##       <location>occurrence.txt</location>
##     </files>
##     <id index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/individualCount"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/organismQuantity"/>
## <field index="6" term="http://rs.tdwg.org/dwc/terms/organismQuantityType"/>
## <field index="7" term="http://rs.tdwg.org/dwc/terms/sex"/>
## <field index="8" term="http://rs.tdwg.org/dwc/terms/occurrenceStatus"/>
## <field index="9" term="http://rs.tdwg.org/dwc/terms/associatedSequences"/>
## <field index="10" term="http://rs.tdwg.org/dwc/terms/occurrenceRemarks"/>
## <field index="11" term="http://rs.tdwg.org/dwc/terms/eventID"/>
## <field index="12" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
## <field index="13" term="http://rs.tdwg.org/dwc/terms/eventTime"/>
## <field index="14" term="http://rs.tdwg.org/dwc/terms/year"/>
## <field index="15" term="http://rs.tdwg.org/dwc/terms/month"/>
## <field index="16" term="http://rs.tdwg.org/dwc/terms/day"/>
## <field index="17" term="http://rs.tdwg.org/dwc/terms/habitat"/>
## <field index="18" term="http://rs.tdwg.org/dwc/terms/samplingProtocol"/>
## <field index="19" term="http://rs.tdwg.org/dwc/terms/eventRemarks"/>
## <field index="20" term="http://rs.tdwg.org/dwc/terms/locationID"/>
## <field index="21" term="http://rs.tdwg.org/dwc/terms/minimumDepthInMeters"/>
## <field index="22" term="http://rs.tdwg.org/dwc/terms/maximumDepthInMeters"/>
## <field index="23" term="http://rs.tdwg.org/dwc/terms/verbatimDepth"/>
## <field index="24" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
## <field index="25" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
## <field index="26" term="http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem"/>
## <field index="27" term="http://rs.tdwg.org/dwc/terms/verbatimSRS"/>
## <field index="28" term="http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters"/>
## <field index="29" term="http://rs.tdwg.org/dwc/terms/identificationID"/>
## <field index="30" term="http://rs.tdwg.org/dwc/terms/typeStatus"/>
## <field index="31" term="http://rs.tdwg.org/dwc/terms/dateIdentified"/>
## <field index="32" term="http://rs.tdwg.org/dwc/terms/identificationVerificationStatus"/>
## <field index="33" term="http://purl.org/dc/terms/type"/>
## <field index="34" term="http://purl.org/dc/terms/license"/>
## <field index="35" term="http://purl.org/dc/terms/rightsHolder"/>
## <field index="36" term="http://purl.org/dc/terms/accessRights"/>
## <field index="37" term="http://purl.org/dc/terms/bibliographicCitation"/>
## <field index="38" term="http://rs.tdwg.org/dwc/terms/institutionID"/>
## <field index="39" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
## <field index="40" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
## <field index="41" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
## <field index="42" term="http://rs.tdwg.org/dwc/terms/scientificNameID"/>
## <field index="43" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
## <field index="44" term="http://rs.tdwg.org/dwc/terms/phylum"/>
## <field index="45" term="http://rs.tdwg.org/dwc/terms/class"/>
## <field index="46" term="http://rs.tdwg.org/dwc/terms/order"/>
## <field index="47" term="http://rs.tdwg.org/dwc/terms/family"/>
## <field index="48" term="http://rs.tdwg.org/dwc/terms/genus"/>
## <field index="49" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
## <field index="50" term="http://rs.tdwg.org/dwc/terms/taxonomicStatus"/>
## <field index="51" term="http://rs.tdwg.org/dwc/terms/taxonRemarks"/>
##   </core>
##   <extension encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact">
##     <files>
##       <location>extendedmeasurementorfact.txt</location>
##     </files>
##     <coreid index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/measurementID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/measurementType"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/measurementValue"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/measurementUnit"/>
##   </extension>
## </archive>

Bringing it all together

Now we can generate an archive for each dataset. While I’m generating datasets I’m also populating the RSS feed and creating dataset landing pages.

baseurl <- "https://datasets.obis.org/hosted/isa/"
item_template <- read_file("templates/rss_item.xml")
landing_template <- read_file("templates/index_dataset.html")
items <- list()
pubdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %z")

unlink("output", recursive = TRUE)
dir.create("output")

datasetids <- unique(records$dataset_id)

for (datasetid in datasetids) {
  
  dataset <- records %>%
    filter(dataset_id == datasetid) %>%
    head(1)

  dataset$Metadata$abstract <- dataset$Metadata$abstract %>%
    str_replace(., "&", "&amp;") %>%
    str_replace(., ">", "&gt;") %>%
    str_replace(., "<", "&lt;") %>%
    str_replace(., "'", "&apos;") %>%
    str_replace(., "\"", "&quot;")

  title <- dataset$Metadata$title
  abstract <- dataset$Metadata$abstract
  link <- paste0(baseurl, datasetid, "/index.html")
  dwca <- paste0(baseurl, datasetid, "/", datasetid, ".zip")

  # clear dataset directory
    
  unlink(paste0("output/", datasetid), recursive = TRUE)
  dir.create(paste0("output/", datasetid))

  # RSS feed items
  
  item <- glue(item_template)
  items[[datasetid]] <- item
  
  # dataset landing page
  
  landing <- glue(landing_template)
  writeLines(landing, paste0("output/", datasetid, "/index.html"))
  
  # archive  
  
  dataset_occ <- occ %>% filter(dataset_id == datasetid) 
  dataset_mof <- mof %>% filter(dataset_id == datasetid) 

  eml <- generate_eml(dataset)
  meta <- generate_meta(occ, mof)
  
  write.table(dataset_occ, file = paste0("output/", datasetid, "/occurrence.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
  write.table(dataset_mof, file = paste0("output/", datasetid, "/extendedmeasurementorfact.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
  writeLines(eml, paste0("output/", datasetid, "/eml.xml"))
  writeLines(meta, paste0("output/", datasetid, "/meta.xml"))
  
  files <- c("occurrence.txt", "extendedmeasurementorfact.txt", "eml.xml", "meta.xml")
  setwd(paste0("output/", datasetid))
  zip(glue("{datasetid}.zip"), files)
  for (f in files) {
    file.remove(f)
  }
  setwd("../..")

}

Data publishing

In this section all files are uploaded to an S3 bucket. A list of datasets is visible at https://datasets.obis.org/hosted/isa/index.html, and an RSS file is available for the OBIS harvester.

Generate RSS file

items <- paste0(items, collapse = "\n")
rss_template <- read_file("templates/rss.xml")

title <- "International Seabed Authority (ISA)"
description <- "International Seabed Authority (ISA)"
link <- paste0(baseurl, "index.html")

rss <- glue(rss_template)
writeLines(rss, "output/rss.xml")

Generate landing page

index_template <- read_file("templates/index.html")
content <- paste0(paste0("<li><a href=\"", datasetids, "/index.html\">", datasetids, "</a></li>"), collapse = "\n")
index <- glue(index_template)
writeLines(index, "output/index.html")

Uploading to S3

delete_object("hosted/isa/", bucket = "obis-datasets")
files <- list.files("output", full.names = TRUE, recursive = TRUE, include.dirs = FALSE)

for (file in files) {
  folder <- str_replace(dirname(file), "output", "hosted/isa")
  target <- str_replace(file, "output", "hosted/isa")
  message(target)
  put_object(file, object = target, bucket = "obis-datasets", acl = "public-read")
}