ISA DeepData
This notebook processes data from the International Seabed Authority (ISA) DeepData database into Darwin Core archives. The resulting datasets are hosted at https://datasets.obis.org/hosted/isa/index.html.
The code for this notebook is hosted at https://github.com/iobis/notebook-deepdata.
Data flow
Reading the data from S3
The DeepData dataset is delivered to OBIS via S3. Credentials are
stored in env.txt
. Earlier versions of the file were
encoded in the non-standard ISO-8859-1
, requiring the need
to use readLines
before parsing the data with the jsonlite
package, but that is fixed now.
Parsing the JSON file
Generating Darwin Core data files
We can now extract a list of distinct datasets from the data frame, and generate a Darwin Core archive for each dataset. Let’s first generate dataset identifiers from the dataset titles for later use in the dataset URLs. To account for possible future changes to dataset titles, I’m removing or fixing some words in the titles. The result should be that identifiers do not change when typos are fixed in the future.
library(knitr)
titles <- records %>%
distinct(Metadata$title) %>%
pull("Metadata$title")
identifiers <- titles %>%
tolower(.) %>%
str_replace(., "meiofaun$", "meiofauna") %>%
str_replace(., "templaye", "template") %>%
str_replace(., "template", "") %>%
str_replace(., "biodiveristy", "biodiversity") %>%
str_replace(., "macrfaun$", "macrofauna") %>%
str_replace(., "meofauna", "meiofauna") %>%
str_replace(., "meiobent$", "meiobenthos") %>%
str_replace(., "-", " ") %>%
str_squish(.) %>%
str_replace_all(., "\\s", "_")
stopifnot(length(unique(titles)) == length(unique(identifiers)))
records$dataset_id <- identifiers[match(records$Metadata$title, titles)]
data.frame(titles, identifiers) %>%
kable()
titles | identifiers |
---|---|
BGRPMN12017 Biodiveristy | bgrpmn12017_biodiversity |
BGRPMN12017 Env Template MANGAN2014 | bgrpmn12017_env_mangan2014 |
BGRPMN12017 Env Template FLUM | bgrpmn12017_env_flum |
GSRPMN12017 Env Template BIOSO239 | gsrpmn12017_env_bioso239 |
UKSRLPMN12019 NHM-NORCE | uksrlpmn12019_nhm_norce |
UKSRLPMN12019 NOC NERC | uksrlpmn12019_noc_nerc |
KOREAPMN12019 Macrofauna2018 | koreapmn12019_macrofauna2018 |
KOREAPMN12019 macrofauna2019 | koreapmn12019_macrofauna2019 |
GSRPMN12019 P03-MBL 1 | gsrpmn12019_p03_mbl_1 |
COMRACFRC120015 Env Template 2015 demersal scavenger | comracfrc120015_env_2015_demersal_scavenger |
COMRACFRC120015 Env Template 2015 macrofauna | comracfrc120015_env_2015_macrofauna |
GSRPMN12019 P03-MBL 2 | gsrpmn12019_p03_mbl_2 |
OMSPMN12019 NHM Env | omspmn12019_nhm_env |
BGRPMS12015 ROV Biology | bgrpms12015_rov_biology |
COMRACRFC12017 Env Template DY36 meiofauna | comracrfc12017_env_dy36_meiofauna |
COMRACRFC12016 Env Template DY37 megafauna | comracrfc12016_env_dy37_megafauna |
KOREAPMN12010 Env Template 2010 biomass | koreapmn12010_env_2010_biomass |
KOREAPMN12011 Env Template 2011 abundance | koreapmn12011_env_2011_abundance |
KOREAPMN12013 Env Template 2013 abundance | koreapmn12013_env_2013_abundance |
RUSMNRCRFC12015 Env Template Biodata | rusmnrcrfc12015_env_biodata |
JOGMECCRFC12018 Env Template HK17 NEMA | jogmeccrfc12018_env_hk17_nema |
COMRAPMS12017 Env Template phytoplankton | comrapms12017_env_phytoplankton |
COMRAPMS12017 Env Template zooplankton | comrapms12017_env_zooplankton |
JOGMECCRFC12018 Env Template JK18 picoplankton | jogmeccrfc12018_env_jk18_picoplankton |
JOGMECCRFC12018 Env Template HK17-01 phyto | jogmeccrfc12018_env_hk17_01_phyto |
COMRAPMS12016 Env Template phytoplanton | comrapms12016_env_phytoplanton |
IOMPMN12018 Env Template BIOL | iompmn12018_env_biol |
COMRAPMS12018 zooplankton | comrapms12018_zooplankton |
COMRAPMN12016 Sequences | comrapmn12016_sequences |
COMRAPMN12018 Lander2017 | comrapmn12018_lander2017 |
UKSRLPMN12015 Env Template Scavengers 032016 | uksrlpmn12015_env_scavengers_032016 |
IFREMERPMN12015 AR ENV | ifremerpmn12015_ar_env |
IFREMERPMN12018 Nodinaut Nematoda Copepoda | ifremerpmn12018_nodinaut_nematoda_copepoda |
UKSRLPMN12016 Senkenberg | uksrlpmn12016_senkenberg |
UKSRLPMN12016 Megafauna | uksrlpmn12016_megafauna |
UKSRLPMN12016 AB02 NOCS | uksrlpmn12016_ab02_nocs |
UKSRLPMN12017 Senkenberg Macrofauna | uksrlpmn12017_senkenberg_macrofauna |
YUZHPMN12015 Biodata B6 | yuzhpmn12015_biodata_b6 |
YUZPMN12016 Biodata | yuzpmn12016_biodata |
UKSRLPMN12017 NOC NERC | uksrlpmn12017_noc_nerc |
DORDPMN12018 Env Mn Bio | dordpmn12018_env_mn_bio |
GSRPMN12016 MarBiol UGent | gsrpmn12016_marbiol_ugent |
BGRPMS12020 2019 DR Biology | bgrpms12020_2019_dr_biology |
OMSPMN12018 Env NHM NORCE | omspmn12018_env_nhm_norce |
UKSRLPMN12016 NHM-UNI | uksrlpmn12016_nhm_uni |
OMSPMN12017 Macro Senckenberg | omspmn12017_macro_senckenberg |
OMSPMN12017 Scavengers Senckenberg | omspmn12017_scavengers_senckenberg |
KOREAPMN12012 Env Template 2012 abundance | koreapmn12012_env_2012_abundance |
BGRPMS12020 2019 GC Biology | bgrpms12020_2019_gc_biology |
KOREAPMN12012 Env Template 2012 biomass | koreapmn12012_env_2012_biomass |
KOREAPMN12012 Env Template 2012 macrofauna | koreapmn12012_env_2012_macrofauna |
JOGMECCRFC12016 Env Template 2016 Edokko data | jogmeccrfc12016_env_2016_edokko_data |
JOGMECCRFC12016 Env Template 2016 nematoda DNA | jogmeccrfc12016_env_2016_nematoda_dna |
JOGMECCRFC12016 Env Template 2016 ROV | jogmeccrfc12016_env_2016_rov |
JOGMECCRFC120017 Env Template 2017 abundance | jogmeccrfc120017_env_2017_abundance |
DORDPMN12016 Mn2016 ENV | dordpmn12016_mn2016_env |
BGRPMN12017 Env Template BIONOD2012 | bgrpmn12017_env_bionod2012 |
BGRPMN12017 Env Template ECORESPONSE | bgrpmn12017_env_ecoresponse |
BGRPMN12017 Env Template MANGAN2010 | bgrpmn12017_env_mangan2010 |
BGRPMN12017 Env Template MANGAN2013 | bgrpmn12017_env_mangan2013 |
BGRPMN12017 Env Template MANGAN2016 | bgrpmn12017_env_mangan2016 |
IOMPMN12015 Env Template annex 1 | iompmn12015_env_annex_1 |
IOMPMN12015 Env Template annex 11 | iompmn12015_env_annex_11 |
KOREAPMN12014 Env Template 2014 abundance | koreapmn12014_env_2014_abundance |
KOREAPMN12014 Env Template 2014 megafauna | koreapmn12014_env_2014_megafauna |
BGRPMS12020 2019 WC Biology | bgrpms12020_2019_wc_biology |
KOREAPMN12014 Env Template 2014 biomass | koreapmn12014_env_2014_biomass |
KOREAPMN12011 Env Template 2011 biomass | koreapmn12011_env_2011_biomass |
COMRAPMS12016 Env Template phytoplankton | comrapms12016_env_phytoplankton |
KOREAPMN12012 Meiofauna | koreapmn12012_meiofauna |
IOMPMN12014 Env Bio | iompmn12014_env_bio |
DORDPMN12020 Env | dordpmn12020_env |
GSRPMN12020 MarBiol UGent | gsrpmn12020_marbiol_ugent |
IFREMERPMN12018 Polynoids | ifremerpmn12018_polynoids |
NORIPMN12022 Env Template BIO | noripmn12022_env_bio |
COMRAPMN12014 Env Template W1101 | comrapmn12014_env_w1101 |
COMRAPMN12014 Env Template W1102 | comrapmn12014_env_w1102 |
COMRAPMN12014 Env Template WS1102 | comrapmn12014_env_ws1102 |
COMRAPMN12014 Env Template WS1104 | comrapmn12014_env_ws1104 |
UKSRLPMN12015 Env Template AB01 NHM | uksrlpmn12015_env_ab01_nhm |
COMRAPMS12015 ENV | comrapms12015_env |
COMRAPMS12018 Phytoplankton | comrapms12018_phytoplankton |
UKSRLPMN12015 Env Template Macrofauna 032016 | uksrlpmn12015_env_macrofauna_032016 |
UKSRLPMN12015 Env Template Senckenberg 032016 | uksrlpmn12015_env_senckenberg_032016 |
UKSRLPMN12015 Env Templaye Megafauna 032016 | uksrlpmn12015_env_megafauna_032016 |
UKSRLPMN12015 Env Template GG 032020163 | uksrlpmn12015_env_gg_032020163 |
IFREMERPMN12018 SO239 | ifremerpmn12018_so239 |
OMSPMN12018 NUS Data | omspmn12018_nus_data |
UKSRLPMN12017 Senkenberg Meofauna | uksrlpmn12017_senkenberg_meiofauna |
COMRAPMS12015 Env Template Meiobent | comrapms12015_env_meiobenthos |
IFREMERPMN12017 Env Template BIO1 2017 | ifremerpmn12017_env_bio1_2017 |
COMRACRFC12017 Env Template DY29 zooplankton | comracrfc12017_env_dy29_zooplankton |
Extracting occurrence data
Let’s first create a new ID column, this will be used later to link
together the measurements and occurrences, and to select records by
dataset. We cannot use occurrenceID
here because these are
not unique within the dataset.
library(uuid)
records$id <- UUIDgenerate(use.time = NA, n = nrow(records))
stopifnot(length(unique(records$id)) == nrow(records))
Now we can select and process the columns that will go into the occurrence table.
Initial cleanup of occurrence data
First clean up any escaped newlines, empty strings, and placeholder
values. Also fix basisOfRecord
and convert coordinates to
numeric values:
library(stringr)
occ <- occ %>%
mutate_all(~gsub("\\n", "", .)) %>%
mutate_all(~na_if(., "")) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate_all(~replace(., . %in% c("indet", "Not Reported", "indet."), NA)) %>%
mutate(basisOfRecord = "HumanObservation") %>%
mutate(
decimalLongitude = as.numeric(decimalLongitude),
decimalLatitude = as.numeric(decimalLatitude)
)
Let’s check for coordinates issues:
Let’s take a look at scientificName
and
scientificNameID
.
occ %>%
group_by(scientificName) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
rmarkdown::paged_table()
occ %>%
group_by(scientificNameID) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
rmarkdown::paged_table()
So at least in the current version at the time of writing (June 2021)
there are some quality issues for scientificName
.
Fixing taxonomy
Let’s try to clean up the scientific names before we do taxon
matching with WoRMS. Here I’m using the gni_parse()
function from the taxize
package, which connects to the GNI name parser. If a name cannot
be parsed, I’m keeping the original.
The first step is to create a list of all distinct names in the taxonomy columns.
taxonomy <- occ %>%
select(phylum, class, order, family, genus, scientificName)
names <- na.omit(unique(unlist(taxonomy)))
Then pass through the name parser:
library(taxize)
clean_name <- function(name) {
parsed <- tryCatch({
res <- gni_parse(name)
stopifnot(nrow(res) == 1)
return(res$canonical[1])
},
error = function(cond){
return(name)
})
}
names_clean <- sapply(names, clean_name)
Now use the cleaned names for taxon matching:
library(worrms)
match_name <- function(name) {
lsid <- tryCatch({
res <- wm_records_names(name)
matches <- res[[1]] %>%
filter(match_type == "exact" | match_type == "exact_genus")
if (nrow(matches) > 1) {
message(paste0("Multiple exact matches for ", name))
}
return(matches$lsid[1])
}, error = function(cond) {
return(NA)
})
}
lsids <- sapply(names_clean, match_name)
Now we need to find the lowest taxonomic level at which we find a
name. Note that this will result in records with less taxonomic
resolution than intended. Ideally we would only match on
scientificName
. First translate the taxonomy columns to
LSIDs:
## # A tibble: 132,928 × 6
## phylum class order family genus scientificName
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 urn:lsid:marinespecies.org:taxname:2… <NA> <NA> <NA> <NA> urn:lsid:mari…
## 2 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> <NA>
## 3 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> urn:lsid:mari…
## 4 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> <NA>
## 5 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> urn:lsid:mari…
## 6 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> urn:lsid:mari…
## 7 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> <NA>
## 8 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> urn:lsid:mari…
## 9 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> urn:lsid:mari…
## 10 urn:lsid:marinespecies.org:taxname:8… urn:… <NA> <NA> <NA> <NA>
## # ℹ 132,918 more rows
The find the most specific one for each row:
taxonomy_clean <- taxonomy_clean %>%
mutate(best = coalesce(scientificName, genus, family, order, class))
I’ll use the resulting LSIDs to replace the provided
scientificNameIDs
.
Let’s take another look at the top scientificName
and
scientificNameID
after mathing:
occ %>%
group_by(scientificName, scientificNameID) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
head(30) %>%
knitr::kable()
scientificName | scientificNameID | records |
---|---|---|
monothalamea | NA | 29185 |
hymenopenaeus nereus | urn:lsid:marinespecies.org:taxname:1071 | 2761 |
polychaeta | urn:lsid:marinespecies.org:taxname:883 | 2587 |
abyssoprimnoa | NA | 2351 |
scleralcyonacea | NA | 2278 |
nematoda | urn:lsid:marinespecies.org:taxname:799 | 2249 |
plesiopenaeus armatus | urn:lsid:marinespecies.org:taxname:1071 | 2005 |
amphipoda | urn:lsid:marinespecies.org:taxname:1071 | 1900 |
isopoda | urn:lsid:marinespecies.org:taxname:1071 | 1856 |
coryphaenoides armatus or yaquinae | NA | 1591 |
leptochiton | NA | 1568 |
plesiodiadema | urn:lsid:marinespecies.org:taxname:123082 | 1411 |
reophax | NA | 1308 |
thalassomonhystera | NA | 1280 |
astrorhizida | NA | 1226 |
copepoda | NA | 1191 |
actiniaria | NA | 1125 |
ostracoda | urn:lsid:marinespecies.org:taxname:1078 | 1095 |
malacostraca | urn:lsid:marinespecies.org:taxname:1071 | 1025 |
acantholaimus | NA | 966 |
porifera | urn:lsid:marinespecies.org:taxname:558 | 951 |
ophiuroidea | urn:lsid:marinespecies.org:taxname:123084 | 860 |
harpacticoida | NA | 859 |
tanaidacea | urn:lsid:marinespecies.org:taxname:1071 | 849 |
thenea | NA | 824 |
trochammina | NA | 745 |
callozostron | NA | 721 |
halalaimus | NA | 714 |
turbellaria | NA | 696 |
manganonema | NA | 640 |
Extracting MeasurementOrFact data
extract_mof <- function(df) {
df %>%
select("id", "dataset_id", "MeasurementOrFact") %>%
jsonlite::flatten() %>%
rename_all(~str_replace(., ".*\\.", "")) %>%
mutate(across(where(is.character), str_squish)) %>%
mutate_all(~na_if(., "")) %>%
filter(!is.na(measurementType) & !is.na(measurementValue)) %>%
as_tibble()
}
mof <- extract_mof(records)
mof
## # A tibble: 14,360 × 6
## id dataset_id measurementID measurementType measurementValue
## <chr> <chr> <chr> <chr> <chr>
## 1 7085c2c3-e5a9-4118… bgrpmn120… BD12421641988 Relative abund… 1.8
## 2 107d43f2-227e-45aa… bgrpmn120… BD12761642022 Relative abund… 0.72
## 3 ccba49e6-0746-4f88… bgrpmn120… BD12771642023 Relative abund… 3.14
## 4 c35699df-1c7f-4c79… bgrpmn120… BD12781642024 Relative abund… 0.72
## 5 5d94738f-8c6b-445a… bgrpmn120… BD12791642025 Relative abund… 0.45
## 6 0e859e7d-2618-4164… bgrpmn120… BD12801642026 Relative abund… 0.18
## 7 788eb9b6-4b47-4020… bgrpmn120… BD12811642027 Relative abund… 13.66
## 8 4db4b13a-cd63-443a… bgrpmn120… BD12821642028 Relative abund… 0.09
## 9 9d7fb500-08f9-431e… bgrpmn120… BD12831642029 Relative abund… 5.12
## 10 44aa97a6-e7aa-4810… bgrpmn120… BD12841642030 Relative abund… 31.81
## # ℹ 14,350 more rows
## # ℹ 1 more variable: measurementUnit <chr>
A number of records appear to have empty values. To demonstrate this,
let’s take a look at the most common combinations of
measurementType
and measurementValue
:
mof %>%
group_by(measurementType, measurementValue) %>%
summarize(records = n()) %>%
arrange(desc(records)) %>%
head(10) %>%
knitr::kable()
measurementType | measurementValue | records |
---|---|---|
Relative abundance | ns | 4504 |
Relative abundance | 0 | 462 |
Relative abundance | in progress | 141 |
Relative abundance | 0.01 | 107 |
Relative abundance | 0.854700854700855 | 97 |
Relative abundance | 0.0462962962962963 | 79 |
Relative abundance | 0.09765625 | 78 |
Relative abundance | 0.03 | 74 |
Relative abundance | 0.282485875706215 | 72 |
Relative abundance | 0.07 | 69 |
Generating Darwin Core Archives
Generating EML
For demonstration purposes, I’m working with the dataset pertaining
to the first record here. The EML template is read from
templates/eml.xml
:
library(readr)
library(glue)
generate_eml <- function(df) {
eml <- read_file("templates/eml.xml")
metadata <- df$Metadata[1,]
firstname <- strsplit(metadata$Creator$name, " ")[[1]][1]
lastname <- strsplit(metadata$Creator$name, " ")[[1]][2]
organization <- metadata$Creator$organisation
email <- metadata$Creator$email
position <- metadata$Creator$position
creator_firstname <- ""
creator_lastname <- ""
creator_organization <- metadata$Contact$organisation
creator_email <- ""
creator_position <- ""
abstract <- metadata$abstract
title <- metadata$title
citation <- metadata$citation
packageid <- "https://datasets.obis.org/deepdata"
pubdate <- format(Sys.time(), "%Y-%m-%d")
datestamp <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
glue(eml)
}
generate_eml(records)
## <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1"
## xmlns:dc="http://purl.org/dc/terms/"
## xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
## xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
## packageId="https://datasets.obis.org/deepdata" system="http://gbif.org" scope="system"
## xml:lang="eng">
##
## <dataset>
## <title xml:lang="eng">BGRPMN12017 Biodiveristy</title>
## <pubDate>2023-06-14</pubDate>
## <language>eng</language>
## <abstract>
## <para>Sampling data captured in Oceanic Exploration Research Based on geochemical and physical data we identified differences in the polymetallic nodule facies within the eastern German license area. Nodules in Prospective Areas # 1 are generally larger and show significantly higher contents of Ni and Cu as compared to the nodules in Prospective Area #2 which are smaller and are characterised by higher contents of Co. Furthermore, our study shows that sediment, which mainly consists of Si and Al oxides, contributes 13 % to the dry weight of the nodules on average. The method we used is quantitative and can thus be used for habitat mapping and be related to biodiversity and other environmental parameters</para>
## </abstract>
## <keywordSet>
## <keyword>Occurrence</keyword>
## <keywordThesaurus>GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type.xml</keywordThesaurus>
## </keywordSet>
## <intellectualRights>
## <para>This work is licensed under a <ulink url="http://creativecommons.org/licenses/by/4.0/legalcode"><citetitle>Creative Commons Attribution (CC-BY) 4.0 License</citetitle></ulink>.</para>
## </intellectualRights>
## <maintenance>
## <description>
## <para></para>
## </description>
## <maintenanceUpdateFrequency>unkown</maintenanceUpdateFrequency>
## </maintenance>
## <creator>
## <individualName>
## <givenName></givenName>
## <surName></surName>
## </individualName>
## <organizationName>Federal Institute for Geosciences and Natural Resources of Germany</organizationName>
## <positionName></positionName>
## <electronicMailAddress></electronicMailAddress>
## </creator>
## <metadataProvider>
## <individualName>
## <givenName>Sheldon</givenName>
## <surName>Carter</surName>
## </individualName>
## <organizationName>International Seabed Authority</organizationName>
## <positionName>Database Manager</positionName>
## <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
## </metadataProvider>
## <contact>
## <individualName>
## <givenName>Sheldon</givenName>
## <surName>Carter</surName>
## </individualName>
## <organizationName>International Seabed Authority</organizationName>
## <positionName>Database Manager</positionName>
## <electronicMailAddress>scarter@isa.org.jm</electronicMailAddress>
## </contact>
## </dataset>
## <additionalMetadata>
## <metadata>
## <gbif>
## <dateStamp>2023-06-14T12:43:17+0200</dateStamp>
## <hierarchyLevel>dataset</hierarchyLevel>
## <citation>Federal Institute for Geosciences and Natural Resources of Germany, (2017). BGRPMN12017 Biodiveristy. Available : DeepData, International Seabed Authority https://data.isa.org.jm/ Accessed: [YYYY-MM-DD].</citation>
## </gbif>
## </metadata>
## </additionalMetadata>
## </eml:eml>
Generating an archive descriptor file
The archive also needs to include a meta.xml
file which
describes the files in the archive and their relationships.
Let’s first get a list of terms including their
qualName
.
library(xml2)
get_terms <- function(url) {
doc <- read_xml(url)
terms <- doc %>%
xml_ns_strip() %>%
xml_find_all(".//property") %>%
map_df(function(x) {
list(
name = xml_attr(x, "name"),
qual = xml_attr(x, "qualName")
)
})
}
occurrence_terms <- get_terms("https://rs.gbif.org/core/dwc_occurrence_2020-07-15.xml")
mof_terms <- get_terms("https://rs.gbif.org/extension/obis/extended_measurement_or_fact.xml")
Using these we can generate a list of terms to go into the
meta.xml
file for each table.
generate_meta <- function(occ, mof) {
occurrence_fields <- tibble(name = names(occ)) %>%
left_join(occurrence_terms, by = "name") %>%
mutate(index = as.numeric(row.names(.)) - 1)
occurrence_lines <- paste0("<field index=\"", occurrence_fields$index, "\" term=\"", occurrence_fields$qual, "\"/>")
occurrence_lines[1] <- "<id index=\"0\" />"
occurrence_lines <- paste0(occurrence_lines, collapse = "\n")
mof_fields <- tibble(name = names(mof)) %>%
left_join(mof_terms, by = "name") %>%
mutate(index = as.numeric(row.names(.)) - 1)
mof_lines <- paste0("<field index=\"", mof_fields$index, "\" term=\"", mof_fields$qual, "\"/>")
mof_lines[1] <- "<coreid index=\"0\" />"
mof_lines <- paste0(mof_lines, collapse = "\n")
meta <- read_file("templates/meta.xml")
glue(meta)
}
generate_meta(occ, mof)
## <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
## <core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
## <files>
## <location>occurrence.txt</location>
## </files>
## <id index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/occurrenceID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/catalogNumber"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/individualCount"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/organismQuantity"/>
## <field index="6" term="http://rs.tdwg.org/dwc/terms/organismQuantityType"/>
## <field index="7" term="http://rs.tdwg.org/dwc/terms/sex"/>
## <field index="8" term="http://rs.tdwg.org/dwc/terms/occurrenceStatus"/>
## <field index="9" term="http://rs.tdwg.org/dwc/terms/associatedSequences"/>
## <field index="10" term="http://rs.tdwg.org/dwc/terms/occurrenceRemarks"/>
## <field index="11" term="http://rs.tdwg.org/dwc/terms/eventID"/>
## <field index="12" term="http://rs.tdwg.org/dwc/terms/eventDate"/>
## <field index="13" term="http://rs.tdwg.org/dwc/terms/eventTime"/>
## <field index="14" term="http://rs.tdwg.org/dwc/terms/year"/>
## <field index="15" term="http://rs.tdwg.org/dwc/terms/month"/>
## <field index="16" term="http://rs.tdwg.org/dwc/terms/day"/>
## <field index="17" term="http://rs.tdwg.org/dwc/terms/habitat"/>
## <field index="18" term="http://rs.tdwg.org/dwc/terms/samplingProtocol"/>
## <field index="19" term="http://rs.tdwg.org/dwc/terms/eventRemarks"/>
## <field index="20" term="http://rs.tdwg.org/dwc/terms/locationID"/>
## <field index="21" term="http://rs.tdwg.org/dwc/terms/minimumDepthInMeters"/>
## <field index="22" term="http://rs.tdwg.org/dwc/terms/maximumDepthInMeters"/>
## <field index="23" term="http://rs.tdwg.org/dwc/terms/verbatimDepth"/>
## <field index="24" term="http://rs.tdwg.org/dwc/terms/decimalLatitude"/>
## <field index="25" term="http://rs.tdwg.org/dwc/terms/decimalLongitude"/>
## <field index="26" term="http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem"/>
## <field index="27" term="http://rs.tdwg.org/dwc/terms/verbatimSRS"/>
## <field index="28" term="http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters"/>
## <field index="29" term="http://rs.tdwg.org/dwc/terms/identificationID"/>
## <field index="30" term="http://rs.tdwg.org/dwc/terms/typeStatus"/>
## <field index="31" term="http://rs.tdwg.org/dwc/terms/dateIdentified"/>
## <field index="32" term="http://rs.tdwg.org/dwc/terms/identificationVerificationStatus"/>
## <field index="33" term="http://purl.org/dc/terms/type"/>
## <field index="34" term="http://purl.org/dc/terms/license"/>
## <field index="35" term="http://purl.org/dc/terms/rightsHolder"/>
## <field index="36" term="http://purl.org/dc/terms/accessRights"/>
## <field index="37" term="http://purl.org/dc/terms/bibliographicCitation"/>
## <field index="38" term="http://rs.tdwg.org/dwc/terms/institutionID"/>
## <field index="39" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
## <field index="40" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
## <field index="41" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
## <field index="42" term="http://rs.tdwg.org/dwc/terms/scientificNameID"/>
## <field index="43" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
## <field index="44" term="http://rs.tdwg.org/dwc/terms/phylum"/>
## <field index="45" term="http://rs.tdwg.org/dwc/terms/class"/>
## <field index="46" term="http://rs.tdwg.org/dwc/terms/order"/>
## <field index="47" term="http://rs.tdwg.org/dwc/terms/family"/>
## <field index="48" term="http://rs.tdwg.org/dwc/terms/genus"/>
## <field index="49" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
## <field index="50" term="http://rs.tdwg.org/dwc/terms/taxonomicStatus"/>
## <field index="51" term="http://rs.tdwg.org/dwc/terms/taxonRemarks"/>
## </core>
## <extension encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact">
## <files>
## <location>extendedmeasurementorfact.txt</location>
## </files>
## <coreid index="0" />
## <field index="1" term="NA"/>
## <field index="2" term="http://rs.tdwg.org/dwc/terms/measurementID"/>
## <field index="3" term="http://rs.tdwg.org/dwc/terms/measurementType"/>
## <field index="4" term="http://rs.tdwg.org/dwc/terms/measurementValue"/>
## <field index="5" term="http://rs.tdwg.org/dwc/terms/measurementUnit"/>
## </extension>
## </archive>
Bringing it all together
Now we can generate an archive for each dataset. While I’m generating datasets I’m also populating the RSS feed and creating dataset landing pages.
baseurl <- "https://datasets.obis.org/hosted/isa/"
item_template <- read_file("templates/rss_item.xml")
landing_template <- read_file("templates/index_dataset.html")
items <- list()
pubdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %z")
unlink("output", recursive = TRUE)
dir.create("output")
datasetids <- unique(records$dataset_id)
for (datasetid in datasetids) {
dataset <- records %>%
filter(dataset_id == datasetid) %>%
head(1)
dataset$Metadata$abstract <- dataset$Metadata$abstract %>%
str_replace(., "&", "&") %>%
str_replace(., ">", ">") %>%
str_replace(., "<", "<") %>%
str_replace(., "'", "'") %>%
str_replace(., "\"", """)
title <- dataset$Metadata$title
abstract <- dataset$Metadata$abstract
link <- paste0(baseurl, datasetid, "/index.html")
dwca <- paste0(baseurl, datasetid, "/", datasetid, ".zip")
# clear dataset directory
unlink(paste0("output/", datasetid), recursive = TRUE)
dir.create(paste0("output/", datasetid))
# RSS feed items
item <- glue(item_template)
items[[datasetid]] <- item
# dataset landing page
landing <- glue(landing_template)
writeLines(landing, paste0("output/", datasetid, "/index.html"))
# archive
dataset_occ <- occ %>% filter(dataset_id == datasetid)
dataset_mof <- mof %>% filter(dataset_id == datasetid)
eml <- generate_eml(dataset)
meta <- generate_meta(occ, mof)
write.table(dataset_occ, file = paste0("output/", datasetid, "/occurrence.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
write.table(dataset_mof, file = paste0("output/", datasetid, "/extendedmeasurementorfact.txt"), sep = "\t", row.names = FALSE, na = "", quote = FALSE)
writeLines(eml, paste0("output/", datasetid, "/eml.xml"))
writeLines(meta, paste0("output/", datasetid, "/meta.xml"))
files <- c("occurrence.txt", "extendedmeasurementorfact.txt", "eml.xml", "meta.xml")
setwd(paste0("output/", datasetid))
zip(glue("{datasetid}.zip"), files)
for (f in files) {
file.remove(f)
}
setwd("../..")
}
Data publishing
In this section all files are uploaded to an S3 bucket. A list of datasets is visible at https://datasets.obis.org/hosted/isa/index.html, and an RSS file is available for the OBIS harvester.
Generate RSS file
Generate landing page
Uploading to S3
delete_object("hosted/isa/", bucket = "obis-datasets")
files <- list.files("output", full.names = TRUE, recursive = TRUE, include.dirs = FALSE)
for (file in files) {
folder <- str_replace(dirname(file), "output", "hosted/isa")
target <- str_replace(file, "output", "hosted/isa")
message(target)
put_object(file, object = target, bucket = "obis-datasets", acl = "public-read")
}