Bundling van Etten 2002 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Etten, J.P.C van (2002) Bane d’Arguin a Nursery for fish species. Master’s Thesis / Essay, Biology.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "banc_darguin_national_park"
dataset_dir_name <- "van_Etten_2002"
original_pdf <- "Biol_Msc_2002_JPCvanEtten.CV.pdf"
short_name <- "van-etten-2002"

Parsing PDF table to CSV

The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV.

#conda environment
condaenv <- "mwhs-data-mobilization"

# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")

# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")

# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", sep="/")

# Define page numbers and table areas (see documentation)
page_args <- c(
  "-a 464.354,96.626,720.199,386.897 -p 10"
)

# Define run parameters (see documentation)
run_parameters <- "-s -nh"

# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
##  [1] ""                                                                                                                              
##  [2] "Script Execution Summary"                                                                                                      
##  [3] "Date and Time: 2023-10-03 16:04:08"                                                                                            
##  [4] "------------------------------"                                                                                                
##  [5] ""                                                                                                                              
##  [6] "PDF input: ../../../datasets/banc_darguin_national_park/van_Etten_2002/raw/Biol_Msc_2002_JPCvanEtten.CV.pdf"                   
##  [7] "Perform Table Parsing: TRUE"                                                                                                   
##  [8] "Selected Areas:"                                                                                                               
##  [9] "  Area 1: [464.354, 96.626, 720.199, 386.897]"                                                                                 
## [10] "Pages: 10"                                                                                                                     
## [11] "Concatenate: False"                                                                                                            
## [12] "Concatenate across headers: True"                                                                                              
## [13] "Stream Extraction: True"                                                                                                       
## [14] "Lattice Extraction: False"                                                                                                     
## [15] ""                                                                                                                              
## [16] "Parsing Tables"                                                                                                                
## [17] "------------------------------"                                                                                                
## [18] ""                                                                                                                              
## [19] ""                                                                                                                              
## [20] "Saving to CSV"                                                                                                                 
## [21] "CSV file(s):"                                                                                                                  
## [22] "\t../../../datasets/banc_darguin_national_park/van_Etten_2002/processed/Biol_Msc_2002_JPCvanEtten.CV_tables_parsed_1.csv"      
## [23] "------------------------------"                                                                                                
## [24] ""                                                                                                                              
## [25] ""                                                                                                                              
## [26] "Run Details: ../../../datasets/banc_darguin_national_park/van_Etten_2002/processed/Biol_Msc_2002_JPCvanEtten.CV_parameters.txt"
## [27] "Finished"                                                                                                                      
## [28] ""

Read source data

Now we’ll read in the csv table outputted from the previous step

processed_csv <- "Biol_Msc_2002_JPCvanEtten.CV_tables_parsed_1.csv"

input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv, sep="/"), header = FALSE)

#to preview pretty table
knitr::kable(head(input_data))
V1 V2 V3
Anus Iatisculatus Unnamed: 1 0.000
Athennasp. 0021 0.013
Bathysolea p0111 0 000 0.000
Boops boops 0 001
Dicentrarchus punctatus 0.001 0.00
DvIodussa,gus 0.014 0.014

Preprocessing

Here we tidy the data up, since OCR and table parsing errors are common and only take the list of species, since this is a checklist.

Tidy Data

names(input_data) <- c("sciname", "A", "F")
input_data[1, 2] <- ""
input_data[14, 3] <- ""

# Remove Classes, Families and Orders and take first column only
cleaned_data <- input_data

#to preview pretty table
knitr::kable(head(cleaned_data))
sciname A F
Anus Iatisculatus 0.000
Athennasp. 0021 0.013
Bathysolea p0111 0 000 0.000
Boops boops 0 001
Dicentrarchus punctatus 0.001 0.00
DvIodussa,gus 0.014 0.014

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,1])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})
## 
##         id                           target                     authority
## 1   315777            Tilapia aequatorialis                   Roman, 1971
## 2   315778                  Tilapia affinis                 Duméril, 1861
## 3   315779               Tilapia amphimelas              Hilgendorf, 1905
## 4   405482                Tilapia andersoni             (Castelnau, 1861)
## 5   315780               Tilapia andersonii             (Castelnau, 1861)
## 6   315781               Tilapia angolensis Thys van den Audenaerde, 1969
## 7   315782                  Tilapia arnoldi    Gilchrist & Thompson, 1917
## 8  1531221                Tilapia athiensis             (Boulenger, 1916)
## 9   315783                    Tilapia aurea          (Steindachner, 1864)
## 10 1016990                 Tilapia borkuana               Pellegrin, 1919
## 11 1018764               Tilapia boulengeri               Pellegrin, 1903
## 12  315784                   Tilapia browni                 Nichols, 1923
## 13  405520        Tilapia caeruleomaculatus            (Rochebrune, 1880)
## 14  315785                 Tilapia calciati             Gianferrari, 1924
## 15  315786               Tilapia cancellata                 Nichols, 1923
## 16  315787                 Tilapia christyi               Boulenger, 1915
## 17 1529182              Tilapia crassispina               Arambourg, 1948
## 18  315789                   Tilapia druryi    Gilchrist & Thompson, 1917
## 19  315790                    Tilapia dubia                Lönnberg, 1904
## 20  405499                 Tilapia dumerili          (Steindachner, 1864)
## 21  315791                Tilapia dumerilii          (Steindachner, 1864)
## 22 1017654               Tilapia eduardiana               Boulenger, 1912
## 23 1620002              Tilapia esduardiana               Boulenger, 1912
## 24  405522                Tilapia faidherbi            (Rochebrune, 1880)
## 25  315792                 Tilapia galilaea              (Linnaeus, 1758)
## 26 1626622        Tilapia galilaea borkuana               Pellegrin, 1919
## 27 1626623      Tilapia galilaea boulengeri               Pellegrin, 1903
## 28 1626904   Tilapia galilaea multifasciata               (Günther, 1903)
## 29  315793                Tilapia gefuensis Thys van den Audenaerde, 1964
## 30  315794              Tilapia grandidieri               (Sauvage, 1882)
## 31  282977               Tilapia guineensis               (Bleeker, 1863)
## 32  367733               Tilapia guineensis               (Günther, 1862)
## 33  405509                Tilapia guinensis               (Günther, 1862)
## 34  405508                Tilapia heudeloti                 Duméril, 1861
## 35  315795               Tilapia heudelotii                 Duméril, 1861
## 36  315796                 Tilapia hornorum                Trewavas, 1966
## 37 1015667                  Tilapia inducta                Trewavas, 1933
## 38  315798                Tilapia kafuensis               Boulenger, 1912
## 39  315799                  Tilapia kashabi                  Elster, 1958
## 40  315800                 Tilapia kirkhami    Gilchrist & Thompson, 1917
## 41  315801                  Tilapia korogwe                  (Lowe, 1955)
## 42  315802                     Tilapia lata               (Günther, 1862)
## 43  315803                Tilapia lateralis                 Duméril, 1861
## 44  315804                Tilapia latifrons               Boulenger, 1906
## 45  315805                Tilapia lemassoni          Blache & Miton, 1960
## 46  315806                Tilapia leonensis Thys van den Audenaerde, 1971
## 47 1048608                    Tilapia louka Thys van den Audenaerde, 1969
## 48  315808                 Tilapia mackeani    Gilchrist & Thompson, 1917
## 49  315809              Tilapia macrocentra                 Duméril, 1861
## 50  315810             Tilapia macrocephala               (Bleeker, 1862)
## 51  315810             Tilapia macrocephala               (Bleeker, 1862)
## 52  315811         Tilapia madagascariensis               (Liénard, 1891)
## 53  315812                 Tilapia manyarae              Hilgendorf, 1905
## 54  282978                   Tilapia mariae               Boulenger, 1899
## 55  315813                    Tilapia meeki               Pellegrin, 1911
## 56  315814             Tilapia melanopleura                 Duméril, 1861
## 57  315815             Tilapia melanotheron               (Rüppell, 1852)
## 58  315817             Tilapia microcephala               (Günther, 1862)
## 59  315818               Tilapia microstoma                (Lortet, 1883)
## 60  315819                   Tilapia monodi                   Daget, 1954
## 61  405491                Tilapia mosambica                (Peters, 1852)
## 62  315820               Tilapia mossambica                (Peters, 1852)
## 63  324268       Tilapia mossambica korogwe                    Lowe, 1955
## 64  315821              Tilapia mossambicus                (Peters, 1852)
## 65  405498               Tilapia mozambique                (Peters, 1852)
## 66  323880 Tilapia multifasciata macrostoma               Pellegrin, 1941
## 67 1622281           Tilapia multifasciatus               (Günther, 1903)
## 68 1627261              Tilapia nigra nigra               (Günther, 1894)
## 69  315824              Tilapia nigripinnis               Guichenot, 1861
## 70  315825                 Tilapia nilotica              (Linnaeus, 1758)
## 71 1626075       Tilapia nilotica athiensis               Boulenger, 1916
## 72  324270      Tilapia nilotica cancellata                 Nichols, 1923
## 73 1626612      Tilapia nilotica eduardiana               Boulenger, 1912
## 74 1626613          Tilapia nilotica regani                    Poll, 1932
## 75  405501                Tilapia nilotious              (Linnaeus, 1758)
## 76  315826                  Tilapia nyirica                Lönnberg, 1911
## 77  315827             Tilapia oligacanthus                 Bleeker, 1868
## 78 1017494                Tilapia percivali               Boulenger, 1912
## 79 1619970                  Tilapia placida                Trewavas, 1941
## 80  315829              Tilapia pleuromelas                 Duméril, 1861
## 81  315830               Tilapia polycentra                 Duméril, 1861
## 82  315831                   Tilapia rangii                 Duméril, 1861
## 83 1015754                   Tilapia regani                    Poll, 1932
## 84  282979                 Tilapia rendalli             (Boulenger, 1897)
## 85  315832                  Tilapia ruvumae                Trewavas, 1966
## 86 1010624              Tilapia sanagaensis Thys van den Audenaerde, 1966
## 87  315834               Tilapia shariensis                  Fowler, 1949
## 88  324273          Tilapia shirana chilwae                Trewavas, 1966
## 89 1626906           Tilapia spilurus nigra               (Günther, 1894)
## 90  315836               Tilapia swierstrae    Gilchrist & Thompson, 1917
## 91  315837                  Tilapia sykesii    Gilchrist & Thompson, 1917
## 92  315839                Tilapia tristrami               (Günther, 1860)
## 93  315840                 Tilapia urolepis                  Norman, 1922
## 94 1015679                  Tilapia vulcani                Trewavas, 1933
## 95 1620043                    Tilapia zilii               (Gervais, 1848)
## 96  405516                   Tilapia zillei               (Gervais, 1848)
## 97  405519                    Tilapia zilli               (Gervais, 1848)
## 98  282980                   Tilapia zillii               (Gervais, 1848)
## 99  323689        Tilapia zillii guineensis               (Günther, 1862)
##         status
## 1   unaccepted
## 2   unaccepted
## 3   unaccepted
## 4   unaccepted
## 5   unaccepted
## 6   unaccepted
## 7   unaccepted
## 8   unaccepted
## 9   unaccepted
## 10  unaccepted
## 11  unaccepted
## 12  unaccepted
## 13  unaccepted
## 14  unaccepted
## 15  unaccepted
## 16  unaccepted
## 17  unaccepted
## 18  unaccepted
## 19  unaccepted
## 20  unaccepted
## 21  unaccepted
## 22  unaccepted
## 23 misspelling
## 24  unaccepted
## 25  unaccepted
## 26  unaccepted
## 27  unaccepted
## 28  unaccepted
## 29  unaccepted
## 30  unaccepted
## 31  unaccepted
## 32  unaccepted
## 33  unaccepted
## 34  unaccepted
## 35  unaccepted
## 36  unaccepted
## 37  unaccepted
## 38  unaccepted
## 39  unaccepted
## 40  unaccepted
## 41  unaccepted
## 42  unaccepted
## 43  unaccepted
## 44  unaccepted
## 45  unaccepted
## 46  unaccepted
## 47  unaccepted
## 48  unaccepted
## 49  unaccepted
## 50  unaccepted
## 51  unaccepted
## 52  unaccepted
## 53  unaccepted
## 54  unaccepted
## 55  unaccepted
## 56  unaccepted
## 57  unaccepted
## 58  unaccepted
## 59  unaccepted
## 60  unaccepted
## 61  unaccepted
## 62  unaccepted
## 63  unaccepted
## 64  unaccepted
## 65  unaccepted
## 66  unaccepted
## 67  unaccepted
## 68  unaccepted
## 69  unaccepted
## 70  unaccepted
## 71  unaccepted
## 72  unaccepted
## 73  unaccepted
## 74  unaccepted
## 75  unaccepted
## 76  unaccepted
## 77  unaccepted
## 78  unaccepted
## 79  unaccepted
## 80  unaccepted
## 81  unaccepted
## 82  unaccepted
## 83  unaccepted
## 84  unaccepted
## 85  unaccepted
## 86  unaccepted
## 87  unaccepted
## 88  unaccepted
## 89  unaccepted
## 90  unaccepted
## 91  unaccepted
## 92  unaccepted
## 93  unaccepted
## 94  unaccepted
## 95 misspelling
## 96  unaccepted
## 97  unaccepted
## 98  unaccepted
## 99  unaccepted
## 
## More than one WORMS ID found for taxon 'Tilapia'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.sciname CleanedData.A CleanedData.F CanonicalFull WormsIDs
Anus Iatisculatus 0.000 Anus NA
Athennasp. 0021 0.013 NA NA
Bathysolea p0111 0 000 0.000 Bathysolea 126126
Boops boops 0 001 Boops boops 127047
Dicentrarchus punctatus 0.001 0.00 Dicentrarchus punctatus 126976
DvIodussa,gus 0.014 0.014 NA NA

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[1,4:5] = c("Arius latiscutatus", 275576)
combined_dataframe[2,4:5] = c("Atherina", 125659)
combined_dataframe[3,4:5] = c("Bathysolea polli", 274297)
combined_dataframe[6,4:5] = c("Diplodus sargus", 127053)
combined_dataframe[7,4:5] = c("Ephippion guttiferum", 403504)
combined_dataframe[8,4:5] = c("Epinephelus aeneus", 127032)
combined_dataframe[9,4:5] = c("Ethmalosa fimbriata", 280725)
combined_dataframe[10,4:5] = c("Gobius microps", 151516)
combined_dataframe[12,4:5] = c("Hippocampus hippocampus", 127380)
combined_dataframe[13,4:5] = c("Liza falcipinnis", 273639)
combined_dataframe[14,4:5] = c("Loligo", 138139)
combined_dataframe[15,4:5] = c("Mugil cephalus", 126983)
combined_dataframe[16,4:5] = c("Eucinostomus melanopterus", 276423)
combined_dataframe[17,4:5] = c("Sardinella", 125721)
combined_dataframe[18,4:5] = c("Sardinella aurita", 126422)
combined_dataframe[20,4:5] = c("Solea senegalensis", 127159)
combined_dataframe[21,4:5] = c("Solea vulgaris", 154712)
combined_dataframe[22,4:5] = c("Stephanolepis hispidus", 127409)
combined_dataframe[24,4:5] = c("Syngnathus typhle", 127393)
combined_dataframe[25,4:5] = c("Tilapia", 271096)

Locality data

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv.

occ_data <- data.frame(
  canonicalFull = character(),
  wormsIDs = numeric(),
  locality = character(),
  fieldNumber = character(),
  decimalLongitude = numeric(),
  decimalLatitude = numeric(),
  coordinateUncertaintyInMeters = numeric()
)

for (i in 1:nrow(combined_dataframe)) {
  if (combined_dataframe[i, 2] != "") {
    fieldNumber = "A"
    locality = "Baie d'Aouatif, Arie flat"
    decimalLatitude = "19.8839"
    decimalLongitude = "-16.2790723"
    coordinateUncertaintyInMeters = "50"
    
    new_row <- data.frame(
      canonicalFull = combined_dataframe[i, "CanonicalFull"],
      wormsIDs = combined_dataframe[i, "WormsIDs"],
      locality = locality,
      fieldNumber = fieldNumber,
      decimalLongitude = decimalLongitude,
      decimalLatitude = decimalLatitude,
      coordinateUncertaintyInMeters = coordinateUncertaintyInMeters)
      
    occ_data <- rbind(occ_data, new_row)
  }
  if (combined_dataframe[i, 3] != ""){
    fieldNumber = "F"
    locality = "Baie d'Aouatif, Francesc flat"
    decimalLatitude = "19.8765081"
    decimalLongitude = "-16.2864898"
    coordinateUncertaintyInMeters = "50"
    
    new_row <- data.frame(
      canonicalFull = combined_dataframe[i, "CanonicalFull"],
      wormsIDs = combined_dataframe[i, "WormsIDs"],
      locality = locality,
      fieldNumber = fieldNumber,
      decimalLongitude = decimalLongitude,
      decimalLatitude = decimalLatitude,
      coordinateUncertaintyInMeters = coordinateUncertaintyInMeters)
      
    occ_data <- rbind(occ_data, new_row)
  }
}

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occ_data %>%
  rename(scientificName = canonicalFull) %>%
  rename(scientificNameID = wormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, locality), algo="md5"), sep=":"))

eventDate

These specimens were collected between June 2003 - December 2003

eventDate <- "2002-01-27/2002-03-18"
occurrence %<>% mutate(eventDate)

decimalLongitude/decimalLatitude

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. See above.

The calculations below are used to calculate the boundaries for the EML file.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Banc d'Arguin National Park
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Banc d'Arguin National Park")]

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Mauritania"
occurrence %<>% mutate(country)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, country, locality, fieldNumber, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, geodeticDatum, occurrenceStatus, basisOfRecord)

#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 0 × 0

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")
## [1] "eml-2.1.1"
#Title
title <- "Banc d'Arguin a Nursery for fish species"

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")

#Abstract
abstract <- eml$abstract(
  para = "In the period from 27-01-2002 until 18-03-2002 126 samples, totalling an area of 6139.58 m2, on different substrates (Zostera, Tidal pool, Sand, Gully Zostera and Gully Cvmodocea) were collected with a Beam trawl in the Baie d'Aouatif, located in the Parc National du Banc d'Arguin, Mauritania. This was done to test the hypothesis that the Banc d'Arguin has an ecological function as a nursery for juvenile fish. 347 individuals of 25 different species were caught, 2 of those species belonged to the class of Cephalopoda. Of the twenty-three fish species, those species belonging to the family of Gobiidae and the Sub-family Syngnathinae were the most common species."
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- list(eml$creator(
    individualName = eml$individualName(
      givenName = "J.P.C.", 
      surName = "van Etten"),
    organizationName = "Rijks Universiteit Groningen"
  )
)

contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Etten, J.P.C van (2002) Bane d'Arguin a Nursery for fish species. Master's Thesis / Essay, Biology.")
    )
  )
)

citationdoi <- "https://fse.studenttheses.ub.rug.nl/id/eprint/9188"

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Banc d'Arguin National Park",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
  ),
  temporalCoverage = eml$temporalCoverage(
    rangeOfDates = eml$rangeOfDates(
      beginDate = eml$beginDate(
        calendarDate = "2002-01-27"
      ),
      endDate = eml$endDate(
        calendarDate = "2002-03-18"
      )
    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)

#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)

#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "default", eventDate)
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  }
}

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}