Bundling Hutchins 1990 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Hutchins, J. (1990). Fish survey of South Passage Shark Bay, Western Australia. In Berry, P., Bradshaw, S. & Wilson, B. (eds), Research in Shark Bay. Report of the France-Australe Bicentenary Expedition Committee. Western Australia Museum. Pp. 263-278.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file
library(rmapshaper)                     # To simplify shapefiles

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "shark_bay_western_australia"
dataset_dir_name <- "Hutchins_1990"
original_pdf <- "391311.pdf"
short_name <- "shark-bay-hutchins-1990"

Parsing PDF table to CSV

The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV.

#conda environment
condaenv <- "mwhs-data-mobilization"

# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")

# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")

# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", sep="/")

# Define page numbers and table areas (see documentation)
page_args <- c(
  "-a 552.977,67.201,587.331,229.93 -p 5",
  "-a 102.617,38.063,582.509,276.791 -p 6",
  "-a 97.904,59.581,580.537,297.903 -p 7",
  "-a 104.794,36.754,584.111,264.566 -p 8",
  "-a 101.1,62.1,584.1,320.1 -p 9",
  "-a 104.649,38.138,587.318,267.573 -p 10",
  "-a 101.482,47.318,587.165,275.28 -p 11",
  "-a 106.244,59.162,590.693,310.06 -p 12",
  "-a 103.862,41.962,585.379,290.16 -p 13",
  "-a 103.226,49.634,285.317,315.158 -p 14"
)

# Define run parameters (see documentation)
run_parameters <- "-s -c -f -nh -ocr -# 8"

# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)

##  [1] ""                                                                                                                     
##  [2] "Script Execution Summary"                                                                                             
##  [3] "Date and Time: 2023-09-13 12:56:58"                                                                                   
##  [4] "------------------------------"                                                                                       
##  [5] ""                                                                                                                     
##  [6] "PDF input: ../../../datasets/shark_bay_western_australia/Hutchins_1990/raw/391311.pdf"                                
##  [7] "Perform OCR: True"                                                                                                    
##  [8] "Number of Cores: 8"                                                                                                   
##  [9] "Perform Table Parsing: TRUE"                                                                                          
## [10] "Selected Areas:"                                                                                                      
## [11] "  Area 1: [552.977, 67.201, 587.331, 229.93]"                                                                         
## [12] "  Area 2: [102.617, 38.063, 582.509, 276.791]"                                                                        
## [13] "  Area 3: [97.904, 59.581, 580.537, 297.903]"                                                                         
## [14] "  Area 4: [104.794, 36.754, 584.111, 264.566]"                                                                        
## [15] "  Area 5: [101.1, 62.1, 584.1, 320.1]"                                                                                
## [16] "  Area 6: [104.649, 38.138, 587.318, 267.573]"                                                                        
## [17] "  Area 7: [101.482, 47.318, 587.165, 275.28]"                                                                         
## [18] "  Area 8: [106.244, 59.162, 590.693, 310.06]"                                                                         
## [19] "  Area 9: [103.862, 41.962, 585.379, 290.16]"                                                                         
## [20] "  Area 10: [103.226, 49.634, 285.317, 315.158]"                                                                       
## [21] "Pages: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14"                                                                             
## [22] "Concatenate: True"                                                                                                    
## [23] "Concatenate across headers: True"                                                                                     
## [24] "Stream Extraction: True"                                                                                              
## [25] "Lattice Extraction: False"                                                                                            
## [26] ""                                                                                                                     
## [27] "OCRing PDF"                                                                                                           
## [28] "------------------------------"                                                                                       
## [29] ""                                                                                                                     
## [30] ""                                                                                                                     
## [31] "Parsing Tables"                                                                                                       
## [32] "------------------------------"                                                                                       
## [33] ""                                                                                                                     
## [34] ""                                                                                                                     
## [35] "Saving to CSV"                                                                                                        
## [36] "CSV file: ../../../datasets/shark_bay_western_australia/Hutchins_1990/processed/391311_tables_parsed_concatenated.csv"
## [37] "------------------------------"                                                                                       
## [38] ""                                                                                                                     
## [39] ""                                                                                                                     
## [40] "Run Details: ../../../datasets/shark_bay_western_australia/Hutchins_1990/processed/391311_parameters.txt"             
## [41] "Finished"                                                                                                             
## [42] ""

Read source data

Now we’ll read in the csv table outputted from the previous step

processed_csv <- "391311_tables_parsed_concatenated.csv"

input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv, sep="/"))

#to preview pretty table
knitr::kable(head(input_data))

X0	X1	X2
DASYATIDIDAE	Unnamed: 1
Dasyatis brevicaudata Trion.	1875)
[limantura uarnak (Forsskal, 1775)
MYLIOBATIDIDAE	Unnamed: 1	Unnamed: 2
Aetobatus narinart (Euphrasen, 1790)
MOBULIDAE

Preprocessing

Here we tidy the data up, since OCR and table parsing errors are common and only take the list of species, since this is a checklist.

Tidy Data

input_data %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names()                             # Have sensible (lowercase) column names
  
#this is the only one that parses the species name out
input_data[274,1] = c("Macropharyngodon ornatus")
  
input_data %<>%  
  select(c(x0))                             # Take only first column

# Remove lines with only one word and lines with all uppercase letters
cleaned_data <- input_data %>%
    filter(str_count(x0, "\\S+") > 1,
           !str_detect(x0, "^[A-Z\\s]+$"))

#For elements with an abbreviated genus name, look at the precious element and grab the genus
for (i in 2:nrow(cleaned_data)) {
  if (grepl("[A-Za-z][.,]", cleaned_data$x0[i])) {
    previous_species <- strsplit(cleaned_data$x0[i - 1], " ")[[1]][1]
    cleaned_data$x0[i] <- sub("^[A-Za-z][.,]", paste0(previous_species, " "), cleaned_data$x0[i])
  }
}


#to preview pretty table
knitr::kable(head(cleaned_data))

x0
Dasyatis brevicaudata Trion.
[limantura uarnak (Forsskal, 1775)
Aetobatus narinart (Euphrasen, 1790)
Manta birostris (Donndorff, 1798)
Rhynchobatus djiddensis (Forsskal,
Carcharhinus brevipinna (Midler

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})

##

##       id                  target        authority     status
## 1 209375 Cheilodipterus lineatus (Forsskål, 1775) unaccepted
## 2 401515 Cheilodipterus lineatus   Lacepède, 1801 unaccepted

## 
## More than one WORMS ID found for taxon 'Cheilodipterus lineatus'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):

#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))

CleanedData	CanonicalFull	WormsIDs
Dasyatis brevicaudata Trion.	Dasyatis brevicaudata	212249
[limantura uarnak (Forsskal, 1775)	NA	NA
Aetobatus narinart (Euphrasen, 1790)	Aetobatus narinart	NA
Manta birostris (Donndorff, 1798)	Manta birostris	105857
Rhynchobatus djiddensis (Forsskal,	Rhynchobatus djiddensis	217378
Carcharhinus brevipinna (Midler	Carcharhinus brevipinna	105788

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[2,2:3] = c("Himantura uarnak", 105854)
combined_dataframe[3,2:3] = c("Aetobatus narinari", 217426)
combined_dataframe[9,2:3] = c("Chiloscyllium punctatum", 277833)
combined_dataframe[12,2:3] = c("Gymnothorax eurostus", 217496)
combined_dataframe[13,2:3] = c("Gymnothorax prasinus", 271873)
combined_dataframe[14,2:3] = c("Gymnothorax undulatus", 217508)
combined_dataframe[15,2:3] = c("Siderea thyrsoidea", 311404)
combined_dataframe[16,2:3] = c("Herklotsichthys quadrimaculatus", 212264)
combined_dataframe[23,2:3] = c("Abantennarius nummifer", 1605260)
combined_dataframe[26,2:3] = c("Atherinomorus ogilbyi", 299910)
combined_dataframe[29,2:3] = c("Sargocentron rubrum", 126400)
combined_dataframe[30,2:3] = c("Fistularia commersonii", 217966)
combined_dataframe[33,2:3] = c("Hippocampus", 1525460)
combined_dataframe[35,2:3] = c("Pterois volitans", 159559)
combined_dataframe[37,2:3] = c("Scorpaena", 126171)
combined_dataframe[43,2:3] = c("Psammoperca waigiensis", 282395)
combined_dataframe[45,2:3] = c("Anthias cooperi", 304621)
combined_dataframe[52,2:3] = c("Epinephelus fasciatus", 276099)
combined_dataframe[53,2:3] = c("Epinephelus lanceolatus", 218224)
combined_dataframe[54,2:3] = c("Epinephelus multinotatus", 218237)
combined_dataframe[55,2:3] = c("Epinephelus quoyanus", 218252)
combined_dataframe[56,2:3] = c("Epinephelus rivulatus", 218216)
combined_dataframe[57,2:3] = c('Epinephelus suillus', 304778)
combined_dataframe[71,2:3] = c('Apogon cookii', 159588)
combined_dataframe[73,2:3] = c('Apogon doederleini', 273008)
combined_dataframe[75,2:3] = c('Apogon rueppelli', 302002)
combined_dataframe[79,2:3] = c('Apogon', 125913)
combined_dataframe[81,2:3] = c('Cheilodipterus lineatus', 401515)
combined_dataframe[83,2:3] = c('Sillago schomburgkii', 273953)
combined_dataframe[87,2:3] = c('Caranx ignobilis', 218414)
combined_dataframe[89,2:3] = c('Decapterus macrosoma', 218426)
combined_dataframe[99,2:3] = c('Caesio cuning', 278536)
combined_dataframe[100,2:3] = c('Pterocaesio diagramma', 401778)
combined_dataframe[105,2:3] = c('Lutjanus kasmira', 218482)
combined_dataframe[109,2:3] = c('Scaevius milii', 282666)
combined_dataframe[111,2:3] = c('Diagramma pictum', 218536)
combined_dataframe[113,2:3] = c('Plectorhinchus multivittatum', 303418)
combined_dataframe[114,2:3] = c('Plectorhinchus schotaf', 218551)
combined_dataframe[121,2:3] = c('Chrysophrys auratus', 367239)
combined_dataframe[123,2:3] = c('Argyrosomus hololepidotus', 218632)
combined_dataframe[132,2:3] = c('Pempheris klunzingeri', 277060)
combined_dataframe[134,2:3] = c('Pempheris schwenkii', 218699)
combined_dataframe[135,2:3] = c('Kyphosus cornelii', 273520)
combined_dataframe[136,2:3] = c('Kyphosus gibsoni', 303483)
combined_dataframe[140,2:3] = c('Scorpis aequipinnis', 282728)
combined_dataframe[141,2:3] = c('Platax teira', 218710)
combined_dataframe[143,2:3] = c('Chaetodon auriga', 218730)
combined_dataframe[144,2:3] = c('Chaetodon lineolatus', 218734)
combined_dataframe[145,2:3] = c('Chaetodon lunula', 218733)
combined_dataframe[146,2:3] = c('Chaetodon plebeius', 273354)
combined_dataframe[147,2:3] = c('Chaetodon speculum', 218740)
combined_dataframe[148,2:3] = c('Chaetodon trifascialis', 218719)
combined_dataframe[149,2:3] = c('Chelmon rostratus', 218758)
combined_dataframe[155,2:3] = c('Heniochus singularius', 218766)
combined_dataframe[155,2:3] = c('Heniochus singularius', 218766)
combined_dataframe[157,2:3] = c('Chaetodontoplus duboulayi', 280118)
combined_dataframe[159,2:3] = c('Pomacanthus imperator', 220001)
combined_dataframe[169,2:3] = c('Chromis margaritifer', 273739)
combined_dataframe[170,2:3] = c('Chromis weberi', 212817)
combined_dataframe[171,2:3] = c('Chromis westaustralis', 273760)
combined_dataframe[183,2:3] = c('Pomacentrus milleri', 277158)
combined_dataframe[185,2:3] = c('Pomacentrus vaiuli', 277176)
combined_dataframe[189,2:3] = c('Cyprinocirrhites polyactis', 218864)
combined_dataframe[190,2:3] = c('Paracirrhites forsteri', 218867)
combined_dataframe[196,2:3] = c('Anampses lennardi', 279172)
combined_dataframe[197,2:3] = c('Anampses meleagrides', 218923)
combined_dataframe[205,2:3] = c('Choerodon jordani', 277272)
combined_dataframe[207,2:3] = c('Choerodon schoenleinii', 218951)
combined_dataframe[208,2:3] = c('Cirrhilabrus temmincki', 402401)
combined_dataframe[209,2:3] = c('Coris auricularis', 273548)
combined_dataframe[210,2:3] = c('Coris aygula', 218957)
combined_dataframe[211,2:3] = c('Coris caudimacula', 218959)
combined_dataframe[214,2:3] = c('Halichoeres marginatus', 218991)
combined_dataframe[215,2:3] = c('Halichoeres nebulosus', 218986)
combined_dataframe[216,2:3] = c('Hemigymnus fasciatus', 218999)
combined_dataframe[217,2:3] = c('Hemigymnus melapterus', 218998)
combined_dataframe[224,2:3] = c('Stethojulis bandanensis', 277106)
combined_dataframe[225,2:3] = c('Stethojulis interrupta', 219063)
combined_dataframe[229,2:3] = c('Thalassoma hardwichei', 1578174)
combined_dataframe[230,2:3] = c('Thalassoma jansenii', 273582)
combined_dataframe[233,2:3] = c('Thalassoma purpureum', 212213)
combined_dataframe[234,2:3] = c('Thalassoma septemfasciata', 273592)
combined_dataframe[240,2:3] = c('Scarus rivulatus', 367210)
combined_dataframe[242,2:3] = c('Scarus schlegeli', 276060)
combined_dataframe[246,2:3] = c('Parapercis nebulosa', 219154)
combined_dataframe[250,2:3] = c('Cirripectes hutchinsi', 276719)
combined_dataframe[252,2:3] = c('Ecsenius oculus', 277671)
combined_dataframe[256,2:3] = c('Laiphognathus multimaculatus', 219305)
combined_dataframe[261,2:3] = c('Plagiotremus rhinorhynchos', 219334)
combined_dataframe[262,2:3] = c('Plagiotremus tapeinosoma', 219335)
combined_dataframe[264,2:3] = c('Stanulus talboti', 277937)
combined_dataframe[265,2:3] = c('Helcogramma decurrens', 279231)
combined_dataframe[266,2:3] = c('Helcogramma', 206858)
combined_dataframe[268,2:3] = c('Enneapterygius', 204407)
combined_dataframe[269,2:3] = c('Heteroclinus', 269393)
combined_dataframe[270,2:3] = c('Amblygobius phalaena', 278655)
combined_dataframe[273,2:3] = c('Eviota bimaculata', 278582)
combined_dataframe[274,2:3] = c('Eviota smaragdus', 278608)
combined_dataframe[275,2:3] = c('Eviota storthynx', 278611)
combined_dataframe[276, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Eviota", "species 1", 205965)
combined_dataframe[277, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Eviota", "species 2", 205965)
combined_dataframe[279,2:3] = c('Fusigobius', 206282)
combined_dataframe[288,2:3] = c('Priolepis', 203905)
combined_dataframe[291,2:3] = c('Valenciennea immaculata', 277030)
combined_dataframe[292,2:3] = c('Valenciennea longipinnis', 277032)
combined_dataframe[293,2:3] = c('Valenciennea puellaris', 219595)
combined_dataframe[294,2:3] = c('Valenciennea', 204096)
combined_dataframe[296,2:3] = c('Acanthurus mata', 219651)
combined_dataframe[297,2:3] = c('Acanthurus olivaceus', 219625)
combined_dataframe[298,2:3] = c('Acanthurus triostegus', 219630)
combined_dataframe[301,2:3] = c('Zebrasoma veliferum', 219684)
combined_dataframe[308,2:3] = c('Engyprosopon', 204480)
combined_dataframe[314,2:3] = c('Colurodontis paxmani', 280370)
combined_dataframe[318,2:3] = c('Stephanolepis', 126236)
combined_dataframe[319,2:3] = c('Anoplocapros robustus', 219900)

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- combined_dataframe %>%
  distinct(CanonicalFull, identificationQualifier, WormsIDs) %>%
  rename(scientificName = CanonicalFull) %>%
  rename(scientificNameID = WormsIDs) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, identificationQualifier), algo="md5"), sep=":"))

eventDate

This is NULL since this is technically a checklist and we do not know the collection date.

eventDate <- ""
occurrence %<>% mutate(eventDate)

decimalLongitude/decimalLatitude

Use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. The WKT strings are from https://github.com/iobis/mwhs-shapes.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))

## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326

#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Shark Bay, Western Australia
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Shark Bay, Western Australia")]

#Simplify shape because the WKT length is too long for GBIF as is
smoothed_shape <- ms_simplify(ind_shape)

#convert shape to WKT
wkt <- st_as_text(smoothed_shape, digits = 6)

localities <- calculate_centroid(wkt)

occurrence %<>% mutate(decimalLatitude = localities$decimalLatitude)
occurrence %<>% mutate(decimalLongitude = localities$decimalLongitude)

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

footprintWKT

occurrence %<>% mutate(footprintWKT = wkt)

coordinateUncertaintyInMeters

occurrence %<>% mutate(coordinateUncertaintyInMeters = localities$coordinateUncertaintyInMeters)

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Australia"
occurrence %<>% mutate(country)

locality

locality <- "Shark Bay"
occurrence %<>% mutate(locality)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, identificationQualifier,scientificNameID, eventDate, country, locality, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, footprintWKT, geodeticDatum, occurrenceStatus, basisOfRecord)

#Check fields
check_fields(occurrence)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## # A tibble: 324 × 4
##    level field       row message                                 
##    <chr> <chr>     <int> <chr>                                   
##  1 error eventDate     1 Empty value for required field eventDate
##  2 error eventDate     2 Empty value for required field eventDate
##  3 error eventDate     3 Empty value for required field eventDate
##  4 error eventDate     4 Empty value for required field eventDate
##  5 error eventDate     5 Empty value for required field eventDate
##  6 error eventDate     6 Empty value for required field eventDate
##  7 error eventDate     7 Empty value for required field eventDate
##  8 error eventDate     8 Empty value for required field eventDate
##  9 error eventDate     9 Empty value for required field eventDate
## 10 error eventDate    10 Empty value for required field eventDate
## # ℹ 314 more rows

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")

## [1] "eml-2.1.1"

#Title
title <- "Fish survey of South Passage Shark Bay, Western Australia: Fishes Checklist"

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")


#Abstract
abstract <- eml$abstract(
  para = "In April 1979, 323 species of fish were recorded from the South Passage area of Shark Bay. The majority of these are tropical species (83%), with smaller numbers of warm temperate (11%) and cool temperate (6%) species. Many of the tropical species, however, were found to be present in only low numbers, while some warm temperate and one cool temperate species were abundant. The fishes of South Passage, therefore, are considered to belong to an impoverished tropical fauna. Furthermore, South Passage is the southernmost mainland area of Western Australia which supports a predominantly tropical fish fauna. Its fauna is even more diverse than that of the Houtman Abrolhos, a very much larger area of offshore islands and coral reefs located to the south off Geraldton."
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- eml$creator(
  individualName = eml$individualName(
    givenName = "J. Barry", 
    surName = "Hutchins"),
  organizationName = "Western Australian Museum"
)

contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Hutchins, J. (1990). Fish survey of South Passage Shark Bay, Western Australia. In Berry, P., Bradshaw, S. & Wilson, B. (eds), Research in Shark Bay. Report of the France-Australe Bicentenary Expedition Committee. Western Australia Museum. Pp. 263-278.")
    )
  )
)

citationdoi <- ""

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Shark Bay, Western Australia",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
#  ),
#  temporalCoverage = eml$temporalCoverage(
#    rangeOfDates = eml$rangeOfDates(
#      beginDate = eml$beginDate(
#        calendarDate = "2019-05-01"
#      ),
#      endDate = eml$endDate(
#        calendarDate = "2016-05-06"
#      )
#    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)


#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)
#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)

## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_checklist_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "default", eventDate)
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/locality") {
    xml_set_attr(field, "default", locality)
  } else if (term == "http://rs.tdwg.org/dwc/terms/decimalLatitude") {
    xml_set_attr(field, "default", localities$decimalLatitude)
  } else if (term == "http://rs.tdwg.org/dwc/terms/decimalLongitude") {
    xml_set_attr(field, "default", localities$decimalLongitude)
  } else if (term == "http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters") {
    xml_set_attr(field, "default", localities$coordinateUncertaintyInMeters)
  } else if (term == "http://rs.tdwg.org/dwc/terms/footprintWKT") {
    xml_set_attr(field, "default", wkt)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  }
}

#Add identificationQualifier
new_field <- xml_add_sibling(fields[[3]], "field")
xml_set_attr(new_field, "index", "3")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/identificationQualifier")

fields <- append(fields, list(new_field))

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}