Bundling Robertson et al. 2003 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Robertson, D R. (2003). Survey of reef fishes in the Coiba National Park, Panama, May 5-22, 2003.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file
suppressMessages(library(tidyr))

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "coiba_national_park_and_its_special_zone_of_marine_protection"
dataset_dir_name <- "Robertson_et_al_2003"
original_pdf <- "coiba2003.pdf"
short_name <- "coiba-robertson-2003"

Dive Data

Parsing PDF table to CSV

The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV. We will do this twice, since there are two tables that we want inside the pdf - one for trawls and one for dives.

#conda environment
condaenv <- "mwhs-data-mobilization"

# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")

# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")

# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed/dive", sep="/")

# Define page numbers and table areas (see documentation)
page_args <- c(
"-a 111.623,17.528,1965.848,1432.643 -p 5",
"-a 96.863,15.683,1379.138,1430.798 -p 6"

)

# Define run parameters (see documentation)
run_parameters <- "-l -c"

# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
##  [1] ""                                                                                                                                                                      
##  [2] "Script Execution Summary"                                                                                                                                              
##  [3] "Date and Time: 2023-10-03 04:38:16"                                                                                                                                    
##  [4] "------------------------------"                                                                                                                                        
##  [5] ""                                                                                                                                                                      
##  [6] "PDF input: ../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/raw/coiba2003.pdf"                                     
##  [7] "Perform Table Parsing: TRUE"                                                                                                                                           
##  [8] "Selected Areas:"                                                                                                                                                       
##  [9] "  Area 1: [111.623, 17.528, 1965.848, 1432.643]"                                                                                                                       
## [10] "  Area 2: [96.863, 15.683, 1379.138, 1430.798]"                                                                                                                        
## [11] "Pages: 5, 6"                                                                                                                                                           
## [12] "Concatenate: True"                                                                                                                                                     
## [13] "Concatenate across headers: False"                                                                                                                                     
## [14] "Stream Extraction: False"                                                                                                                                              
## [15] "Lattice Extraction: True"                                                                                                                                              
## [16] ""                                                                                                                                                                      
## [17] "Parsing Tables"                                                                                                                                                        
## [18] "------------------------------"                                                                                                                                        
## [19] ""                                                                                                                                                                      
## [20] ""                                                                                                                                                                      
## [21] "Saving to CSV"                                                                                                                                                         
## [22] "CSV file: ../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/processed/dive/coiba2003_tables_parsed_concatenated.csv"
## [23] "------------------------------"                                                                                                                                        
## [24] ""                                                                                                                                                                      
## [25] ""                                                                                                                                                                      
## [26] "Run Details: ../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/processed/dive/coiba2003_parameters.txt"             
## [27] "Finished"                                                                                                                                                              
## [28] ""

Read source data

Now we’ll read in the csv table outputted from the previous step

processed_csv <- "dive/coiba2003_tables_parsed_concatenated.csv"

input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv, sep="/"))

#to preview pretty table
knitr::kable(head(input_data))
Family Species.Name X1 X2 X3 X4 X5 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19_.20 X21 X22 X23_24 X25 X26_27 X28 X29 X30 X31 X32_33 X34 X35 X36 LT2 X37 LT3 X38 LT4 X39 LT5 X40 X41 X42 X43 X44 X45 X46 X47 X48 X49
Ginglymostomatidae Ginglymostoma cirratum r nuc nuc r
Carcharhinidae Carcharhinus limbatus r
Triaenodon obesus nuc r r c nuc r r r nuc r r r r r r
Narcinidae Diplobatis ommata r r r
Narcine entemedor r
Dasyatidae Dasyatis dipterura r

Preprocessing

Here we tidy the data up.

Tidy Data

input_data %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names()

cleaned_data <- input_data

#to preview pretty table
knitr::kable(head(cleaned_data))
family species_name x1 x2 x3 x4 x5 x7 x8 x9 x10 x11 x12 x13 x14 x15 x16 x17 x18 x19_20 x21 x22 x23_24 x25 x26_27 x28 x29 x30 x31 x32_33 x34 x35 x36 lt2 x37 lt3 x38 lt4 x39 lt5 x40 x41 x42 x43 x44 x45 x46 x47 x48 x49
Ginglymostomatidae Ginglymostoma cirratum r nuc nuc r
Carcharhinidae Carcharhinus limbatus r
Triaenodon obesus nuc r r c nuc r r r nuc r r r r r r
Narcinidae Diplobatis ommata r r r
Narcine entemedor r
Dasyatidae Dasyatis dipterura r

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,2])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})

#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.family CleanedData.species_name CleanedData.x1 CleanedData.x2 CleanedData.x3 CleanedData.x4 CleanedData.x5 CleanedData.x7 CleanedData.x8 CleanedData.x9 CleanedData.x10 CleanedData.x11 CleanedData.x12 CleanedData.x13 CleanedData.x14 CleanedData.x15 CleanedData.x16 CleanedData.x17 CleanedData.x18 CleanedData.x19_20 CleanedData.x21 CleanedData.x22 CleanedData.x23_24 CleanedData.x25 CleanedData.x26_27 CleanedData.x28 CleanedData.x29 CleanedData.x30 CleanedData.x31 CleanedData.x32_33 CleanedData.x34 CleanedData.x35 CleanedData.x36 CleanedData.lt2 CleanedData.x37 CleanedData.lt3 CleanedData.x38 CleanedData.lt4 CleanedData.x39 CleanedData.lt5 CleanedData.x40 CleanedData.x41 CleanedData.x42 CleanedData.x43 CleanedData.x44 CleanedData.x45 CleanedData.x46 CleanedData.x47 CleanedData.x48 CleanedData.x49 CanonicalFull WormsIDs
Ginglymostomatidae Ginglymostoma cirratum r nuc nuc r Ginglymostoma cirratum 105846
Carcharhinidae Carcharhinus limbatus r Carcharhinus limbatus 105793
Triaenodon obesus nuc r r c nuc r r r nuc r r r r r r Triaenodon obesus 214557
Narcinidae Diplobatis ommata r r r Diplobatis ommata 280551
Narcine entemedor r Narcine entemedor 275389
Dasyatidae Dasyatis dipterura r Dasyatis dipterura 271430

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[49, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Encheliophis", "new species", 206890)
combined_dataframe[50, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Ogilbia", "species A", 270080)
combined_dataframe[64,10:11] = c("Tylosurus imperialis melanotus", NA)
combined_dataframe[195, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Chaenopsis", "species A", 268702)
combined_dataframe[249, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Symphurus", "new species", 126113)

Locality data

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters.

locality_points_file <- "dive/Robertson_et_al_2003_dive_localities.csv"

data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", locality_points_file, sep="/"))
sf_data <- st_as_sf(data, coords = c("Latitude", "Longitude"), crs = 4326)

wkts <- aggregate(sf_data["geometry"], by = list(sf_data$Name), function(x) st_union(x))
wkts$geometry <- st_as_text(wkts$geometry, digits = 6)
wkts <- as.data.frame(wkts)


#set uncertainty to ~200 meters for those with just one point
all_wkts <- wkts %>%
  rowwise() %>%
  mutate(centroids = list(calculate_centroid(geometry))) %>%
  unnest(centroids) %>%
  mutate(coordinateUncertaintyInMeters = ifelse(coordinateUncertaintyInMeters == 0, 200, coordinateUncertaintyInMeters))

Now we can combine the cleaned names and localities into one dataframe.

occ_data_dive <- data.frame(
  canonicalFull = character(),
  wormsIDs = numeric(),
  locality = character(),
  fieldNumber = character(),
  decimalLongitude = numeric(),
  decimalLatitude = numeric(),
  coordinateUncertaintyInMeters = numeric()
)

for (i in 1:nrow(combined_dataframe)) {
  for (j in 3:50) {
    if (combined_dataframe[i, j] != "" | is.na(combined_dataframe[i, j])) {
      # Create a new row in occ_data
      site <- sub("^[^.]+\\.x?", "", colnames(combined_dataframe)[j])
      fieldNumber <- paste("Dive Site", site, sep=" ")
      row_index <- which(all_wkts$Group.1 == site)
      if (length(row_index) != 0) {
        new_row <- data.frame(
          canonicalFull = combined_dataframe[i, "CanonicalFull"],
          wormsIDs = combined_dataframe[i, "WormsIDs"],
          identificationQualifier = combined_dataframe[i, "identificationQualifier"],
          fieldNumber = fieldNumber,
          decimalLongitude = all_wkts[row_index, "decimalLongitude"],
          decimalLatitude = all_wkts[row_index, "decimalLatitude"],
          coordinateUncertaintyInMeters = all_wkts[row_index, "coordinateUncertaintyInMeters"]
        )        
      } else {
        new_row <- data.frame(
          canonicalFull = combined_dataframe[i, "CanonicalFull"],
          wormsIDs = combined_dataframe[i, "WormsIDs"],
          fieldNumber = fieldNumber,
          identificationQualifier = combined_dataframe[i, "identificationQualifier"],
          decimalLongitude = 0,
          decimalLatitude = 0,
          coordinateUncertaintyInMeters = 0
        )
      }

      # Append the new row to occ_data
      occ_data_dive <- rbind(occ_data_dive, new_row)
    }
  }
}

EventDate

Event dates are given in the paper and have been reproduced here:

locality <- c("Isla Brincanco", "Isla Brincanco", "Isla Brincanco", "Isla Brincanco", 
                "Isla Uva", "Isla Uva", "Isla Uva", "Isla Uva",
                "Isla Afuerita", "Isla Afuerita",
                "Isla Canal de Afuera", "Isla Canal de Afuera", "Isla Canal de Afuera", "Isla Canal de Afuera",
                "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba",
                "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", 
                "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", 
                "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", "Isla Coiba", 
                "Isla Coiba", "Isla Coiba", "Isla Coibita",
                "Isla Rancheria", "Isla Rancheria", "Isla Cocos", "Isla Jicaron", 
                "Isla Jicaron", "Isla Jicaron", "Isla Jicaron", "Isla Jicaron", "Isla Jicarita", 
                "Isla Jicarita", "Isla Jicarita")

fieldNumber <- c("Dive Site 10", "Dive Site 11", "Dive Site 12", "Dive Site 31", "Dive Site 7", "Dive Site 8", "Dive Site 9", "Dive Site 32_33", "Dive Site 48", "Dive Site 49", 
                  "Dive Site 16", "Dive Site 17", "Dive Site 18", "Dive Site 47", "Dive Site 3", "Dive Site 13", "Dive Site 14", "Dive Site 15", "Dive Site 21", "Dive Site 25", 
                  "Dive Site 26_27", "Dive Site 30", "Dive Site 34", "Dive Site 35", "Dive Site 36", "Dive Site lt2", "Dive Site lt3", "Dive Site 38", "Dive Site lt4", 
                  "Dive Site 41", "Dive Site 42", "Dive Site 43", "Dive Site 44", "Dive Site 45", "Dive Site 46", "Dive Site 19_20", "Dive Site 28", "Dive Site 37", 
                  "Dive Site 29", "Dive Site lt1", "Dive Site 5", "Dive Site 22", "Dive Site lt5", "Dive Site 40", "Dive Site 1", "Dive Site 2", "Dive Site 23_24")

date <- c("2003-05-05", "2003-05-05", "2003-05-05", "2003-05-13", "2003-05-04", 
           "2003-05-04", "2003-05-04", "2003-05-13", "2003-05-22", "2003-05-22", "2003-05-07", "2003-05-07", 
           "2003-05-07", "2003-05-22", "2003-05-03", "2003-05-06", "2003-05-06", "2003-05-06", 
           "2003-05-08", "2003-05-11", "2003-05-11", "2003-05-12", 
           "2003-05-14", "2003-05-14", "2003-05-14", "2003-05-16", "2003-05-17", 
           "2003-05-17", "2003-05-18", "2003-05-20", "2003-05-20", "2003-05-20", 
           "2003-05-21", "2003-05-21", "2003-05-21", "2003-05-08", 
           "2003-05-12", "2003-05-16", "2003-05-12", "2003-05-03", "2003-05-03",
           "2003-05-22", "2003-05-19", "2003-05-19", "2003-05-02", "2003-05-02", "2003-05-10")

dive_dates <- data.frame(locality = locality, fieldNumber = fieldNumber, date = date)

occ_data_dive_event_locality <- merge(occ_data_dive, dive_dates, by = "fieldNumber", sort = FALSE, all.x = TRUE)

Trawl Data

Parsing PDF table to CSV

#conda environment
condaenv <- "mwhs-data-mobilization"

# Path to the Python script
script <- paste(path_to_project_root, "scripts_data/pdf_to_tables/pdf_to_table.py", sep="/")

# Input PDF file path
input_pdf <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", original_pdf, sep="/")

# Output directory for OCR/table files
output_dir <- paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed/trawl", sep="/")

# Define page numbers and table areas (see documentation)
page_args <- c(
"-a 94.478,15.683,1049.198,573.368 -p 7"

)

# Define run parameters (see documentation)
run_parameters <- "-l"

# Combine page arguments and execute
page_args_combined <- paste(page_args, collapse = " ")
command <- paste("conda run -n", condaenv, "python", script, "-i", input_pdf, run_parameters, page_args_combined, "-o", output_dir)
system(command, intern=TRUE)
##  [1] ""                                                                                                                                                          
##  [2] "Script Execution Summary"                                                                                                                                  
##  [3] "Date and Time: 2023-10-03 04:40:36"                                                                                                                        
##  [4] "------------------------------"                                                                                                                            
##  [5] ""                                                                                                                                                          
##  [6] "PDF input: ../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/raw/coiba2003.pdf"                         
##  [7] "Perform Table Parsing: TRUE"                                                                                                                               
##  [8] "Selected Areas:"                                                                                                                                           
##  [9] "  Area 1: [94.478, 15.683, 1049.198, 573.368]"                                                                                                             
## [10] "Pages: 7"                                                                                                                                                  
## [11] "Concatenate: False"                                                                                                                                        
## [12] "Concatenate across headers: False"                                                                                                                         
## [13] "Stream Extraction: False"                                                                                                                                  
## [14] "Lattice Extraction: True"                                                                                                                                  
## [15] ""                                                                                                                                                          
## [16] "Parsing Tables"                                                                                                                                            
## [17] "------------------------------"                                                                                                                            
## [18] ""                                                                                                                                                          
## [19] ""                                                                                                                                                          
## [20] "Saving to CSV"                                                                                                                                             
## [21] "CSV file(s):"                                                                                                                                              
## [22] "\t../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/processed/trawl/coiba2003_tables_parsed_1.csv"      
## [23] "------------------------------"                                                                                                                            
## [24] ""                                                                                                                                                          
## [25] ""                                                                                                                                                          
## [26] "Run Details: ../../../datasets/coiba_national_park_and_its_special_zone_of_marine_protection/Robertson_et_al_2003/processed/trawl/coiba2003_parameters.txt"
## [27] "Finished"                                                                                                                                                  
## [28] ""

Read source data

Now we’ll read in the csv table outputted from the previous step

processed_csv <- "trawl/coiba2003_tables_parsed_1.csv"

input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", processed_csv, sep="/"), check.names = FALSE)

#to preview pretty table
knitr::kable(head(input_data))
Family Species Name 1 2 3 4 5 6 7 8 9 10 11 1.1 2.1 3.1 4.1 5.1 6.1 7.1 8.1 9.1 10.1 11.1 12
Narcinidae Diplobatis ommata x NO Data
Rhinobatidae Zapteryx xyster x x x x
Rajidae Raja velezi x
Moringuidae Neoconger vermiformis x
Chlopsidae Chlopsis apterus x x x
Muraenidae Gymnothorax equatorialis x x x x x x x x x x x x

Preprocessing

Here we tidy the data up.

Tidy Data

input_data %<>%
  remove_empty(c("rows", "cols")) %>%       # Remove empty rows and columns
  clean_names() %>%
  select(-c(x3_1))

cleaned_data <- input_data

#to preview pretty table
knitr::kable(head(cleaned_data))
family species_name x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x1_1 x2_1 x4_1 x5_1 x6_1 x7_1 x8_1 x9_1 x10_1 x11_1 x12
Narcinidae Diplobatis ommata x
Rhinobatidae Zapteryx xyster x x x x
Rajidae Raja velezi x
Moringuidae Neoconger vermiformis x
Chlopsidae Chlopsis apterus x x x
Muraenidae Gymnothorax equatorialis x x x x x x x x x x x x

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,2])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})

#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.family CleanedData.species_name CleanedData.x1 CleanedData.x2 CleanedData.x3 CleanedData.x4 CleanedData.x5 CleanedData.x6 CleanedData.x7 CleanedData.x8 CleanedData.x9 CleanedData.x10 CleanedData.x11 CleanedData.x1_1 CleanedData.x2_1 CleanedData.x4_1 CleanedData.x5_1 CleanedData.x6_1 CleanedData.x7_1 CleanedData.x8_1 CleanedData.x9_1 CleanedData.x10_1 CleanedData.x11_1 CleanedData.x12 CanonicalFull WormsIDs
Narcinidae Diplobatis ommata x Diplobatis ommata 280551
Rhinobatidae Zapteryx xyster x x x x Zapteryx xyster 283214
Rajidae Raja velezi x Raja velezi 271585
Moringuidae Neoconger vermiformis x Neoconger vermiformis 275441
Chlopsidae Chlopsis apterus x x x Chlopsis apterus 271712
Muraenidae Gymnothorax equatorialis x x x x x x x x x x x x Gymnothorax equatorialis 271832

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps

combined_dataframe[29, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Pontinus", "species A", 126170)
combined_dataframe[53, c("CanonicalFull", "identificationQualifier", "WormsIDs")] <- c("Bollmannia", "species A", 268559)

Locality data

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters.

locality_points_file <- "trawl/Robertson_et_al_2003_trawl_localities.csv"

data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "processed", locality_points_file, sep="/"))
sf_data <- st_as_sf(data, coords = c("Latitude", "Longitude"), crs = 4326)

wkts <- aggregate(sf_data["geometry"], by = list(sf_data$Name), function(x) st_union(x))
wkts$geometry <- st_as_text(wkts$geometry, digits = 6)
wkts <- as.data.frame(wkts)


#set uncertainty to ~200 meters for those with just one point
all_wkts <- wkts %>%
  rowwise() %>%
  mutate(centroids = list(calculate_centroid(geometry))) %>%
  unnest(centroids) %>%
  mutate(coordinateUncertaintyInMeters = ifelse(coordinateUncertaintyInMeters == 0, 200, coordinateUncertaintyInMeters))

Now we can combine the cleaned names and localities into one dataframe.

occ_data_trawl <- data.frame(
  canonicalFull = character(),
  wormsIDs = numeric(),
  locality = character(),
  fieldNumber = character(),
  decimalLongitude = numeric(),
  decimalLatitude = numeric(),
  coordinateUncertaintyInMeters = numeric()
)

for (i in 1:nrow(combined_dataframe)) {
  for (j in 3:24) {
    if (combined_dataframe[i, j] != "" | is.na(combined_dataframe[i, j])) {
      # Create a new row in occ_data
      site <- sub("^[^.]+\\.x?", "", colnames(combined_dataframe)[j])
      fieldNumber <- paste("Trawl Site", site, sep=" ")
      row_index <- which(all_wkts$Group.1 == site)
      if (length(row_index) != 0) {
        new_row <- data.frame(
          canonicalFull = combined_dataframe[i, "CanonicalFull"],
          wormsIDs = combined_dataframe[i, "WormsIDs"],
          identificationQualifier = combined_dataframe[i, "identificationQualifier"],
          fieldNumber = fieldNumber,
          decimalLongitude = all_wkts[row_index, "decimalLongitude"],
          decimalLatitude = all_wkts[row_index, "decimalLatitude"],
          coordinateUncertaintyInMeters = all_wkts[row_index, "coordinateUncertaintyInMeters"]
        )        
      } else {
        new_row <- data.frame(
          canonicalFull = combined_dataframe[i, "CanonicalFull"],
          wormsIDs = combined_dataframe[i, "WormsIDs"],
          identificationQualifier = combined_dataframe[i, "identificationQualifier"],
          fieldNumber = fieldNumber,
          decimalLongitude = 0,
          decimalLatitude = 0,
          coordinateUncertaintyInMeters = 0
        )
      }

      # Append the new row to occ_data
      occ_data_trawl <- rbind(occ_data_trawl, new_row)
    }
  }
}

EventDate

Event dates are given in the paper and have been reproduced here:

locality <- c("South of Coiba", "South of Coiba", "South of Coiba", "South of Coiba", 
                "South of Coiba", "South of Coiba", "South of Coiba", "South of Coiba",
                "South of Coiba", "South of Coiba",
                "South of Coiba", "North of Coiba", "North of Coiba", "North of Coiba",
                "North of Coiba", "North of Coiba", "North of Coiba", "North of Coiba", "North of Coiba",
                "North of Coiba", "North of Coiba", "North of Coiba", "North of Coiba")

fieldNumber <- c("Trawl Site 1", "Trawl Site 2", "Trawl Site 3", "Trawl Site 4", "Trawl Site 5", "Trawl Site 6", "Trawl Site 7", "Trawl Site 8", "Trawl Site 9", "Trawl Site 10", 
                  "Trawl Site 11", "Trawl Site 1_1", "Trawl Site 2_1", "Trawl Site 3_1", "Trawl Site 4_1", "Trawl Site 5_1", "Trawl Site 6_1", "Trawl Site 7_1", "Trawl Site 8_1", "Trawl Site 9_1", 
                  "Trawl Site 10_1", "Trawl Site 11_1", "Trawl Site 12")

date <- c("2003-05-09", "2003-05-09", "2003-05-09", "2003-05-09", "2003-05-09", 
           "2003-05-09", "2003-05-09", "2003-05-09", "2003-05-09", "2003-05-09", "2003-05-09", "2003-05-15", 
           "2003-05-15", "2003-05-15", "2003-05-15", "2003-05-15", "2003-05-15", "2003-05-15", 
           "2003-05-15", "2003-05-15", "2003-05-15", "2003-05-15", 
           "2003-05-15")

trawl_dates <- data.frame(locality = locality, fieldNumber = fieldNumber, date = date)

occ_data_trawl_event_locality <- merge(occ_data_trawl, trawl_dates, by = "fieldNumber", sort = FALSE, all.x = TRUE)

Merge Dive and Trawl

occ_data <- rbind(occ_data_dive_event_locality, occ_data_trawl_event_locality)

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- occ_data %>%
  rename(scientificName = canonicalFull) %>%
  rename(scientificNameID = wormsIDs) %>%
  rename(eventDate = date) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, locality, eventDate, fieldNumber), algo="md5"), sep=":"))

decimalLongitude/decimalLatitude

Locality data was retrieved via georeferencing the included site maps from the paper. These maps have been saved as TIFs and points saved as a csv. First we will use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. See above.

The calculations below are used to calculate the boundaries for the EML file.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Coiba National Park and its Special Zone of Marine Protection
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Coiba National Park and its Special Zone of Marine Protection")]

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

footprintWKT

coordinateUncertaintyInMeters

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Panama"
occurrence %<>% mutate(country)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, identificationQualifier, scientificNameID, eventDate, country, locality, fieldNumber, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, geodeticDatum, occurrenceStatus, basisOfRecord, )

#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 109 × 4
##    level field       row message                                 
##    <chr> <chr>     <int> <chr>                                   
##  1 error eventDate  2817 Empty value for required field eventDate
##  2 error eventDate  2818 Empty value for required field eventDate
##  3 error eventDate  2819 Empty value for required field eventDate
##  4 error eventDate  2820 Empty value for required field eventDate
##  5 error eventDate  2821 Empty value for required field eventDate
##  6 error eventDate  2822 Empty value for required field eventDate
##  7 error eventDate  2823 Empty value for required field eventDate
##  8 error eventDate  2824 Empty value for required field eventDate
##  9 error eventDate  2825 Empty value for required field eventDate
## 10 error eventDate  2826 Empty value for required field eventDate
## # ℹ 99 more rows

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")
## [1] "eml-2.1.1"
#Title
title <- "Survey of reef fishes in the Coiba National Park, Panama, May 5-22, 2003"

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")

#Abstract
abstract <- eml$abstract(
  para = "STRI Survey of Fishes in the Coiba National Park May 5-22, 2003"
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- list(eml$creator(
    individualName = eml$individualName(
      givenName = "D. Ross", 
      surName = "Robertson"),
    organizationName = "Smithsonian Tropical Research Institution"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "James", 
      surName = "Van Tassell"),
    organizationName = "Hofstra University"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "John", 
      surName = "Earle"),
    organizationName = "Bishop Museum"
  ), eml$creator(
    individualName = eml$individualName(
      givenName = "Edgardo", 
      surName = "Ochoa"),
    organizationName = "Smithsonian Tropical Research Institution"
  )
)


contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Robertson, D R. (2003). Survey of reef fishes in the Coiba National Park, Panama, May 5-22, 2003.")
    )
  )
)

citationdoi <- "http://dx.doi.org/10.13140/RG.2.1.3983.6008"

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Coiba National Park",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
#  ),
#  temporalCoverage = eml$temporalCoverage(
#    rangeOfDates = eml$rangeOfDates(
#      beginDate = eml$beginDate(
#        calendarDate = "2019-05-01"
#      ),
#      endDate = eml$endDate(
#        calendarDate = "2016-05-06"
#      )
#    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)

#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)

#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "index", 3)
    xml_set_attr(field, "default", NULL)
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  }
}

#Add identificationQualifier
new_field <- xml_add_sibling(fields[[2]], "field")
xml_set_attr(new_field, "index", "2")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/identificationQualifier")

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}