Bundling CCMAR - Centre of Marine Sciences 2023 to a DwC Archive

This is an R Markdown Notebook for converting the species checklist found in the following reference to DarwinCore format for upload into OBIS as part of UNESCO’s eDNA Expeditions project:

Project Marafrica: a network monitoring, integrating and assessing marine biodiversity data along the west africa to understand, predict and mitigate climatic / oceanographic changes.

Data were downloaded on 2023-10-17.

Setup

Call the necessary libraries and variables. Suppresses loading messages.

library(magrittr)                       # To use %<>% pipes
suppressMessages(library(janitor))      # To clean input data
suppressMessages(library(dplyr))        # To clean input data
library(stringr)                        # To clean input data
suppressMessages(library(rgnparser))    # To clean species names
suppressMessages(library(taxize))       # To get WoRMS IDs
library(worrms)                         # To get WoRMS IDs
library(digest)                         # To generate hashes
suppressMessages(library(obistools))    # To generate centroid lat/long and uncertainty
suppressMessages(library(sf))           # To generate wkt polygon
suppressMessages(library(EML))          # To create eml.xml file
library(xml2)                           # To create the meta.xml file
suppressMessages(library(zip))          # To zip DwC file

Input Parameters and Paths

path_to_project_root <- "../../.."
site_dir_name <- "banc_darguin_national_park"
dataset_dir_name <- "CCMAR_2023"
original_pdf <- ""
short_name <- "banc-darguin-ccmar-2023"

Parsing PDF table to CSV

The data for this reference is formatted as an image-based table inside a PDF across multiple sheets. First, we use pdf_to_table to OCR and parse out the table to a CSV. We don’t have to do this since we have the CSV.

Read source data

Now we’ll read in the csv table outputted from the previous step

processed_csv <- "biodiversity-data.csv"

input_data <- read.csv(paste(path_to_project_root, "datasets", site_dir_name, dataset_dir_name, "raw", processed_csv, sep="/"))

#to preview pretty table
knitr::kable(head(input_data))
scientificName decimalLongitude decimalLatitude year month day associatedMedia datasetName references
Bryopsis corymbosa -17.05011 20.76984 1977 NA NA MarAfrica
Bryopsis plumosa -17.05011 20.76984 1966 NA NA MarAfrica
Chaetomorpha brachygona -17.05011 20.76984 1952 NA NA MarAfrica
Cladophora prolifera -17.05011 20.76984 1977 NA NA MarAfrica
Codium decorticatum -17.05011 20.76984 1911 NA NA MarAfrica
Codium tomentosum -17.05011 20.76984 1977 NA NA MarAfrica

Preprocessing

Here we tidy the data up, since OCR and table parsing errors are common and only take the list of species, since this is a checklist.

Tidy Data

input_data %<>%
  remove_empty(c("rows", "cols"))

# Remove iNaturalist data since this will be duplicated
cleaned_data <- subset(input_data, datasetName != "iNaturalist")

#to preview pretty table
knitr::kable(head(cleaned_data))
scientificName decimalLongitude decimalLatitude year month day associatedMedia datasetName references
Bryopsis corymbosa -17.05011 20.76984 1977 NA NA MarAfrica
Bryopsis plumosa -17.05011 20.76984 1966 NA NA MarAfrica
Chaetomorpha brachygona -17.05011 20.76984 1952 NA NA MarAfrica
Cladophora prolifera -17.05011 20.76984 1977 NA NA MarAfrica
Codium decorticatum -17.05011 20.76984 1911 NA NA MarAfrica
Codium tomentosum -17.05011 20.76984 1977 NA NA MarAfrica

Get WoRMS IDs

Auto matching

First we will try to do this automatically by first cleaning the species names using gnparser and then using the taxise library to call the WoRMS database.

#Parse author names out
parsed_names <- rgnparser::gn_parse(cleaned_data[,1])

#Function to get WoRMS IDs. Search for accepted names first and if not found, search for unaccepted. If still not found, use the worrms package to search.
get_worms_id_from_element <- function(element) {
  worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", fuzzy=TRUE, messages = FALSE, accepted = TRUE)
  if (attr(worms_id, "match") == "not found") {
    worms_id <- get_wormsid(element$canonical$full, searchtype="scientific", messages = FALSE, fuzzy=TRUE)
    if (attr(worms_id, "match") == "not found") {
      worms_id <- NA
    }
  }
  return(worms_id)
}

#Call the function
worms_ids <- lapply(parsed_names, function(element) {
  if (element$parsed) {
    return(get_worms_id_from_element(element))
  } else {
    return(NA)
  }
})
## 
##        id                                   target               authority
## 2  691418        Enteromorpha flexuosa f. crispata               Schiffner
## 3  691416        Enteromorpha flexuosa f. crispate               Schiffner
## 8  416363 Enteromorpha flexuosa subsp. linziformis (Bliding) Bliding, 1963
## 10 680044      Enteromorpha flexuosa var. angulosa       angulosa Kjellman
##      status
## 2  accepted
## 3  accepted
## 8  accepted
## 10 accepted
## 
## More than one WORMS ID found for taxon 'Enteromorpha flexuosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                        target           authority   status
## 3  691355   Ulva fasciata f. caespitosa            Setchell accepted
## 7  691356        Ulva fasciata f. major              Tilden accepted
## 8  691357        Ulva fasciata f. minor              Tilden accepted
## 10 679899   Ulva fasciata var. concolor            Montagne accepted
## 11 679900     Ulva fasciata var. lobata   (Kützing) Piccone accepted
## 12 679901    Ulva fasciata var. palmata (C.Agardh) Montagne accepted
## 13 679902 Ulva fasciata var. subsimplax            Montagne accepted
## 
## More than one WORMS ID found for taxon 'Ulva fasciata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                        target                               authority
## 1  145982                 Ulva fasciata                            Delile, 1813
## 2  548159                 Ulva fasciata                          S.F.Gray, 1821
## 3  691355   Ulva fasciata f. caespitosa                                Setchell
## 4  698075      Ulva fasciata f. costata                            M.Howe, 1914
## 5  698052      Ulva fasciata f. expansa                          Setchell, 1905
## 6  698137       Ulva fasciata f. lobata                          Setchell, 1901
## 7  691356        Ulva fasciata f. major                                  Tilden
## 8  691357        Ulva fasciata f. minor                                  Tilden
## 9  698073     Ulva fasciata f. taeniata                          Setchell, 1901
## 10 679899   Ulva fasciata var. concolor                                Montagne
## 11 679900     Ulva fasciata var. lobata                       (Kützing) Piccone
## 12 679901    Ulva fasciata var. palmata                     (C.Agardh) Montagne
## 13 679902 Ulva fasciata var. subsimplax                                Montagne
## 14 696822   Ulva fasciata var. taeniata (Setchell) Saifullah & Nizamuddin, 1977
##        status
## 1  unaccepted
## 2  unaccepted
## 3    accepted
## 4  unaccepted
## 5  unaccepted
## 6  unaccepted
## 7    accepted
## 8    accepted
## 9  unaccepted
## 10   accepted
## 11   accepted
## 12   accepted
## 13   accepted
## 14 unaccepted
## 
## More than one WORMS ID found for taxon 'Ulva fasciata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                                   target                   authority
## 7  693749 Ceramium strictum f. corticatulostrictum            (Kylin) Sjöstedt
## 9  693750         Ceramium strictum f. divaricatum            Holmes & Batters
## 11 693752               Ceramium strictum f. nanum                   Schiffner
## 12 693753          Ceramium strictum f. proliferum                     Collins
## 13 693754          Ceramium strictum f. proliferum                 F.S.Collins
## 15 693755  Ceramium strictum f. strictotenuissimum                H.E.Petersen
## 17 693756          Ceramium strictum f. vertebrale (H.E.Petersen) H.E.Petersen
## 20 550951        Ceramium strictum var. acrocarpum   (Kützing) Schiffner, 1943
## 21 685961  Ceramium strictum var. breviarticulatum                   Ardissone
## 22 552869         Ceramium strictum var. delicatum                    J.Agardh
## 23 685962          Ceramium strictum var. strictum                    J.Agardh
##      status
## 7  accepted
## 9  accepted
## 11 accepted
## 12 accepted
## 13 accepted
## 15 accepted
## 17 accepted
## 20 accepted
## 21 accepted
## 22 accepted
## 23 accepted
## 
## More than one WORMS ID found for taxon 'Ceramium strictum'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##         id                                    target
## 1   146786                         Ceramium strictum
## 2   548115                         Ceramium strictum
## 3   381136                         Ceramium strictum
## 4   548093                         Ceramium strictum
## 5  1359644                         Ceramium strictum
## 6   554274           Ceramium strictum f. acrocarpum
## 8   554086 Ceramium strictum f. corticatulo-strictum
## 7   693749  Ceramium strictum f. corticatulostrictum
## 9   693750          Ceramium strictum f. divaricatum
## 10  693751           Ceramium strictum f. minusculum
## 11  693752                Ceramium strictum f. nanum
## 12  693753           Ceramium strictum f. proliferum
## 13  693754           Ceramium strictum f. proliferum
## 16  554089  Ceramium strictum f. stricto-tenuissimum
## 14  554088          Ceramium strictum f. strictoides
## 15  693755   Ceramium strictum f. strictotenuissimum
## 17  693756           Ceramium strictum f. vertebrale
## 18  554090                Ceramium strictum f. verum
## 19  554764       Ceramium strictum subsp. tenuicorne
## 20  550951         Ceramium strictum var. acrocarpum
## 21  685961   Ceramium strictum var. breviarticulatum
## 22  552869          Ceramium strictum var. delicatum
## 23  685962           Ceramium strictum var. strictum
## 24  552460        Ceramium strictum var. zostericola
## 25 1309297        Ceramium strictum var. zostericola
##                              authority     status
## 1           (Kützing) Rabenhorst, 1847 unaccepted
## 2               (Kützing) Harvey, 1849 unaccepted
## 3              Greville & Harvey, 1846 unaccepted
## 4                           Roth, 1806  uncertain
## 5    (Mertens ex Dillwyn) Poiret, 1811 unaccepted
## 6                        Mazoyer, 1938 unaccepted
## 8               (Kylin) Sjöstedt, 1928 unaccepted
## 7                     (Kylin) Sjöstedt   accepted
## 9                     Holmes & Batters   accepted
## 10                       Mazoyer, 1938 unaccepted
## 11                           Schiffner   accepted
## 12                             Collins   accepted
## 13                         F.S.Collins   accepted
## 16                      Petersen, 1908 unaccepted
## 14           (Petersen) Sjöstedt, 1928 unaccepted
## 15                        H.E.Petersen   accepted
## 17         (H.E.Petersen) H.E.Petersen   accepted
## 18                      Petersen, 1908 unaccepted
## 19 (Kützing) Rueness & Kornfeldt, 1992 unaccepted
## 20           (Kützing) Schiffner, 1943   accepted
## 21                           Ardissone   accepted
## 22                            J.Agardh   accepted
## 23                            J.Agardh   accepted
## 24                        Thuret, 1863 unaccepted
## 25              Feldmann-Mazoyer, 1941 unaccepted
## 
## More than one WORMS ID found for taxon 'Ceramium strictum'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target              authority   status
## 2 552943 Gracilaria verrucosa var. procerrima (Esper) M.P.Reis, 1981 accepted
## 3 551240   Gracilaria verrucosa var. ramulosa    (C.Agardh) M.P.Reis accepted
## 
## More than one WORMS ID found for taxon 'Gracilaria verrucosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                                   target                   authority
## 7  693749 Ceramium strictum f. corticatulostrictum            (Kylin) Sjöstedt
## 9  693750         Ceramium strictum f. divaricatum            Holmes & Batters
## 11 693752               Ceramium strictum f. nanum                   Schiffner
## 12 693753          Ceramium strictum f. proliferum                     Collins
## 13 693754          Ceramium strictum f. proliferum                 F.S.Collins
## 15 693755  Ceramium strictum f. strictotenuissimum                H.E.Petersen
## 17 693756          Ceramium strictum f. vertebrale (H.E.Petersen) H.E.Petersen
## 20 550951        Ceramium strictum var. acrocarpum   (Kützing) Schiffner, 1943
## 21 685961  Ceramium strictum var. breviarticulatum                   Ardissone
## 22 552869         Ceramium strictum var. delicatum                    J.Agardh
## 23 685962          Ceramium strictum var. strictum                    J.Agardh
##      status
## 7  accepted
## 9  accepted
## 11 accepted
## 12 accepted
## 13 accepted
## 15 accepted
## 17 accepted
## 20 accepted
## 21 accepted
## 22 accepted
## 23 accepted
## 
## More than one WORMS ID found for taxon 'Ceramium strictum'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##         id                                    target
## 1   146786                         Ceramium strictum
## 2   548115                         Ceramium strictum
## 3   381136                         Ceramium strictum
## 4   548093                         Ceramium strictum
## 5  1359644                         Ceramium strictum
## 6   554274           Ceramium strictum f. acrocarpum
## 8   554086 Ceramium strictum f. corticatulo-strictum
## 7   693749  Ceramium strictum f. corticatulostrictum
## 9   693750          Ceramium strictum f. divaricatum
## 10  693751           Ceramium strictum f. minusculum
## 11  693752                Ceramium strictum f. nanum
## 12  693753           Ceramium strictum f. proliferum
## 13  693754           Ceramium strictum f. proliferum
## 16  554089  Ceramium strictum f. stricto-tenuissimum
## 14  554088          Ceramium strictum f. strictoides
## 15  693755   Ceramium strictum f. strictotenuissimum
## 17  693756           Ceramium strictum f. vertebrale
## 18  554090                Ceramium strictum f. verum
## 19  554764       Ceramium strictum subsp. tenuicorne
## 20  550951         Ceramium strictum var. acrocarpum
## 21  685961   Ceramium strictum var. breviarticulatum
## 22  552869          Ceramium strictum var. delicatum
## 23  685962           Ceramium strictum var. strictum
## 24  552460        Ceramium strictum var. zostericola
## 25 1309297        Ceramium strictum var. zostericola
##                              authority     status
## 1           (Kützing) Rabenhorst, 1847 unaccepted
## 2               (Kützing) Harvey, 1849 unaccepted
## 3              Greville & Harvey, 1846 unaccepted
## 4                           Roth, 1806  uncertain
## 5    (Mertens ex Dillwyn) Poiret, 1811 unaccepted
## 6                        Mazoyer, 1938 unaccepted
## 8               (Kylin) Sjöstedt, 1928 unaccepted
## 7                     (Kylin) Sjöstedt   accepted
## 9                     Holmes & Batters   accepted
## 10                       Mazoyer, 1938 unaccepted
## 11                           Schiffner   accepted
## 12                             Collins   accepted
## 13                         F.S.Collins   accepted
## 16                      Petersen, 1908 unaccepted
## 14           (Petersen) Sjöstedt, 1928 unaccepted
## 15                        H.E.Petersen   accepted
## 17         (H.E.Petersen) H.E.Petersen   accepted
## 18                      Petersen, 1908 unaccepted
## 19 (Kützing) Rueness & Kornfeldt, 1992 unaccepted
## 20           (Kützing) Schiffner, 1943   accepted
## 21                           Ardissone   accepted
## 22                            J.Agardh   accepted
## 23                            J.Agardh   accepted
## 24                        Thuret, 1863 unaccepted
## 25              Feldmann-Mazoyer, 1941 unaccepted
## 
## More than one WORMS ID found for taxon 'Ceramium strictum'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                                   target               authority
## 2  691418        Enteromorpha flexuosa f. crispata               Schiffner
## 3  691416        Enteromorpha flexuosa f. crispate               Schiffner
## 8  416363 Enteromorpha flexuosa subsp. linziformis (Bliding) Bliding, 1963
## 10 680044      Enteromorpha flexuosa var. angulosa       angulosa Kjellman
##      status
## 2  accepted
## 3  accepted
## 8  accepted
## 10 accepted
## 
## More than one WORMS ID found for taxon 'Enteromorpha flexuosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target              authority   status
## 2 552943 Gracilaria verrucosa var. procerrima (Esper) M.P.Reis, 1981 accepted
## 3 551240   Gracilaria verrucosa var. ramulosa    (C.Agardh) M.P.Reis accepted
## 
## More than one WORMS ID found for taxon 'Gracilaria verrucosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                        target           authority   status
## 3  691355   Ulva fasciata f. caespitosa            Setchell accepted
## 7  691356        Ulva fasciata f. major              Tilden accepted
## 8  691357        Ulva fasciata f. minor              Tilden accepted
## 10 679899   Ulva fasciata var. concolor            Montagne accepted
## 11 679900     Ulva fasciata var. lobata   (Kützing) Piccone accepted
## 12 679901    Ulva fasciata var. palmata (C.Agardh) Montagne accepted
## 13 679902 Ulva fasciata var. subsimplax            Montagne accepted
## 
## More than one WORMS ID found for taxon 'Ulva fasciata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                        target                               authority
## 1  145982                 Ulva fasciata                            Delile, 1813
## 2  548159                 Ulva fasciata                          S.F.Gray, 1821
## 3  691355   Ulva fasciata f. caespitosa                                Setchell
## 4  698075      Ulva fasciata f. costata                            M.Howe, 1914
## 5  698052      Ulva fasciata f. expansa                          Setchell, 1905
## 6  698137       Ulva fasciata f. lobata                          Setchell, 1901
## 7  691356        Ulva fasciata f. major                                  Tilden
## 8  691357        Ulva fasciata f. minor                                  Tilden
## 9  698073     Ulva fasciata f. taeniata                          Setchell, 1901
## 10 679899   Ulva fasciata var. concolor                                Montagne
## 11 679900     Ulva fasciata var. lobata                       (Kützing) Piccone
## 12 679901    Ulva fasciata var. palmata                     (C.Agardh) Montagne
## 13 679902 Ulva fasciata var. subsimplax                                Montagne
## 14 696822   Ulva fasciata var. taeniata (Setchell) Saifullah & Nizamuddin, 1977
##        status
## 1  unaccepted
## 2  unaccepted
## 3    accepted
## 4  unaccepted
## 5  unaccepted
## 6  unaccepted
## 7    accepted
## 8    accepted
## 9  unaccepted
## 10   accepted
## 11   accepted
## 12   accepted
## 13   accepted
## 14 unaccepted
## 
## More than one WORMS ID found for taxon 'Ulva fasciata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target                 authority
## 1 239032                 Laurencia paniculata (C.Agardh) J.Agardh, 1852
## 2 548034                 Laurencia paniculata             Kützing, 1849
## 3 554650 Laurencia paniculata f. patentiramea    (Montagne) Hauck, 1883
## 4 551654   Laurencia paniculata var. snackeyi     Weber-van Bosse, 1923
##       status
## 1 unaccepted
## 2 unaccepted
## 3 unaccepted
## 4 unaccepted
## 
## More than one WORMS ID found for taxon 'Laurencia paniculata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target                 authority
## 1 239032                 Laurencia paniculata (C.Agardh) J.Agardh, 1852
## 2 548034                 Laurencia paniculata             Kützing, 1849
## 3 554650 Laurencia paniculata f. patentiramea    (Montagne) Hauck, 1883
## 4 551654   Laurencia paniculata var. snackeyi     Weber-van Bosse, 1923
##       status
## 1 unaccepted
## 2 unaccepted
## 3 unaccepted
## 4 unaccepted
## 
## More than one WORMS ID found for taxon 'Laurencia paniculata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                              target                        authority
## 2 553197 Laurencia papillosa f. australasica Kützing ex Weber-van Bosse, 1913
## 4 550833 Laurencia papillosa var. thyrsoides                          Kützing
##     status
## 2 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Laurencia papillosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target                 authority
## 1 239032                 Laurencia paniculata (C.Agardh) J.Agardh, 1852
## 2 548034                 Laurencia paniculata             Kützing, 1849
## 3 554650 Laurencia paniculata f. patentiramea    (Montagne) Hauck, 1883
## 4 551654   Laurencia paniculata var. snackeyi     Weber-van Bosse, 1923
##       status
## 1 unaccepted
## 2 unaccepted
## 3 unaccepted
## 4 unaccepted
## 
## More than one WORMS ID found for taxon 'Laurencia paniculata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                               target                 authority
## 1 239032                 Laurencia paniculata (C.Agardh) J.Agardh, 1852
## 2 548034                 Laurencia paniculata             Kützing, 1849
## 3 554650 Laurencia paniculata f. patentiramea    (Montagne) Hauck, 1883
## 4 551654   Laurencia paniculata var. snackeyi     Weber-van Bosse, 1923
##       status
## 1 unaccepted
## 2 unaccepted
## 3 unaccepted
## 4 unaccepted
## 
## More than one WORMS ID found for taxon 'Laurencia paniculata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                              target                        authority
## 2 553197 Laurencia papillosa f. australasica Kützing ex Weber-van Bosse, 1913
## 4 550833 Laurencia papillosa var. thyrsoides                          Kützing
##     status
## 2 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Laurencia papillosa'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                 target            authority   status
## 1 145496 Erythrotrichia simplex P.J.L.Dangeard, 1968 accepted
## 2 838682 Erythrotrichia simplex     B.F.Zheng & J.Li accepted
## 
## More than one WORMS ID found for taxon 'Erythrotrichia simplex'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                                               target
## 5  690658                   Enteromorpha clathrata f. denudata
## 7  690660               Enteromorpha clathrata f. foeniculacea
## 10 690657                   Enteromorpha clathrata f. linkiana
## 13 690663                 Enteromorpha clathrata f. procerrima
## 14 690655                  Enteromorpha clathrata f. prostrata
## 16 690659                 Enteromorpha clathrata f. ramulifera
## 18 690661                    Enteromorpha clathrata f. robusta
## 19 690662                     Enteromorpha clathrata f. tenuis
## 20 678042               Enteromorpha clathrata var. agardhiana
## 21 702722    Enteromorpha clathrata var. agardhiana f. ambigua
## 22 702723 Enteromorpha clathrata var. agardhiana f. nudiuscula
## 23 678048                  Enteromorpha clathrata var. angusta
## 25 678049          Enteromorpha clathrata var. angustimembrana
## 27 678052              Enteromorpha clathrata var. confervacea
## 29 678055             Enteromorpha clathrata var. confervoidea
## 30 678043             Enteromorpha clathrata var. confervoidea
## 33 678056                   Enteromorpha clathrata var. erecta
## 34 678051                  Enteromorpha clathrata var. genuina
## 46 678050                 Enteromorpha clathrata var. rothiana
## 45 678044                 Enteromorpha clathrata var. Rothiana
## 47 678045                 Enteromorpha clathrata var. Rothiena
## 48 706785                 Enteromorpha clathrata var. Rothiene
## 49 706949 Enteromorpha clathrata var. Rothiene f. faeniculacea
## 50 678054                  Enteromorpha clathrata var. spinosa
## 51 678054                  Enteromorpha clathrata var. spinosa
## 53 702724       Enteromorpha clathrata var. uncinata f. tenuis
## 54 706786                 Enteromorpha clathrata var. uncinate
## 55 706950      Enteromorpha clathrata var. uncinate f. robuste
## 56 678046                 Enteromorpha clathrata var. uncinota
##                 authority   status
## 5          (Ahlner) Kylin accepted
## 7       (Le Jolis) Chalon accepted
## 10 (Greville) V.J.Chapman accepted
## 13                 Wollny accepted
## 14     (Le Jolis) Batters accepted
## 16         (Ahlner) Kylin accepted
## 18      (Le Jolis) Chalon accepted
## 19      (Le Jolis) Chalon accepted
## 20      (Le Jolis) Chalon accepted
## 21                   <NA> accepted
## 22                   <NA> accepted
## 23            V.J.Chapman accepted
## 25            V.J.Chapman accepted
## 27    Areschoug ex Ahlner accepted
## 29    (C.Agardh) Montagne accepted
## 30         (Ag.) Montagne accepted
## 33     (Lyngbye) Greville accepted
## 34                Batters accepted
## 46                 Chalon accepted
## 45      (Le Jolis) Chalon accepted
## 47         Ef.] prostrata accepted
## 48                   <NA> accepted
## 49                   <NA> accepted
## 50   (Kützing) Rabenhorst accepted
## 51   (Kützing) Rabenhorst accepted
## 53             (Le Jolis) accepted
## 54                   <NA> accepted
## 55             (Le Jolis) accepted
## 56                   <NA> accepted
## 
## More than one WORMS ID found for taxon 'Enteromorpha clathrata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                                               target
## 5  690658                   Enteromorpha clathrata f. denudata
## 7  690660               Enteromorpha clathrata f. foeniculacea
## 10 690657                   Enteromorpha clathrata f. linkiana
## 13 690663                 Enteromorpha clathrata f. procerrima
## 14 690655                  Enteromorpha clathrata f. prostrata
## 16 690659                 Enteromorpha clathrata f. ramulifera
## 18 690661                    Enteromorpha clathrata f. robusta
## 19 690662                     Enteromorpha clathrata f. tenuis
## 20 678042               Enteromorpha clathrata var. agardhiana
## 21 702722    Enteromorpha clathrata var. agardhiana f. ambigua
## 22 702723 Enteromorpha clathrata var. agardhiana f. nudiuscula
## 23 678048                  Enteromorpha clathrata var. angusta
## 25 678049          Enteromorpha clathrata var. angustimembrana
## 27 678052              Enteromorpha clathrata var. confervacea
## 29 678055             Enteromorpha clathrata var. confervoidea
## 30 678043             Enteromorpha clathrata var. confervoidea
## 33 678056                   Enteromorpha clathrata var. erecta
## 34 678051                  Enteromorpha clathrata var. genuina
## 46 678050                 Enteromorpha clathrata var. rothiana
## 45 678044                 Enteromorpha clathrata var. Rothiana
## 47 678045                 Enteromorpha clathrata var. Rothiena
## 48 706785                 Enteromorpha clathrata var. Rothiene
## 49 706949 Enteromorpha clathrata var. Rothiene f. faeniculacea
## 50 678054                  Enteromorpha clathrata var. spinosa
## 51 678054                  Enteromorpha clathrata var. spinosa
## 53 702724       Enteromorpha clathrata var. uncinata f. tenuis
## 54 706786                 Enteromorpha clathrata var. uncinate
## 55 706950      Enteromorpha clathrata var. uncinate f. robuste
## 56 678046                 Enteromorpha clathrata var. uncinota
##                 authority   status
## 5          (Ahlner) Kylin accepted
## 7       (Le Jolis) Chalon accepted
## 10 (Greville) V.J.Chapman accepted
## 13                 Wollny accepted
## 14     (Le Jolis) Batters accepted
## 16         (Ahlner) Kylin accepted
## 18      (Le Jolis) Chalon accepted
## 19      (Le Jolis) Chalon accepted
## 20      (Le Jolis) Chalon accepted
## 21                   <NA> accepted
## 22                   <NA> accepted
## 23            V.J.Chapman accepted
## 25            V.J.Chapman accepted
## 27    Areschoug ex Ahlner accepted
## 29    (C.Agardh) Montagne accepted
## 30         (Ag.) Montagne accepted
## 33     (Lyngbye) Greville accepted
## 34                Batters accepted
## 46                 Chalon accepted
## 45      (Le Jolis) Chalon accepted
## 47         Ef.] prostrata accepted
## 48                   <NA> accepted
## 49                   <NA> accepted
## 50   (Kützing) Rabenhorst accepted
## 51   (Kützing) Rabenhorst accepted
## 53             (Le Jolis) accepted
## 54                   <NA> accepted
## 55             (Le Jolis) accepted
## 56                   <NA> accepted
## 
## More than one WORMS ID found for taxon 'Enteromorpha clathrata'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##       id                                target                  authority
## 3 552942   Gigartina acicularis var. compressa (Kützing) Frauenfeld, 1854
## 4 552983 Gigartina acicularis var. ornithopoda                Mazza, 1904
##     status
## 3 accepted
## 4 accepted
## 
## More than one WORMS ID found for taxon 'Gigartina acicularis'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
##        id                               target               authority   status
## 2  553472   Sargassum vulgare f. diversifolium                  Grunow accepted
## 3  847596     Sargassum vulgare f. ercegovicii            A.Span, 2005 accepted
## 5  553473          Sargassum vulgare f. humile                  Grunow accepted
## 6 1311351     Sargassum vulgare f. leptocarpum  (Kützing) Grunow, 1889 accepted
## 7  708576   Sargassum vulgare f. linearifolium          J.Agardh, 1889 accepted
## 8  689026          Sargassum vulgare f. ovatum           Collins, 1901 accepted
## 9  551067 Sargassum vulgare var. angustifolium (Turner) C.Agardh, 1820 accepted
## 
## More than one WORMS ID found for taxon 'Sargassum vulgare'!
## 
##                   Enter rownumber of taxon (other inputs will return 'NA'):
#combine original names, parsed data and WoRMS ID into one data frame
combined_dataframe <- data.frame()

for (i in 1:nrow(cleaned_data)) {
  cleaned_value <- cleaned_data[i,]
  canonical_value <- parsed_names[[i]]$canonical$full
  worms_id_value <- worms_ids[[i]][1]
  if (is.null(canonical_value)){
    canonical_value <- NA
  }
  temp_row <- data.frame(CleanedData = cleaned_value, CanonicalFull = canonical_value, WormsIDs = worms_id_value)
  combined_dataframe <- rbind(combined_dataframe, temp_row)
}

knitr::kable(head(combined_dataframe))
CleanedData.scientificName CleanedData.decimalLongitude CleanedData.decimalLatitude CleanedData.year CleanedData.month CleanedData.day CleanedData.associatedMedia CleanedData.datasetName CleanedData.references CanonicalFull WormsIDs
Bryopsis corymbosa -17.05011 20.76984 1977 NA NA MarAfrica Bryopsis corymbosa 144447
Bryopsis plumosa -17.05011 20.76984 1966 NA NA MarAfrica Bryopsis plumosa 144457
Chaetomorpha brachygona -17.05011 20.76984 1952 NA NA MarAfrica Chaetomorpha brachygona 157103
Cladophora prolifera -17.05011 20.76984 1977 NA NA MarAfrica Cladophora prolifera 145060
Codium decorticatum -17.05011 20.76984 1911 NA NA MarAfrica Codium decorticatum 145083
Codium tomentosum -17.05011 20.76984 1977 NA NA MarAfrica Codium tomentosum 145092

Human Verification

Sometimes there are misspellings in the original text or incorrect OCR that can be searched for and fixed by hand. To do this, view the combined dataframe, search for unmatched species in WoRMS and add the ID, and remove rows that were not autoremoved in the earlier cleaning steps.

Darwin Core mapping

Required Terms

OBIS currently has eight required DwC terms: scientificName, scientificNameID, occurrenceID, eventDate, decimalLongitude, decimalLatitude, occurrenceStatus, basisOfRecord.

scientificName/scientificNameID

Create a dataframe with unique taxa only (though this should already be unique). This will be our primary DarwinCore data frame.

#rename and restructure WoRMSIDs to OBIS requirements
occurrence <- combined_dataframe %>%
  distinct(CleanedData.decimalLongitude, CleanedData.decimalLatitude, CleanedData.year, CleanedData.month, CleanedData.day, CanonicalFull, WormsIDs) %>%
  rename(scientificName = CanonicalFull) %>%
  rename(scientificNameID = WormsIDs) %>%
  rename(decimalLongitude = CleanedData.decimalLongitude) %>%
  rename(decimalLatitude = CleanedData.decimalLatitude) %>%
  rename(year = CleanedData.year) %>%
  rename(month = CleanedData.month) %>%
  rename(day = CleanedData.day) %>%
  mutate(scientificNameID = ifelse(!is.na(scientificNameID), paste("urn:lsid:marinespecies.org:taxname:", scientificNameID, sep = ""), NA))

occurrenceID

OccurrenceID is an identifier for the occurrence record and should be persistent and globally unique. It is a combination of dataset-shortname:occurrence: and a hash based on the scientific name.

# Vectorize the digest function (The digest() function isn't vectorized. So if you pass in a vector, you get one value for the whole vector rather than a digest for each element of the vector):
vdigest <- Vectorize(digest)

# Generate taxonID:
occurrence %<>% mutate(occurrenceID = paste(short_name, "occurrence", vdigest (paste(scientificName, decimalLongitude, decimalLatitude, year, month, day), algo="md5"), sep=":"))

eventDate

This is NULL since we do not know the collection date.

occurrence$eventDate <- ifelse(!is.na(occurrence$year), occurrence$year, NA)
occurrence$eventDate <- ifelse(!is.na(occurrence$month), paste(occurrence$eventDate, sprintf("%02d", occurrence$month), sep="-"), occurrence$eventDate)
occurrence$eventDate <- ifelse(!is.na(occurrence$day), paste(occurrence$eventDate, sprintf("%02d", occurrence$day), sep="-"), occurrence$eventDate)
occurrence$eventDate[occurrence$eventDate == "NA-NA-NA"] <- NA

decimalLongitude/decimalLatitude

Use obistools::calculate_centroid to calculate a centroid and radius for WKT strings. This is useful for populating decimalLongitude, decimalLatitude and coordinateUncertaintyInMeters. The WKT strings are from https://github.com/iobis/mwhs-shapes.

if (!file.exists(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))) {
  download.file("https://github.com/iobis/mwhs-shapes/blob/master/output/marine_world_heritage.gpkg?raw=true", paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
}

shapes <- st_read(paste(path_to_project_root, "scripts_data/marine_world_heritage.gpkg", sep="/"))
## Reading layer `marine_world_heritage' from data source 
##   `/mnt/c/Users/Chandra Earl/Desktop/Labs/UNESCO/mwhs-data-mobilization/scripts_data/marine_world_heritage.gpkg' 
##   using driver `GPKG'
## Simple feature collection with 60 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -55.32282 xmax: 180 ymax: 71.81381
## Geodetic CRS:  4326
#For some sites, the GeoPackage has core as well as buffer areas. Merge the geometries by site.
shapes_processed <- shapes %>%
  group_by(name) %>%
  summarize()

#Banc d'Arguin National Park
ind_shape <- shapes_processed$geom[which(shapes_processed$name == "Banc d'Arguin National Park")]

occurrenceStatus

occurrenceStatus <- "present"
occurrence %<>% mutate(occurrenceStatus)

basisOfRecord

basisOfRecord <- "HumanObservation"
occurrence %<>% mutate(basisOfRecord)

Extra Terms

footprintWKT

coordinateUncertaintyInMeters

occurrence %<>% mutate(coordinateUncertaintyInMeters = NA)

geodeticDatum

geodeticDatum <- "WGS84"
occurrence %<>% mutate(geodeticDatum)

country

country <- "Mauritania"
occurrence %<>% mutate(country)

locality

locality <- "Banc d'Arguin National Park"
occurrence %<>% mutate(locality)

Post-processing

Check data

Use the check_fields command from obistools to check if all OBIS required fields are present in an occurrence table and if any values are missing.

#Reorganize columns
occurrence = occurrence %>% select(occurrenceID, scientificName, scientificNameID, eventDate, year, month, day, country, locality, decimalLatitude, decimalLongitude, coordinateUncertaintyInMeters, geodeticDatum, occurrenceStatus, basisOfRecord)

#Check fields
check_fields(occurrence)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## ℹ The deprecated feature was likely used in the obistools package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 166 × 4
##    level field       row message                                 
##    <chr> <chr>     <int> <chr>                                   
##  1 error eventDate   204 Empty value for required field eventDate
##  2 error eventDate   205 Empty value for required field eventDate
##  3 error eventDate   206 Empty value for required field eventDate
##  4 error eventDate   207 Empty value for required field eventDate
##  5 error eventDate   208 Empty value for required field eventDate
##  6 error eventDate   209 Empty value for required field eventDate
##  7 error eventDate   210 Empty value for required field eventDate
##  8 error eventDate   211 Empty value for required field eventDate
##  9 error eventDate   212 Empty value for required field eventDate
## 10 error eventDate   213 Empty value for required field eventDate
## # ℹ 156 more rows

Create the EML file

This is a file which contains the dataset’s metadata and is required in a DarwinCore-Archive.

emld::eml_version("eml-2.1.1")
## [1] "eml-2.1.1"
#Title
title <- "Biodiversity data of the Banc D'Arguin National Park. Information for education, conservation and management."

#AlternateIdentifier
alternateIdentifier <- paste("https://ipt.obis.org/secretariat/resource?r=", short_name, sep="")

#Abstract
abstract <- eml$abstract(
  para = "The Parc National du Banc d′Arguin (PNBA, comprising 12,000 km2 of which half are marine) is the largest marine protected area in western Africa, recognized for its unique universal value by the UNESCO’s Marine World Heritage Programme. It is a globally significant hotspot for blue carbon, as the third largest area of seagrass and the third largest carbon stock among these UNESCO sites. PNBA is also a Ramsar Wetland of international importance and a WWF “Gift to the Earth”, and is the most important habitat of the Western Atlantic for nesting birds and Palaearctic migratory waders. The PNBA’s extensive marine vegetation provides major shelter, feeding and/or breeding sites for a wide range of marine species, including seabirds, turtles, endangered elasmobranchs and many other species of conservation and/or commercial interest (adapted from https://doi.org/10.1016/j.gecco.2021.e01890)"
)

People

Here we add the people involved in the project:

The creator is the person or organization responsible for creating the resource itself.

The contact is the person or institution to contact with questions about the use, interpretation of a data set.

The metadataProvider is the person responsible for providing the metadata documentation for the resource.

The associatedParty (in this case the Data Curator) is the person who mobilized the data from the original resource.

creator <- list(eml$creator(
    individualName = eml$individualName(
      givenName = "Ester", 
      surName = "Serrao"),
    organizationName = "Universidade do Algarve"
  )
)

contact <- eml$creator(
  individualName = eml$individualName(
    givenName = "OBIS", 
    surName = "Secretariat"),
  electronicMailAddress = "helpdesk@obis.org",
  organizationName = "OBIS",
  positionName = "Secretariat"
)

metadataProvider <- eml$metadataProvider(
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

associatedParty <- eml$associatedParty(
  role = "processor",
  individualName = eml$individualName(
    givenName = "Chandra", 
    surName = "Earl"),
  electronicMailAddress = "c.earl@unesco.org",
  organizationName = "UNESCO",
  positionName = "eDNA Scientific Officer"
)

Additional Metadata

Here we add the additionalMetadata element, which is required for a GBIF-type EML file and contains information such as the citation of the dataset, the citation of the original resource and the creation timestamp of the EML.

#{dataset.authors} ({dataset.pubDate}) {dataset.title}. [Version {dataset.version}]. {organization.title}. {dataset.type} Dataset {dataset.doi}, {dataset.url}

additionalMetadata <- eml$additionalMetadata(
  metadata = list(
    gbif = list(
      dateStamp = paste0(format(Sys.time(), "%Y-%m-%dT%H:%M:%OS3"), paste0(substr(format(Sys.time(), "%z"), 1, 3), ":", paste0(substr(format(Sys.time(), "%z"), 4, 5)))),
      hierarchyLevel = "dataset",
      citation = "IPT will autogenerate this",
      bibliography = list(
        citation = "Project Marafrica: a network monitoring, integrating and assessing marine biodiversity data along the west africa to understand, predict and mitigate climatic / oceanographic changes. Aga-Khan Foundation and FCT-Portugal and Project STM - Survi des Tortues Marines. MAVA Foundation and PRCM. 2023. Data Downloaded 2023-10-17. ")
    )
  )
)

citationdoi <- "https://www.marafrica.net/pnba/"

Coverage

Here we describe the dataset’s geographic, taxonomic and temporal coverage.

#Coverage
coverage <- eml$coverage(
  geographicCoverage = eml$geographicCoverage(
    geographicDescription = "Banc d'Arguin National Park",
    boundingCoordinates = eml$boundingCoordinates(
      westBoundingCoordinate = st_bbox(ind_shape)$xmax,
      eastBoundingCoordinate = st_bbox(ind_shape)$xmin,
      northBoundingCoordinate = st_bbox(ind_shape)$ymax,
      southBoundingCoordinate = st_bbox(ind_shape)$ymin)
    ),
  taxonomicCoverage = eml$taxonomicCoverage(
    generalTaxonomicCoverage = "Fishes",
    taxonomicClassification = list(
      eml$taxonomicClassification(
        taxonRankName = "Superclass",
        taxonRankValue = "Agnatha"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Chondrichthyes"),
      eml$taxonomicClassification(
        taxonRankName = "unranked",
        taxonRankValue = "Osteichthyes")
      )
    
#  ),
#  temporalCoverage = eml$temporalCoverage(
#    rangeOfDates = eml$rangeOfDates(
#      beginDate = eml$beginDate(
#        calendarDate = "2019-05-01"
#      ),
#      endDate = eml$endDate(
#        calendarDate = "2016-05-06"
#      )
#    )
   )
)

Extra MetaData

These fields are not required, though they make the metadata more complete.

methods <- eml$methods(
  methodStep = eml$methodStep(
    description = eml$description(
      para = paste("See Github <a href=\"https://github.com/iobis/mwhs-data-mobilization\">Project</a> and <a href=\"https://iobis.github.io/mwhs-data-mobilization/notebooks/", site_dir_name, "/", dataset_dir_name, "\"> R Notebook</a> for dataset construction methods", sep="")
    )
  )
)

#Other Data
pubDate <- "2023-10-15"

#language of original document
language <- "eng"

keywordSet <- eml$keywordSet(
  keyword = "Occurrence",
  keywordThesaurus = "GBIF Dataset Type Vocabulary: http://rs.gbif.org/vocabulary/gbif/dataset_type_2015-07-10.xml"
)

maintenance <- eml$maintenance(
  description = eml$description(
    para = ""),
  maintenanceUpdateFrequency = "notPlanned"
)

#Universal CC
intellectualRights <- eml$intellectualRights(
  para = "To the extent possible under law, the publisher has waived all rights to these data and has dedicated them to the <ulink url=\"http://creativecommons.org/publicdomain/zero/1.0/legalcode\"><citetitle>Public Domain (CC0 1.0)</citetitle></ulink>. Users may copy, modify, distribute and use the work, including for commercial purposes, without restriction."
)


purpose <- eml$purpose(
  para = "These data were made accessible through UNESCO's eDNA Expeditions project to mobilize available marine species and occurrence datasets from World Heritage Sites."
)

additionalInfo <- eml$additionalInfo(
  para = "marine, harvested by iOBIS"
)

Create and Validate EML

#Put it all together
my_eml <- eml$eml(
           packageId = paste("https://ipt.obis.org/secretariat/resource?id=", short_name, "/v1.0", sep = ""),  
           system = "http://gbif.org",
           scope = "system",
           dataset = eml$dataset(
               alternateIdentifier = alternateIdentifier,
               title = title,
               creator = creator,
               metadataProvider = metadataProvider,
               associatedParty = associatedParty,
               pubDate = pubDate,
               coverage = coverage,
               language = language,
               abstract = abstract,
               keywordSet = keywordSet,
               contact = contact,
               methods = methods,
               intellectualRights = intellectualRights,
               purpose = purpose,
               maintenance = maintenance,
               additionalInfo = additionalInfo),
           additionalMetadata = additionalMetadata
)

eml_validate(my_eml)
## [1] TRUE
## attr(,"errors")
## character(0)

Create meta.xml file

This is a file which describes the archive and data file structure and is required in a DarwinCore-Archive. It is based on the template file “meta_occurrence_checklist_template.xml”

meta_template <- paste(path_to_project_root, "scripts_data/meta_occurrence_occurrence_template.xml", sep="/")
meta <- read_xml(meta_template)

fields <- xml_find_all(meta, "//d1:field")

for (field in fields) {
  term <- xml_attr(field, "term")
  if (term == "http://rs.tdwg.org/dwc/terms/eventDate") {
    xml_set_attr(field, "index", "3")
    xml_set_attr(field, "default", NULL)
  } else if (term == "http://rs.tdwg.org/dwc/terms/country") {
    xml_set_attr(field, "default", country)
  } else if (term == "http://rs.tdwg.org/dwc/terms/geodeticDatum") {
    xml_set_attr(field, "default", geodeticDatum)
  } else if (term == "http://rs.tdwg.org/dwc/terms/occurrenceStatus") {
    xml_set_attr(field, "default", occurrenceStatus)
  } else if (term == "http://rs.tdwg.org/dwc/terms/basisOfRecord") {
    xml_set_attr(field, "default", basisOfRecord)
  } else if (term == "http://rs.tdwg.org/dwc/terms/locality") {
    xml_set_attr(field, "default", locality)
    xml_set_attr(field, "index", NULL)
  } else if (term == "http://rs.tdwg.org/dwc/terms/decimalLatitude") {
    xml_set_attr(field, "index", "9")
  } else if (term == "http://rs.tdwg.org/dwc/terms/decimalLongitude") {
    xml_set_attr(field, "index", "10")
  } else if (term == "http://rs.tdwg.org/dwc/terms/coordinateUncertaintyInMeters") {
    xml_set_attr(field, "index", "11")
  } else if (term == "http://rs.tdwg.org/dwc/terms/fieldNumber") {
    xml_remove(field)
  }
}

#Add year, month, day
new_field <- xml_add_sibling(fields[[4]], "field")
xml_set_attr(new_field, "index", "4")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/year")

fields <- append(fields, list(new_field))

new_field <- xml_add_sibling(fields[[5]], "field")
xml_set_attr(new_field, "index", "5")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/month")

fields <- append(fields, list(new_field))

new_field <- xml_add_sibling(fields[[6]], "field")
xml_set_attr(new_field, "index", "6")
xml_set_attr(new_field, "term", "http://rs.tdwg.org/dwc/terms/day")

fields <- append(fields, list(new_field))

Save outputs

dwc_output_dir <- paste(path_to_project_root, "output", site_dir_name, dataset_dir_name, sep="/")

write.csv(occurrence, paste(dwc_output_dir, "/occurrence.csv", sep = ""), na = "", row.names=FALSE)
write_xml(meta, file = paste(dwc_output_dir, "/meta.xml", sep = ""))
write_eml(my_eml, paste(dwc_output_dir, "/eml.xml", sep = ""))

Edit EML

We have to further edit the eml file to conform to GBIF-specific requirements that cannot be included in the original EML construction. This includes changing the schemaLocation and rearranging the GBIF element, since the construction automatically arranges the children nodes to alphabetical order.

#edit the schemaLocation and rearrange gbif node for gbif specific eml file
eml_content <- read_xml(paste(dwc_output_dir, "/eml.xml", sep = ""))

#change schemaLocation attributes for GBIF
root_node <- xml_root(eml_content)
xml_set_attr(root_node, "xsi:schemaLocation", "https://eml.ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.2/eml.xsd")
xml_set_attr(root_node, "xmlns:dc", "http://purl.org/dc/terms/")
xml_set_attr(root_node, "xmlns:stmml", NULL)
xml_set_attr(root_node, "xml:lang", "eng")


#rearrange children nodes under the GBIF element
hierarchyLevel <- eml_content %>% xml_find_all(".//hierarchyLevel")
dateStamp <- eml_content %>% xml_find_all(".//dateStamp")
citation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation")
bibcitation <- eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/bibliography/citation")
xml_set_attr(bibcitation, "identifier", citationdoi)

eml_content %>% xml_find_all(".//hierarchyLevel") %>% xml_remove()
eml_content %>% xml_find_all(".//dateStamp") %>% xml_remove()
eml_content %>% xml_find_all("./additionalMetadata/metadata/gbif/citation") %>% xml_remove()
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(citation, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(hierarchyLevel, .where=0)
eml_content %>% xml_find_all(".//gbif") %>% xml_add_child(dateStamp, .where=0)

write_xml(eml_content, paste(dwc_output_dir, "/eml.xml", sep = ""))

Zip files to DwC-A

output_zip <- paste(dwc_output_dir, "DwC-A.zip", sep="/")

if (file.exists(output_zip)) {
  unlink(output_zip)
}

file_paths <- list.files(dwc_output_dir, full.names = TRUE)
zip(zipfile = output_zip, files = file_paths, mode = "cherry-pick")

if (file.exists(output_zip)) {
  unlink(file_paths)
}