GBIF marine datasets
This notebook uses the GBIF API to generate statistics on marine data in GBIF. Note that this is a preliminary analysis which only uses species. The process looks like this:
- Get all marine species names from WoRMS
- Using the species names from WoRMS, get all profiles of marine species from GBIF
- Using the
nubKeys from the species profiles, get record counts by dataset from GBIF
Get all marine species from WoRMS
This uses a full export of the WoRMS database. This file is not included in the repository, contact the WoRMS team to get access to such an export.
library(dplyr)
library(data.table)
taxon <- fread("worms_export/taxon.txt", sep = "\t", na.strings = "", quote = "") %>%
as_tibble()
speciesprofile <- fread("worms_export/speciesprofile.txt", sep = "\t", na.strings = "", quote = "") %>%
as_tibble()
taxon_marine <- taxon %>%
left_join(speciesprofile, by = "taxonID", suffix = c("", ".y")) %>%
filter(taxonomicStatus == "accepted" & isMarine == 1 & taxonRank == "Species")
species <- unique(taxon_marine$scientificName)Fetch species profiles from GBIF
Here we use the GBIF API to search for species by name. The results are stored as CSV files in the profiles folder. Once all species have been processed, read the CSV files.
library(stringr)
library(jsonlite)
library(progress)
library(purrr)
if (!file.exists("profiles.rds")) {
pb <- progress_bar$new(total = length(species), format = "[:bar] :current/:total (:percent) ETA: :eta")
for (sp in species) {
key <- str_replace(tolower(sp), "\\s", "_")
filename <- paste0("profiles/", key, ".csv")
if (!file.exists(filename)) {
url <- URLencode(paste0("https://api.gbif.org/v1/species?name=", sp))
res <- fromJSON(url)$results
if (length(res) > 0 & "nubKey" %in% names(res)) {
species_names <- res %>%
select(key, nubKey, nameKey, taxonID)
write.csv(species_names, filename, row.names = FALSE, na = "")
} else {
write.csv(data.frame(nubKey = character(0)), filename, row.names = FALSE, na = "")
}
}
pb$tick()
}
files <- list.files(path = "profiles", pattern = "*.csv", full.names = TRUE)
profiles <- map(files, ~read.csv(.)) %>%
bind_rows()
} else {
profiles <- readRDS("profiles.rds")
}Get occurrence counts by dataset for each species
Here we use another API endpoint to get the number of records per dataset for each species. Results are stored in the statistics folder as CSV files.
if (!file.exists("statistics.rds")) {
nubkeys <- na.omit(unique(profiles$nubKey))
pb <- progress_bar$new(total = length(nubkeys), format = "[:bar] :current/:total (:percent) ETA: :eta")
for (nubkey in nubkeys) {
filename <- paste0("statistics/", nubkey, ".csv")
if (!file.exists(filename)) {
url <- URLencode(paste0("https://api.gbif.org/v1/occurrence/counts/datasets?nubKey=", nubkey))
res <- fromJSON(url)
if (length(res) > 0) {
df <- data.frame(dataset = names(res), records = unlist(res))
write.csv(df, filename, row.names = FALSE, na = "")
} else {
write.csv(data.frame(dataset = character(0), records = integer(0)), filename, row.names = FALSE, na = "")
}
}
pb$tick()
}
files <- list.files(path = "statistics", pattern = "*.csv", full.names = TRUE)
statistics <- map(files, ~read.csv(., colClasses = c("character", "integer"))) %>%
bind_rows()
} else {
statistics <- readRDS("statistics.rds")
}Calculate statistics
stats <- statistics %>%
group_by(dataset) %>%
summarize(records = sum(records)) %>%
arrange(desc(records))
n_datasets <- format(nrow(stats), big.mark = ",")
n_records <- format(sum(stats$records), big.mark = ",")
stats %>%
rmarkdown::paged_table()In total we have found 15,364 datasets containing marine species, for a total of 223,870,046 marine species records.