Identifying duplicate datasets in OBIS
This notebook tries to identify duplicate datasets in the OBIS system. This is done by aggregating datasets by geohash, species, and year, and calculating cosine similarities between datasets. Skip to results here.
Dependencies
library(tidyr)
library(glue)
library(Matrix)
library(dplyr)
library(arrow)
library(dplyr)
library(geohashTools)
library(data.table)
library(reactable)
Fetch occurrences from S3
This fetches just the necessary columns from the OBIS export on S3.
<- S3FileSystem$create(
space anonymous = TRUE,
scheme = "https",
endpoint_override = "ams3.digitaloceanspaces.com"
)
<- open_dataset(space$path("obis-datasets/exports/obis_20220208.parquet")) %>%
occ select(dataset_id, decimalLongitude, decimalLatitude, AphiaID, year = date_year) %>%
as_tibble()
Aggregate datasets by geohash, species, and year
<- occ %>%
stats filter(!is.na(year) & decimalLatitude < 90 & decimalLongitude < 180) %>%
mutate(geohash = gh_encode(decimalLatitude, decimalLongitude, 2)) %>%
mutate(cell = factor(paste(geohash, AphiaID, year, sep = "_"))) %>%
group_by(dataset_id, cell) %>%
summarize(records = n())
Generate vectors and calculate similarity
First generate vectors by dataset and store as sparseVector
.
<- length(levels(stats$cell))
n_cells <- unique(stats$dataset_id)
dataset_ids
<- list()
vectors
for (id in dataset_ids) {
message(id)
<- rep(0, n_cells)
vector <- stats %>%
dataset filter(dataset_id == id)
for (i in 1:nrow(dataset)) {
as.numeric(dataset$cell[i])] <- dataset$records[i]
vector[
}<- as(vector, "sparseVector")
vectors[[id]] }
Calculate similarities using parallelization and write the results to a text file.
write("x y similarity", file = "similarity.txt", append = FALSE)
::mclapply(1:length(dataset_ids), function(i) {
parallel<- dataset_ids[i]
dataset_x <- as.vector(vectors[[dataset_x]])
x for (j in (i + 1):length(dataset_ids)) {
<- dataset_ids[j]
dataset_y <- as.vector(vectors[[dataset_y]])
y <- coop::cosine(x, y)
similarity <- paste(dataset_x, dataset_y, format(similarity, scientific = FALSE))
line write(line, file = "similarity.txt", append = TRUE)
}mc.cores = 6) },
Plot the similarities to get an idea of the distribution.
<- fread("similarity.txt", sep = " ", header = TRUE)
similarity plot(similarity$similarity)
Create a shortlist of suspect dataset pairs.
<- robis::dataset() %>%
datasets ::unnest(statistics) %>%
tidyrselect(id, url, title, records = Occurrence)
<- similarity %>%
suspect filter(similarity > 0.85) %>%
left_join(datasets, by = c("x" = "id")) %>%
left_join(datasets, by = c("y" = "id"), suffix = c("_x", "_y")) %>%
arrange(desc(similarity), desc(records_x + records_y)) %>%
as_tibble()
suspect
## # A tibble: 426 × 9
## x y similarity url_x title_x records_x url_y title_y records_y
## <chr> <chr> <dbl> <chr> <chr> <int> <chr> <chr> <int>
## 1 3851d… fdfb2… 1 http:/… Movemen… 156333 http:… Movemen… 4
## 2 01f35… bad2f… 1 http:/… Movemen… 143568 http:… Movemen… 2
## 3 3e0cb… d9a55… 1 http:/… Phytopl… 25736 http:… Phytopl… 25736
## 4 54bf1… a4d4e… 1 https:… Whale c… 7122 http:… Whale c… 7122
## 5 7d610… f98b4… 1 https:… Snow Pe… 7139 http:… Snow Pe… 6697
## 6 2c261… e3ab9… 1 http:/… SPF Mar… 4609 http:… SPF Mar… 9218
## 7 c2d77… e3009… 1 http:/… Macro- … 6402 http:… Macro- … 6402
## 8 6c853… f6638… 1 https:… Macquar… 5771 http:… Macquar… 5771
## 9 33dc2… 4b273… 1 http:/… Snow Pe… 5588 https… Snow Pe… 5588
## 10 08fb2… 40e33… 1 http:/… Nest ce… 5056 https… Nest ce… 5056
## # … with 416 more rows
Results
%>%
suspect mutate(
title = glue("<a href=\"https://obis.org/dataset/{x}\" target=\"_blank\">{title_x}</a><br/><br/><a href=\"https://obis.org/dataset/{y}\" target=\"_blank\">{title_y}</a>"),
url = glue("<a href=\"{url_x}\" target=\"_blank\">{url_x}</a><br/><br/><a href=\"{url_y}\" target=\"_blank\">{url_y}</a>"),
records = glue("{records_x}<br/><br/>{records_y}")) %>%
select(similarity, title, url, records) %>%
reactable(columns = list(similarity = colDef(width = 100), title = colDef(html = TRUE), url = colDef(html = TRUE), records = colDef(html = TRUE, width = 100)), pagination = FALSE)