Identifying duplicate datasets in OBIS
This notebook tries to identify duplicate datasets in the OBIS
system. This is done by aggregating datasets by geohash, species, and
year, and calculating cosine
similarities Jaccard similarities between datasets. Skip to
results here.
Dependencies
Calculate similarities
Aggregate datasets by geohash, species, and year. Then calculate Jaccard similarity.
con <- dbConnect(duckdb())
similarity <- dbGetQuery(con, glue("
install h3 from community;
load h3;
WITH cells_per_dataset AS (
SELECT DISTINCT
dataset_id,
concat_ws(
'_',
interpreted.AphiaID,
interpreted.date_year,
h3_latlng_to_cell_string(interpreted.decimalLatitude, interpreted.decimalLongitude, 2)
) as cell
FROM read_parquet('/Volumes/acasis/occurrence/*')
),
dataset_cell_counts AS (
SELECT
dataset_id,
COUNT(*) as total_cells
FROM cells_per_dataset
GROUP BY dataset_id
),
intersections AS (
SELECT
a.dataset_id as x,
b.dataset_id as y,
COUNT(*) as intersection_size
FROM cells_per_dataset a
JOIN cells_per_dataset b ON a.cell = b.cell
WHERE a.dataset_id < b.dataset_id
GROUP BY a.dataset_id, b.dataset_id
)
SELECT
i.x,
i.y,
i.intersection_size,
a.total_cells + b.total_cells - i.intersection_size as union_size,
i.intersection_size::FLOAT / (a.total_cells + b.total_cells - i.intersection_size) as similarity,
1 - (i.intersection_size::FLOAT / (a.total_cells + b.total_cells - i.intersection_size)) as distance
FROM intersections i
JOIN dataset_cell_counts a ON i.x = a.dataset_id
JOIN dataset_cell_counts b ON i.y = b.dataset_id;
"))
Plot the similarities to get an idea of the distribution.
Create a shortlist of suspect dataset pairs.
datasets <- robis::dataset() %>%
tidyr::unnest(statistics) %>%
select(id, url, title, records = Occurrence)
suspect <- similarity %>%
filter(similarity > 0.5) %>%
left_join(datasets, by = c("x" = "id")) %>%
left_join(datasets, by = c("y" = "id"), suffix = c("_x", "_y")) %>%
arrange(desc(similarity), desc(records_x + records_y)) %>%
as_tibble()
suspect
## # A tibble: 678 × 12
## x y intersection_size union_size similarity distance url_x title_x records_x url_y title_y records_y
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <int> <chr> <chr> <int>
## 1 5f1a9890-b71f-4d40-bd4c-f200674dfba8 7776642c-95ee-466c-890a-f… 2365 2365 1 0 http… ANEMOO… 1625592 http… ANEMOO… 1625592
## 2 6d7b384f-090b-4c15-b68d-4c1192c0f8a6 d2caf73d-7e52-4113-b2f6-b… 82676 82676 1 0 http… Marine… 772791 http… Marine… 772791
## 3 1471a49e-37df-4312-95cd-135ac00d25cd e7e9e58c-6c4c-4cc0-9df3-d… 2880 2880 1 0 http… Plankt… 460805 http… Plankt… 460805
## 4 b18829a5-eec2-4978-a88d-1edba34f1377 c87cb4d8-0757-442f-9166-f… 28199 28199 1 0 http… Marine… 415442 http… Marine… 415442
## 5 9fd58901-44bf-4e78-adc6-d040eec324af df6977ea-a704-4c48-bd46-e… 41720 41720 1 0 http… Marine… 353496 http… Marine… 353496
## 6 5a39f604-b31f-49c1-acb6-e9a4f97e236f c7243636-85ff-4c99-8b5e-b… 1978 1978 1 0 http… TPWD H… 84624 http… TPWD H… 179052
## 7 25f252e8-a2a6-4847-bda4-6045650b761f a8d9746d-562b-40f5-b786-6… 913 913 1 0 http… SHARK … 121103 http… SHARK … 121103
## 8 442b676b-65b2-4c90-a8b9-2fad5f27b08d 9378019b-6657-4c21-81cd-1… 2172 2172 1 0 http… Plankt… 108078 http… Plankt… 108078
## 9 67bac074-3445-4fd8-8e3c-4836fa0736dd b88b8e7b-cbf1-4c04-a2c2-a… 58 58 1 0 http… 2013 U… 69945 http… 2013 U… 69945
## 10 5132b63a-4d7a-41e7-87eb-b5a9d6edc05f cfa6809d-50cf-4f29-99d1-9… 11723 11723 1 0 http… Irish … 62604 http… Irish … 62604
## # ℹ 668 more rows
Results
suspect %>%
mutate(
title = glue("<a href=\"https://obis.org/dataset/{x}\" target=\"_blank\">{title_x}</a><br/><br/><a href=\"https://obis.org/dataset/{y}\" target=\"_blank\">{title_y}</a>"),
url = glue("<a href=\"{url_x}\" target=\"_blank\">{url_x}</a><br/><br/><a href=\"{url_y}\" target=\"_blank\">{url_y}</a>"),
records = glue("{records_x}<br/><br/>{records_y}")) %>%
select(similarity, title, url, records) %>%
reactable(columns = list(similarity = colDef(width = 100), title = colDef(html = TRUE), url = colDef(html = TRUE), records = colDef(html = TRUE, width = 100)), pagination = FALSE)