Identifying duplicate datasets in OBIS

This notebook tries to identify duplicate datasets in the OBIS system. This is done by aggregating datasets by geohash, species, and year, and calculating cosine similarities Jaccard similarities between datasets. Skip to results here.

grids

Dependencies

library(tidyr)
library(glue)
library(Matrix)
library(dplyr)
library(arrow)
library(dplyr)
library(data.table)
library(duckdb)
library(DBI)
library(reactable)

Calculate similarities

Aggregate datasets by geohash, species, and year. Then calculate Jaccard similarity.

con <- dbConnect(duckdb())

similarity <- dbGetQuery(con, glue("
  install h3 from community;
  load h3;

  WITH cells_per_dataset AS (
    SELECT DISTINCT
      dataset_id,
      concat_ws(
        '_',
        interpreted.AphiaID,
        interpreted.date_year,
        h3_latlng_to_cell_string(interpreted.decimalLatitude, interpreted.decimalLongitude, 2)
      ) as cell
    FROM read_parquet('/Volumes/acasis/occurrence/*')
  ),

  dataset_cell_counts AS (
    SELECT
      dataset_id,
      COUNT(*) as total_cells
    FROM cells_per_dataset
    GROUP BY dataset_id
  ),

  intersections AS (
    SELECT
      a.dataset_id as x,
      b.dataset_id as y,
      COUNT(*) as intersection_size
    FROM cells_per_dataset a
    JOIN cells_per_dataset b ON a.cell = b.cell
    WHERE a.dataset_id < b.dataset_id
    GROUP BY a.dataset_id, b.dataset_id
  )

  SELECT
    i.x,
    i.y,
    i.intersection_size,
    a.total_cells + b.total_cells - i.intersection_size as union_size,
    i.intersection_size::FLOAT / (a.total_cells + b.total_cells - i.intersection_size) as similarity,
    1 - (i.intersection_size::FLOAT / (a.total_cells + b.total_cells - i.intersection_size)) as distance
  FROM intersections i
  JOIN dataset_cell_counts a ON i.x = a.dataset_id
  JOIN dataset_cell_counts b ON i.y = b.dataset_id;
"))

Plot the similarities to get an idea of the distribution.

plot(similarity$similarity)

Create a shortlist of suspect dataset pairs.

datasets <- robis::dataset() %>%
  tidyr::unnest(statistics) %>%
  select(id, url, title, records = Occurrence)

suspect <- similarity %>%
  filter(similarity > 0.5) %>%
  left_join(datasets, by = c("x" = "id")) %>%
  left_join(datasets, by = c("y" = "id"), suffix = c("_x", "_y")) %>%
  arrange(desc(similarity), desc(records_x + records_y)) %>%
  as_tibble()

suspect 
## # A tibble: 678 × 12
##    x                                    y                          intersection_size union_size similarity distance url_x title_x records_x url_y title_y records_y
##    <chr>                                <chr>                                  <dbl>      <dbl>      <dbl>    <dbl> <chr> <chr>       <int> <chr> <chr>       <int>
##  1 5f1a9890-b71f-4d40-bd4c-f200674dfba8 7776642c-95ee-466c-890a-f…              2365       2365          1        0 http… ANEMOO…   1625592 http… ANEMOO…   1625592
##  2 6d7b384f-090b-4c15-b68d-4c1192c0f8a6 d2caf73d-7e52-4113-b2f6-b…             82676      82676          1        0 http… Marine…    772791 http… Marine…    772791
##  3 1471a49e-37df-4312-95cd-135ac00d25cd e7e9e58c-6c4c-4cc0-9df3-d…              2880       2880          1        0 http… Plankt…    460805 http… Plankt…    460805
##  4 b18829a5-eec2-4978-a88d-1edba34f1377 c87cb4d8-0757-442f-9166-f…             28199      28199          1        0 http… Marine…    415442 http… Marine…    415442
##  5 9fd58901-44bf-4e78-adc6-d040eec324af df6977ea-a704-4c48-bd46-e…             41720      41720          1        0 http… Marine…    353496 http… Marine…    353496
##  6 5a39f604-b31f-49c1-acb6-e9a4f97e236f c7243636-85ff-4c99-8b5e-b…              1978       1978          1        0 http… TPWD H…     84624 http… TPWD H…    179052
##  7 25f252e8-a2a6-4847-bda4-6045650b761f a8d9746d-562b-40f5-b786-6…               913        913          1        0 http… SHARK …    121103 http… SHARK …    121103
##  8 442b676b-65b2-4c90-a8b9-2fad5f27b08d 9378019b-6657-4c21-81cd-1…              2172       2172          1        0 http… Plankt…    108078 http… Plankt…    108078
##  9 67bac074-3445-4fd8-8e3c-4836fa0736dd b88b8e7b-cbf1-4c04-a2c2-a…                58         58          1        0 http… 2013 U…     69945 http… 2013 U…     69945
## 10 5132b63a-4d7a-41e7-87eb-b5a9d6edc05f cfa6809d-50cf-4f29-99d1-9…             11723      11723          1        0 http… Irish …     62604 http… Irish …     62604
## # ℹ 668 more rows

Results

suspect %>%
  mutate(
    title = glue("<a href=\"https://obis.org/dataset/{x}\" target=\"_blank\">{title_x}</a><br/><br/><a href=\"https://obis.org/dataset/{y}\" target=\"_blank\">{title_y}</a>"),
    url = glue("<a href=\"{url_x}\" target=\"_blank\">{url_x}</a><br/><br/><a href=\"{url_y}\" target=\"_blank\">{url_y}</a>"),
    records = glue("{records_x}<br/><br/>{records_y}")) %>%
  select(similarity, title, url, records) %>%
  reactable(columns = list(similarity = colDef(width = 100), title = colDef(html = TRUE), url = colDef(html = TRUE), records = colDef(html = TRUE, width = 100)), pagination = FALSE)