diff --git a/data-raw/itis.R b/data-raw/itis.R index a08bb2e..6e4e50c 100644 --- a/data-raw/itis.R +++ b/data-raw/itis.R @@ -1,7 +1,7 @@ ## apt-get -y install mariadb-client postgresql-client -library(taxizedb) +#library(taxizedb) library(tidyverse) - +library(piggyback) #itis_store <- db_download_itis() ## Need to fix locale issue #db_load_itis(itis_store, user = "postgres", pwd = "password", host = "postgres") @@ -96,25 +96,9 @@ itis_long <- itis %>% path_rank, path_id, path_rank_id, name_usage, update_date) %>% mutate(update_date = as_date(update_date)) - -## encode language with common name(?) - -## Some langauage names have the same language code. -## ISOcodes puts duplicates in same column, we need a tidy look-up table -iso <- ISOcodes::ISO_639_2 %>% - select(language = Name, code = Alpha_2) %>% - na.omit() %>% - separate(language, c("name", "name2", "name3", "name4", "name5"), - sep = ";", extra="warn", fill = "right") %>% - gather(key, language, -code) %>% - select(-key) %>% - na.omit() - -itis_for_rdf <- - itis_long %>% - left_join(iso) %>% - unite("common_name", common_name, code, sep = "@") - +system.time({ + write_tsv(itis_long, bzfile("data/itis_long.tsv.bz2", compression=9)) +}) #itis_long <- read_tsv("data/itis_long.tsv.bz2") @@ -127,9 +111,7 @@ itis_hierarchy <- spread(path_rank, path) -system.time({ - write_tsv(itis_long, bzfile("data/itis_long.tsv.bz2", compression=9)) -}) + ## write at compression 9 for best compression system.time({ write_tsv(itis_hierarchy, bzfile("data/itis_hierarchy.tsv.bz2", compression=9)) diff --git a/notebook/taxa-rdf.R b/notebook/taxa-rdf.R index 77e0f3c..3a5f26b 100644 --- a/notebook/taxa-rdf.R +++ b/notebook/taxa-rdf.R @@ -1,11 +1,41 @@ remotes::install_github("cboettig/virtuoso") remotes::install_github("cboettig/rdftools") + library(virtuoso) library(readr) library(rdftools) library(dplyr) library(taxald) +library(ISOcodes) + + +itis_long <- read_tsv("data/itis_long.tsv.bz2") + +## Some langauage names have the same language code. +## ISOcodes puts duplicates in same column, we need a tidy look-up table +iso <- ISOcodes::ISO_639_2 %>% + select(language = Name, code = Alpha_2) %>% + na.omit() %>% + separate(language, c("name", "name2", "name3", "name4", "name5"), + sep = ";", extra="warn", fill = "right") %>% + gather(key, language, -code) %>% + select(-key) %>% + na.omit() + +itis_for_rdf <- + itis_long %>% + left_join(iso) %>% + unite("common_name", common_name, code, sep = "@") + +species <- itis_long %>% select(id, name, rank, common_name, update_date) %>% distinct() +classif <- itis_long %>% select(path_id, path, path_rank, path_rank_id, species_id = id) %>% distinct() +#readr::write_tsv(species, "species.tsv") +#readr::write_tsv(classif, "classif.tsv") +rdftools::write_nquads(species, "itis_species.nq.gz", prefix = "taxald:", key = "id") +rdftools::write_nquads(classif, "itis_classif.nq.gz", prefix = "taxald:", key = "path_id") + + taxa_tbl("itis", "taxonid") %>% collect() %>% @@ -42,6 +72,8 @@ taxa_tbl("wd", "taxonid") %>% collect() %>% key_column = "id", prefix = "taxald:") #virtuoso:::vos_delete_db() + +library(virtuoso) vos_start() con <- vos_connect() files <- fs::dir_ls(glob="*.nq.gz") @@ -65,18 +97,69 @@ vos_query(con, ") vos_query(con, -'SELECT ?id ?rank +'SELECT ?id ?rank ?common WHERE { ?id "Homo sapiens" . - ?id ?rank + ?id ?rank . + OPTIONAL { ?id ?common . } +} +') + +vos_query(con, + 'SELECT ?id ?species ?rank + WHERE { + ?id ?species . + ?id ?rank . + ?id "Human" . + } + ') + +vos_query(con, 'SELECT ?id ?species ?rank + WHERE { + ?id ?species . + ?id ?rank . + ?id ?common_name . + ?id "Human" . + FILTER langMatches(lang(?common_name), "en") +} +') + +vos_query(con, 'SELECT ?id ?species ?common_name ?rank + WHERE { + ?id ?species . + ?id ?rank . + ?id ?common_name . + FILTER( ?common_name LIKE "%Cod%" ). +} +') + + +vos_query(con, + 'SELECT ?path ?rank + WHERE { + ?path_id ?path . + ?path_id ?rank + } LIMIT 20 ') vos_query(con, - 'SELECT ?id ?name + 'SELECT ?id ?name ?common ?rank ?update_date WHERE { ?id ?name . - ?id "kingdom" + ?id ?common . + ?id ?rank . + ?id ?update_date + } LIMIT 20 ') + +vos_query(con, + 'SELECT DISTINCT ?p +FROM + WHERE { + ?s ?p ?o + } +LIMIT 20 +')