ZoTFIDF: Adding custom tf-idf tags to your Zotero database

Image credit: Source: Zotero

I wanted to add my own custom tags to my Zotero citation management database based on the term frequency—inverse document frequency (tf-idf) formula. While not perfect, this is a nice way to pick out what is distinct and important about individual texts.

“external access to the SQLite database (including direct access via the mozStorage API) should be done only in a read-only manner. Modifying the database while Zotero is running can easily result in a corrupted database, as mozStorage caching breaks the normal file-locking in SQLite that allows for safe concurrent file access. And even if Firefox is shut down before accessing the file, modifying the database directly bypasses the data validation and referential integrity checks performed by Zotero and the Zotero server that are required for Zotero to function properly. Generally, the SQLite database should be viewed as an internal database that has the benefit of being externally readable for people who want to get the data out in other ways.”1

How did I do this? It turns out that Zotero stores all the indexed text from attachments (e.g., .pdf files) in a file named .zotero-ft-cache inside each individual item’s folder. Amazingly, the R package, readtext allows you to import text from plaintext files that are organized in this way using:

textDF <- readtext(paste0("~/Zotero/storage", "/*/.zotero-ft-cache"), 
             docvarsfrom = "filepaths")

Once I did this, I then was able to use the DBI and RSQLite packages to interact directly with the Zotero database, zotero.sqlite. With a few tricks from the quanteda package and a few left-joins, I was able to add new tags to all my existing items with indexed text that represent the 5 features with the greatest tf-idf score! Now I’m slowly adding those tags to justdeserts.org, success!

Code

# to avoid this error on caching later on: Error in lazyLoadDBinsertVariable(vars[i], from, datafile, ascii, compress, :long vectors not supported yet: connections.c:5984
knitr::opts_chunk$set(cache.lazy = FALSE)
library(magrittr)
library(DBI)
library(RSQLite)

NEW_TAGS <- FALSE

con = dbConnect(drv = RSQLite::SQLite(), 
                dbname = "~/Zotero/zotero.sqlite")

alltables = dbListTables(con)

table.tags <- dbGetQuery(con, 'select * from tags')
table.itemAttachments <- dbGetQuery(con, 'select * from itemAttachments')
table.items <- dbGetQuery(con, 'select * from items')
table.itemNotes <- dbGetQuery(con, 'select * from itemNotes')
table.itemTags <- dbGetQuery(con, 'select * from itemTags')
library(quanteda)
library(readtext)

# bring in Zotero fulltext cache plaintext
textDF <- readtext(paste0("~/Zotero/storage", "/*/.zotero-ft-cache"), 
                 docvarsfrom = "filepaths")

# isolate "key" in docvar1
textDF$docvar1 <- gsub(pattern = "^.*storage\\/", replacement = "", x = textDF$docvar1)
textDF$docvar1 <- gsub(pattern = "\\/.*", replacement = "", x = textDF$docvar1)

# bring in itemID
textDF <- textDF %>%
  dplyr::rename(key = docvar1) %>%
  dplyr::left_join(table.items) %>%
  dplyr::filter(!is.na(itemID), !itemID %in% table.itemNotes$itemID)

# make a corpus to tokens to document feature matrix without punctuation, symbols, numbers, or 1-letter features
my_corpus <- corpus(textDF, docid_field = "itemID")
my_tokens <- tokens_ngrams(tokens(my_corpus, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE), n = 1)
my_dfm <- dfm(my_tokens, tolower = FALSE, remove = stopwords())
my_dfm <- dfm(my_dfm, tolower = FALSE, remove = stopwords(language = "es"))
my_dfm <- dfm_select(my_dfm, min_nchar = 2)
# my_dfm <- dfm_select(my_dfm, "\\b[a-zA-Z0-9]{1,3}\\b", selection = "remove", valuetype = "regex")
# my_dfm <- dfm_select(my_dfm, "�", selection = "remove", valuetype = "regex")

# calculate tf-idf weights
tf_idf_weights <- dfm_tfidf(my_dfm, force = TRUE)
# topfeatures(tf_idf_weights, n = 15, groups = docnames(tf_idf_weights))
# Error: will not group a weighted dfm; use force = TRUE to override

# grab the weights from the dfm object
tf_idf_df <- 
  as.data.frame(
    matrix(tf_idf_weights[
      seq_len(length(tf_idf_weights@Dimnames$docs)), 
      seq_len(length(tf_idf_weights@Dimnames$features))], 
      nrow = length(tf_idf_weights@Dimnames$docs)))

# add the features to the tf_idf_df
colnames(tf_idf_df) <- tf_idf_weights@Dimnames$features

# add top 5 tf-idf features in small batches (docs.number documents at a time)
docs.number <- 100
rounds <- nrow(tf_idf_df) %/% docs.number + 1
remainder <- nrow(tf_idf_df) %% docs.number
startRow <- 1
endRow <- docs.number
maxTagID <- max(table.tags$tagID)

for(i in 1:rounds) {
  # rank the tf-idf weights with greatest weight = 1
  topTF_idf_df_Temp <- tf_idf_df[startRow:endRow, ]
  topTF_idf_df_Temp_Rank <- apply(X = topTF_idf_df_Temp, 
                                  MARGIN = 1, 
                                  FUN = function(x) {rank(-1 * x)})
  # add features to Temp_Rank
  topTF_idf_df_Temp_Rank <- tibble::rownames_to_column(as.data.frame(topTF_idf_df_Temp_Rank), "term")
  
  # make empty df_Final_Rank
  topTF_idf_df_Final_Rank <- data.frame(matrix(nrow = nrow(topTF_idf_df_Temp), ncol = 5))
  # fill it with features ranked 1 to 5
  for(r in 1:nrow(topTF_idf_df_Temp)) {
    for(c in 1:5) {
      if(length(topTF_idf_df_Temp_Rank$term[topTF_idf_df_Temp_Rank[ , r + 1] == c]) > 0) {
        topTF_idf_df_Final_Rank[r, c] <- topTF_idf_df_Temp_Rank$term[topTF_idf_df_Temp_Rank[ , r + 1] == c][1]
      }
    }
  }
  
  # add itemIDs
  topTF_idf_df_Final_Rank$itemID <- tf_idf_weights@Dimnames$docs[startRow:endRow]
  
  # convert features to tagIDs and organize into three columns with same names as SQLite itemTags
  topTF_idf_df_Final_Rank <- tidyr::gather(topTF_idf_df_Final_Rank, -itemID, key = "type", value = "name") %>%
    dplyr::left_join(table.tags) %>%
    dplyr::mutate(type = 1)
  
  for(r in 1:nrow(topTF_idf_df_Final_Rank)) {
    if(is.na(topTF_idf_df_Final_Rank$tagID[r])) {
      maxTagID <- maxTagID + 1
      topTF_idf_df_Final_Rank$tagID[r] <- maxTagID
    }
  }
  
  startRow <- startRow + docs.number
  endRow <- endRow + docs.number
  
  if(i == 1) {
    topTF_idf_df_itemTags <- topTF_idf_df_Final_Rank
  } else {
    topTF_idf_df_itemTags <- rbind(topTF_idf_df_itemTags,
                                   topTF_idf_df_Final_Rank)
  }
}
topTF_idf_df_itemTags_Final <- dplyr::mutate(topTF_idf_df_itemTags, itemID = as.integer(itemID)) %>%
  dplyr::left_join(table.itemAttachments) %>%
  dplyr::arrange(parentItemID, name) %>%
  dplyr::mutate(itemIDOriginal = itemID) %>%
  dplyr::mutate(itemID = parentItemID) %>%
  dplyr::distinct(itemID, tagID, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(itemID), !is.na(tagID), !is.na(name)) %>%
  dplyr::anti_join(table.itemTags, by = c("itemID", "tagID")) %>%
  # as we did work in batches above, we may have assigned new tags multiple new tagIDs. reduce to one new tagID
  dplyr::group_by(name) %>% 
  dplyr::mutate(tagID = min(tagID)) %>%
  dplyr::distinct(itemID, tagID, .keep_all = TRUE)

topTF_idf_df_itemTags_Final
## # A tibble: 0 x 13
## # Groups:   name [0]
## # … with 13 variables: itemID <int>, type <dbl>, name <chr>, tagID <dbl>,
## #   parentItemID <int>, linkMode <int>, contentType <chr>, charsetID <int>,
## #   path <chr>, syncState <int>, storageModTime <int64>, storageHash <chr>,
## #   itemIDOriginal <int>
if(NEW_TAGS) {
  dbReadTable(con, "itemTags")
  
  # add new records to itemTags with custom tf-idf tagIDs
  dbExecute(
    con,
    "INSERT INTO itemTags (itemID, tagID, type) VALUES (?, ?, ?)",
    param = list(topTF_idf_df_itemTags_Final$parentItemID, 
                 topTF_idf_df_itemTags_Final$tagID, 
                 topTF_idf_df_itemTags_Final$type)
  )
  
  dbReadTable(con, "itemTags")
  
  dbReadTable(con, "tags")
  
  topTF_idf_df_itemTags_Final_NewTags <- topTF_idf_df_itemTags_Final %>%
    dplyr::select(tagID, name) %>%
    dplyr::anti_join(table.tags, by = c("tagID","name")) %>%
    dplyr::arrange(tagID) %>%
    dplyr::distinct(tagID, name)
  
  # add new records to tags with custom tf-idf tagIDs not previously in database
  dbExecute(
    con,
    "INSERT INTO tags (tagID, name) VALUES (?, ?)",
    param = list(topTF_idf_df_itemTags_Final_NewTags$tagID, 
                 topTF_idf_df_itemTags_Final_NewTags$name)
  )
  
  dbReadTable(con, "tags")
}

table.itemTags.NEW <- dbGetQuery(con, 'select * from itemTags')
table.itemData <- dbGetQuery(con, 'select * from itemData')
table.itemDataValues <- dbGetQuery(con, 'select * from itemDataValues')
table.collectionItems <- dbGetQuery(con, 'select * from collectionItems')

dbDisconnect(con)
topTF_idf_df_itemTags_Final %>% 
  dplyr::left_join(table.collectionItems) %>%
  dplyr::filter(collectionID %in% c(16, 16441)) %>%
  dplyr::left_join(table.itemData) %>%
  dplyr::left_join(table.itemDataValues) %>%
  dplyr::select(itemID, name, collectionID, fieldID, value) %>%
  dplyr::filter(fieldID %in% c("12", "28", "85", "86", "91", "104", "107", "110", "114", "115", "116", "119")) %>%
  dplyr::filter(fieldID == 110)
## # A tibble: 0 x 5
## # Groups:   name [0]
## # … with 5 variables: itemID <int>, name <chr>, collectionID <int>,
## #   fieldID <int>, value <chr>

Edit this page

Avatar
Joseph de la Torre Dwyer
Researcher

My research interests include distributive justice; the principles of responsibility, desert, and control; and reproducible research with R.

Related