fork(1) download
  1. x <- "This is a test. I only want to select the (cites) in parenthesis. I do not want it to return words in parenthesis that do not have years attached, such as abbreviations (abbr). For example, citing (Smith 2010) is something I would want to be returned. I would also want multiple citations returned separately such as (Smith 2010; Jones 2001; Brown 2020). I would also want Cooper (2015) returned as Cooper 2015, and not just 2015."
  2. rx <- "(?:\\b(\\p{Lu}\\w*(?:\\s+\\p{Lu}\\w*)*))?\\s*\\(([^()]*\\d{4})\\)"
  3. library(stringr)
  4. res <- str_match_all(x, rx)
  5. result <- lapply(res, function(z) {ifelse(!is.na(z[,2]) & str_detect(z[,3],"^\\d+$"), paste(trimws(z[,2]), trimws(z[,3])), z[,3])})
  6. unlist(sapply(result, function(z) strsplit(paste(z, collapse=";"), "\\s*;\\s*")))
  7.  
Success #stdin #stdout 0.28s 42372KB
stdin
Standard input is empty
stdout
[1] "Smith 2010"  "Smith 2010"  "Jones 2001"  "Brown 2020"  "Cooper 2015"