x <- "This is a test. I only want to select the (cites) in parenthesis. I do not want it to return words in parenthesis that do not have years attached, such as abbreviations (abbr). For example, citing (Smith 2010) is something I would want to be returned. I would also want multiple citations returned separately such as (Smith 2010; Jones 2001; Brown 2020). I would also want Cooper (2015) returned as Cooper 2015, and not just 2015. I would also want John Granger et al. (2015)."
rx <- "(?:\\ b(\\ p{Lu}\\ w*(?:\\ s+\\ p{Lu}\\ w*)*(?:\\ s+et\\ s+al\\ .)?)?)\\ s*\\ (([^()]*\\ d{4})\\ )"
library( stringr)
res <- str_match_all( x, rx)
result <- lapply( res, function ( z) { ifelse( ! is.na ( z[ , 2 ] ) & str_detect( z[ , 3 ] , "^\\ d+$" ) , paste( trimws( z[ , 2 ] ) , trimws( z[ , 3 ] ) ) , z[ , 3 ] ) } )
unlist( sapply( result, function ( z) strsplit( paste( z, collapse= ";" ) , "\\ s*;\\ s*" ) ) )
eCA8LSAiVGhpcyBpcyBhIHRlc3QuIEkgb25seSB3YW50IHRvIHNlbGVjdCB0aGUgKGNpdGVzKSBpbiBwYXJlbnRoZXNpcy4gSSBkbyBub3Qgd2FudCBpdCB0byByZXR1cm4gd29yZHMgaW4gcGFyZW50aGVzaXMgdGhhdCBkbyBub3QgaGF2ZSB5ZWFycyBhdHRhY2hlZCwgc3VjaCBhcyBhYmJyZXZpYXRpb25zIChhYmJyKS4gRm9yIGV4YW1wbGUsIGNpdGluZyAoU21pdGggMjAxMCkgaXMgc29tZXRoaW5nIEkgd291bGQgd2FudCB0byBiZSByZXR1cm5lZC4gSSB3b3VsZCBhbHNvIHdhbnQgbXVsdGlwbGUgY2l0YXRpb25zIHJldHVybmVkIHNlcGFyYXRlbHkgc3VjaCBhcyAoU21pdGggMjAxMDsgSm9uZXMgMjAwMTsgQnJvd24gMjAyMCkuIEkgd291bGQgYWxzbyB3YW50IENvb3BlciAoMjAxNSkgcmV0dXJuZWQgYXMgQ29vcGVyIDIwMTUsIGFuZCBub3QganVzdCAyMDE1LiAgSSB3b3VsZCBhbHNvIHdhbnQgSm9obiBHcmFuZ2VyIGV0IGFsLiAoMjAxNSkuIgpyeCA8LSAiKD86XFxiKFxccHtMdX1cXHcqKD86XFxzK1xccHtMdX1cXHcqKSooPzpcXHMrZXRcXHMrYWxcXC4pPyk/KVxccypcXCgoW14oKV0qXFxkezR9KVxcKSIKbGlicmFyeShzdHJpbmdyKQpyZXMgPC0gc3RyX21hdGNoX2FsbCh4LCByeCkKcmVzdWx0IDwtIGxhcHBseShyZXMsIGZ1bmN0aW9uKHopIHtpZmVsc2UoIWlzLm5hKHpbLDJdKSAmIHN0cl9kZXRlY3QoelssM10sIl5cXGQrJCIpLCBwYXN0ZSh0cmltd3MoelssMl0pLCAgdHJpbXdzKHpbLDNdKSksIHpbLDNdKX0pCQp1bmxpc3Qoc2FwcGx5KHJlc3VsdCwgZnVuY3Rpb24oeikgc3Ryc3BsaXQocGFzdGUoeiwgY29sbGFwc2U9IjsiKSwgIlxccyo7XFxzKiIpKSkK