x <- "This is a test. I only want to select the (cites) in parenthesis. I do not want it to return words in parenthesis that do not have years attached, such as abbreviations (abbr). For example, citing (Smith 2010) is something I would want to be returned. I would also want multiple citations returned separately such as (Smith 2010; Jones 2001; Brown 2020). I would also want Cooper (2015) returned as Cooper 2015, and not just 2015."
rx <- "(?:\\ b(\\ p{Lu}\\ w*(?:\\ s+\\ p{Lu}\\ w*)*))?\\ s*\\ (([^()]*\\ d{4})\\ )"
library( stringr)
res <- str_match_all( x, rx)
result <- lapply( res, function ( z) { ifelse( ! is.na ( z[ , 2 ] ) & str_detect( z[ , 3 ] , "^\\ d+$" ) , paste( trimws( z[ , 2 ] ) , trimws( z[ , 3 ] ) ) , z[ , 3 ] ) } )
unlist( sapply( result, function ( z) strsplit( paste( z, collapse= ";" ) , "\\ s*;\\ s*" ) ) )
eCA8LSAiVGhpcyBpcyBhIHRlc3QuIEkgb25seSB3YW50IHRvIHNlbGVjdCB0aGUgKGNpdGVzKSBpbiBwYXJlbnRoZXNpcy4gSSBkbyBub3Qgd2FudCBpdCB0byByZXR1cm4gd29yZHMgaW4gcGFyZW50aGVzaXMgdGhhdCBkbyBub3QgaGF2ZSB5ZWFycyBhdHRhY2hlZCwgc3VjaCBhcyBhYmJyZXZpYXRpb25zIChhYmJyKS4gRm9yIGV4YW1wbGUsIGNpdGluZyAoU21pdGggMjAxMCkgaXMgc29tZXRoaW5nIEkgd291bGQgd2FudCB0byBiZSByZXR1cm5lZC4gSSB3b3VsZCBhbHNvIHdhbnQgbXVsdGlwbGUgY2l0YXRpb25zIHJldHVybmVkIHNlcGFyYXRlbHkgc3VjaCBhcyAoU21pdGggMjAxMDsgSm9uZXMgMjAwMTsgQnJvd24gMjAyMCkuIEkgd291bGQgYWxzbyB3YW50IENvb3BlciAoMjAxNSkgcmV0dXJuZWQgYXMgQ29vcGVyIDIwMTUsIGFuZCBub3QganVzdCAyMDE1LiIKcnggPC0gIig/OlxcYihcXHB7THV9XFx3Kig/OlxccytcXHB7THV9XFx3KikqKSk/XFxzKlxcKChbXigpXSpcXGR7NH0pXFwpIgpsaWJyYXJ5KHN0cmluZ3IpCnJlcyA8LSBzdHJfbWF0Y2hfYWxsKHgsIHJ4KQpyZXN1bHQgPC0gbGFwcGx5KHJlcywgZnVuY3Rpb24oeikge2lmZWxzZSghaXMubmEoelssMl0pICYgc3RyX2RldGVjdCh6WywzXSwiXlxcZCskIiksIHBhc3RlKHRyaW13cyh6WywyXSksICB0cmltd3MoelssM10pKSwgelssM10pfSkJCnVubGlzdChzYXBwbHkocmVzdWx0LCBmdW5jdGlvbih6KSBzdHJzcGxpdChwYXN0ZSh6LCBjb2xsYXBzZT0iOyIpLCAiXFxzKjtcXHMqIikpKQo=