library(XML)
#Read in PDF file
PDF <- xmlTreeParse("CRF/Datasets/test.xml", useInternalNodes=TRUE)
#Get the page/text/location information
pages <- getNodeSet(PDF, "//Page[@number]")
page <- lapply(pages, function(x) xmlAttrs(x)["number"])
values <- lapply(pages, xpathApply, path="//Page/Content/Para/Box/Word/Text", xmlValue)
pos <- lapply(pages, xpathApply, path="//Page/Content/Para/Box/Word/Box[@*]", xmlAttrs)
bGlicmFyeShYTUwpCgojUmVhZCBpbiBQREYgZmlsZQoKUERGIDwtIHhtbFRyZWVQYXJzZSgiQ1JGL0RhdGFzZXRzL3Rlc3QueG1sIiwgdXNlSW50ZXJuYWxOb2Rlcz1UUlVFKQoKI0dldCB0aGUgcGFnZS90ZXh0L2xvY2F0aW9uIGluZm9ybWF0aW9uCgpwYWdlcyA8LSBnZXROb2RlU2V0KFBERiwgIi8vUGFnZVtAbnVtYmVyXSIpCnBhZ2UgPC0gbGFwcGx5KHBhZ2VzLCBmdW5jdGlvbih4KSB4bWxBdHRycyh4KVsibnVtYmVyIl0pCgp2YWx1ZXMgPC0gbGFwcGx5KHBhZ2VzLCB4cGF0aEFwcGx5LCBwYXRoPSIvL1BhZ2UvQ29udGVudC9QYXJhL0JveC9Xb3JkL1RleHQiLCB4bWxWYWx1ZSkKCnBvcyA8LSBsYXBwbHkocGFnZXMsIHhwYXRoQXBwbHksIHBhdGg9Ii8vUGFnZS9Db250ZW50L1BhcmEvQm94L1dvcmQvQm94W0AqXSIsIHhtbEF0dHJzKQ==