library(XML)

#Read in PDF file

PDF <- xmlTreeParse("CRF/Datasets/test.xml", useInternalNodes=TRUE)

#Get the page/text/location information

pages <- getNodeSet(PDF, "//Page[@number]")
page <- lapply(pages, function(x) xmlAttrs(x)["number"])

values <- lapply(pages, xpathApply, path="//Page/Content/Para/Box/Word/Text", xmlValue)

pos <- lapply(pages, xpathApply, path="//Page/Content/Para/Box/Word/Box[@*]", xmlAttrs)