I have a table full of XML values (all in character format) but unable to use read_xml with it. Have tried using as.list and str to help but get the following errors
Error in UseMethod("read_xml") : no applicable method for 'read_xml' applied to an object of class "list"
The table has a column full of XMLs in the below format simply titled "XML_M":
<Root>
<Product>
<Test_ID value="1" />
<Effective_Date value="2022-01-01" />
<Membership value="Yes" />
<Request>
<Request_ID value="1" />
<Request_type value="Simple" />
</Request>
<Request>
<Request_ID value="2" />
<Request_type value="Complex" />
</Request>
</Product>
</Root>
The objective is to get all the XML values and convert them into another table but that has the elements as columns. What I'm really asking for is what do I need to do to allow read_xml to read this column of XMLs.
edit: have attached a sample table here: Table
CodePudding user response:
While your question was effectively answered with R methods in previous question by @r2evans where you simply have to iterate down a column of a data frame instead of a single XML file, consider an SQL method by shredding the data with below XML query that you can call from R. Below assumes XML column is stored as XML type.
R (adjust my_table and my_xml_column)
...
xml_query <- (
"SELECT
Test_ID = prod.value('(Test_ID/@value)[1]', 'integer'),
Effective_Date = prod.value('(Effective_Date/@value)[1]', 'datetime'),
Membership = prod.value('(Membership/@value)[1]', 'varchar(50)'),
Request_ID = req.value('(Request_ID/@value)[1]', 'integer'),
Request_type = req.value('(Request_type/@value)[1]', 'varchar(50)')
FROM myTable
CROSS APPLY
my_xml_column.nodes('/Root/Product') AS x1(prod)
CROSS APPLY
my_xml_column.nodes('/Root/Product/Request') AS x2(req)"
)
xml_df <- DBI::dbGetQuery(conn, xml_query)
CodePudding user response:
Perhaps to seal this question, using code from the previous answer.
Sample data:
myframe <- structure(list(id = 1:2, xml = c("<Root>\n<Product>\n <Test_ID value=\"1\" />\n <Effective_Date value=\"2022-01-01\" />\n <Membership value=\"Yes\" />\n <Request>\n <Request_ID value=\"1\" />\n <Request_type value=\"Simple\" />\n </Request>\n <Request>\n <Request_ID value=\"2\" />\n <Request_type value=\"Complex\" />\n </Request>\n</Product>\n</Root>", "<Root>\n<Product>\n <Test_ID value=\"1\" />\n <Effective_Date value=\"2022-01-01\" />\n <Membership value=\"Yes\" />\n <Request>\n <Request_ID value=\"3\" />\n <Request_type value=\"Simple\" />\n </Request>\n <Request>\n <Request_ID value=\"4\" />\n <Request_type value=\"Complex\" />\n </Request>\n</Product>\n</Root>" )), class = "data.frame", row.names = c(NA, -2L))
myframe2 <- structure(list(id = 1:2, xml = c("<Root>\n<Product>\n <Test_ID value=\"1\" />\n <Effective_Date value=\"2022-01-01\" />\n <Membership value=\"Yes\" />\n <Request>\n <Request_ID value=\"1\" />\n <Request_type value=\"Simple\" />\n </Request>\n <Request>\n <Request_ID value=\"2\" />\n <Request_type value=\"Complex\" />\n </Request>\n</Product>\n</Root>", "<Root>\n<Product>\n <Test_ID value=\"1\" />\n <Effective_Date value=\"2022-01-01\" />\n <Request>\n <Request_ID value=\"3\" />\n <Request_type value=\"Simple\" />\n </Request>\n <Request>\n <Request_ID value=\"4\" />\n <Request_type value=\"Complex\" />\n </Request>\n</Product>\n</Root>" )), class = "data.frame", row.names = c(NA, -2L))
Functions from the previous answer:
func1 <- function(z) if (is.null(names(z))) attr(z, "value") else lapply(z, func1)
merge.list <- function(A, B) {
# normalize lengths, just in case, since I think you have more than one $Product
A <- lapply(A, `length<-`, max(lengths(A)))
B <- lapply(B, `length<-`, max(lengths(B)))
BnotA <- setdiff(names(B), names(A))
AnotB <- setdiff(names(A), names(B))
inboth <- intersect(names(A), names(B))
A[BnotA] <- replicate(length(BnotA), rep(NA, max(lengths(A))), simplify = FALSE)
A[AnotB] <- lapply(A[AnotB], function(z) c(z, rep(NA, max(lengths(B)))))
A[inboth] <- Map(c, A[inboth], B[inboth])
A
}
Processing the column of xmls:
intermediate <- lapply(myframe$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final <- lapply(intermediate, function(L) {
do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
}))
})
final
# [[1]]
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1 2022-01-01 Yes 1 Simple 1
# Product.2 2022-01-01 Yes 2 Complex 1
# [[2]]
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1 2022-01-01 Yes 3 Simple 1
# Product.2 2022-01-01 Yes 4 Complex 1
Depending on the structures, you might be able to do:
do.call(rbind, final)
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1 2022-01-01 Yes 1 Simple 1
# Product.2 2022-01-01 Yes 2 Complex 1
# Product.11 2022-01-01 Yes 3 Simple 1
# Product.21 2022-01-01 Yes 4 Complex 1
If there are any differences (missing columns), then you might need one of the variants to rbind
provided by other packages. For instance, if the second value of myframe$xml
did not have "Membership"
(as in myframe2
above), then
intermediate2 <- lapply(myframe2$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final2 <- lapply(intermediate2, function(L) {
do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
}))
})
final2
# [[1]]
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1 2022-01-01 Yes 1 Simple 1
# Product.2 2022-01-01 Yes 2 Complex 1
# [[2]]
# Effective_Date Request.Request_ID Request.Request_type Test_ID
# Product.1 2022-01-01 3 Simple 1
# Product.2 2022-01-01 4 Complex 1
and unfortunately
do.call(rbind, final2)
# Error in rbind(deparse.level, ...) :
# numbers of columns of arguments do not match
but we can do
dplyr::bind_rows(final2)
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1...1 2022-01-01 Yes 1 Simple 1
# Product.2...2 2022-01-01 Yes 2 Complex 1
# Product.1...3 2022-01-01 <NA> 3 Simple 1
# Product.2...4 2022-01-01 <NA> 4 Complex 1
data.table::rbindlist(final2, fill = TRUE, use.names = TRUE)
# Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# <char> <char> <char> <char> <char>
# 1: 2022-01-01 Yes 1 Simple 1
# 2: 2022-01-01 Yes 2 Complex 1
# 3: 2022-01-01 <NA> 3 Simple 1
# 4: 2022-01-01 <NA> 4 Complex 1