Home > other >  Converting a table of XMLs into XMLs to use with read_xml R
Converting a table of XMLs into XMLs to use with read_xml R

Time:02-17

I have a table full of XML values (all in character format) but unable to use read_xml with it. Have tried using as.list and str to help but get the following errors

Error in UseMethod("read_xml") : no applicable method for 'read_xml' applied to an object of class "list"

The table has a column full of XMLs in the below format simply titled "XML_M":

<Root>
<Product>
    <Test_ID value="1" />
    <Effective_Date value="2022-01-01" />
    <Membership value="Yes" />
    <Request>
      <Request_ID value="1" />
      <Request_type value="Simple" />
    </Request>
    <Request>
      <Request_ID value="2" />
      <Request_type value="Complex" />
    </Request>
</Product>
</Root>

The objective is to get all the XML values and convert them into another table but that has the elements as columns. What I'm really asking for is what do I need to do to allow read_xml to read this column of XMLs.

edit: have attached a sample table here: Table

CodePudding user response:

While your question was effectively answered with R methods in previous question by @r2evans where you simply have to iterate down a column of a data frame instead of a single XML file, consider an SQL method by shredding the data with below XML query that you can call from R. Below assumes XML column is stored as XML type.

R (adjust my_table and my_xml_column)

...

xml_query <- (
    "SELECT
        Test_ID = prod.value('(Test_ID/@value)[1]', 'integer'), 
        Effective_Date = prod.value('(Effective_Date/@value)[1]', 'datetime'), 
        Membership = prod.value('(Membership/@value)[1]', 'varchar(50)'),
        Request_ID = req.value('(Request_ID/@value)[1]', 'integer'),
        Request_type = req.value('(Request_type/@value)[1]', 'varchar(50)')
    FROM myTable
    CROSS APPLY
        my_xml_column.nodes('/Root/Product') AS x1(prod)
    CROSS APPLY
        my_xml_column.nodes('/Root/Product/Request') AS x2(req)"
)

xml_df <- DBI::dbGetQuery(conn, xml_query)

SQL Fiddle Demo

CodePudding user response:

Perhaps to seal this question, using code from the previous answer.

Sample data:

myframe <- structure(list(id = 1:2, xml = c("<Root>\n<Product>\n    <Test_ID value=\"1\" />\n    <Effective_Date value=\"2022-01-01\" />\n    <Membership value=\"Yes\" />\n    <Request>\n      <Request_ID value=\"1\" />\n      <Request_type value=\"Simple\" />\n    </Request>\n    <Request>\n      <Request_ID value=\"2\" />\n      <Request_type value=\"Complex\" />\n    </Request>\n</Product>\n</Root>", "<Root>\n<Product>\n    <Test_ID value=\"1\" />\n    <Effective_Date value=\"2022-01-01\" />\n    <Membership value=\"Yes\" />\n    <Request>\n      <Request_ID value=\"3\" />\n      <Request_type value=\"Simple\" />\n    </Request>\n    <Request>\n      <Request_ID value=\"4\" />\n      <Request_type value=\"Complex\" />\n    </Request>\n</Product>\n</Root>" )), class = "data.frame", row.names = c(NA, -2L))
myframe2 <- structure(list(id = 1:2, xml = c("<Root>\n<Product>\n    <Test_ID value=\"1\" />\n    <Effective_Date value=\"2022-01-01\" />\n    <Membership value=\"Yes\" />\n    <Request>\n      <Request_ID value=\"1\" />\n      <Request_type value=\"Simple\" />\n    </Request>\n    <Request>\n      <Request_ID value=\"2\" />\n      <Request_type value=\"Complex\" />\n    </Request>\n</Product>\n</Root>", "<Root>\n<Product>\n    <Test_ID value=\"1\" />\n    <Effective_Date value=\"2022-01-01\" />\n    <Request>\n      <Request_ID value=\"3\" />\n      <Request_type value=\"Simple\" />\n    </Request>\n    <Request>\n      <Request_ID value=\"4\" />\n      <Request_type value=\"Complex\" />\n    </Request>\n</Product>\n</Root>" )), class = "data.frame", row.names = c(NA, -2L))

Functions from the previous answer:

func1 <- function(z) if (is.null(names(z))) attr(z, "value") else lapply(z, func1)
merge.list <- function(A, B) {
  # normalize lengths, just in case, since I think you have more than one $Product
  A <- lapply(A, `length<-`, max(lengths(A)))
  B <- lapply(B, `length<-`, max(lengths(B)))
  BnotA <- setdiff(names(B), names(A))
  AnotB <- setdiff(names(A), names(B))
  inboth <- intersect(names(A), names(B))
  A[BnotA] <- replicate(length(BnotA), rep(NA, max(lengths(A))), simplify = FALSE)
  A[AnotB] <- lapply(A[AnotB], function(z) c(z, rep(NA, max(lengths(B)))))
  A[inboth] <- Map(c, A[inboth], B[inboth])
  A
}

Processing the column of xmls:

intermediate <- lapply(myframe$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final <- lapply(intermediate, function(L) {
  do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
    as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
  }))
})
final
# [[1]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  1               Simple       1
# Product.2     2022-01-01        Yes                  2              Complex       1
# [[2]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  3               Simple       1
# Product.2     2022-01-01        Yes                  4              Complex       1

Depending on the structures, you might be able to do:

do.call(rbind, final)
#            Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1      2022-01-01        Yes                  1               Simple       1
# Product.2      2022-01-01        Yes                  2              Complex       1
# Product.11     2022-01-01        Yes                  3               Simple       1
# Product.21     2022-01-01        Yes                  4              Complex       1

If there are any differences (missing columns), then you might need one of the variants to rbind provided by other packages. For instance, if the second value of myframe$xml did not have "Membership" (as in myframe2 above), then

intermediate2 <- lapply(myframe2$xml, function(X) xml2::as_list(xml2::read_xml(X)))
final2 <- lapply(intermediate2, function(L) {
  do.call(rbind.data.frame, lapply(func1(L$Root), function(pr) {
    as.data.frame(lapply(split(pr, names(pr)), function(Y) Reduce(merge.list, Y)))
  }))
})
final2
# [[1]]
#           Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01        Yes                  1               Simple       1
# Product.2     2022-01-01        Yes                  2              Complex       1
# [[2]]
#           Effective_Date Request.Request_ID Request.Request_type Test_ID
# Product.1     2022-01-01                  3               Simple       1
# Product.2     2022-01-01                  4              Complex       1

and unfortunately

do.call(rbind, final2)
# Error in rbind(deparse.level, ...) : 
#   numbers of columns of arguments do not match

but we can do

dplyr::bind_rows(final2)
#               Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
# Product.1...1     2022-01-01        Yes                  1               Simple       1
# Product.2...2     2022-01-01        Yes                  2              Complex       1
# Product.1...3     2022-01-01       <NA>                  3               Simple       1
# Product.2...4     2022-01-01       <NA>                  4              Complex       1

data.table::rbindlist(final2, fill = TRUE, use.names = TRUE)
#    Effective_Date Membership Request.Request_ID Request.Request_type Test_ID
#            <char>     <char>             <char>               <char>  <char>
# 1:     2022-01-01        Yes                  1               Simple       1
# 2:     2022-01-01        Yes                  2              Complex       1
# 3:     2022-01-01       <NA>                  3               Simple       1
# 4:     2022-01-01       <NA>                  4              Complex       1
  • Related