I am trying to convert an xml file into a dataframe The xml file is downloaded from here (for the year 2020)
https://prtr.defra.gov.uk/full-dataset
library(XML)
fileName <- file.path(getwd(), 'uk_prtr_dataset_2020.xml')
xmlData <- xmlInternalTreeParse(fileName)
# print xml data
print(xmlData)
# convert xml data into a dataframe
xmlDataFrame <- xmlToDataFrame(xmlData)
Error in `[<-.data.frame`(`*tmp*`, i, names(nodes[[i]]), value = c(NationalID = "Wales_NP3037AF", :
duplicate subscripts for columns
I am not able to figure out what this error means and any correction to it
CodePudding user response:
Here's how I went about it after doing some exploration.
xml_file <- read_xml(file.path(dir_ls$input, 'raw_data', 'uk_prtr_dataset_2020.xml'))
I figured out which columns I need
node_vec <- c("ParentCompanyName","FacilityName","LongitudeMeasure","LatitudeMeasure","RiverBasinDistrictID", "MainEconomicActivityName")
And then wrote a loop to create a dataframe
temp_list <- list()
for(node in seq_along(node_vec)){
node_ref <- node_vec[node]
var_ref <- paste0(".//rsm:",node_ref)
temp <- xml_file %>%
xml_find_all(var_ref) %>%
as_list() %>%
simplify() %>%
enframe() %>%
unnest_wider(value) %>%
dplyr::select(...1)
names(temp) <- node_ref
temp_list[[node]] <- temp
print(paste0(node_ref,": ",nrow(temp)))
rm(temp)
}
results <- do.call(cbind, temp_list)
head(results)
CodePudding user response:
This is how to get a table containing one address per row:
library(tidyverse)
library(xml2)
read_xml("~/Downloads/uk_prtr_dataset_2007.xml") %>%
xml_find_all(".//rsm:Address") %>%
as_list() %>%
simplify() %>%
enframe() %>%
unnest_wider(value) %>%
unnest_wider(CityName) %>%
unnest(StreetName) %>%
unnest(StreetName) %>%
unnest(PostcodeCode) %>%
unnest(PostcodeCode)
#name StreetName ...1 PostcodeCode BuildingNumber
#<int> <chr> <chr> <chr> <list>
# 1 1 "Klondyke Building Cromac Street" "Belfast" BT7 2JA <NULL>
# 2 2 "Heriot Watt Research Park, Avenue North, Riccaton" "Edinburgh" EH14 4AP <list [1]>
# 3 3 "Parkway Avenue" "Sheffield" S9 4WF <list [1]>
# 4 4 "3 Whitehall Place" "London" SW1A 2AW <NULL>
# 5 5 "All correspondence to: Defra, Industrial Pollution, 5F Ergon House, Horseferry Rd. London. " "East Riding of Yorkshire" SW1P 2AL <NULL>
# 6 6 "All correspondence to: Defra, Industrial Pollution, 5F Ergon House, Horseferry Rd. London. " "East Staffordshire " SW1P 2AL <NULL>