I am trying to append XML data into empty lists to create a dataframe. I was able to create all lists besides three because some tags have null values. I have tried an xpath function to grab all the text from the desired tag.
import requests
from lxml import objectify
from lxml import etree
from bs4 import BeautifulSoup
URL = 'https://data.virginia.gov/api/views/xvir-sctz/rows.xml?accessType=DOWNLOAD'
response = requests.get(URL).content
import requests
from lxml import objectify
root = objectify.fromstring(response)
Here are some empty lists I would like to append
households_served = []
individuals_served = []
pounds_of_food_distributed = []
month = []
I tried this to give me lists, and it works.
pounds_of_food_distributed = root.xpath('//response/row/row/pounds_of_food_distributed/text()')
individuals_served = root.xpath('//response/row/row/individuals_served/text()')
households_served = root.xpath('//response/row/row/households_served/text()')
month = root.xpath('//response/row/row/month/text()')
But then I try to use pd.DataFrame with this code and I get an error.
table = pd.DataFrame(
{'Month': month,
'House': households_served,
'People': individuals_served,
'Pounds' : pounds_of_food_distributed
})
Any suggestions?
CodePudding user response:
Your problem is that some elements do not exist in the row
, but months always exist.
One idea is to fill the missing data with 0s or anything you want.
import requests
from lxml import objectify
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://data.virginia.gov/api/views/xvir-sctz/rows.xml?accessType=DOWNLOAD'
response = requests.get(URL).content
root = objectify.fromstring(response)
households_served = []
individuals_served = []
pounds_of_food_distributed = []
month = []
for element in root.xpath('//row/row'):
month.append(element["month"]) # month always exists
individuals_served.append(element["individuals_served"] if hasattr(element, "individuals_served") else 0)
households_served.append(element["households_served"] if hasattr(element, "households_served") else 0)
pounds_of_food_distributed.append(element["pounds_of_food_distributed"] if hasattr(element, "pounds_of_food_distributed") else 0)
print(len(month))
print(len(individuals_served))
print(len(pounds_of_food_distributed))
print(len(households_served))
table = pd.DataFrame(
{'Month': month,
'House': households_served,
'People': individuals_served,
'Pounds' : pounds_of_food_distributed
})
print(table)
The output:
CodePudding user response:
An alternative would be to use pandas
directly with pandas.read_xml
, set xpath
to all children row
of row
element and just slice output to your needs - This will also take care of the missing / empty elements in XML structure:
import pandas as pd
df = pd.read_xml('https://data.virginia.gov/api/views/xvir-sctz/rows.xml?accessType=DOWNLOAD', xpath='row//row')[['month','individuals_served','households_served','pounds_of_food_distributed']]
df.columns = ['Month','House','People','Pounds']
df
Month | House | People | Pounds | |
---|---|---|---|---|
0 | October | nan | nan | 156644 |
1 | April | nan | nan | 21602 |
2 | August | nan | nan | 51338 |
3 | May | 627 | 270 | 67633 |
4 | May | nan | nan | 54561 |
... | ||||
4254 | August | 37 | 17 | 482661 |
4255 | August | 1974 | 783 | 29211 |
4256 | April | 485 | 259 | 16254.5 |
4257 | August | 34986 | 8583 | 561709 |
4258 | June | 749 | 258 | 31560.7 |