I scraped a website with multiple pages which contain table data. Now I have the data as multiple lists. How do I convert the lists to a dataframe?
url = 'https://www.insolvencydirect.bis.gov.uk/fip1/Home/Search'
session = requests.Session()
def make_initial_request():
page = session.post(url, {'IPForename' : '', 'IPSurname' : '', 'IPCompany' : '', 'IPTown' : '', 'IPNumber' : '', 'IPCounty' : ''})
get_page_content(page, True)
def get_page_content(page, show_header):
soup = BeautifulSoup(page.content, 'html.parser')
#print(soup)
results = soup.find_all('tr')
lists = results
if show_header == False:
lists = results[1:]
for row in lists:
IP_data = get_attributes(row)
print(IP_data)
def make_requests_for_pages(i):
page = session.get(url "?Page=" str(i))
get_page_content(page, False)
def get_attributes(table_row):
soup = BeautifulSoup(str(table_row), 'html.parser')
html_content = soup.get_text()
if html_content:
result = html_content.strip().split('\n')
while len(result) < 8:
result.append('')
return result
First_page = make_initial_request()
for i in range(2, 3):
Other_pages = make_requests_for_pages(i)
This is the output below. There are more lists, and every list contains data for one person. The first list with "Name", "Company" etc is what I will like to be the headers in the dataframe.
['Name', '', '', 'Company', '', '', 'Address', '', '', 'Telephone', '', '', 'Fax', '', '', 'Email', '', '', 'IP No', '', '', 'Authorising Body']
['Mr ROBERT SCHNEIDERMAN', '', '18 Downage, LONDON, NW4 1AH', '07770 783757', '', '[email protected]', '8733', 'ICAEW']
['Mr SHAGUN S DUBEY', '', '63 Stanhope Avenue, LONDON, N3 3LY', '', '', '[email protected]', '9216', 'ICAEW']
['Mr NICK T C HILL', '11/F', 'Greenville, 2 Glenealy, HONG KONG', '07726 631 405', '', '[email protected]', '7239', 'ICAEW']
['Mr Daniel Allen', '360 Insolvency Limited', '1 Castle Hill Court, Castle Hill, ROCHESTER, Kent, ME1 1LF', '01634 475546', '', '[email protected]', '21334', 'ICAEW']
['Mr ANTHONY JOHN SARGEANT', 'A J Sargeant & Co Limited', '7 Newfield Court, 586 Fulwood Road, Sheffield, S10 3QE', '0114 268 1862', '', '[email protected]', '9659', 'ICAEW']
['Mr ALAN STUART BRADSTOCK', 'AABRS Limited', 'Langley House, Park Road, LONDON, N2 8EY', '0208 444 2000', '0208 444 3400', '[email protected]', '5956', 'IPA']
['Mr KEVIN MCLEOD', 'AABRS Limited', 'Langley House, Park Road, LONDON, N2 8EY', '0208 444 3400', '', '[email protected]', '9438', 'ICAS']
['Mr DAVID SIMON MATTHEW EDWARDS', 'Aaron & Partners LLP', '5-7 Grosvenor Court, Foregate Street, CHESTER, CH1 1HG', '01244 405555', '01244 405566', '[email protected]', '8244', 'ICAEW']
['Mrs Janette Louise Chillery-Belcher', 'Aaron & Partners LLP', '5-7 Grosvenor Court, Foregate Street, CHESTER, CH1 1HG', '01244 405555', '', '[email protected]', '22172', 'ICAEW']
['Miss TRACY ANN TAYLOR', 'Abbey Taylor Ltd', "Unit 6, Twelve O'Clock Court, 21 Attercliffe Road, SHEFFIELD, S4 7WW", '0114 292 2402', '0114 292 2403', '[email protected]', '8899', 'ICAEW']
CodePudding user response:
If you want do build an dataframe like CSV you can do like this:
# i've created a list with all lists, if you do this just run my script, must run well
lists = [
['Name', '', '', 'Company', '', '', 'Address', '', '', 'Telephone', '', '', 'Fax', '', '', 'Email', '', '', 'IP No', '', '', 'Authorising Body'],
['Mr ROBERT SCHNEIDERMAN', '', '18 Downage, LONDON, NW4 1AH', '07770 783757', '', '[email protected]', '8733', 'ICAEW'],
['Mr SHAGUN S DUBEY', '', '63 Stanhope Avenue, LONDON, N3 3LY', '', '', '[email protected]', '9216', 'ICAEW'],
['Mr NICK T C HILL', '11/F', 'Greenville, 2 Glenealy, HONG KONG', '07726 631 405', '', '[email protected]', '7239', 'ICAEW'],
['Mr Daniel Allen', '360 Insolvency Limited', '1 Castle Hill Court, Castle Hill, ROCHESTER, Kent, ME1 1LF', '01634 475546', '', '[email protected]', '21334', 'ICAEW'],
['Mr ANTHONY JOHN SARGEANT', 'A J Sargeant & Co Limited', '7 Newfield Court, 586 Fulwood Road, Sheffield, S10 3QE', '0114 268 1862', '', '[email protected]', '9659', 'ICAEW'],
['Mr ALAN STUART BRADSTOCK', 'AABRS Limited', 'Langley House, Park Road, LONDON, N2 8EY', '0208 444 2000', '0208 444 3400', '[email protected]', '5956', 'IPA'],
['Mr KEVIN MCLEOD', 'AABRS Limited', 'Langley House, Park Road, LONDON, N2 8EY', '0208 444 3400', '', '[email protected]', '9438', 'ICAS'],
['Mr DAVID SIMON MATTHEW EDWARDS', 'Aaron & Partners LLP', '5-7 Grosvenor Court, Foregate Street, CHESTER, CH1 1HG', '01244 405555', '01244 405566', '[email protected]', '8244', 'ICAEW'],
['Mrs Janette Louise Chillery-Belcher', 'Aaron & Partners LLP', '5-7 Grosvenor Court, Foregate Street, CHESTER, CH1 1HG', '01244 405555', '', '[email protected]', '22172', 'ICAEW'],
['Miss TRACY ANN TAYLOR', 'Abbey Taylor Ltd', "Unit 6, Twelve O'Clock Court, 21 Attercliffe Road, SHEFFIELD, S4 7WW", '0114 292 2402', '0114 292 2403', '[email protected]', '8899', 'ICAEW']
]
dataFrame = ''
tmp_str = ''
quotedStr = lambda x: '"' x '"' if x != '' else ''
for lst in lists:
tmp_str=''
for element in lst:
if lst[len(lst)-1] == element:
tmp_str = quotedStr(element) '\n'
else:
tmp_str = quotedStr(element) ','
dataFrame =tmp_str
# this dataFrame will be like an dataframe loaded from CSV file
print(dataFrame)
CodePudding user response:
You could change these lines:
for row in lists:
IP_data = get_attributes(row)
print(IP_data)
to
IP_data = pd.DataFrame() # declare somewhere before the loop
for row in lists:
attributes = get_attributes(row)
buffer = pd.DataFrame(
[[
attributes[0],
attributes[1],
#...etc
]],
columns=['Name', 'Company', ...etc])
IP_data = pd.concat([IP_data, buffer])