I have a spreadsheet with a column of CIK numbers and a column of url. I want to create a for loop for url column and get an excel output for every url which is named as the corresponding CIK number. I get stuck on this problem for a long time. Thank you for your help!
import urllib.request as urllib2
from bs4 import BeautifulSoup
import re
import requests
import os
import time
from pandas import DataFrame
import pandas as pd
from urllib.request import urlopen
#headers={"Content-Type":"text"}
headers = {'User-Agent': '[email protected]'}
# open the file
data=pd.read_excel('/content/experiment.xlsx')
# get the urls
urls=data.URL
CIK=data.CIK
for CIK, url in CIK, urls:
Cash=[]
response = requests.get(url, headers = headers)
response.raise_for_status()
time.sleep(0.1)
soup = BeautifulSoup(response.text,'lxml')
for table in soup.find_all('table'):
for tr in table.find_all('tr'):
for row_num in range(len(tr)):
row = []
for td in tr.find_all('td'):
for row_item in td.get_text(strip=True):
try:
if row_item[0] == 'Cash and cash equivalent':
aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
row.append(aa)
except IndexError:
continue
Cash.append(row)
df = pd.DataFrame(data=Cash)
df.to_excel(f'/content/{CIK}.xlsx')
CodePudding user response:
for CIK, url in CIK, urls
is not a valid Python construction. Did you mean
for single_CIK, url in zip(CIK, urls)