I'm trying to extract information from the HTML texts that I get from URLs that I create from a For loop and then use beautiful soup.
I get to isolate the information correctly but when I'm trying to export the data I get an error message "All arrays must be of the same length"
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' company_name[item] '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
CodePudding user response:
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')