I'm trying to input zip codes taken from a predefined excel file into a website to fetch populated results and write them back to the same file in a new column. To be specific, each zip code should produce a household number, which is what I want to record and enter in the final column of the excel file.
With my current attempt, the script is doing fine. What I wish to do now is apply concurrent.futures
library within the script to make it faster.
import concurrent.futures as futures
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'referer': 'https://www.unitedstateszipcodes.org/60634/'
}
link = 'https://www.unitedstateszipcodes.org/{}/'
def get_content(company,url):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text,"lxml")
household = int(soup.select_one("th:-soup-contains('Occupied Housing Units') td").get_text(strip=True).replace(",",""))
company.offset(0,3).value = household
return household
if __name__ == '__main__':
wb = load_workbook('input.xlsx')
ws = wb['result']
for company in ws['B']:
if company.value == "Zip Code": continue
if not company.value: break
content_url = link.format(company.value)
print(get_content(company,content_url))
wb.save('input.xlsx')
# if __name__ == "__main__":
# wb = load_workbook('input.xlsx')
# ws = wb['result']
# with futures.ThreadPoolExecutor(max_workers=6) as executor:
# future_to_url = {
# executor.submit(get_content, company, link.format(company.value)): company for company in ws['B']
# }
# futures.as_completed(future_to_url)
# wb.save('input.xlsx')
Here is the excel file for your consideration and this is how it looks like.
CodePudding user response:
How about trying this?
Note: I've refactored your code for clarity and simplicity.
import concurrent.futures as futures
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'referer': 'https://www.unitedstateszipcodes.org/60634/'
}
def get_content(url: str) -> int:
return int(
BeautifulSoup(session.get(url, headers=headers).text, "lxml")
.select_one("th:-soup-contains('Occupied Housing Units') td")
.get_text(strip=True)
.replace(",", "")
)
if __name__ == '__main__':
df = pd.read_excel('input.xlsx')
with requests.Session() as session:
with futures.ThreadPoolExecutor(max_workers=6) as executor:
future_to_url = {
executor.submit(
get_content,
f'https://www.unitedstateszipcodes.org/{zip_code}/'
): zip_code for zip_code in df['Zip Code']
}
futures.as_completed(future_to_url)
for future in future_to_url:
df.loc[df['Zip Code'] == future_to_url[future], 'Total households'] = future.result()
print(df.to_markdown(index=False))
df.to_excel('output.xlsx', index=False)
This should print:
| State | Zip Code | High Speed | 100M | Total households |
|:--------|-----------:|-------------:|-------:|-------------------:|
| IL | 60618 | 32198 | 3038 | 35407 |
| IL | 60647 | 29868 | 3526 | 34330 |
| IL | 60641 | 18340 | 1811 | 24770 |
| IL | 60634 | 17743 | 1544 | 25712 |
| IL | 60402 | 14554 | 1533 | 21338 |
| CA | 95969 | 12074 | 0 | 12475 |
| NC | 28269 | 13071 | 209 | 27076 |
| TX | 78245 | 10094 | 326 | 17691 |
| IL | 60622 | 15794 | 6519 | 24124 |
| IL | 60630 | 11730 | 1852 | 20497 |
| CA | 90280 | 12637 | 92 | 23278 |
| CA | 90026 | 9878 | 191 | 24956 |
| TX | 78207 | 9140 | 14 | 15851 |
| CA | 94112 | 11608 | 2129 | 21975 |
| TX | 78228 | 8957 | 147 | 18890 |
| CA | 90201 | 7450 | 11 | 24104 |
| FL | 32828 | 7586 | 338 | 20057 |
| GA | 30043 | 9045 | 189 | 25997 |
| GA | 30349 | 10507 | 656 | 25422 |
| FL | 33027 | 7457 | 54 | 21028 |
And you should also get an Excel file output.xlsx
with the same data as above.