I am quite new to Python but I need to scrape several local html files. I managed to make the code work for one file like this:
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
myFile = open("file_01.html", "r")
doc = BeautifulSoup(myFile, "html.parser")
messages = doc.select('div.message')
...
Now I need to make the code run through all html files in a specific directory. I tried it like this:
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import os
for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
if filename.endswith('.html'):
fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
print("Current file name ..", os.path.abspath(fname))
with open(fname, 'r') as file:
doc = BeautifulSoup(file.read(), 'html.parser')
messages = doc.select('div.message')
The code prints out the correct number and names of all html files in the directory.
BUT: It scrapes only through one of them. No matter which or how many files I put in the directory.
Could somebody tell me what I am doing wrong? Thanks.
CodePudding user response:
Try putting the message variable inline with doc, otherwise it only acts on the last file. Example:
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import os
for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
if filename.endswith('.html'):
fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
print("Current file name ..", os.path.abspath(fname))
with open(fname, 'r') as file:
doc = BeautifulSoup(file.read(), 'html.parser')
messages = doc.select('div.message')
print(messages)
CodePudding user response:
for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
if filename.endswith('.html'):
fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
print("Current file name ..", os.path.abspath(fname))
with open(fname, 'r') as file:
doc = BeautifulSoup(file.read(), 'html.parser')
messages = doc.select('div.message')
#print(messages)
rows = []
for message in messages:
print('---')
row = {}
row['id_number'] = (message['id'])
try:
row['time'] = (message.select_one('div[title]').get('title'))
except:
print("Couldn't find the time")
print(row)
rows.append(row)
df = pd.DataFrame(rows)
CodePudding user response:
doc.select is not indented properly. You can simplify this code with glob as follows:
from glob import glob
from bs4 import BeautifulSoup
GPATH = '/Users/xyz/Documents/testdirectory/*.html'
for file in glob(GPATH):
with open(file) as html_file:
soup = BeautifulSoup(html_file.read(), 'html.parser')
for message in soup.select('div.message'):
pass # process elements here