Scrape through multiple local html files in one directory-CodePudding

I am quite new to Python but I need to scrape several local html files. I managed to make the code work for one file like this:

from bs4 import BeautifulSoup
import html5lib
import pandas as pd

myFile = open("file_01.html", "r")
doc = BeautifulSoup(myFile, "html.parser")

messages = doc.select('div.message')

...

Now I need to make the code run through all html files in a specific directory. I tried it like this:

from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import os

for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
      
    if filename.endswith('.html'):
          
        fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
        print("Current file name ..", os.path.abspath(fname))
        
        with open(fname, 'r') as file:
            
            doc = BeautifulSoup(file.read(), 'html.parser')

messages = doc.select('div.message')

The code prints out the correct number and names of all html files in the directory.

BUT: It scrapes only through one of them. No matter which or how many files I put in the directory.

Could somebody tell me what I am doing wrong? Thanks.

CodePudding user response：

Try putting the message variable inline with doc, otherwise it only acts on the last file. Example:


from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import os

for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
      
    if filename.endswith('.html'):
          
        fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
        print("Current file name ..", os.path.abspath(fname))
        
        with open(fname, 'r') as file:
            
            doc = BeautifulSoup(file.read(), 'html.parser')
            messages = doc.select('div.message')
            print(messages)

CodePudding user response：

for filename in os.listdir("/Users/xyz/Documents/testdirectory"):
      
    if filename.endswith('.html'):
          
        fname = os.path.join("/Users/xyz/Documents/testdirectory", filename)
        print("Current file name ..", os.path.abspath(fname))
        
        with open(fname, 'r') as file:
            
            doc = BeautifulSoup(file.read(), 'html.parser')
            messages = doc.select('div.message')
            #print(messages)
            
            rows = []

            for message in messages:
                print('---')

                row = {}

                row['id_number'] = (message['id'])

                try:
                    row['time'] = (message.select_one('div[title]').get('title'))
                except:
                    print("Couldn't find the time")

                print(row)

                rows.append(row)
                
df = pd.DataFrame(rows)

CodePudding user response：

doc.select is not indented properly. You can simplify this code with glob as follows:

from glob import glob
from bs4 import BeautifulSoup

GPATH = '/Users/xyz/Documents/testdirectory/*.html'

for file in glob(GPATH):
    with open(file) as html_file:
        soup = BeautifulSoup(html_file.read(), 'html.parser')
        for message in soup.select('div.message'):
            pass # process elements here