Home > Back-end >  Convert scraped data to JSON format using pandas library
Convert scraped data to JSON format using pandas library

Time:03-16

My code works fine and scrapes the data, but when I try to output to JSON it doesn't work. I have created a dictionary and tried to use pandas to convert to JSON. I get the following error: return dumps( OverflowError: Maximum recursion level reached

/usr/local/Cellar/[email protected]/3.9.5/Frameworks/Python.framework/Versions/3.9/lib/python3.9/subprocess.py:1052: ResourceWarning: subprocess 40103 is still running sys:1: ResourceWarning: unclosed file <_io.BufferedWriter name=4> sys:1: ResourceWarning: unclosed <socket.socket fd=6, family=AddressFamily.AF_INET6, type=SocketKind.SOCK_STREAM, proto=6, laddr=('::1', 57427, 0, 0), raddr=('::1', 57423, 0, 0)>

Here is my code:

import unittest
import time
import pandas as pd
import json
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

test_url = "https://brocku.ca/guides-and-timetables/timetables/?session=fw&type=ug&level=all&program=ACTG"

before_XPath = "/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr["
aftertd_XPath = "]/td["
aftertr_XPath = "]"

course_code = []
course_title = []

class WebTableTest(unittest.TestCase):

    def setUp(self):
        self.driver = webdriver.Chrome('/Users/Downloads/chromedriver')
        self.driver.maximize_window()

    # def test_1_get_num_rows_(self):
    #     driver = self.driver
    #     driver.get(test_url)
    #
    #     WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="7858101"]')))
    #     # time.sleep(10)
    #     # num_rows = len (driver.find_elements(By.XPATH, '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]'));
    #     num_rows = len(driver.find_elements(By.XPATH,
    #                                         '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr'));
    #
    #     # num_rows = len (driver.find_elements_by_xpath("//*[@id='datatable-1547']/tbody/tr"))
    #     print("Rows in table are "   repr(num_rows))

    # def test_1_get_num_columns_(self):
    #     driver = self.driver
    #     driver.get(test_url)
    #
    #     WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="7858101"]')))
    #     # time.sleep(10)
    #     # num_rows = len (driver.find_elements(By.XPATH, '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]'));
    #     num_rows = len(driver.find_elements(By.XPATH,
    #                                         '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr[2]/td'));
    #
    #     # num_rows = len (driver.find_elements_by_xpath("//*[@id='datatable-1547']/tbody/tr"))
    #     print("Columns in table are "   repr(num_rows))

    def test_get_row_col_info_(self):
        driver = self.driver
        driver.get(test_url)


        # course_code.append(driver.find_elements(By.XPATH,
        #                                 '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr'));
        #
        # course_title.append(driver.find_elements(By.XPATH,
        #                                    '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr[2]/td'));

        # time.sleep(30)
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, '//*[@id="7858101"]')))

        rows = len(driver.find_elements(By.XPATH,
                                        '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr'));
        # print (rows)
        columns = len(driver.find_elements(By.XPATH,
                                           '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr[2]/td'));
        # print(columns)

        # print("Company" "               " "Contact" "               " "Country")

        for t_row in range(2, (rows   1)):
            for t_column in range(1, (columns   1)):
                course_code.append(driver.find_elements(By.XPATH,
                                                        '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr'));

                course_title.append(driver.find_elements(By.XPATH,
                                                         '/html/body/div/div[8]/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div/div[2]/div[4]/table/tbody/tr[2]/td'));
                FinalXPath = before_XPath   str(t_row)   aftertd_XPath   str(t_column)   aftertr_XPath
                cell_text = driver.find_element_by_xpath(FinalXPath).text
                # print(cell_text, end = '               ')
                print(cell_text)
            print()
        df = pd.DataFrame({'course_code': course_code, 'course_title': course_title})
        df.to_json('ACTG.json')


if __name__ == "__main__":
    unittest.main()

CodePudding user response:

The problem is that course_code and course_title both contain HTML elements from Selenium.

You can't serialize an entire Selenium HTML element to JSON. Instead, you must pick out the relevant values from the text content (or attributes) of these elements and put those in your dataframe instead of the elements before serializing it to JSON.

  • Related