Home > front end >  Proper way of doing asyncio.to_thread for selenium.webdriver
Proper way of doing asyncio.to_thread for selenium.webdriver

Time:04-28

get almost zero performance gain

i think i missing something here..

def take_page_scr(dict_item, driver) -> None:
    print(dict_item['id'])
    driver.get(dict_item['url'])
    driver.set_window_size(500, 900)
    (
        Image.open(
            io.BytesIO(
                driver.get_screenshot_as_png()
            )
        )
        .convert("RGB")
        .save(f"./dst/{dict_item['id']}.jpg", quality=85)
    )
    driver.quit()


def main_async(data):
    async def main(data):
        options = get_options()
        await asyncio.gather(
            *(
                asyncio.to_thread(
                    take_page_scr,
                    i, webdriver.Chrome(options=options)
                )
                for i in data
            )
        )
        print()
        print('#DONE')
    asyncio.run(
        main(
            data
        )
    )
    # 13.397236824035645
    # 13.26906943321228

here is basic setup

def main_sync(data):
    options = get_options()
    driver = webdriver.Chrome(options=options)
    for i in data:
        print(i['id'])
        try:
            driver.get(i['url'])
            driver.set_window_size(500, 900)
            image_bytes = io.BytesIO(driver.get_screenshot_as_png())
            img = Image.open(image_bytes).convert("RGB")
            img.save(f"./dst/{i['id']}.jpg", quality=85)
        except Exception:
            pass
    driver.quit()
    print()
    print('#DONE')
    # 16.04508686065674
    # 16.138192653656006

i guess problem in webdriver.Chrome(options=options)

def main_sync_bad():
    options = get_options()
    [
        take_page_scr(
            i, webdriver.Chrome(options=options)
        ) for i in data
    ]
    print()
    print('#DONE')
    # 76.43093585968018
    # 78.09915900230408

but i do not know how to propagate it to many threads

CodePudding user response:

and the answer is - just don't use poor designed crappy things like lxml or selenium, use well documented, nice designed and supported libraries, even if they from micro$oft ...

async def coro(dict_item: TestData, browser: BrowserContext) -> None:
    print(dict_item["hash"])
    page = await browser.new_page()
    await page.goto(dict_item["url"])
    await page.screenshot(path=f'./res/{dict_item["hash"]}.jpg', type="jpeg")


async def main() -> None:
    async with async_playwright() as p:
        browser: BrowserContext = await p.chromium.launch_persistent_context(
            user_data_dir=f"{Path.home()}/.config/chromium",
            executable_path="/usr/bin/chromium",
            viewport={
                "width": 500,
                "height": 900
            },
        )
        await asyncio.gather(*(coro(i, browser) for i in data()))
        await browser.close()
    # 7.495748519897461
    # 2.4119105339050293
    # 2.3972327709198

do it more than 2-3 times faster! and working with it just pure pleasure..!

  • Related