Home > Mobile >  scraping a dictionary website with puppeteer
scraping a dictionary website with puppeteer

Time:02-23

I'm trying to scrape a dictionary website (this one "http://rjecnik.hr/"), with all the words from all the letters. Managed to do it partially. I manage to loop through pages, but can't implement looping through every letter and then saving that information in a document. Searched all over the internet, just can't see the solution to my problem. To add, I'm a big beginner at programming, still learning stuff. Probably a simple solution I can't see. Here is the code, also i didn't write the code by myself, but i can understand what every part does.

// Browser
const puppeteer = require('puppeteer');

// funkcija se odnosi na puppeteer
(async () => {
    // Izvlačenje riječi sa stranice, pomoću rekurzije provjerit iduće stranice.
    const izvuciRijeci = async (url) => 
    {
        // Izvlačenje (Scraping) podataka koje želimo. // Scraping data we want.
        const page = await browser.newPage()
        await page.goto(url)
        //console.log(`Scraping: ${url}`); // Debugging
        const rijeciNaStranici = await page.evaluate(() => Array.from(document.querySelectorAll('.word')).map((rijeci) => rijeci.innerText.trim())); // Getting the words from a page.
        await page.close();

        // Provjera iduće stranice pomoću rekurzije. // Checkin next page using recursion.
        if (rijeciNaStranici.length < 1) 
        {
            // Prekidanje ako nema riječi. // Stop if no more words.
            //console.log (`Terminate recursion on: ${url}`) // Debugging
            return rijeciNaStranici
        }
        else 
        {
        // Dohvati iduću stranicu načinom "?page=X 1". // Get next page using "?page=X 1".
        const  nextPageNumber = parseInt(url.match(/page=(\d )$/)[1], 10)   1;
        const nextUrl = `http://rjecnik.hr/?letter=a&page=${nextPageNumber}`;
        
        return rijeciNaStranici.concat(await izvuciRijeci(nextUrl))
        }
    }

    const browser = await puppeteer.launch();
    const url = "http://rjecnik.hr/?letter=a&page=1";
    const rijec = await izvuciRijeci(url);

    // Todo: Ažurirati bazu s riječima
    console.log(rijec);

// Spremanje u datoteku. // Save to file.
const content = rijec.toString();

var fs = require('fs');

fs.writeFile("rijeci.txt", content, function (err){
    if (err) {
        console.log(err);
    } else {
        console.log("File saved");
    }
});

    await browser.close();
})();

CodePudding user response:

Please select this solution as the right answer if you find it's useful and helpful.

First, you don't need to open and close the page every time it loads new URL. You can simply use the already opened page when the browser has launched.

// const page = await page.newPage()    // <= this is also not efficient enough
// await page.close()                   // <= this is unnecessary and way too heavy
                                        // == You can use these method instead
const page = (await browser.pages())[0] // <= this way is lot better and lighter

Then you need to list all of available letters in a single array:

const getLettersArray = async (url) => {
    const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
    await page.goto(url)
    return await page.evaluate(() => Array.from(document.querySelectorAll('.alphabet ul > li')).map(element => element.innerText))
}

Then to define the selected or active letter, you can check with regular expression like below, (NOTE: since the dictionary is using some non-English QWERTY characters, i've added {1.6} in the parameter)

const letterInUse = url.match(/letter=(.{1,6})&page=(\d )$/)[1] // Get the letter used in the page

And more methods i've added so you can run this full functional script below:

// Browser
const puppeteer = require('puppeteer')
const fs = require('fs')

// funkcija se odnosi na puppeteer
;(async () => {
    const getLettersArray = async (url) => {
        const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
        await page.goto(url)
        return await page.evaluate(() => Array.from(document.querySelectorAll('.alphabet ul > li')).map(element => element.innerText))
    }
    // Izvlačenje riječi sa stranice, pomoću rekurzije provjerit iduće stranice.
    const izvuciRijeci = async (url, allLetters) => {
        // Izvlačenje (Scraping) podataka koje želimo. // Scraping data we want.
        const page = (await browser.pages())[0] // Use the first page already opened, to keep it light
        await page.goto(url)
        //console.log(`Scraping: ${url}`); // Debugging
        const rijeciNaStranici = await page.evaluate(() => Array.from(document.querySelectorAll('.word')).map((rijeci) => rijeci.innerText.trim())) // Getting the words from a page.
        // await page.close() // Don't close page when it can be reused for efficiency and effectivity

        // Provjera iduće stranice pomoću rekurzije. // Checkin next page using recursion.
        if (rijeciNaStranici.length < 1) {
            // Prekidanje ako nema riječi. // Stop if no more words.
            // console.log (`Terminate recursion on: ${url}`) // Debugging
            return rijeciNaStranici
        } else {
            // Dohvati iduću stranicu načinom "?page=X 1". // Get next page using "?page=X 1".
            const nextPageNumber = parseInt(url.match(/page=(\d )$/)[1], 10)   1
            const letterInUse = url.match(/letter=(.{1,6})&page=(\d )$/)[1] // Get the letter used in the page
            const letterIndexed = allLetters.findIndex(value => value === letterInUse.toUpperCase())   1
            if (letterIndexed > allLetters.length) {
                return []
            }
            const nextLetter = allLetters.at(letterIndexed) // Get the next letter after this letter
            const nextLetterUrl = `http://rjecnik.hr/?letter=${nextLetter}&page=1`
            const nextUrl = `http://rjecnik.hr/?letter=${letterInUse}&page=${nextPageNumber}`
            const nextPageArray = await izvuciRijeci(nextUrl, allLetters)
            if (nextPageArray.length) {
                return rijeciNaStranici.concat(nextPageArray)
            } else {
                const nextLetterArray = await izvuciRijeci(nextLetterUrl, allLetters)
                return rijeciNaStranici.concat(nextLetterArray)
            }
        }
    }

    const browser = await puppeteer.launch({headless: true})
    const url = "http://rjecnik.hr/?letter=a&page=1"
    const allLetters = await getLettersArray(url)
    const rijec = await izvuciRijeci(url, allLetters)

    // Todo: Ažurirati bazu s riječima
    console.log(rijec)

    // Spremanje u datoteku. // Save to file.
    const content = rijec.toString()


    fs.writeFile('rijeci.txt', content, function (error) {
        if (error) {
            console.log(error)
        } else {
            console.log('File saved')
        }
    });

    await browser.close()
})()
  • Related