Data scrapping to get not only h3 tag name but also the link and picture that goes with it-CodePudding

I currently have this code that is able to retrieve the headlines of articles from und.com but I am also trying to get the a tag to the website page and the picture as well. I am struggling to get the code to display the correct data to the server and was wondering what I can do to get it to display the correct information. Below is the code I am trying right now.

    const express = require("express");
    const cheerio = require("cheerio");
    const axios = require("axios");

    const app = express();
    app.post("/post", (req, res) => {
        console.log("Connected to react");
        res.redirect("/");
    })
    app.use(function(req, res, next) {
        res.header('Access-Control-Allow-Origin', 'http://localhost:3000');
        res.header('Access-Control-Allow-Methods', 'GET');
        res.header('Access-Control-Allow-Headers', 'Content-Type');
        next();
    });
    const PORT = process.env.PORT || 3002;

    const website = "https://und.com";

    let options = {
    headers: {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
    }
    };

    app.get("/", (req, res) => {
    // Make the HTTP request using axios
    axios(website, options)
    .then(({data}) => {
        const $ = cheerio.load(data);
        console.log($)
        const result = $(".post__meta h3 a:last-child")
        .map((_, e) => $(e).text().trim())
        .get();
        console.log(result);
        // Use cheerio to manipulate the HTML data

        // Set the content type of the response to "text/html"
        res.set("Content-Type", "text/html");

        // Send the HTML data as the response
        res.send(result);
        })
        .catch((error) => {
            // The HTTP request failed
            console.log("An error occurred:", error);
        
            // Set the status code of the response to indicate an error
            res.status(500);
        
            // Send an error message as the response
            res.send("An error occurred while making the HTTP request.");
        });
        })

    app.listen(PORT, () => {
        console.log(`server is running on PORT: ${PORT} `);
    })

CodePudding user response：

Cheerio is fast method to web scraping. it's limitations is that it cannot handle dynamic sites.

A loading a browser would take a lot of resources because it has to load a lot of other things like the toolbar and buttons. These UI elements are not needed when everything is being controlled with code. Fortunately, there are better solutions – headless browsers.

A headless browser is simply a browser but without a graphical user interface. Think of it as a hidden browser.

Puppeteer is one of best control headless browser.

I made a program with express and puppeteer for your https://und.com web site .

It shows article list with title, image link, tag, link and category

Example

  {
    "title": "Together Irish",
    "image_link": "https://und.com/imgproxy/BPEVHq4HccPML1rEeouDDHSwrg2Kd_uUryxDl6o1b-Q/fit/1024/619/ce/0/aHR0cHM6Ly91bmQuY29tL3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDIyLzA5L3dlYi1oZWFkZXJfdG9nZXRoZXJpcmlzaC5wbmc.png",
    "tag": "Together Irish",
    "link": "https://und.com/stand-together-notre-dame-athletics-commitment-to-change/",
    "category": "Athletics Communications"
  },

puppeteer `xpath` selector idea for scrapping

Title and image path

"div.post__thumb a span.image.lazy"

Title and category path

".post__meta h3 a:last-child"

Full code

const express = require("express")
const cors = require('cors');
const puppeteer = require("puppeteer");

const app = express()
const port = 3002

app.use(cors());

async function getArticles (url) {
    try {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto(url);
    
        articles = await page.evaluate(() => {
            title_elements = document.querySelectorAll(".post__meta h3 a:last-child");
            title_array = Array.from(title_elements);
            image_elements = document.querySelectorAll("div.post__thumb a span.image.lazy");
            image_array = Array.from(image_elements);
            return title_array.map((title, i) => {
                return {
                    title: image_array[i].nextElementSibling.textContent.trim(),
                    image_link: image_array[i].getAttribute('data-bg'),
                    tag: title.textContent.trim(),
                    link: 'https://und.com'   title.getAttribute('href'),
                    category: title.previousElementSibling.textContent.trim()
                }
            });
        });
   
        await browser.close();
        return Promise.resolve(articles);
    } catch (error) {
        return Promise.reject(error);
    }
}

app.get("/info", (req, res) => {
    getArticles('https://und.com/')
        .then((articles) => {
            console.log(articles);
            console.log(articles.length);
            res.status(200).json(articles)
        })
})

app.listen(port, ( ) => console.log(`Server started, port: ${port}`))

Install and run it

npm install express cors puppeteer
node server.js

Result - if access express server

http://localhost:3002/info

CodePudding user response：

Here is a pure cheerio solution:

// define function which accepts body and cheerio as args
function extract(input, cheerio) {
    // return object with extracted values              
    let $ = cheerio.load(input);
  
    let posts = $(".post__meta").map(function () {
        let h3 = $('h3', this);
        let item = { 
          category: h3.find('a:nth-child(1)').text().trim(),
          title: h3.find('a:nth-child(2)').text().trim(),
          img: $(this).prev().find('.image').attr('data-bg')
        };
      
        return item;
    }).toArray();
    return posts;
}

results:

[
    {
        "category": "Athletics Communications",
        "title": "Together Irish",
        "img": "https://und.com/imgproxy/BPEVHq4HccPML1rEeouDDHSwrg2Kd_uUryxDl6o1b-Q/fit/1024/619/ce/0/aHR0cHM6Ly91bmQuY29tL3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDIyLzA5L3dlYi1oZWFkZXJfdG9nZXRoZXJpcmlzaC5wbmc.png"
    },
    {
        "category": "Football",
        "title": "Mayer, Alt and Foskey Named Associated Press All-Americans",
        "img": "https://und.com/imgproxy/hGlq4hjeum4XvBCd69FPHAKHMz1sbhondxvzePmvYdk/fit/1024/576/ce/0/aHR0cHM6Ly91bmQuY29tL3dwLWNvbnRlbnQvdXBsb2Fkcy8yMDIyLzEyL2Fzc29jaWF0ZWQtcHJlc3MtMTZ4OS0xLmpwZw.jpg"
    }
]

run this cheerio extractor on test HTML here: https://scrapeninja.net/cheerio-sandbox?slug=eaa3eeb5c55284274880b4c2714715a1ffe6839c

Example

puppeteer xpath selector idea for scrapping

Full code

Install and run it

Result - if access express server

puppeteer `xpath` selector idea for scrapping