Home > Back-end >  I am trying to scrape a website, but it returns 404 not found error
I am trying to scrape a website, but it returns 404 not found error

Time:12-03

Here I am trying to retrieve all the internship offers(stage in French) from LinkedIn.

If I do the same on a simple website and change my search parameters, it works. I cannot see what I am doing wrong.

const PORT = 8000
const express = require('express')
const axios = require('axios')
const cheerio = require('cheerio')

const app = express()
const articles = []

app.get('/', (req, res) => {
    res.json('Scraping')

})

app.get('/news', (req, res) => {

    axios.get('https://www.linkedin.com/jobs/')
         .then((response) => {
            const html = response.data
            const $ = cheerio.load(html)

            $('a:contains("stage")', html).each(function () {

                const title = $(this).text()
                const url = $(this).attr('href')
                articles.push({
                    title,
                    url
                })

            })

            res.json(articles)
         }).catch((err) => console.log(err))
})


app.listen(PORT, () => console.log('server running on PORT ${8000}'))

CodePudding user response:

I was able to scrape for data engineers with this : remove " , html" and replaced it like that : $('a:contains("Data")').each. Made a console log on the http://localhost:8000/news. And it printed some URLs.

    const PORT = 8000
    const express = require('express')
    const axios = require('axios')
    const cheerio = require('cheerio')

    const app = express()
    const articles = []

    app.get('/', (req, res) => {
res.json('Scraping')
})

app.get('/news', (req, res) => {
axios.get('https://www.linkedin.com/jobs/')
     .then((response) => {
        const html = response.data
        const $ = cheerio.load(html)

        // Find <a> elements with a title attribute that contains the word "stage"
        $('a:contains("Data")').each(function () {
            const title = $(this).text()
            const url = $(this).attr('href')
            articles.push({
                title,
                url
            })
        })
        console.log(articles)
        res.json(articles)
     }).catch((err) => console.log(err))
  })

  app.listen(PORT, () => console.log('server running on PORT ${8000}'))
  • Related