Home > Software engineering >  Empty list as json response although code is running
Empty list as json response although code is running

Time:11-27

I am trying to run the following python script to extract data from google scholar.However, when I run the code,I am getting an empty list as a json response.Note that all necessary libraries are installed.

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')

# JSON data will be collected here
data = []

# Container where all needed data is located
for result in soup.select('.gs_r.gs_or.gs_scl'):
    title = result.select_one('.gs_rt').text
    title_link = result.select_one('.gs_rt a')['href']
    publication_info = result.select_one('.gs_a').text
    snippet = result.select_one('.gs_rs').text
    cited_by = result.select_one('#gs_res_ccl_mid .gs_nph  a')['href']
    related_articles = result.select_one('a:nth-child(4)')['href']
    try:
        all_article_versions = result.select_one('a~ a  .gs_nph')['href']
    except:
        all_article_versions = None
    
    try:
        pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
    except: 
        pdf_link = None

    data.append({
        'title': title,
        'title_link': title_link,
        'publication_info': publication_info,
        'snippet': snippet,
        'cited_by': f'https://scholar.google.com{cited_by}',
        'related_articles': f'https://scholar.google.com{related_articles}',
        'all_article_versions': f'https://scholar.google.com{all_article_versions}',
        "pdf_link": pdf_link
    })

print(json.dumps(data, indent = 2, ensure_ascii = False))

Output: []

CodePudding user response:

Your code is working fine but the problem was to save the scraped data correctly in json format. So you can use super powerful and easy tool which is pandas DataFrasme to store data in json format

from bs4 import BeautifulSoup
import requests
#import json
import pandas as pd

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')
#print(soup.prettify())

# JSON data will be collected here
data = []

# Container where all needed data is located
for result in soup.select('.gs_r.gs_or.gs_scl'):
    title = result.select_one('.gs_rt').text
    title_link = result.select_one('.gs_rt a')['href']
    publication_info = result.select_one('.gs_a').text
    snippet = result.select_one('.gs_rs').text
    cited_by = result.select_one('#gs_res_ccl_mid .gs_nph  a')['href']
    related_articles = result.select_one('a:nth-child(4)')['href']
    try:
        all_article_versions = result.select_one('a~ a  .gs_nph')['href']
    except:
        all_article_versions = None
    
    try:
        pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
    except: 
        pdf_link = None

    data.append({
        'title': title,
        'title_link': title_link,
        'publication_info': publication_info,
        'snippet': snippet,
        'cited_by': f'https://scholar.google.com{cited_by}',
        'related_articles': f'https://scholar.google.com{related_articles}',
        'all_article_versions': f'https://scholar.google.com{all_article_versions}',
        "pdf_link": pdf_link
    })

#print(json.dumps(data, indent = 2, ensure_ascii = False))

df = pd.DataFrame(data).to_json('out.json',indent=4)

Output:

{
    "title": {
        "0": "[BOOK][B] Machine learning",
        "1": "[BOOK][B] Machine learning",
        "2": "Machine learning",
        "3": "Machine learning: Trends, perspectives, and prospects",
        "4": "[PDF][PDF] Machine learning algorithms-a review",
        "5": "What is machine learning?",
        "6": "[PDF][PDF] Machine learning basics",
        "7": "What is machine learning? A primer for the epidemiologist",
        "8": "[BOOK][B] Readings in machine learning",
        "9": "[BOOK][B] Encyclopedia of machine learning"
    },
    "title_link": {
        "0": "https:\/\/books.google.com\/books?hl=en&lr=&id=ctM-EAAAQBAJ&oi=fnd&pg=PR6&dq=Machine learning&ots=oZOqY0Vw_r&sig=Ide7KdAOWXxQwQKPxJKaps4Ag0g",
        "1": "https:\/\/profs.info.uaic.ro\/~ciortuz\/SLIDES\/2017s\/ml0.pdf",
        "2": "https:\/\/www.annualreviews.org\/doi\/pdf\/10.1146\/annurev.cs.04.060190.001351",
        "3": "https:\/\/www.science.org\/doi\/abs\/10.1126\/science.aaa8415",
        "4": "https:\/\/www.researchgate.net\/profile\/Batta-Mahesh\/publication\/344717762_Machine_Learning_Algorithms_-A_Review\/links\/5f8b2365299bf1b53e2d243a\/Machine-Learning-Algorithms-A-Review.pdf?eid=5082902844932096",
        "5": "https:\/\/link.springer.com\/chapter\/10.1007\/978-3-319-18305-3_1",
        "6": "http:\/\/whdeng.cn\/Teaching\/PPT_01_Machine learning Basics.pdf",
        "7": "https:\/\/academic.oup.com\/aje\/article-abstract\/188\/12\/2222\/5567515",
        "8": "https:\/\/books.google.com\/books?hl=en&lr=&id=UgC33U2KMCsC&oi=fnd&pg=PA1&dq=Machine learning&ots=Thlmkd7Io7&sig=8wkVF31S9nKRAOY8a-OOF8DWRGI",
        "9": "https:\/\/books.google.com\/books?hl=en&lr=&id=i8hQhp1a62UC&oi=fnd&pg=PT29&dq=Machine learning&ots=91ogCqhE8N&sig=7yz-s1SuD_e6HZe_-_5jF8lbld8"
    },
    "publication_info": {
        "0": "ZH Zhou - 2021 - books.google.com",
        "1": "TM Mitchell, TM Mitchell - 1997 - profs.info.uaic.ro",
        "2": "TG Dietterich\u00a0- Annual review of computer science, 1990 - annualreviews.org",
        "3": "MI Jordan, TM Mitchell\u00a0- Science, 2015 - science.org",
        "4": "B Mahesh\u00a0- International Journal of Science and Research (IJSR)\u00a0\u2026, 2020 - researchgate.net",
        "5": "I El Naqa, MJ Murphy\u00a0- machine learning in radiation oncology, 2015 - Springer",
        "6": "H Wang, Z Lei, X Zhang, B Zhou, J Peng\u00a0- Deep Learn, 2016 - whdeng.cn",
        "7": "Q Bi, KE Goodman, J Kaminsky\u2026\u00a0- American journal of\u00a0\u2026, 2019 - academic.oup.com",
        "8": "JW Shavlik, T Dietterich, TG Dietterich - 1990 - books.google.com",
        "9": "C Sammut, GI Webb - 2011 - books.google.com"
    },
    "snippet": {
        "0": "\u2026 machine learning. The second part includes Chapters 4\u201310, which presents some classic and \npopular machine learning \u2026 cover the core topics of machine learning in one semester, and \u2026",
        "1": "\u2026 Tom Mitchell (Definition of the [general] learning problem): \u201cA computer program is said \nto learn from experience E with respect to some class of tasks T and performance measure P\u00a0\u2026",
        "2": "Recent progress in the study of machine learning methods has taken many directions. First, \nin the area of inductive learning, a new formal definition of learning introduced by Leslie \u2026",
        "3": "\u2026 Machine learning addresses the question of how to build computers that improve \u2026 Recent \nprogress in machine learning has been driven both by the development of new learning \u2026",
        "4": "\u2026 Here\u201fsa quick look at some of the commonly used algorithms in machine learning (ML) \nSupervised Learning Supervised learning is the machine learning task of learning a function \u2026",
        "5": "\u2026 A machine learning algorithm is a computational process that \u2026 This training is the \u201clearning\u201d \npart of machine learning. The \u2026 can practice \u201clifelong\u201d learning as it processes new data and \u2026",
        "6": "\u2026 To obtain theoretical guarantees about generalization of a machine learning algorithm, we \n\u2026 Why does deep learning have different behavior than other machine learning methods for \u2026",
        "7": "\u2026 We provide a brief introduction to 5 common machine learning \u2026 of machine learning \ntechniques in the published literature. We recommend approaches to incorporate machine learning \u2026",
        "8": "\u2026 in machine learning. We have taught from these readings in our own machine learning \u2026 \nFurthermore, we in machine learning believe that learning techniques provide important con\u2026",
        "9": "\u2026 Machine Learning came to be identified as a research field in \u2026 machine learning appeared. \nAlthough the field coalesced in the \uf6dc\uf641\uf640\uf639s, research on what we now call machine learning \u2026"
    },
    "cited_by": {
        "0": "https:\/\/scholar.google.com\/scholar?cites=3387547533016043281&as_sdt=2005&sciodt=0,5&hl=en",
        "1": "https:\/\/scholar.google.com\/scholar?cites=5160851211484945804&as_sdt=2005&sciodt=0,5&hl=en",
        "2": "https:\/\/scholar.google.com\/scholar?cites=7073378272324684978&as_sdt=2005&sciodt=0,5&hl=en",
        "3": "https:\/\/scholar.google.com\/scholar?cites=10883068066968164261&as_sdt=2005&sciodt=0,5&hl=en",
        "4": "https:\/\/scholar.google.com\/scholar?cites=15194857180303073201&as_sdt=2005&sciodt=0,5&hl=en",
        "5": "https:\/\/scholar.google.com\/scholar?cites=13248080025875046634&as_sdt=2005&sciodt=0,5&hl=en",
        "6": "https:\/\/scholar.google.com\/scholar?cites=2537307997858018983&as_sdt=2005&sciodt=0,5&hl=en",
        "7": "https:\/\/scholar.google.com\/scholar?cites=16719333272424362284&as_sdt=2005&sciodt=0,5&hl=en",
        "8": "https:\/\/scholar.google.com\/scholar?cites=2031020440241972606&as_sdt=2005&sciodt=0,5&hl=en",
        "9": "https:\/\/scholar.google.com\/scholar?cites=16791323098365028130&as_sdt=2005&sciodt=0,5&hl=en"
    },
    "related_articles": {
        "0": "https:\/\/scholar.google.com\/scholar?q=related:EQ8shYj8Ai8J:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "1": "https:\/\/scholar.google.com\/scholar?q=related:jF00X9UGn0cJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "2": "https:\/\/scholar.google.com\/scholar?q=related:sgzh8w-wKWIJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "3": "https:\/\/scholar.google.com\/scholar?q=related:pdcI9r5sCJcJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "4": "https:\/\/scholar.google.com\/scholar?q=related:sR_ChBn63tIJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "5": "https:\/\/scholar.google.com\/scholar?q=related:6uA6mpei2rcJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "6": "https:\/\/scholar.google.com\/scholar?q=related:p7YVSi5UNiMJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "7": "https:\/\/scholar.google.com\/scholar?q=related:LDE5SAcBB-gJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "8": "https:\/\/scholar.google.com\/scholar?q=related:fiUuYFSiLxwJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5",
        "9": "https:\/\/scholar.google.com\/scholar?q=related:IufbymTDBukJ:scholar.google.com\/&scioq=Machine learning&hl=en&as_sdt=0,5"
    },
    "all_article_versions": {
        "0": "https:\/\/scholar.google.comNone",
        "1": "https:\/\/scholar.google.com\/scholar?cluster=5160851211484945804&hl=en&as_sdt=0,5",
        "2": "https:\/\/scholar.google.com\/scholar?cluster=7073378272324684978&hl=en&as_sdt=0,5",
        "3": "https:\/\/scholar.google.com\/scholar?cluster=10883068066968164261&hl=en&as_sdt=0,5",
        "4": "https:\/\/scholar.google.com\/scholar?cluster=15194857180303073201&hl=en&as_sdt=0,5",
        "5": "https:\/\/scholar.google.com\/scholar?cluster=13248080025875046634&hl=en&as_sdt=0,5",
        "6": "https:\/\/scholar.google.com\/scholar?cluster=2537307997858018983&hl=en&as_sdt=0,5",
        "7": "https:\/\/scholar.google.com\/scholar?cluster=16719333272424362284&hl=en&as_sdt=0,5",
        "8": "https:\/\/scholar.google.com\/scholar?cluster=2031020440241972606&hl=en&as_sdt=0,5",
        "9": "https:\/\/scholar.google.com\/scholar?cluster=16791323098365028130&hl=en&as_sdt=0,5"
    },
    "pdf_link": {
        "0": null,
        "1": "https:\/\/profs.info.uaic.ro\/~ciortuz\/SLIDES\/2017s\/ml0.pdf",
        "2": "https:\/\/web.engr.oregonstate.edu\/~tgd\/publications\/arcs.ps.gz",
        "3": "http:\/\/www.cs.cmu.edu\/~tom\/pubs\/Science-ML-2015.pdf",
        "4": "https:\/\/www.researchgate.net\/profile\/Batta-Mahesh\/publication\/344717762_Machine_Learning_Algorithms_-A_Review\/links\/5f8b2365299bf1b53e2d243a\/Machine-Learning-Algorithms-A-Review.pdf?eid=5082902844932096",
        "5": null,
        "6": "http:\/\/whdeng.cn\/Teaching\/PPT_01_Machine learning Basics.pdf",
        "7": null,
        "8": null,
        "9": null
    }
}

CodePudding user response:

This will let you write results to a JSON file.

import requests
from bs4 import BeautifulSoup

import json

start_url = 'https://scholar.google.com/scholar'

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}

params = {
    'q': 'Machine learning',
    'hl': 'en'
}

res = requests.get(start_url, headers=headers, params=params)
soup = BeautifulSoup(res.text, 'lxml')
data = []
for result in soup.select('#gs_res_ccl_mid > [data-lid]'):
    item_dict = {}
    item_dict['title'] = result.select_one('h3 > a[href]').text
    item_dict['title_link'] = result.select_one('h3 > a[href]')['href']
    item_dict['publication_info'] = result.select_one('.gs_a').text
    item_dict['snippet'] = result.select_one('.gs_rs').text
    item_dict['cited_by'] = result.select_one("a:-soup-contains('Cited by')")['href']
    item_dict['related_articles'] = result.select_one("a:-soup-contains('Related articles')")['href']
    try:
        item_dict['all_article_versions'] = result.select_one("a.gs_nph:-soup-contains('versions')")['href']
    except TypeError:
        item_dict['all_article_versions'] = ""
    
    try:
        item_dict['pdf_link'] = result.select_one('.gs_or_ggsm > a[href]')['href']
    except TypeError: 
        item_dict['pdf_link'] = ""

    data.append(item_dict)


print(json.dumps(data, indent=4))

with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
  • Related