Is there a way to retrieve the number of citations per year of an article using the inspirehep websi-CodePudding

I have a database of scientific articles with their authors, the date of publication (on arXiV) and their respective arXiV id. Now, I want to add to this database the number of citations each year after the article has been created.

For instance, I would to like to retrieve the graph on the right hand side (example).

Is there an API that could help me?

I could use this method here opencitationAPI, but I wondered if there was a more straightforward way using the inspirehep data.

CodePudding user response：

I figured out how to do that by using the inspirehep api. A sleeping time should also be considered.

import pandas as pd
import requests
from collections import defaultdict

ihep_search_arxiv = "https://inspirehep.net/api/arxiv/"
ihep_search_article = "https://inspirehep.net/api/literature?sort=mostcited&size=100&page=1&q=refersto:recid:"

year = [str(x 1) for x in range(2009,2022)]

def count_year(year, input_list):
    #counting the number of citations each year
    year_count = {}
    for y in year:
        if input_list[0] == 'NaN':
            year_count[y] = 0
        else:
            year_count[y] = input_list.count(y)

    return year_count

def get_cnumber():

    citation_url = []

    for id in arxiv_id:
        inspirehep_url_arxiv = f"{ihep_search_arxiv}{id}"
        control_number = requests.get(inspirehep_url_arxiv).json()["metadata"]["control_number"]
        citation_url.append(f"{ihep_search_article}{control_number}")

    return citation_url


def get_citations():

    citation_url = get_cnumber()
    citation_per_year = pd.DataFrame(columns=year)
    citation_date = defaultdict(list)

    for i, url in enumerate(citation_url):
        data_article = requests.get(url).json()
        if len(data_article["hits"]["hits"]) == 0:
            citation_date[i].append('NaN')
        else : 
            for j, _ in enumerate(data_article["hits"]["hits"]):
                citation_date[i].append(data_article["hits"]["hits"][j]["created"][:4])

    for p, _ in enumerate(citation_date):
        citation_per_year = citation_per_year.append(count_year(year,citation_date[p]), True)

    citation_per_year.insert(0,"arxiv_id",arxiv_id,True)
    
    return citation_per_year


arxiv_id = recollect_data() #list of arxiv ids collected in a separate way

print(get_citations())