I have a database of scientific articles with their authors, the date of publication (on arXiV) and their respective arXiV id. Now, I want to add to this database the number of citations each year after the article has been created.
For instance, I would to like to retrieve the graph on the right hand side (example).
Is there an API that could help me?
I could use this method here opencitationAPI, but I wondered if there was a more straightforward way using the inspirehep data.
CodePudding user response:
I figured out how to do that by using the inspirehep api. A sleeping time should also be considered.
import pandas as pd
import requests
from collections import defaultdict
ihep_search_arxiv = "https://inspirehep.net/api/arxiv/"
ihep_search_article = "https://inspirehep.net/api/literature?sort=mostcited&size=100&page=1&q=refersto:recid:"
year = [str(x 1) for x in range(2009,2022)]
def count_year(year, input_list):
#counting the number of citations each year
year_count = {}
for y in year:
if input_list[0] == 'NaN':
year_count[y] = 0
else:
year_count[y] = input_list.count(y)
return year_count
def get_cnumber():
citation_url = []
for id in arxiv_id:
inspirehep_url_arxiv = f"{ihep_search_arxiv}{id}"
control_number = requests.get(inspirehep_url_arxiv).json()["metadata"]["control_number"]
citation_url.append(f"{ihep_search_article}{control_number}")
return citation_url
def get_citations():
citation_url = get_cnumber()
citation_per_year = pd.DataFrame(columns=year)
citation_date = defaultdict(list)
for i, url in enumerate(citation_url):
data_article = requests.get(url).json()
if len(data_article["hits"]["hits"]) == 0:
citation_date[i].append('NaN')
else :
for j, _ in enumerate(data_article["hits"]["hits"]):
citation_date[i].append(data_article["hits"]["hits"][j]["created"][:4])
for p, _ in enumerate(citation_date):
citation_per_year = citation_per_year.append(count_year(year,citation_date[p]), True)
citation_per_year.insert(0,"arxiv_id",arxiv_id,True)
return citation_per_year
arxiv_id = recollect_data() #list of arxiv ids collected in a separate way
print(get_citations())