Skip to content

Commit

Permalink
Merge pull request #8 from vishalmhjn/vishal_developer
Browse files Browse the repository at this point in the history
Developer
  • Loading branch information
vishalmhjn authored Oct 17, 2023
2 parents 9b06647 + 37686db commit 453e333
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ dmypy.json
# input data and models
input/*
data/*
src/cache/*

# exception to the rule
!input/.gitkeep
Expand Down
6 changes: 2 additions & 4 deletions src/call_scopus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# MIT License

# Copyright (c) 2021 Santhanakrishnan Narayanan

# Permission is hereby granted, free of charge, to any person obtaining a copy
Expand All @@ -25,12 +24,11 @@
import argparse
from datetime import datetime


API_FILE = "../input/.API"


def create_article_dataframe(allentries):
"create data frame from the extracted json from API response"
"create data frame from the extracted JSON from API response"
articles = pd.DataFrame(
columns=["title", "creator", "publisher", "date", "doi", "citations"]
)
Expand Down Expand Up @@ -145,6 +143,7 @@ def wrapper(api_key, keywords, year):
# query += '&subj=ENGI' # This is commented because many results might not be covered under ENGI
query += "&start=%d" % (start)
# query += '&count=%d' % (count)

r = requests.get(url + query, headers=headers, timeout=30)
if "entry" in r.json()["search-results"]:
if "error" in r.json()["search-results"]["entry"][0]:
Expand All @@ -162,7 +161,6 @@ def wrapper(api_key, keywords, year):

if __name__ == "__main__":
YEAR, API_KEY, KEYWORDS = get_arguments()

print(f"Current year is set to {YEAR}")
file_name = "_".join(KEYWORDS)
articles_extracted = wrapper(API_KEY, KEYWORDS, YEAR)
Expand Down
75 changes: 41 additions & 34 deletions src/call_semanticscholar.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import requests
import aiohttp
import asyncio
import sys
import time
import pandas as pd
from random import choice

# add you desktop agent here. for this go to: https://www.whatismybrowser.com/detect/what-is-my-user-agent/
# and copy your agent here
# Example ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36']
desktop_agents = [""]
BASE_API_URL = "http://api.semanticscholar.org/v1/paper/"


def random_headers():
Expand All @@ -17,41 +15,50 @@ def random_headers():
}


def call_api(doi):
search_url = (
"http://api.semanticscholar.org/v1/paper/"
+ doi
+ "?include_unknown_references=true"
)
async def call_api_async(session, doi):
search_url = BASE_API_URL + doi + "?include_unknown_references=true"

resp = requests.get(search_url, headers=random_headers())
content = resp.json()
return content
headers = random_headers()

async with session.get(search_url, headers=headers) as response:
content = await response.json()
return content

if __name__ == "__main__":
df = pd.read_csv("../data/" + sys.argv[1])
filename = sys.argv[2]

print(len(df))
df = df[df.doi != "No Doi"]
print(len(df))
async def fetch_articles_async(df):
timeout = aiohttp.ClientTimeout(total=10 * 60)
connector = aiohttp.TCPConnector(limit=5)

list_doi = list(df["doi"])
list_abstracts = []
list_topics = []
i = 0
for doi in list_doi:
i = i + 1
print(i)
try:
content = call_api(doi)
list_abstracts.append(content["abstract"])
list_topics.append(content["topics"])
except Exception as e:
print(e)
list_abstracts.append("None")
list_topics.append("None")
time.sleep(2)

async with aiohttp.ClientSession(
connector=connector, headers=random_headers(), timeout=timeout
) as session: #
tasks = [call_api_async(session, doi) for doi in list_doi]
results = await asyncio.gather(*tasks, return_exceptions=True)

for content in results:
list_abstracts.append(content["abstract"])
list_topics.append(content["topics"])

return list_abstracts, list_topics


if __name__ == "__main__":
df = pd.read_csv(sys.argv[1])

print(f"Total articles: {len(df)}")

df = df[df.doi != "No Doi"]
print(f"Articles with abstracts: {len(df)}")

loop = asyncio.get_event_loop()
list_abstracts, list_topics = loop.run_until_complete(fetch_articles_async(df))

df["abstract"] = list_abstracts
df["topics"] = list_topics
df.to_csv("../data/abstracts_" + filename + ".csv", index=None)

output_file = "../data/abstracts_" + sys.argv[1].split("/")[-1][:-4] + ".csv"
df.to_csv(output_file, index=None)

0 comments on commit 453e333

Please sign in to comment.