Merge pull request #8 from vishalmhjn/vishal_developer

Developer
vishalmhjn · Oct 17, 2023 · 453e333 · 453e333
2 parents 9b06647 + 37686db
commit 453e333
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -132,6 +132,7 @@ dmypy.json
 # input data and models
 input/*
 data/*
+src/cache/*
 
 # exception to the rule
 !input/.gitkeep 

diff --git a/src/call_scopus.py b/src/call_scopus.py
@@ -1,5 +1,4 @@
 # MIT License
-
 # Copyright (c) 2021 Santhanakrishnan Narayanan
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -25,12 +24,11 @@
 import argparse
 from datetime import datetime
 
-
 API_FILE = "../input/.API"
 
 
 def create_article_dataframe(allentries):
-    "create data frame from the extracted json from API response"
+    "create data frame from the extracted JSON from API response"
     articles = pd.DataFrame(
         columns=["title", "creator", "publisher", "date", "doi", "citations"]
     )
@@ -145,6 +143,7 @@ def wrapper(api_key, keywords, year):
             # query += '&subj=ENGI' # This is commented because many results might not be covered under ENGI
             query += "&start=%d" % (start)
             # query += '&count=%d' % (count)
+
             r = requests.get(url + query, headers=headers, timeout=30)
             if "entry" in r.json()["search-results"]:
                 if "error" in r.json()["search-results"]["entry"][0]:
@@ -162,7 +161,6 @@ def wrapper(api_key, keywords, year):
 
 if __name__ == "__main__":
     YEAR, API_KEY, KEYWORDS = get_arguments()
-
     print(f"Current year is set to {YEAR}")
     file_name = "_".join(KEYWORDS)
     articles_extracted = wrapper(API_KEY, KEYWORDS, YEAR)

diff --git a/src/call_semanticscholar.py b/src/call_semanticscholar.py
@@ -1,13 +1,11 @@
-import requests
+import aiohttp
+import asyncio
 import sys
-import time
 import pandas as pd
 from random import choice
 
-# add you desktop agent here. for this go to: https://www.whatismybrowser.com/detect/what-is-my-user-agent/
-# and copy your agent here
-# Example ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36']
 desktop_agents = [""]
+BASE_API_URL = "http://api.semanticscholar.org/v1/paper/"
 
 
 def random_headers():
@@ -17,41 +15,50 @@ def random_headers():
     }
 
 
-def call_api(doi):
-    search_url = (
-        "http://api.semanticscholar.org/v1/paper/"
-        + doi
-        + "?include_unknown_references=true"
-    )
+async def call_api_async(session, doi):
+    search_url = BASE_API_URL + doi + "?include_unknown_references=true"
 
-    resp = requests.get(search_url, headers=random_headers())
-    content = resp.json()
-    return content
+    headers = random_headers()
 
+    async with session.get(search_url, headers=headers) as response:
+        content = await response.json()
+        return content
 
-if __name__ == "__main__":
-    df = pd.read_csv("../data/" + sys.argv[1])
-    filename = sys.argv[2]
 
-    print(len(df))
-    df = df[df.doi != "No Doi"]
-    print(len(df))
+async def fetch_articles_async(df):
+    timeout = aiohttp.ClientTimeout(total=10 * 60)
+    connector = aiohttp.TCPConnector(limit=5)
+
     list_doi = list(df["doi"])
     list_abstracts = []
     list_topics = []
-    i = 0
-    for doi in list_doi:
-        i = i + 1
-        print(i)
-        try:
-            content = call_api(doi)
-            list_abstracts.append(content["abstract"])
-            list_topics.append(content["topics"])
-        except Exception as e:
-            print(e)
-            list_abstracts.append("None")
-            list_topics.append("None")
-        time.sleep(2)
+
+    async with aiohttp.ClientSession(
+        connector=connector, headers=random_headers(), timeout=timeout
+    ) as session:  #
+        tasks = [call_api_async(session, doi) for doi in list_doi]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    for content in results:
+        list_abstracts.append(content["abstract"])
+        list_topics.append(content["topics"])
+
+    return list_abstracts, list_topics
+
+
+if __name__ == "__main__":
+    df = pd.read_csv(sys.argv[1])
+
+    print(f"Total articles: {len(df)}")
+
+    df = df[df.doi != "No Doi"]
+    print(f"Articles with abstracts: {len(df)}")
+
+    loop = asyncio.get_event_loop()
+    list_abstracts, list_topics = loop.run_until_complete(fetch_articles_async(df))
+
     df["abstract"] = list_abstracts
     df["topics"] = list_topics
-    df.to_csv("../data/abstracts_" + filename + ".csv", index=None)
+
+    output_file = "../data/abstracts_" + sys.argv[1].split("/")[-1][:-4] + ".csv"
+    df.to_csv(output_file, index=None)