-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle_scholar.py
101 lines (87 loc) · 3.63 KB
/
google_scholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
class ScholarInfo():
def __init__(self, proxy=None):
self.proxy = proxy
def get_authors(self, soup):
return soup.find('div', class_='gs_fmaa').text.split(',')
def get_title(self, soup):
return soup.find('h3', class_='gs_rt').text
def get_citation(self, soup):
cite = soup.find('a', text=lambda t: t and 'Cited by' in t)
if cite is None:
return -1, None
base_url = "https://scholar.google.com"
url = base_url + cite.get('href')
num = int(cite.text.split()[-1])
return num, url
def get_pub(self, soup):
data = soup.find('div', class_='gs_a gs_fma_p')
input_text = str(data)
div_index = input_text.find("</div>")
span_index = input_text.find('<span class="gs_pdot">')
# Extract the text between the two indices
if div_index != -1 and span_index != -1:
extracted_text = input_text[div_index + 6:span_index]
return extracted_text.strip()
else:
print(f"Substring not found in input: {input_text}")
return None
def search_by_arxiv_id(self, arxiv_id):
url = f"https://scholar.google.com/scholar_lookup?arxiv_id={arxiv_id}"
data = requests.get(url, proxies=self.proxy)
soup = BeautifulSoup(data.content, 'html.parser')
authors = self.get_authors(soup)
cite, cite_url = self.get_citation(soup)
pub_info = self.get_pub(soup)
if pub_info is None:
pub, date = 'arXiv', arxiv_id
else:
pub, date = self.publication_mapping(pub_info)
title = self.get_title(soup)
return {"title": title, "authors": authors,
"citation": cite, "citation_url": cite_url,
"publication": pub, "publication_date": date}
def publication_mapping(self, pub_info):
pub, date = pub_info.split(',')
pub = pub.lower()
map_dict = {
"International Conference on Machine Learning".lower(): "ICML",
"Proceedings of the IEEE conference on computer vision and".lower(): "CVPR",
"IEEE/CVF Conference on Computer Vision".lower(): "CVPR",
# ICCV
"Proceedings of the IEEE/CVF International Conference on".lower(): "ICCV",
# ECCV
"European Conference on Computer Vision".lower(): "ECCV",
# NeurIPS
"Neural Information Processing Systems".lower(): "NeurIPS",
# ICLR
"International Conference on Learning Representations".lower(): "ICLR",
# IJCAI
"International Joint Conference on Artificial Intelligence".lower(): "IJCAI",
"arxiv".lower(): "arXiv",
# AAAI
"Proceedings of the AAAI Conference on Artificial Intelligence".lower(): "AAAI",
# ACL
"Proceedings of the Association for Computational Linguistics".lower(): "ACL",
# EMNLP
"Proceedings of the Conference on Empirical Methods in Natural".lower(): "EMNLP",
# CoRL
"Conference on Robot Learning".lower(): "CoRL",
}
for k, v in map_dict.items():
if k in pub:
return v, date.strip()
print(f"Cannot find the publication for {pub_info}")
return pub, date.strip()
if __name__ == '__main__':
proxy = {
'http': '127.0.0.1:7890',
'https': '127.0.0.1:7890',
}
gscholar = ScholarInfo(proxy=proxy)
arxiv_id = "2106.06103"
res = gscholar.search_by_arxiv_id(arxiv_id)
print(res)
# vim: ts=4 sw=4 sts=4 expandtab