-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy paths01_pcname_to_dblp_person_id.py
81 lines (69 loc) · 3.34 KB
/
s01_pcname_to_dblp_person_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Project: ISCA 2021 Script
# Filename: s01_pcname_to_dblp_person_id.py
# Date: March 16, 2021
# Author: Bagus Hanindhito (hanindhito[at]bagus[dot]my[dot]id)
# Title: PC Member Name to DBLP Person ID Lookup Script
# Description:
## This script is used to look-up appropriate DBLP Person ID that match
## with a given name (first name and last name). The script uses fuzzy match
## to match the given person name with the DBLP person name. Because of homonym
## (i.e., multiple persons with the same name), the script may output multiple
## person with the same name and requires a bit of manual works to choose the
## correct person.
#%% Import some libraries that are needed
import pandas as pd
import numpy as np
import tqdm
import os
from fuzzywuzzy import fuzz
from s00_function import request_author_key
from s00_function import merge_affiliation
from s00_function import convert_to_dict
from s00_function import filter_name
from s00_function import filter_affiliation
#%% Define the input and output CSV filename
# Input CSV filename
## This is CSV file obtained from HotCRP by going to Users and select 'Program Committee' in the
## filter. At the bottom of page, click Select All then Download 'PC info' and click 'Go'
##
pc_info_hotcrp_filename = 'sample-data/input/isca2021-pcinfo.csv'
# Output CSV filename
pc_to_dblp_filename = 'sample-data/output/isca2021-pc-to-dblp.csv'
# %% Load Input CSV to Pandas Dataframe
# Load the PC Info from HotCRP
pc_info_hotcrp_df = pd.read_csv(pc_info_hotcrp_filename)
# %% Sanitize
# Construct and Sanitize First name and Last name
pc_members_name = pd.DataFrame(pc_info_hotcrp_df['first'] + " " + pc_info_hotcrp_df['last'], columns = ['full'])
pc_members_name['split'] = pc_members_name['full'].str.split()
pc_members_name['last'] = pc_members_name['split'].str[-1]
pc_members_name['first'] = pc_members_name.apply(lambda x: x['full'].replace(" " + x['last'],""), axis=1)
pc_members_name['affiliation'] = pc_info_hotcrp_df['affiliation']
pc_members_name['email'] = pc_info_hotcrp_df['email']
del pc_members_name['split']
# %% Request DBLP Person ID for each PC Members
pc_members_list = []
for index,pc_member in tqdm.tqdm(pc_members_name.iterrows(), total=pc_members_name.shape[0]):
pc_json = request_author_key(pc_member["first"], pc_member["last"])
pc_member_dblp_list = convert_to_dict(pc_member, pc_json)
confidence_level=100
pc_member_dblp_list_filtered = []
# Filtering by name with the highest confidence level
# then gradually reducing the confidence level until at least one item come up.
# This is not 100% perfect.
while(len(pc_member_dblp_list_filtered)<1):
pc_member_dblp_list_filtered=filter_name(pc_member_dblp_list, confidence_level)
confidence_level=confidence_level-1
pc_member_dblp_list = pc_member_dblp_list_filtered
pc_member_dblp_list_filtered = filter_affiliation(pc_member_dblp_list)
# Filtering by Affiliation
if(len(pc_member_dblp_list_filtered)>0):
pc_member_dblp_list = pc_member_dblp_list_filtered
# Add to the Final List
pc_members_list = pc_members_list + pc_member_dblp_list
# %% Write Output to CSV
# This CSV needs to be inspected manually.
## NOTE: To avoid overwritting, rename the final CSV file to something else.
pc_members_df = pd.DataFrame(pc_members_list)
pc_members_df.to_csv(pc_to_dblp_filename, index=False)
# %%