-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathDatasetPatternAnalyzer.py
103 lines (76 loc) · 2.94 KB
/
DatasetPatternAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from collections import Counter
from data.datasets import *
from eval import metrics
from nlp import chunker, tokenizer as tk
from utils import info
import nltk
import re
# LOGGING CONFIGURATION
logging.basicConfig(
format='%(asctime)s\t%(levelname)s\t%(message)s',
level=logging.DEBUG)
info.log_versions()
# END LOGGING CONFIGURATION
DATASET = Hulth
PRINT_PATTERNS = True
if DATASET == Semeval2017:
tokenizer = tk.tokenizers.nltk
DATASET_FOLDER = "data/Semeval2017"
elif DATASET == Hulth:
tokenizer = tk.tokenizers.nltk
DATASET_FOLDER = "data/Hulth2003"
else:
raise NotImplementedError("Can't set the hyperparameters: unknown dataset")
# END PARAMETERS
logging.info("Loading dataset...")
data = DATASET(DATASET_FOLDER)
train_doc_str, train_answer_str = data.load_train()
test_doc_str, test_answer_str = data.load_test()
val_doc_str, val_answer_str = data.load_validation()
train_doc, train_answer = tk.tokenize_set(train_doc_str,train_answer_str,tokenizer)
test_doc, test_answer = tk.tokenize_set(test_doc_str,test_answer_str,tokenizer)
val_doc, val_answer = tk.tokenize_set(val_doc_str,val_answer_str,tokenizer)
logging.info("Dataset loaded. Generating candidate keyphrases...")
train_candidates = chunker.extract_candidates_from_set(train_doc_str,tokenizer)
test_candidates = chunker.extract_candidates_from_set(test_doc_str,tokenizer)
val_candidates = chunker.extract_candidates_from_set(val_doc_str,tokenizer)
logging.debug("Candidates recall on training set : %.4f", metrics.recall(train_answer,train_candidates))
logging.debug("Candidates recall on test set : %.4f", metrics.recall(test_answer,test_candidates))
logging.debug("Candidates recall on validation set : %.4f", metrics.recall(val_answer,val_candidates))
train_pos = []
for answers in train_answer.values():
for answer in answers:
train_pos.append(nltk.pos_tag(answer))
test_pos = []
for answers in test_answer.values():
for answer in answers:
test_pos.append(nltk.pos_tag(answer))
val_pos = []
for answers in val_answer.values():
for answer in answers:
val_pos.append(nltk.pos_tag(answer))
# Note: replace each NN* pos tag (like NNS, NNP...) with just NN*
train_seq = []
for seq in train_pos:
pattern = []
for pos in seq:
pattern.append(re.sub(r"(NN).*",r"\1*",pos[1]))
train_seq.append(' '.join(pattern))
test_seq = []
for seq in test_pos:
pattern = []
for pos in seq:
pattern.append(re.sub(r"(NN).*",r"\1*",pos[1]))
test_seq.append(' '.join(pattern))
val_seq = []
for seq in val_pos:
pattern = []
for pos in seq:
pattern.append(re.sub(r"(NN).*",r"\1*",pos[1]))
val_seq.append(' '.join(pattern))
counts = Counter(train_seq + test_seq + val_seq)
print("Total keyphrases %s" % sum([value for value in counts.values()]))
print("Total patterns %s" % len(counts))
if PRINT_PATTERNS:
for pattern, value in counts.items():
print("%s \t %s \t occurrences" % (pattern, value))