-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_groups.py
141 lines (109 loc) · 5.17 KB
/
feature_groups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from abc import ABC,abstractmethod
from pandas import DataFrame
from typing import List
import os
import re
import textdistance as td
import pandas as pd
import sys
# The import must be relative if script is called from elsewhere
ROOT_DIR = os.path.abspath(os.curdir)
if os.path.isfile(ROOT_DIR + "/feature_groups.py"):
from data import ShortAnswerInstance
else:
from .data import ShortAnswerInstance
TARGET_DELIMS_PATTERN = re.compile('|'.join(map(re.escape, ["•", "ODER", " / "])))
MEASURES = [
td.algorithms.edit_based.NeedlemanWunsch(),
td.algorithms.edit_based.SmithWaterman(),
td.algorithms.sequence_based.LCSSeq(),
td.algorithms.sequence_based.LCSStr(),
td.algorithms.simple.Length(),
td.algorithms.phonetic.Editex(),
td.algorithms.phonetic.MRA(),
td.algorithms.token_based.Overlap(),
td.algorithms.token_based.Cosine()
]
simCache = {}
class FeatureGroupExtractor(ABC):
"""
An abstract class providing for FeatureGroupExtractors.
Subclasses have to implement the extract() method
"""
@abstractmethod
def extract(self, instances: List[ShortAnswerInstance]) -> DataFrame:
return DataFrame()
class SIMGroupExtractor(FeatureGroupExtractor):
"""
A subclass of FeatureGroupExtractor which provides a method
to compute similarity values (see the MEASURES variable) between
"answer" and "itemTargets" and store them as features in a pandas DataFrame.
For identification purposes, "learnerId" of each ShortAnswerInstance will be stored
as a separate column ('ID').
"""
def extract(self, instances: List[ShortAnswerInstance], reference='answer', compar='itemTargets') -> DataFrame:
"""
Computes similarity values (see the MEASURES variable) between
"answer" and "itemTargets" and stores them as features in a pandas DataFrame
:param instances: a list of ShortAnswerInstances
:return: a pd.DataFrame with similarity features + the learner ID for each ShortAnswerInstance
"""
# extract student and target answers from ShortAnswerInstances
st_tgt_answers = [(x.answer, x.itemTargets) for x in instances]
return pd.DataFrame([self.similarity_features(r, t, MEASURES) for r, t in st_tgt_answers],
columns=[type(i).__name__ for i in MEASURES])
def sim_lookup_str(self, response, a, m):
return response + " " + a + " | " + type(m).__name__
def similarity_features(self, response, targets, measures):
response = str(response)
resultScores = []
for m in measures:
max_sim = -1.0
for a in targets:
a = str(a)
try:
lookup = self.sim_lookup_str(response, a, m)
simCache.setdefault(lookup, m.normalized_similarity(response, a))
sim = simCache[lookup]
if sim > max_sim:
max_sim = sim
except:
# this should only happen for German answers and phonetic distance measures
print("Error with measure", m, "on strings '", response, "' and '", a, "'", file=sys.stderr)
resultScores.append(max_sim)
return resultScores
class BOWGroupExtractor(FeatureGroupExtractor):
"""
A subclass of GroupFeatureExtractor which extracts bag of words features from
a list of ShortAnswerInstances and stores them as features in a pandas DataFrame.
For identification purposes, "learnerId" of each ShortAnswerInstance will be stored
as a separate column ('ID').
*** IMPORTANT ***: when instantiating the class, a list of ShortAnswerInstances has to be passed
in order for the bag of words to be trained. Subsequent calls to
extract() will return features which will be computed based on the trained bag.
"""
def __init__(self, instances: List[ShortAnswerInstance]):
self.bag = self.train_bag(" ".join([x.answer for x in instances])) # trains the bag
def extract(self, instances: List[ShortAnswerInstance]) -> DataFrame:
"""
Extract BOW features from a list of ShortAnswerInstances.
:param instances: a list of ShortAnswerInstances
:return: a pd.DataFrame with BOW features + the learner ID for each ShortAnswerInstance
"""
# extract student answers & bow features for them
bow_feats = self.bow_features([x.answer for x in instances])
return pd.DataFrame(bow_feats, columns=self.bag)
def train_bag(self, text, n=500):
words = [w for w in text.lower().split(" ") if w]
word_counts = {}
for w in words:
if w not in word_counts:
word_counts[w] = 0.0
word_counts[w] += 1.0
sorted_words = sorted(word_counts.keys(), key=lambda x: word_counts[x], reverse=True)
return sorted_words[:n]
def bag_representation(self, text):
text = set(map(lambda w: w.lower(), text.split()))
return [float(w in text) for w in self.bag]
def bow_features(self, instances):
return [self.bag_representation(x) for x in instances]