Skip to content
This repository has been archived by the owner on Aug 15, 2020. It is now read-only.

Commit

Permalink
added some fixes to reader. this will be cliner's first build release
Browse files Browse the repository at this point in the history
  • Loading branch information
kwaco committed Nov 12, 2015
1 parent a52b266 commit da0bd6a
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 38 deletions.
97 changes: 80 additions & 17 deletions cliner/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def getConceptSpans(boundaries, classifications):
break
if possibleEnd == 'I':
end += 1

conceptSpans[lineIndex].update({(beginning,end):concept})

return conceptSpans
Expand All @@ -63,7 +64,7 @@ def evaluate(referenceSpans, predictedSpans, exactMatch=False, reportSeperately=
classes = [
"treatment",
"problem",
"test"
"test",
]

measures = {
Expand Down Expand Up @@ -247,21 +248,72 @@ def displayMatrix(out, name, confusion):
print >>out, "Analysis"
print >>out, " " * pad, "Precision\tRecall\tF1"

#print confusion

#print labels.items()
for lab, lab_v in labels.items():
if lab == 'none': continue

tp = confusion[lab_v][lab_v]
fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v)
fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v)

precision += [float(tp) / (tp + fp + 1e-100)]
recall += [float(tp) / (tp + fn + 1e-100)]
f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
"""
print lab
print tp
print fp
print fn
"""

p_num = tp
p_den = (tp + fp)

if p_num == p_den:
p = 1.0
else:
p = float(p_num) / p_den

r_num = tp
r_den = (tp + fn)

if r_num == r_den:
r = 1.0
else:
r = float(r_num) / r_den

"""
print "LAB: ", lab
print "R_NUM: ", r_num
print "R_DEN: ", r_den
print "RECALL: ", r
"""

precision += [p]
recall += [r]

if (p*r) == (p+r):
f = 2.0
else:
f = 2 * ((p * r) / (p + r))

f1 += [f]

"""
print precision
print recall
print f1
"""

print >>out, "%s %.4f\t%.4f\t%.4f" % (lab.rjust(pad), precision[-1], recall[-1], f1[-1])

print >>out, "--------"

"""
print "precision: ", precision
print "recall: ", recall
print "f1: ", f1
"""

precision = sum(precision) / len(precision)
recall = sum(recall) / len(recall)
f1 = sum(f1) / len(f1)
Expand All @@ -270,19 +322,32 @@ def displayMatrix(out, name, confusion):



def generateResultsForExactSpans(truePositive, falseNegative, falsePositive):
def generateResultsForExactSpans(tp, fn, fp):

#convert to float implicitly incase of any truncation
truePositive = float(truePositive)
flaseNegative = float(falseNegative)
falsePositive = float(falsePositive)

recall = truePositive / (truePositive + falseNegative)
precision = truePositive / (truePositive + falsePositive)
fScore = (2*truePositive) / (2*truePositive + falseNegative + falsePositive)
p_num = tp
p_den = (tp + fp)

if p_num == p_den:
p = 1.0
else:
p = float(p_num) / p_den

r_num = tp
r_den = (tp + fn)

if r_num == r_den:
r = 1.0
else:
r = float(r_num) / r_den

if (p*r) == (p+r):
f = 2.0
else:
f = 2 * ((p * r) / (p + r))

#convert to percent
return {"Recall":(recall * 100), "Precision":(precision * 100), "F Score":(fScore * 100)}
return {"Recall":(r * 100), "Precision":(p * 100), "F Score":(f * 100)}


def main():
Expand Down Expand Up @@ -348,17 +413,14 @@ def main():
txt_files_map = helper.map_files(txt_files)
wildcard = '*.' + Note.dictOfFormatToExtensions()[format]


# List of gold data
ref_files = glob.glob( os.path.join(args.ref, wildcard) )
ref_files_map = helper.map_files(ref_files)


# List of predictions
pred_files = glob.glob( os.path.join(args.con, wildcard) )
pred_files_map = helper.map_files(pred_files)


# Grouping of text, predictions, gold
files = []
for k in txt_files_map:
Expand All @@ -384,7 +446,8 @@ def main():
confusionMatrixExactSpan = deepcopy(confusion)
confusionMatrixInexactSpan = deepcopy(confusion)


if len(files) == 0:
exit("No files to be evaluated")

for txt, annotations, gold in files:

Expand Down
30 changes: 18 additions & 12 deletions cliner/notes/note.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,43 +176,47 @@ def getIOBLabels(self):

b_count = 0

seen_before = {}

# Add 'B's and 'I's from concept spans
for classification in self.derived_note.getClassificationTuples():
concept,char_spans = classification

#print '\n\n'
#print 'concept: ', concept
#print 'char_spans: ', char_spans
"""
print '\n\n'
print 'concept: ', concept
print 'char_spans: ', char_spans
"""

# Each span (could be noncontiguous span)
for span in char_spans:
start_ind,end_ind = span

#print '\tstart_ind, end_ind: ', start_ind, end_ind
#print '\ttext[start_ind:end_ind]: <%s>' % text[start_ind:end_ind]

lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
lineno, tokspan = lineno_and_tokspan(line_inds, data, text, span, seen_before)
start,end = tokspan

#print '\tlineno, tokspan: ', lineno, tokspan
#print '\t\tdata[linenp]: ', data[lineno]
#print '\t\ttokspan: ', data[lineno][tokspan[0]:tokspan[1]+1]
#print '\t\tiobs: ', iobs[lineno]
# print '\tlineno, tokspan: ', lineno, tokspan
# print '\t\tdata[linenp]: ', data[lineno]
# print '\t\ttokspan: ', data[lineno][tokspan[0]:tokspan[1]+1]
# print '\t\tiobs: ', iobs[lineno]

# Update concept tokens to 'B's and 'I's
assert iobs[lineno][start] == 'O'

iobs[lineno][start] = 'B'
b_count += 1
#print 'B: ', b_count
for i in range(start+1,end+1):
#print '\t\t\t', i

assert iobs[lineno][i] == 'O'
iobs[lineno][i] = 'I'

#print '\t\t', iobs[lineno]

#exit()
#exit()

# Memoize for next call
self.iob_labels = iobs
return iobs
Expand Down Expand Up @@ -359,6 +363,8 @@ def conlist(self):
tmp.append('none')
self.concepts.append(tmp)

seen_before = {}

# Use the classifications to correct all mislabled 'none's
for classification in self.derived_note.getClassificationTuples():
concept = classification[0]
Expand All @@ -369,7 +375,7 @@ def conlist(self):
data = self.derived_note.getTokenizedSentences()
text = self.derived_note.getText()
for span in char_spans:
lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, seen_before)
start,end = tokspan

self.concepts[lineno][start] = concept
Expand Down
20 changes: 13 additions & 7 deletions cliner/notes/note_i2b2.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,22 +50,26 @@ def getClassificationTuples(self):
# return value
retVal = []

q = False

# Build list of standardized classification tuples
for classification in self.classifications:
concept,lineno,tok_start,tok_end = classification

#q = lineno==12 and tok_start==19
#if q:
# print '\n\n\n\n'
# print 'concept: ', concept
# print 'lineno: ', lineno
# print 'tok_start: ', tok_start
# print 'tok_end: ', tok_end
# print 'line: <%s>' % self.data[lineno-1]
if q:
print '\n\n\n\n'
print 'concept: ', concept
print 'lineno: ', lineno
print 'tok_start: ', tok_start
print 'tok_end: ', tok_end
print 'line: <%s>' % self.data[lineno-1]

# character offset of beginning of line
begin = self.line_inds[lineno-1][0]

if q: print "BEGIN: ", self.line_inds[lineno-1]

# Sweep through line to get character offsets from line start
start = 0
for word in self.data[lineno-1][:tok_start]:
Expand Down Expand Up @@ -101,6 +105,8 @@ def getClassificationTuples(self):

#exit()

# print retVal

return retVal


Expand Down
18 changes: 17 additions & 1 deletion cliner/notes/utilities_for_notes.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def concept_cmp(a,b):


# Helper function
def lineno_and_tokspan(line_inds, data, text, char_span):
def lineno_and_tokspan(line_inds, data, text, char_span, seen_before=None):
""" File character offsets => line number and index into line """

q = False
Expand All @@ -71,6 +71,22 @@ def lineno_and_tokspan(line_inds, data, text, char_span):
# Locate line number
for i,candidate_span in enumerate(line_inds):
if char_span[1] <= candidate_span[1]:

# quick hack saying that if we have seen this specific line before with same candidate span
# then move on and try to get the next match.

if seen_before is not None:
if i in seen_before:
if char_span in seen_before[i]:
# try again.
continue

else:
seen_before[i].append(char_span)

else:
seen_before[i] = [char_span]

lineno = i
break

Expand Down
2 changes: 1 addition & 1 deletion config.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
GENIA None /home/rcampos/CliNER/cliner/features_dir/genia_dir/geniatagger-3.0.1/geniatagger
GENIA None
UMLS None

0 comments on commit da0bd6a

Please sign in to comment.