added some fixes to reader. this will be cliner's first build release

text-machine-lab · Nov 12, 2015 · da0bd6a · da0bd6a
1 parent a52b266
commit da0bd6a
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 38 deletions.
diff --git a/cliner/evaluate.py b/cliner/evaluate.py
@@ -52,6 +52,7 @@ def getConceptSpans(boundaries, classifications):
                         break
                     if possibleEnd == 'I':
                         end += 1
+
                 conceptSpans[lineIndex].update({(beginning,end):concept})
 
     return conceptSpans
@@ -63,7 +64,7 @@ def evaluate(referenceSpans, predictedSpans, exactMatch=False, reportSeperately=
     classes = [
                 "treatment",
                 "problem",
-                "test"
+                "test",
               ]
 
     measures = {
@@ -247,21 +248,72 @@ def displayMatrix(out, name, confusion):
     print >>out, "Analysis"
     print >>out, " " * pad, "Precision\tRecall\tF1"
 
+    #print confusion
 
+    #print labels.items()
     for lab, lab_v in labels.items():
         if lab == 'none': continue
 
         tp = confusion[lab_v][lab_v]
         fp = sum(confusion[v][lab_v] for k, v in labels.items() if v != lab_v)
         fn = sum(confusion[lab_v][v] for k, v in labels.items() if v != lab_v)
 
-        precision += [float(tp) / (tp + fp + 1e-100)]
-        recall += [float(tp) / (tp + fn + 1e-100)]
-        f1 += [float(2 * tp) / (2 * tp + fp + fn + 1e-100)]
+        """
+        print lab
+        print tp
+        print fp
+        print fn
+        """
+
+        p_num = tp
+        p_den = (tp + fp)
+
+        if p_num == p_den:
+            p = 1.0
+        else:
+            p = float(p_num) / p_den
+
+        r_num = tp
+        r_den = (tp + fn)
+
+        if r_num == r_den:
+            r = 1.0
+        else:
+            r = float(r_num) / r_den
+
+        """
+        print "LAB: ", lab
+        print "R_NUM: ", r_num
+        print "R_DEN: ", r_den
+        print "RECALL: ", r
+        """
+
+        precision += [p]
+        recall += [r]
+
+        if (p*r) == (p+r):
+            f = 2.0
+        else:
+            f = 2 * ((p * r) / (p + r))
+
+        f1 += [f]
+
+        """
+        print precision
+        print recall
+        print f1
+        """
+
         print >>out, "%s %.4f\t%.4f\t%.4f" % (lab.rjust(pad), precision[-1], recall[-1], f1[-1])
 
     print >>out, "--------"
 
+    """
+    print "precision: ", precision
+    print "recall: ", recall
+    print "f1: ", f1
+    """
+
     precision = sum(precision) / len(precision)
     recall = sum(recall) / len(recall)
     f1 = sum(f1) / len(f1)
@@ -270,19 +322,32 @@ def displayMatrix(out, name, confusion):
 
 
 
-def generateResultsForExactSpans(truePositive, falseNegative, falsePositive):
+def generateResultsForExactSpans(tp, fn, fp):
 
-    #convert to float implicitly incase of any truncation
-    truePositive = float(truePositive)
-    flaseNegative = float(falseNegative)
-    falsePositive = float(falsePositive)
 
-    recall = truePositive / (truePositive + falseNegative)
-    precision = truePositive / (truePositive + falsePositive)
-    fScore = (2*truePositive) / (2*truePositive + falseNegative + falsePositive)
+    p_num = tp
+    p_den = (tp + fp)
+
+    if p_num == p_den:
+        p = 1.0
+    else:
+        p = float(p_num) / p_den
+
+    r_num = tp
+    r_den = (tp + fn)
+
+    if r_num == r_den:
+        r = 1.0
+    else:
+        r = float(r_num) / r_den
+
+    if (p*r) == (p+r):
+        f = 2.0
+    else:
+        f = 2 * ((p * r) / (p + r))
 
     #convert to percent
-    return {"Recall":(recall * 100), "Precision":(precision * 100), "F Score":(fScore * 100)}
+    return {"Recall":(r * 100), "Precision":(p * 100), "F Score":(f * 100)}
 
 
 def main():
@@ -348,17 +413,14 @@ def main():
     txt_files_map = helper.map_files(txt_files)
     wildcard = '*.' + Note.dictOfFormatToExtensions()[format]
 
-
     # List of gold data
     ref_files = glob.glob( os.path.join(args.ref, wildcard) )
     ref_files_map = helper.map_files(ref_files)
 
-
     # List of predictions
     pred_files = glob.glob( os.path.join(args.con, wildcard) )
     pred_files_map = helper.map_files(pred_files)
 
-
     # Grouping of text, predictions, gold
     files = []
     for k in txt_files_map:
@@ -384,7 +446,8 @@ def main():
     confusionMatrixExactSpan = deepcopy(confusion)
     confusionMatrixInexactSpan = deepcopy(confusion)
 
-
+    if len(files) == 0:
+        exit("No files to be evaluated")
 
     for txt, annotations, gold in files:
 

diff --git a/cliner/notes/note.py b/cliner/notes/note.py
@@ -176,43 +176,47 @@ def getIOBLabels(self):
 
         b_count = 0
 
+        seen_before = {}
+
         # Add 'B's and 'I's from concept spans
         for classification in self.derived_note.getClassificationTuples():
             concept,char_spans = classification
 
-            #print '\n\n'
-            #print 'concept: ', concept
-            #print 'char_spans: ', char_spans
+            """
+            print '\n\n'
+            print 'concept: ', concept
+            print 'char_spans: ', char_spans
+            """
 
             # Each span (could be noncontiguous span)
             for span in char_spans:
                 start_ind,end_ind = span
+
                 #print '\tstart_ind, end_ind: ', start_ind, end_ind
                 #print '\ttext[start_ind:end_ind]: <%s>' % text[start_ind:end_ind]
 
-                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
+                lineno, tokspan = lineno_and_tokspan(line_inds, data, text, span, seen_before)
                 start,end = tokspan
 
-                #print '\tlineno, tokspan: ', lineno, tokspan
-                #print '\t\tdata[linenp]: ', data[lineno]
-                #print '\t\ttokspan: ', data[lineno][tokspan[0]:tokspan[1]+1]
-                #print '\t\tiobs: ', iobs[lineno]
+#                print '\tlineno, tokspan: ', lineno, tokspan
+#                print '\t\tdata[linenp]: ', data[lineno]
+#                print '\t\ttokspan: ', data[lineno][tokspan[0]:tokspan[1]+1]
+#                print '\t\tiobs: ', iobs[lineno]
 
                 # Update concept tokens to 'B's and 'I's
                 assert iobs[lineno][start] == 'O'
+
                 iobs[lineno][start] = 'B'
                 b_count += 1
                 #print 'B: ', b_count
                 for i in range(start+1,end+1):
                     #print '\t\t\t', i
+
                     assert iobs[lineno][i] == 'O'
                     iobs[lineno][i] = 'I'
 
                 #print '\t\t', iobs[lineno]
 
-            #exit()
-        #exit()
-
         # Memoize for next call
         self.iob_labels = iobs
         return iobs
@@ -359,6 +363,8 @@ def conlist(self):
                 tmp.append('none')
             self.concepts.append(tmp)
 
+        seen_before = {}
+
         # Use the classifications to correct all mislabled 'none's
         for classification in self.derived_note.getClassificationTuples():
             concept    = classification[0]
@@ -369,7 +375,7 @@ def conlist(self):
             data      = self.derived_note.getTokenizedSentences()
             text      = self.derived_note.getText()
             for span in char_spans:
-                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span)
+                lineno,tokspan = lineno_and_tokspan(line_inds, data, text, span, seen_before)
                 start,end = tokspan
 
             self.concepts[lineno][start] = concept

diff --git a/cliner/notes/note_i2b2.py b/cliner/notes/note_i2b2.py
@@ -50,22 +50,26 @@ def getClassificationTuples(self):
         # return value
         retVal = []
 
+        q = False
+
         # Build list of standardized classification tuples
         for classification in self.classifications:
             concept,lineno,tok_start,tok_end = classification
 
             #q = lineno==12 and tok_start==19
-            #if q:
-            #    print '\n\n\n\n'
-            #    print 'concept: ', concept
-            #    print 'lineno: ', lineno
-            #    print 'tok_start: ', tok_start
-            #    print 'tok_end:   ', tok_end
-            #    print 'line: <%s>' % self.data[lineno-1]
+            if q:
+                print '\n\n\n\n'
+                print 'concept: ', concept
+                print 'lineno: ', lineno
+                print 'tok_start: ', tok_start
+                print 'tok_end:   ', tok_end
+                print 'line: <%s>' % self.data[lineno-1]
 
             # character offset of beginning of line
             begin = self.line_inds[lineno-1][0]
 
+            if q: print "BEGIN: ", self.line_inds[lineno-1]
+
             # Sweep through line to get character offsets from line start
             start = 0
             for word in self.data[lineno-1][:tok_start]:
@@ -101,6 +105,8 @@ def getClassificationTuples(self):
 
         #exit()
 
+#        print retVal
+
         return retVal
 
 

diff --git a/cliner/notes/utilities_for_notes.py b/cliner/notes/utilities_for_notes.py
@@ -59,7 +59,7 @@ def concept_cmp(a,b):
 
 
 # Helper function
-def lineno_and_tokspan(line_inds, data, text, char_span):
+def lineno_and_tokspan(line_inds, data, text, char_span, seen_before=None):
     """ File character offsets => line number and index into line """
 
     q = False
@@ -71,6 +71,22 @@ def lineno_and_tokspan(line_inds, data, text, char_span):
     # Locate line number
     for i,candidate_span in enumerate(line_inds):
         if char_span[1] <= candidate_span[1]:
+
+            # quick hack saying that if we have seen this specific line before with same candidate span
+            # then move on and try to get the next match.
+
+            if seen_before is not None:
+                if i in seen_before:
+                    if char_span in seen_before[i]:
+                        # try again.
+                        continue
+
+                    else:
+                        seen_before[i].append(char_span)
+
+                else:
+                    seen_before[i] = [char_span]
+
             lineno = i
             break
 

diff --git a/config.txt b/config.txt
@@ -1,2 +1,2 @@
-GENIA None /home/rcampos/CliNER/cliner/features_dir/genia_dir/geniatagger-3.0.1/geniatagger
+GENIA None
 UMLS None