Sort dataframes by prediction_id beforehand

Ref: MLWave#5 Also suffled samples file.
plantsgo · Nov 18, 2015 · 37bcdd4 · 37bcdd4
1 parent 30176e1
commit 37bcdd4
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 17 deletions.
diff --git a/correlations.py b/correlations.py
@@ -5,13 +5,15 @@
 second_file = sys.argv[2]
 
 def corr(first_file, second_file):
-  first_df = pd.read_csv(first_file)
-  second_df = pd.read_csv(second_file)
-  goal = first_df.columns[1]
+  first_df = pd.read_csv(first_file,index_col=0)
+  second_df = pd.read_csv(second_file,index_col=0)
+  # assuming first column is `prediction_id` and second column is `prediction`
+  prediction = first_df.columns[0]
+  # correlation
   print "Finding correlation between: %s and %s" % (first_file,second_file)
-  print "Column to be measured: %s" % goal
-  print "Pearson's correlation score: %0.5f" % first_df[goal].corr(second_df[goal],method='pearson')
-  print "Kendall's correlation score: %0.5f" % first_df[goal].corr(second_df[goal],method='kendall')
-  print "Spearman's correlation score: %0.5f" % first_df[goal].corr(second_df[goal],method='spearman')
+  print "Column to be measured: %s" % prediction
+  print "Pearson's correlation score: %0.5f" % first_df[prediction].corr(second_df[prediction],method='pearson')
+  print "Kendall's correlation score: %0.5f" % first_df[prediction].corr(second_df[prediction],method='kendall')
+  print "Spearman's correlation score: %0.5f" % first_df[prediction].corr(second_df[prediction],method='spearman')
 
-corr(first_file, second_file)
+corr(first_file, second_file)
diff --git a/kaggle_avg.py b/kaggle_avg.py
@@ -11,7 +11,10 @@ def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
   with open(loc_outfile,"wb") as outfile:
     for i, glob_file in enumerate( glob(glob_files) ):
       print "parsing:", glob_file
-      for e, line in enumerate( open(glob_file) ):
+      # sort glob_file by first column, ignoring the first line
+      lines = open(glob_file).readlines()
+      lines = [lines[0]] + sorted(lines[1:])
+      for e, line in enumerate( lines ):
         if i == 0 and e == 0:
           outfile.write(line)
         if e > 0:

diff --git a/kaggle_rankavg.py b/kaggle_rankavg.py
@@ -12,7 +12,10 @@ def kaggle_bag(glob_files, loc_outfile):
     for i, glob_file in enumerate( glob(glob_files) ):
       file_ranks = []
       print "parsing:", glob_file
-      for e, line in enumerate( open(glob_file) ):
+      # sort glob_file by first column, ignoring the first line
+      lines = open(glob_file).readlines()
+      lines = [lines[0]] + sorted(lines[1:])
+      for e, line in enumerate( lines ):
         if e == 0 and i == 0:
           outfile.write( line )
         elif e > 0:

diff --git a/kaggle_vote.py b/kaggle_vote.py
@@ -11,7 +11,10 @@ def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
   with open(loc_outfile,"wb") as outfile:
     for i, glob_file in enumerate( glob(glob_files) ):
       print "parsing:", glob_file
-      for e, line in enumerate( open(glob_file) ):
+      # sort glob_file by first column, ignoring the first line
+      lines = open(glob_file).readlines()
+      lines = [lines[0]] + sorted(lines[1:])
+      for e, line in enumerate( lines ):
         if i == 0 and e == 0:
           outfile.write(line)
         if e > 0:

diff --git a/samples/method1.csv b/samples/method1.csv
@@ -1,6 +1,6 @@
 ImageId,Label
-1,1
+5,3
 2,0
 3,9
 4,9
-5,3
+1,1
diff --git a/samples/method2.csv b/samples/method2.csv
@@ -1,6 +1,6 @@
 ImageId,Label
 1,2
-2,0
-3,6
 4,2
+3,6
 5,3
+2,0
diff --git a/samples/method3.csv b/samples/method3.csv
@@ -1,6 +1,6 @@
 ImageId,Label
 1,2
-2,0
 3,9
-4,2
+2,0
 5,3
+4,2