RUSH-LAB · larsvilhuber · Feb 20, 2019
diff --git a/README.md b/README.md
@@ -19,10 +19,14 @@ This package is written in C++ and Python. We require at least g++ version 5 and
 
 3.  Prerequisites
 
+Software:
+ + C++ compiler
+ + Python 2.7
+
 The following packages are needed in Python for the code to run: 
 
 ```
-C++, Python 2, ngram, sklearn, numpy, scipy, matlib
+ngram, sklearn, numpy, scipy, matlib
 ```
 
 Remark: In order to install using pip, one will need to run the following commands if errors arise from the terminal due to recent changes with SSH in pip (Linux and MacOS)
@@ -36,7 +40,7 @@ pip2 install numpy scipy matplotlib
 
 ```
 cd C++Codes
-g++ -std=c++11 *.cpp -fopenmp (on Windows and Linux)
+g++ -o minhash -std=c++11 *.cpp -fopenmp (on Windows and Linux)
 g++ *.cpp -fopenmp (on MacOS) 
 ```
 
@@ -63,7 +67,7 @@ Use the C++ Package folder in this repository. This is a fast minhash package wh
 
 1. Update the Config file for minhash and run the program (Remember to change the outputfile name option to Restaurant_pair.csv or the particular name of your data set.) The second and third arguments are K and L respectively.
 ```
-./a.out Config.txt 1 10
+./C++Codes/minhash config_restaurant.txt 1 10
 ```
 
 The output is `Restaurant_pair.csv` where the output is candidate record pairs:

diff --git a/config_restaurant.txt b/config_restaurant.txt
@@ -1,47 +1,47 @@
 ##############################################################
-# Config file for Restaurant dataset
-# AnyLine containing `#' with be treated as comment.
-# The Typical Format is Variablename = Value
-###############################################################
-
-# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning,
-#for rigorous pruning use larger values of K 
-
-K=1
-
-# More L increases the recall but also reports more pair. Less sensitive than K
-
-L=4
-
-# ngrams length
-
-shingles=2
-
-# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this
-Thresh=3
-
-
-#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record. 
-#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc.
-
-Input=data/restaurant.csv
-#Output File: this will contain a pair of record IDs in each line indicating a possible match. 
-
-Output=restaurant_pair.csv
-##############################################################################
-#These are advanced parameters depending on memory 
-##############################################################################
-# No of Cells in each bucket. Decrease if goes out of memory. 
-
-BucketSize=32
-
-# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27
-
-RangePow=20
-
-# Increase if MinHashing Takes a lot of Time. Must be power of 2.
-
-MinHashChunkSize=32
-
-# Processes these many records in parallel, larger is faster. Decrease if goes out of memory
-Chunk=500000
+# Config file for Restaurant dataset
+# AnyLine containing `#' with be treated as comment.
+# The Typical Format is Variablename = Value
+###############################################################
+
+# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning,
+#for rigorous pruning use larger values of K 
+
+K=1
+
+# More L increases the recall but also reports more pair. Less sensitive than K
+
+L=4
+
+# ngrams length
+
+shingles=2
+
+# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this
+Thresh=3
+
+
+#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record. 
+#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc.
+
+Input=data/Restaurant.csv
+#Output File: this will contain a pair of record IDs in each line indicating a possible match. 
+
+Output=Restaurant_pair.csv
+##############################################################################
+#These are advanced parameters depending on memory 
+##############################################################################
+# No of Cells in each bucket. Decrease if goes out of memory. 
+
+BucketSize=32
+
+# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27
+
+RangePow=20
+
+# Increase if MinHashing Takes a lot of Time. Must be power of 2.
+
+MinHashChunkSize=32
+
+# Processes these many records in parallel, larger is faster. Decrease if goes out of memory
+Chunk=500000
diff --git a/run_script.sh b/run_script.sh
@@ -3,12 +3,12 @@
 
 #!/bin/bash
 
-g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp
+g++-7 -std=c++11 C++Codes/*.cpp -o minhash -fopenmp
 
  For Restaurant
  for ((i=6;i<=25;i+=6)) ; 
  	do for ((j=1;j<=10; j++));
-  		do ./output config_restaurant.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.3 --input restaurant_pair.csv --goldstan data/restaurant.csv --output log-restaurant ; 
+  		do ./minhash config_restaurant.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.3 --input restaurant_pair.csv --goldstan data/restaurant.csv --output log-restaurant ; 
  	done
  done
 
@@ -17,7 +17,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp
 #For CD
 # for ((i=6;i<=20;i+=4)) ; 
 # 	do for ((j=1;j<=3; j++));
-#  		do ./output config_cd.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.5 --input cd_pair.csv --goldstan data/cd.csv --delimiter ';' --output log-cd ; 
+#  		do ./minhash config_cd.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.5 --input cd_pair.csv --goldstan data/cd.csv --delimiter ';' --output log-cd ; 
 # 	done
 # done
 
@@ -26,7 +26,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp
 #For Voter
 # for ((i=25;i<=40;i+=5)) ; 
 # 	do for ((j=1;j<=10; j++));
-#  		do ./output config_voter.txt 4 $i; python pipeline.py --flag 0 --id $i --trainsize 0.1 --input voter_pair.csv --goldstan data/voter.csv --delimiter ',' --c 0.0001 --output log-voter ; 
+#  		do ./minhash config_voter.txt 4 $i; python pipeline.py --flag 0 --id $i --trainsize 0.1 --input voter_pair.csv --goldstan data/voter.csv --delimiter ',' --c 0.0001 --output log-voter ; 
 # 	done
 # done
 
@@ -36,7 +36,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp
 # python preprocess.py
 
 #for ((i=1;i<=10;i++)) ; 
-#	do ./output config_syria.txt 15 10; python pipeline_for_syria.py --input syria_pair.csv --output log-syria --rawdata data/syria.csv --goldstandpair data/syria_train.csv; 
+#	do ./minhash config_syria.txt 15 10; python pipeline_for_syria.py --input syria_pair.csv --output log-syria --rawdata data/syria.csv --goldstandpair data/syria_train.csv; 
 #done
 
 #python count.py --input log-syria
diff --git a/setup.sh b/setup.sh
@@ -0,0 +1,18 @@
+# Setup script
+# Assumes presence of Anaconda
+
+# Create an environment
+conda create --name LSH python=2.7
+source activate LSH
+
+# Install packages from Anaconda
+conda install numpy
+conda install scipy
+
+# Install packages using pip
+pip install --pre subprocess32
+pip install ngram
+pip install sklearn
+pip install matlib
+
+# this fails due to dependency failure: matlib.h