updates to taxi

aashishdugar · Dec 10, 2018 · 19435fe · 19435fe
1 parent 0644d61
commit 19435fe
Show file tree

Hide file tree

Showing 33 changed files with 124 additions and 28 deletions.
diff --git a/PROJECT_README.txt b/PROJECT_README.txt
@@ -0,0 +1,11 @@
+READ ME FIRST
+
+In this folder we have provided readmes for each dataset:
+
+taxi_readme.txt
+weather_readme.txt
+turnstile_readme.txt
+
+the data should be browseable at: /user/hl1785/data/
+
+see the individual readmes for more details
diff --git a/data_ingest/taxi/README b/data_ingest/taxi/README
@@ -0,0 +1,22 @@
+Data Source:
+-----------------------
+http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
+
+Data Ingest:
+-----------------------
+* For Green Cab Data (screenshot: "ingest"):
+> curl -o green_tripdata_2018-06.csv https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-06.csv
+
+* For Yellow Cab Data:
+> curl -o yellow_tripdata_2018-05.csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-05.csv
+
+* For FHV:
+> curl -o fhv_tripdata_2017-11.csv https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2017-11.csv
+
+All of these commands get one month of data, I ran each of these mutliple times to get all the data I needed.
+
+* For taxi-zones (used for ETL):
+> curl -o taxi-zone.csv https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
+
+Finally, load all data into dumbo, e.g.
+> hdfs dfs -put yellow* data/yellow/
diff --git a/source_code/nyc-impala/fhv.sql → etl_code/taxi/nyc-impala/fhv.sql b/source_code/nyc-impala/fhv.sql → etl_code/taxi/nyc-impala/fhv.sql
diff --git a/source_code/nyc-impala/greentaxi.sql → etl_code/taxi/nyc-impala/greentaxi.sql b/source_code/nyc-impala/greentaxi.sql → etl_code/taxi/nyc-impala/greentaxi.sql
diff --git a/source_code/nyc-impala/yellowtaxi.sql → etl_code/taxi/nyc-impala/yellowtaxi.sql b/source_code/nyc-impala/yellowtaxi.sql → etl_code/taxi/nyc-impala/yellowtaxi.sql
diff --git a/source_code/nyc-spark/.gitignore → etl_code/taxi/nyc-spark/.gitignore b/source_code/nyc-spark/.gitignore → etl_code/taxi/nyc-spark/.gitignore
diff --git a/source_code/nyc-spark/build.sbt → etl_code/taxi/nyc-spark/build.sbt b/source_code/nyc-spark/build.sbt → etl_code/taxi/nyc-spark/build.sbt
diff --git a/...e_code/nyc-spark/project/build.properties → ...e/taxi/nyc-spark/project/build.properties b/...e_code/nyc-spark/project/build.properties → ...e/taxi/nyc-spark/project/build.properties
diff --git a/...nyc-spark/src/main/scala/DataSchema.scala → ...nyc-spark/src/main/scala/DataSchema.scala b/...nyc-spark/src/main/scala/DataSchema.scala → ...nyc-spark/src/main/scala/DataSchema.scala
diff --git a/...rk/src/main/scala/JoinWeatherAndFHV.scala → ...rk/src/main/scala/JoinWeatherAndFHV.scala b/...rk/src/main/scala/JoinWeatherAndFHV.scala → ...rk/src/main/scala/JoinWeatherAndFHV.scala
diff --git a/.../src/main/scala/JoinWeatherAndGreen.scala → .../src/main/scala/JoinWeatherAndGreen.scala b/.../src/main/scala/JoinWeatherAndGreen.scala → .../src/main/scala/JoinWeatherAndGreen.scala
diff --git a/...src/main/scala/JoinWeatherAndYellow.scala → ...src/main/scala/JoinWeatherAndYellow.scala b/...src/main/scala/JoinWeatherAndYellow.scala → ...src/main/scala/JoinWeatherAndYellow.scala
diff --git a/.../nyc-spark/src/main/scala/PredGreen.scala → .../nyc-spark/src/main/scala/PredGreen.scala b/.../nyc-spark/src/main/scala/PredGreen.scala → .../nyc-spark/src/main/scala/PredGreen.scala
diff --git a/...de/nyc-spark/src/main/scala/RFGreen.scala → ...xi/nyc-spark/src/main/scala/RFGreen.scala b/...de/nyc-spark/src/main/scala/RFGreen.scala → ...xi/nyc-spark/src/main/scala/RFGreen.scala
diff --git a/source_code/nyc-spark/test_data.csv → etl_code/taxi/nyc-spark/test_data.csv b/source_code/nyc-spark/test_data.csv → etl_code/taxi/nyc-spark/test_data.csv
diff --git a/source_code/nyc-taxi/.gitignore → etl_code/taxi/nyc-taxi/.gitignore b/source_code/nyc-taxi/.gitignore → etl_code/taxi/nyc-taxi/.gitignore
diff --git a/source_code/nyc-taxi/README.md → etl_code/taxi/nyc-taxi/README.md b/source_code/nyc-taxi/README.md → etl_code/taxi/nyc-taxi/README.md
diff --git a/source_code/nyc-taxi/pom.xml → etl_code/taxi/nyc-taxi/pom.xml b/source_code/nyc-taxi/pom.xml → etl_code/taxi/nyc-taxi/pom.xml
diff --git a/...c-taxi/src/main/java/DataConsolidate.java → ...c-taxi/src/main/java/DataConsolidate.java b/...c-taxi/src/main/java/DataConsolidate.java → ...c-taxi/src/main/java/DataConsolidate.java
diff --git a/.../nyc-taxi/src/main/java/DataProfiler.java → .../nyc-taxi/src/main/java/DataProfiler.java b/.../nyc-taxi/src/main/java/DataProfiler.java → .../nyc-taxi/src/main/java/DataProfiler.java
@@ -13,20 +13,20 @@
 
 public class DataProfiler {
 
-    public static class ProfileMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
+    public static class ProfileMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
         public void map(LongWritable key, Text value, Context context)
                 throws IOException, InterruptedException {
             int colInd = context.getConfiguration().getInt("colInd", 0);
             String[] rowSplit = value.toString().split(",");
-            context.write(new Text(rowSplit[colInd]), new IntWritable(1));
+            context.write(new Text(rowSplit[colInd]), new LongWritable(1));
         }
     }
 
-    public static class ProfileReducer extends Reducer<Text, IntWritable, Text, LongWritable> {
-        public void reduce(Text key, Iterable<IntWritable> values, Context context)
+    public static class ProfileReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
+        public void reduce(Text key, Iterable<LongWritable> values, Context context)
                 throws IOException, InterruptedException {
             long sum = 0;
-            for (IntWritable value: values) {
+            for (LongWritable value: values) {
                 sum += value.get();
             }
             context.write(key, new LongWritable(sum));

diff --git a/...de/nyc-taxi/src/main/java/DataSchema.java → ...xi/nyc-taxi/src/main/java/DataSchema.java b/...de/nyc-taxi/src/main/java/DataSchema.java → ...xi/nyc-taxi/src/main/java/DataSchema.java
diff --git a/...xi/src/main/java/IdToNeighborhoodJob.java → ...xi/src/main/java/IdToNeighborhoodJob.java b/...xi/src/main/java/IdToNeighborhoodJob.java → ...xi/src/main/java/IdToNeighborhoodJob.java
diff --git a/...c-taxi/src/main/java/LocationTimeJob.java → ...c-taxi/src/main/java/LocationTimeJob.java b/...c-taxi/src/main/java/LocationTimeJob.java → ...c-taxi/src/main/java/LocationTimeJob.java
diff --git a/...axi/src/main/java/LocationTimeMapper.java → ...axi/src/main/java/LocationTimeMapper.java b/...axi/src/main/java/LocationTimeMapper.java → ...axi/src/main/java/LocationTimeMapper.java
diff --git a/...xi/src/main/java/LocationTimeReducer.java → ...xi/src/main/java/LocationTimeReducer.java b/...xi/src/main/java/LocationTimeReducer.java → ...xi/src/main/java/LocationTimeReducer.java
diff --git a/...c-taxi/src/main/java/old/DataCleaner.java → ...c-taxi/src/main/java/old/DataCleaner.java b/...c-taxi/src/main/java/old/DataCleaner.java → ...c-taxi/src/main/java/old/DataCleaner.java
diff --git a/...src/main/java/old/DateTimeFeaturizer.java → ...src/main/java/old/DateTimeFeaturizer.java b/...src/main/java/old/DateTimeFeaturizer.java → ...src/main/java/old/DateTimeFeaturizer.java
diff --git a/...axi/src/main/java/old/DateTimeMapper.java → ...axi/src/main/java/old/DateTimeMapper.java b/...axi/src/main/java/old/DateTimeMapper.java → ...axi/src/main/java/old/DateTimeMapper.java
diff --git a/...xi/src/main/java/old/DateTimeReducer.java → ...xi/src/main/java/old/DateTimeReducer.java b/...xi/src/main/java/old/DateTimeReducer.java → ...xi/src/main/java/old/DateTimeReducer.java
diff --git a/...i/src/main/java/old/LocationIdMapper.java → ...i/src/main/java/old/LocationIdMapper.java b/...i/src/main/java/old/LocationIdMapper.java → ...i/src/main/java/old/LocationIdMapper.java
diff --git a/.../src/main/java/old/LocationIdReducer.java → .../src/main/java/old/LocationIdReducer.java b/.../src/main/java/old/LocationIdReducer.java → .../src/main/java/old/LocationIdReducer.java
diff --git a/profiling_code/taxi/README b/profiling_code/taxi/README
@@ -0,0 +1,4 @@
+For analyzing NYC taxi data, I used maven to all of my MapReduce code
+
+As a result, the code for profiling NYC taxi data is also bundled with the ETL code for taxi
+in the directory /etl_code/taxi/nyc-taxi/
diff --git a/taxi_readme.txt b/taxi_readme.txt
@@ -1,14 +1,24 @@
+-----------------------
 Taxi Data:
 -----------------------
 
+
+-----------------------
 Screenshots:
 -----------------------
 There are screenshots in the screenshots/taxi/ directory showing
-output of running code in order to analyze green taxi data
+the output of running code in order to analyze green taxi data (in particular)
+the processes for analyzing yellow cab and FHV data are virtually identical so screenshots are not provided...
+
 
-Source:
+-----------------------
+Data Source:
+-----------------------
 http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
 
+
+
+-----------------------
 Data Ingest:
 -----------------------
 * For Green Cab Data (screenshot: "ingest"):
@@ -24,17 +34,24 @@ All of these commands get one month of data, I ran each of these mutliple times
 
 * For taxi-zones (used for ETL):
 > curl -o taxi-zone.csv https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
----------- End Data Ingest -------------
 
-Maven
-------------------------
+Finally, load all data into dumbo, e.g.
+> hdfs dfs -put yellow* data/yellow/
+
+
+
+
+-----------------------
+Data ETL
+-----------------------
+See under etl_code/taxi/nyc-taxi/main/java/ :
+IdToNeighborhoodJob, LocalTimeJob, LocalTimeMapper, LocalTimeReducer
+
 I used maven to build/package all of my MapReduce source files,
 so in order to run any of the below commands, please run (on dumbo):
-> cd nyc-taxi
+> cd etl_code/taxi/nyc-taxi
 > mvn clean package
 
-Data Cleaning/Profiling
-------------------------
 The following removes unnecessary columns and removes malformed data (screenshot: "cleaning"):
 > cd nyc-taxi
 > hadoop jar target/nyc-taxi-1.0.jar LocationTimeJob data/green/*.csv data/green/cleaned
@@ -43,23 +60,52 @@ Usage:
 hadoop jar target/nyc-taxi-1.0.jar LocationTimeJob <input path> <output path>
 
 The following adds borough and location/neighborhood information (screenshot: "addBoro"):
-> cd nyc-taxi
 > hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob data/green/cleaned data/green/withBoro data/taxi_zone.csv
 
 Usage:
 hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob <input path> <output path> <taxi zone path>
----------- End Data Cleaning/Profiling -------------
 
-Spark
--------------------------
-I used sbt to build/package all of my spark source files,
+
+-----------------------
+Data Profiling
+-----------------------
+See under etl_code/taxi/nyc-taxi/main/java/ :
+DataProfiler
+
+The following output <k, v> pairs counting the # of occurences of a particular key
+for a specified column of the data (screenshot: "profiling"):
+> hadoop jar target/nyc-taxi-1.0.jar DataProfiler data/green/*.csv data/green/profile 1
+
+Usage:
+hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob <input path> <output path> <col Ind>
+
+Where <col Ind> is the
+
+
+
+-----------------------
+Data Iterations
+-----------------------
+See under etl_code/taxi/nyc-taxi/main/java/old/ :
+
+This directory contains code from previous iterations of the process
+
+
+
+
+-----------------------
+Data Joining
+-----------------------
+See under etl_code/taxi/nyc-spark/ :
+DataSchema, JoinWeatherAndFHV, JoinWeatherAndGreen, JoinWeatherAndYellow
+
+For the next steps, I used sbt to build/package all of my spark source files,
 so in order to run any of the below commands, please run (on dumbo):
 > module load sbt
-> cd nyc-spark
+> cd etl_code/taxi/nyc-spark
 > sbt package
 
-Data Joining
-------------------------
+
 The following joins taxi (Green cab) and weather data (screenshot: "join1" and "join2"):
 
 > cd nyc-spark
@@ -69,11 +115,17 @@ Usage:
 spark2-submit --class JoinWeatherAndGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar <input path> <weather data> <output path>
 
 Use JoinWeatherAndYellow and JoinWeatherAndFHV for yellow cab and FHV resp.
----------- End Data Joining -------------
 
+
+
+
+-----------------------
 Linear Regression
-------------------------
-The following create a prediction model and saves it to some directory (screenshot: "linear_reg1" and "linear_reg2"):
+-----------------------
+See under etl_code/taxi/nyc-spark/ :
+PredGreen, RFGreen
+
+The following creates a prediction model and saves it to some directory (screenshot: "linear_reg1" and "linear_reg2"):
 
 > cd nyc-spark
 > spark2-submit --class RFGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar data/green/joined data/lr/output
@@ -90,11 +142,18 @@ Usage:
 spark2-submit --class PredGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar <model path> <input path> <output path>
 
 As of right now, only green taxi works, and predictions are entirely accurate sorry about that!
----------- End Linear Regression -------------
 
+
+
+
+
+-----------------------
 Impala Queries
-------------------------
-See nyc-impala/ for the SQL commands used to create tables/views and queries for the taxi data
+-----------------------
+See under etl_code/taxi/nyc-impala/ :
+fhv.sql, greentaxi.sql, yellowtaxi.sql
+
+These files contain SQL commands used to create tables/views and queries for the taxi data
 Screenshots (and what they show):
 create_table - create table from joined data
 create_view - create view with date fields
@@ -103,4 +162,4 @@ no_snow_by_boro - taxi usage on days where it didn't snow, grouped by borough
 num_of_snow_days - count # of snow days
 by_avg_temp - taxi usage vs. average temp
 by_avg_temp_brooklyn - taxi usage vs. average temp for a particular borough (Brooklyn)
----------- End Impala Queries -------------
+