diff --git a/PROJECT_README.txt b/PROJECT_README.txt new file mode 100644 index 0000000..a9f8eb7 --- /dev/null +++ b/PROJECT_README.txt @@ -0,0 +1,11 @@ +READ ME FIRST + +In this folder we have provided readmes for each dataset: + +taxi_readme.txt +weather_readme.txt +turnstile_readme.txt + +the data should be browseable at: /user/hl1785/data/ + +see the individual readmes for more details diff --git a/data_ingest/taxi/README b/data_ingest/taxi/README new file mode 100644 index 0000000..4f9169f --- /dev/null +++ b/data_ingest/taxi/README @@ -0,0 +1,22 @@ +Data Source: +----------------------- +http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml + +Data Ingest: +----------------------- +* For Green Cab Data (screenshot: "ingest"): +> curl -o green_tripdata_2018-06.csv https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2018-06.csv + +* For Yellow Cab Data: +> curl -o yellow_tripdata_2018-05.csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-05.csv + +* For FHV: +> curl -o fhv_tripdata_2017-11.csv https://s3.amazonaws.com/nyc-tlc/trip+data/fhv_tripdata_2017-11.csv + +All of these commands get one month of data, I ran each of these mutliple times to get all the data I needed. + +* For taxi-zones (used for ETL): +> curl -o taxi-zone.csv https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv + +Finally, load all data into dumbo, e.g. +> hdfs dfs -put yellow* data/yellow/ diff --git a/source_code/nyc-impala/fhv.sql b/etl_code/taxi/nyc-impala/fhv.sql similarity index 100% rename from source_code/nyc-impala/fhv.sql rename to etl_code/taxi/nyc-impala/fhv.sql diff --git a/source_code/nyc-impala/greentaxi.sql b/etl_code/taxi/nyc-impala/greentaxi.sql similarity index 100% rename from source_code/nyc-impala/greentaxi.sql rename to etl_code/taxi/nyc-impala/greentaxi.sql diff --git a/source_code/nyc-impala/yellowtaxi.sql b/etl_code/taxi/nyc-impala/yellowtaxi.sql similarity index 100% rename from source_code/nyc-impala/yellowtaxi.sql rename to etl_code/taxi/nyc-impala/yellowtaxi.sql diff --git a/source_code/nyc-spark/.gitignore b/etl_code/taxi/nyc-spark/.gitignore similarity index 100% rename from source_code/nyc-spark/.gitignore rename to etl_code/taxi/nyc-spark/.gitignore diff --git a/source_code/nyc-spark/build.sbt b/etl_code/taxi/nyc-spark/build.sbt similarity index 100% rename from source_code/nyc-spark/build.sbt rename to etl_code/taxi/nyc-spark/build.sbt diff --git a/source_code/nyc-spark/project/build.properties b/etl_code/taxi/nyc-spark/project/build.properties similarity index 100% rename from source_code/nyc-spark/project/build.properties rename to etl_code/taxi/nyc-spark/project/build.properties diff --git a/source_code/nyc-spark/src/main/scala/DataSchema.scala b/etl_code/taxi/nyc-spark/src/main/scala/DataSchema.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/DataSchema.scala rename to etl_code/taxi/nyc-spark/src/main/scala/DataSchema.scala diff --git a/source_code/nyc-spark/src/main/scala/JoinWeatherAndFHV.scala b/etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndFHV.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/JoinWeatherAndFHV.scala rename to etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndFHV.scala diff --git a/source_code/nyc-spark/src/main/scala/JoinWeatherAndGreen.scala b/etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndGreen.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/JoinWeatherAndGreen.scala rename to etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndGreen.scala diff --git a/source_code/nyc-spark/src/main/scala/JoinWeatherAndYellow.scala b/etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndYellow.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/JoinWeatherAndYellow.scala rename to etl_code/taxi/nyc-spark/src/main/scala/JoinWeatherAndYellow.scala diff --git a/source_code/nyc-spark/src/main/scala/PredGreen.scala b/etl_code/taxi/nyc-spark/src/main/scala/PredGreen.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/PredGreen.scala rename to etl_code/taxi/nyc-spark/src/main/scala/PredGreen.scala diff --git a/source_code/nyc-spark/src/main/scala/RFGreen.scala b/etl_code/taxi/nyc-spark/src/main/scala/RFGreen.scala similarity index 100% rename from source_code/nyc-spark/src/main/scala/RFGreen.scala rename to etl_code/taxi/nyc-spark/src/main/scala/RFGreen.scala diff --git a/source_code/nyc-spark/test_data.csv b/etl_code/taxi/nyc-spark/test_data.csv similarity index 100% rename from source_code/nyc-spark/test_data.csv rename to etl_code/taxi/nyc-spark/test_data.csv diff --git a/source_code/nyc-taxi/.gitignore b/etl_code/taxi/nyc-taxi/.gitignore similarity index 100% rename from source_code/nyc-taxi/.gitignore rename to etl_code/taxi/nyc-taxi/.gitignore diff --git a/source_code/nyc-taxi/README.md b/etl_code/taxi/nyc-taxi/README.md similarity index 100% rename from source_code/nyc-taxi/README.md rename to etl_code/taxi/nyc-taxi/README.md diff --git a/source_code/nyc-taxi/pom.xml b/etl_code/taxi/nyc-taxi/pom.xml similarity index 100% rename from source_code/nyc-taxi/pom.xml rename to etl_code/taxi/nyc-taxi/pom.xml diff --git a/source_code/nyc-taxi/src/main/java/DataConsolidate.java b/etl_code/taxi/nyc-taxi/src/main/java/DataConsolidate.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/DataConsolidate.java rename to etl_code/taxi/nyc-taxi/src/main/java/DataConsolidate.java diff --git a/source_code/nyc-taxi/src/main/java/DataProfiler.java b/etl_code/taxi/nyc-taxi/src/main/java/DataProfiler.java similarity index 84% rename from source_code/nyc-taxi/src/main/java/DataProfiler.java rename to etl_code/taxi/nyc-taxi/src/main/java/DataProfiler.java index 5fab7b6..53af808 100644 --- a/source_code/nyc-taxi/src/main/java/DataProfiler.java +++ b/etl_code/taxi/nyc-taxi/src/main/java/DataProfiler.java @@ -13,20 +13,20 @@ public class DataProfiler { - public static class ProfileMapper extends Mapper { + public static class ProfileMapper extends Mapper { public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { int colInd = context.getConfiguration().getInt("colInd", 0); String[] rowSplit = value.toString().split(","); - context.write(new Text(rowSplit[colInd]), new IntWritable(1)); + context.write(new Text(rowSplit[colInd]), new LongWritable(1)); } } - public static class ProfileReducer extends Reducer { - public void reduce(Text key, Iterable values, Context context) + public static class ProfileReducer extends Reducer { + public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { long sum = 0; - for (IntWritable value: values) { + for (LongWritable value: values) { sum += value.get(); } context.write(key, new LongWritable(sum)); diff --git a/source_code/nyc-taxi/src/main/java/DataSchema.java b/etl_code/taxi/nyc-taxi/src/main/java/DataSchema.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/DataSchema.java rename to etl_code/taxi/nyc-taxi/src/main/java/DataSchema.java diff --git a/source_code/nyc-taxi/src/main/java/IdToNeighborhoodJob.java b/etl_code/taxi/nyc-taxi/src/main/java/IdToNeighborhoodJob.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/IdToNeighborhoodJob.java rename to etl_code/taxi/nyc-taxi/src/main/java/IdToNeighborhoodJob.java diff --git a/source_code/nyc-taxi/src/main/java/LocationTimeJob.java b/etl_code/taxi/nyc-taxi/src/main/java/LocationTimeJob.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/LocationTimeJob.java rename to etl_code/taxi/nyc-taxi/src/main/java/LocationTimeJob.java diff --git a/source_code/nyc-taxi/src/main/java/LocationTimeMapper.java b/etl_code/taxi/nyc-taxi/src/main/java/LocationTimeMapper.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/LocationTimeMapper.java rename to etl_code/taxi/nyc-taxi/src/main/java/LocationTimeMapper.java diff --git a/source_code/nyc-taxi/src/main/java/LocationTimeReducer.java b/etl_code/taxi/nyc-taxi/src/main/java/LocationTimeReducer.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/LocationTimeReducer.java rename to etl_code/taxi/nyc-taxi/src/main/java/LocationTimeReducer.java diff --git a/source_code/nyc-taxi/src/main/java/old/DataCleaner.java b/etl_code/taxi/nyc-taxi/src/main/java/old/DataCleaner.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/DataCleaner.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/DataCleaner.java diff --git a/source_code/nyc-taxi/src/main/java/old/DateTimeFeaturizer.java b/etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeFeaturizer.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/DateTimeFeaturizer.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeFeaturizer.java diff --git a/source_code/nyc-taxi/src/main/java/old/DateTimeMapper.java b/etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeMapper.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/DateTimeMapper.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeMapper.java diff --git a/source_code/nyc-taxi/src/main/java/old/DateTimeReducer.java b/etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeReducer.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/DateTimeReducer.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/DateTimeReducer.java diff --git a/source_code/nyc-taxi/src/main/java/old/LocationIdMapper.java b/etl_code/taxi/nyc-taxi/src/main/java/old/LocationIdMapper.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/LocationIdMapper.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/LocationIdMapper.java diff --git a/source_code/nyc-taxi/src/main/java/old/LocationIdReducer.java b/etl_code/taxi/nyc-taxi/src/main/java/old/LocationIdReducer.java similarity index 100% rename from source_code/nyc-taxi/src/main/java/old/LocationIdReducer.java rename to etl_code/taxi/nyc-taxi/src/main/java/old/LocationIdReducer.java diff --git a/profiling_code/taxi/README b/profiling_code/taxi/README new file mode 100644 index 0000000..38f048c --- /dev/null +++ b/profiling_code/taxi/README @@ -0,0 +1,4 @@ +For analyzing NYC taxi data, I used maven to all of my MapReduce code + +As a result, the code for profiling NYC taxi data is also bundled with the ETL code for taxi +in the directory /etl_code/taxi/nyc-taxi/ \ No newline at end of file diff --git a/taxi_readme.txt b/taxi_readme.txt index ff76da9..7d07f7f 100644 --- a/taxi_readme.txt +++ b/taxi_readme.txt @@ -1,14 +1,24 @@ +----------------------- Taxi Data: ----------------------- + +----------------------- Screenshots: ----------------------- There are screenshots in the screenshots/taxi/ directory showing -output of running code in order to analyze green taxi data +the output of running code in order to analyze green taxi data (in particular) +the processes for analyzing yellow cab and FHV data are virtually identical so screenshots are not provided... + -Source: +----------------------- +Data Source: +----------------------- http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml + + +----------------------- Data Ingest: ----------------------- * For Green Cab Data (screenshot: "ingest"): @@ -24,17 +34,24 @@ All of these commands get one month of data, I ran each of these mutliple times * For taxi-zones (used for ETL): > curl -o taxi-zone.csv https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv ----------- End Data Ingest ------------- -Maven ------------------------- +Finally, load all data into dumbo, e.g. +> hdfs dfs -put yellow* data/yellow/ + + + + +----------------------- +Data ETL +----------------------- +See under etl_code/taxi/nyc-taxi/main/java/ : +IdToNeighborhoodJob, LocalTimeJob, LocalTimeMapper, LocalTimeReducer + I used maven to build/package all of my MapReduce source files, so in order to run any of the below commands, please run (on dumbo): -> cd nyc-taxi +> cd etl_code/taxi/nyc-taxi > mvn clean package -Data Cleaning/Profiling ------------------------- The following removes unnecessary columns and removes malformed data (screenshot: "cleaning"): > cd nyc-taxi > hadoop jar target/nyc-taxi-1.0.jar LocationTimeJob data/green/*.csv data/green/cleaned @@ -43,23 +60,52 @@ Usage: hadoop jar target/nyc-taxi-1.0.jar LocationTimeJob The following adds borough and location/neighborhood information (screenshot: "addBoro"): -> cd nyc-taxi > hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob data/green/cleaned data/green/withBoro data/taxi_zone.csv Usage: hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob ----------- End Data Cleaning/Profiling ------------- -Spark -------------------------- -I used sbt to build/package all of my spark source files, + +----------------------- +Data Profiling +----------------------- +See under etl_code/taxi/nyc-taxi/main/java/ : +DataProfiler + +The following output pairs counting the # of occurences of a particular key +for a specified column of the data (screenshot: "profiling"): +> hadoop jar target/nyc-taxi-1.0.jar DataProfiler data/green/*.csv data/green/profile 1 + +Usage: +hadoop jar target/nyc-taxi-1.0.jar IdToNeighborhoodJob + +Where is the + + + +----------------------- +Data Iterations +----------------------- +See under etl_code/taxi/nyc-taxi/main/java/old/ : + +This directory contains code from previous iterations of the process + + + + +----------------------- +Data Joining +----------------------- +See under etl_code/taxi/nyc-spark/ : +DataSchema, JoinWeatherAndFHV, JoinWeatherAndGreen, JoinWeatherAndYellow + +For the next steps, I used sbt to build/package all of my spark source files, so in order to run any of the below commands, please run (on dumbo): > module load sbt -> cd nyc-spark +> cd etl_code/taxi/nyc-spark > sbt package -Data Joining ------------------------- + The following joins taxi (Green cab) and weather data (screenshot: "join1" and "join2"): > cd nyc-spark @@ -69,11 +115,17 @@ Usage: spark2-submit --class JoinWeatherAndGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar Use JoinWeatherAndYellow and JoinWeatherAndFHV for yellow cab and FHV resp. ----------- End Data Joining ------------- + + + +----------------------- Linear Regression ------------------------- -The following create a prediction model and saves it to some directory (screenshot: "linear_reg1" and "linear_reg2"): +----------------------- +See under etl_code/taxi/nyc-spark/ : +PredGreen, RFGreen + +The following creates a prediction model and saves it to some directory (screenshot: "linear_reg1" and "linear_reg2"): > cd nyc-spark > spark2-submit --class RFGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar data/green/joined data/lr/output @@ -90,11 +142,18 @@ Usage: spark2-submit --class PredGreen --master yarn target/scala-2.11/nyc-spark_2.11-0.1.jar As of right now, only green taxi works, and predictions are entirely accurate sorry about that! ----------- End Linear Regression ------------- + + + + +----------------------- Impala Queries ------------------------- -See nyc-impala/ for the SQL commands used to create tables/views and queries for the taxi data +----------------------- +See under etl_code/taxi/nyc-impala/ : +fhv.sql, greentaxi.sql, yellowtaxi.sql + +These files contain SQL commands used to create tables/views and queries for the taxi data Screenshots (and what they show): create_table - create table from joined data create_view - create view with date fields @@ -103,4 +162,4 @@ no_snow_by_boro - taxi usage on days where it didn't snow, grouped by borough num_of_snow_days - count # of snow days by_avg_temp - taxi usage vs. average temp by_avg_temp_brooklyn - taxi usage vs. average temp for a particular borough (Brooklyn) ----------- End Impala Queries ------------- +