-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2-LR-Split-Datasets.sql
99 lines (86 loc) · 3.39 KB
/
2-LR-Split-Datasets.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Prepare Data for Linear Regression
## Create 3 tables for LR model
-- Segmenting the dataset
-- Extract the Month and Year from starttime to know the time when the bike trip started
-- Exclude the bad records from the exploration stage and choose year 2017 and 2018
-- Exclude `tripduration` column outliers by limit duration from min 3 mins and max 3 hours
-- Filter customers empty year of birth from `birth_year` and exclude customers born after 2007.
SELECT EXTRACT (YEAR FROM `starttime`) AS year,
EXTRACT (MONTH FROM starttime) AS month,
COUNT(*) AS total
FROM `bigquery-public-data.new_york_citibike.citibike_trips`
WHERE EXTRACT (YEAR FROM starttime)=2017 OR
EXTRACT (YEAR FROM starttime)=2018 AND
(tripduration>=3*60 AND tripduration <=3*60*60) AND
birth_year is not NULL AND
birth_year < 2007
GROUP BY year, month;
# Create ML model datasets
-- We will use the time frames as a data point to split dataset into 3 tables
# Training_table
-- Include all records from April 2017 to Feburary 2018.
CREATE OR REPLACE TABLE `01_nyc_bike_sharing.training_table` AS
SELECT tripduration/60 as tripduration,
starttime,
stoptime,
start_station_id, start_station_latitude, start_station_longitude, start_station_name,
end_station_id, end_station_latitude, end_station_longitude, end_station_name,
bikeid,
usertype,
birth_year,
gender,
customer_plan
FROM `bigquery-public-data.new_york_citibike.citibike_trips`
WHERE (
(EXTRACT (YEAR FROM starttime)=2017 AND
(EXTRACT (MONTH FROM starttime) >= 04 OR
EXTRACT (MONTH FROM starttime)<= 10))
OR(
EXTRACT (YEAR FROM starttime)=2018 AND
(EXTRACT (MONTH FROM starttime) >= 01 OR
EXTRACT (MONTH FROM starttime)<= 02))
)
AND (tripduration>= 3*60 AND tripduration<=3*60*60)
AND birth_year is not NULL
AND birth_year < 2007;
# Evaluattion_table
-- Include all records from March and April 2018
CREATE OR REPLACE TABLE `01_nyc_bike_sharing.evaluation_table` AS
SELECT tripduration/60 as tripduration,
starttime,
stoptime,
start_station_id, start_station_latitude, start_station_longitude, start_station_name,
end_station_id, end_station_latitude, end_station_longitude, end_station_name,
bikeid,
usertype,
birth_year,
gender,
customer_plan
FROM `bigquery-public-data.new_york_citibike.citibike_trips`
WHERE
(EXTRACT (YEAR FROM starttime)=2018 AND
(EXTRACT (MONTH FROM starttime) = 03 OR
EXTRACT (MONTH FROM starttime)= 04))
AND (tripduration>= 3*60 AND tripduration<=3*60*60)
AND birth_year is not NULL
AND birth_year < 2007;
# Prediction_table
-- Include all records only from the month of May 2018
CREATE OR REPLACE TABLE `01_nyc_bike_sharing.prediction_table` AS
SELECT tripduration/60 as tripduration,
starttime,
stoptime,
start_station_id, start_station_latitude, start_station_longitude, start_station_name,
end_station_id, end_station_latitude, end_station_longitude, end_station_name,
bikeid,
usertype,
birth_year,
gender,
customer_plan
FROM `bigquery-public-data.new_york_citibike.citibike_trips`
WHERE
(EXTRACT (YEAR FROM starttime)= 2018 AND
EXTRACT (MONTH FROM starttime) = 05 )
AND (tripduration>= 3*60 AND tripduration<=3*60*60)
AND birth_year is not NULL
AND birth_year < 2007;