From 00efef0fa04b5612fa4b7c7a4992584d9df37441 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Wed, 25 Dec 2019 11:57:58 +0900 Subject: [PATCH 01/23] initial import for blogpost example --- .../gender_age_prediction/blogpost.dig | 50 +++++++++++++++ .../gender_age_prediction/blogpost.md | 42 +++++++++++++ .../config/blogposts-params.yml | 27 ++++++++ .../queries/blogposts/gender_age.sql | 7 +++ .../queries/blogposts/tfidf_vec.sql | 61 +++++++++++++++++++ .../queries/blogposts/tokenize_en.sql | 32 ++++++++++ 6 files changed, 219 insertions(+) create mode 100644 machine-learning-box/gender_age_prediction/blogpost.dig create mode 100644 machine-learning-box/gender_age_prediction/blogpost.md create mode 100644 machine-learning-box/gender_age_prediction/config/blogposts-params.yml create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql diff --git a/machine-learning-box/gender_age_prediction/blogpost.dig b/machine-learning-box/gender_age_prediction/blogpost.dig new file mode 100644 index 00000000..c5279fbc --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogpost.dig @@ -0,0 +1,50 @@ +_export: + !include : config/params.yml + td: + database: ${target_db} + engine: hive + priority: ${job_priority} + ++preprocess: + _parallel: true + + +gender_age: + td>: queries/blogposts/gender_age.sql + create_table: gender_age + engine: presto + + +vectorize: + + +tokenize_en: + if>: ${english} + _do: + td>: queries/tokenize_en.sql + create_table: exploded + + +tokenize_ja: + if>: ${japanese} + _do: + td>: queries/tokenize_ja.sql + create_table: exploded + + +feature_vector: + _parallel: true + +# +count_vectorize: +# td>: queries/count_vec.sql +# create_table: cnt_features + + +tfidf_vectorize: + td>: queries/tfidf_vec.sql + create_table: features + ++prepare_input: + td>: queries/joined.sql + create_table: input + engine: hive + ++predict: + _parallel: true + + +rf_predict: + call>: rf_predict.dig diff --git a/machine-learning-box/gender_age_prediction/blogpost.md b/machine-learning-box/gender_age_prediction/blogpost.md new file mode 100644 index 00000000..3f598b1a --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogpost.md @@ -0,0 +1,42 @@ +Please download dataset from [Kaggle](https://www.kaggle.com/tomlisankie/blog-posts-labeled-with-age-and-gender/download) first. + +Then, you need a Kaggle account for download. Please set your kaggle API credentials in ~/.kaggle/kaggle.json following [this instruction](https://github.com/Kaggle/kaggle-api#api-credentials). + +## Prepare data + +Please download dataset from kaggle and run the following data preprocessing. + + +```sh +pip install kaggle + +chmod 600 ~/.kaggle/kaggle.json +kaggle datasets download tomlisankie/blog-posts-labeled-with-age-and-gender + +unzip blog-posts-labeled-with-age-and-gender.zip + +brew install jq + +echo -e "userid\tpost\tage\tgender" > train.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' train.json | awk '{print NR"\t"$0}' >> train.tsv + +echo -e "userid\tpost\tage\tgender" > test.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' test.json | awk '{print 526812+NR"\t"$0}' >> test.tsv +``` + +## Import data to Treasure Data + +Please import prepared blog post data to Treasure Data as follows: + +```sh +# create database +td db:create blogposts + +# load training data +td table:create blogposts train +td import:auto --auto-create blogposts.train --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./train.tsv + +# load test data +td table:create blogposts test +td import:auto --auto-create blogposts.test --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./test.tsv +``` \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/config/blogposts-params.yml b/machine-learning-box/gender_age_prediction/config/blogposts-params.yml new file mode 100644 index 00000000..3485373e --- /dev/null +++ b/machine-learning-box/gender_age_prediction/config/blogposts-params.yml @@ -0,0 +1,27 @@ +target_db: blogposts + +job_priority: -1 + +# RandomForest +rf_seed: 71 +rf_trees: 15 +rf_num_train_parallel: 4 +rf_max_depth: 30 +min_split: 3 +min_samples_leaf: 1 + +# over/down sampling +min_class_weight: 0.8 +max_class_weight: 4.0 + +# heuristic calibration +f15_factor: 1.0 +f20_factor: 1.0 +f25_factor: 1.0 +f35_factor: 1.0 +f50_factor: 1.0 +m15_factor: 1.0 +m20_factor: 1.0 +m25_factor: 1.0 +m35_factor: 1.0 +m50_factor: 1.0 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql new file mode 100644 index 00000000..de1c99b5 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql @@ -0,0 +1,7 @@ +select + userid, + (if(gender='male','M','F') || cast(cast(round(age / 5) as int) * 5 as varchar) + ) as gender_age +from + source +; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql new file mode 100644 index 00000000..db04b3ba --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql @@ -0,0 +1,61 @@ +WITH term_frequency as ( + select + t1.userid, + t2.word, + t2.freq + from ( + select + userid, + tf(word) as word2freq + from + exploded + group by + userid + ) t1 + LATERAL VIEW explode(word2freq) t2 as word, freq +), +document_frequency AS ( + select + word, + count(distinct userid) docs + from + exploded + group by + word +), +doc_len as ( + select + userid, + count(1) as dl, + avg(count(1)) over () as avgdl, + APPROX_COUNT_DISTINCT(userid) over () as total_docs + from + exploded + group by + userid +), +scores as ( + select + tf.userid, + tf.word, + bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25, + tfidf(tf.freq, df.docs, dl.total_docs) as tfidf + from + term_frequency tf + JOIN document_frequency df ON (tf.word = df.word) + JOIN doc_len dl ON (tf.userid = dl.userid) + where + df.docs >= 2 +) +-- DIGDAG_INSERT_LINE +select + userid, + to_ordered_list(feature(word,bm25),bm25,'-k ${num_features}') as features, + to_ordered_list(feature(word,tfidf),tfidf,'-k ${num_features}') as tfidf_features +-- to_ordered_list(word,bm25,'-k ${num_features}') as bm25_bow_features, +-- to_ordered_list(word,tfidf,'-k ${num_features}') as tfidf_bow_features +from + scores +group by + userid +; \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql new file mode 100644 index 00000000..7c6846d9 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -0,0 +1,32 @@ +WITH userlog as ( + select + td_client_id, + COALESCE(td_description,td_title) as contents + from + tracking + where + TD_TIME_RANGE( + time, + TD_DATE_TRUNC('day', TD_TIME_ADD(TD_SCHEDULED_TIME(),'-${past_ndays}d','JST'), 'JST'), + null, + 'JST' + ) +) +-- DIGDAG_INSERT_LINE +SELECT + l.td_client_id as userid, + translate(r.word,":","\;") as word +FROM + userlog l + LATERAL VIEW explode( + tokenize( + normalize_unicode( + translate(contents,":","\;"), + 'NFKC' + ), + true + ) + ) r as word +WHERE + NOT is_stopword(r.word) AND + length(r.word) >= 2 AND cast(r.word AS double) IS NULL \ No newline at end of file From 4aa1bce9741ee525b7da84bb298abcfe1b08b2e1 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 17:27:43 +0900 Subject: [PATCH 02/23] Update blogposts example --- .../gender_age_prediction/blogpost.dig | 50 ------------------- .../gender_age_prediction/blogposts.dig | 36 +++++++++++++ .../{blogpost.md => blogposts.md} | 26 ++++++---- .../config/blogposts-params.yml | 27 ---------- .../queries/blogposts/gender_age.sql | 2 +- .../queries/blogposts/joined.sql | 14 ++++++ .../queries/blogposts/tfidf_vec.sql | 3 +- .../queries/blogposts/tokenize_en.sql | 28 ++--------- 8 files changed, 71 insertions(+), 115 deletions(-) delete mode 100644 machine-learning-box/gender_age_prediction/blogpost.dig create mode 100644 machine-learning-box/gender_age_prediction/blogposts.dig rename machine-learning-box/gender_age_prediction/{blogpost.md => blogposts.md} (61%) delete mode 100644 machine-learning-box/gender_age_prediction/config/blogposts-params.yml create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql diff --git a/machine-learning-box/gender_age_prediction/blogpost.dig b/machine-learning-box/gender_age_prediction/blogpost.dig deleted file mode 100644 index c5279fbc..00000000 --- a/machine-learning-box/gender_age_prediction/blogpost.dig +++ /dev/null @@ -1,50 +0,0 @@ -_export: - !include : config/params.yml - td: - database: ${target_db} - engine: hive - priority: ${job_priority} - -+preprocess: - _parallel: true - - +gender_age: - td>: queries/blogposts/gender_age.sql - create_table: gender_age - engine: presto - - +vectorize: - - +tokenize_en: - if>: ${english} - _do: - td>: queries/tokenize_en.sql - create_table: exploded - - +tokenize_ja: - if>: ${japanese} - _do: - td>: queries/tokenize_ja.sql - create_table: exploded - - +feature_vector: - _parallel: true - -# +count_vectorize: -# td>: queries/count_vec.sql -# create_table: cnt_features - - +tfidf_vectorize: - td>: queries/tfidf_vec.sql - create_table: features - -+prepare_input: - td>: queries/joined.sql - create_table: input - engine: hive - -+predict: - _parallel: true - - +rf_predict: - call>: rf_predict.dig diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig new file mode 100644 index 00000000..e515e5d7 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -0,0 +1,36 @@ +_export: + !include : config/params.yml + td: + database: ${target_db} + engine: hive + engine_version: experimental + priority: ${job_priority} + ++preprocess: + _parallel: true + + +gender_age: + td>: queries/blogposts/gender_age.sql + create_table: gender_age + engine: presto + + +vectorize: + + +tokenize_en: + td>: queries/blogposts/tokenize_en.sql + create_table: exploded + + +feature_vector: + td>: queries/blogposts/tfidf_vec.sql + create_table: features + ++prepare_input: + td>: queries/blogposts/joined.sql + create_table: input + engine: hive + ++predict: + _parallel: true + + +rf_predict: + call>: rf_predict.dig diff --git a/machine-learning-box/gender_age_prediction/blogpost.md b/machine-learning-box/gender_age_prediction/blogposts.md similarity index 61% rename from machine-learning-box/gender_age_prediction/blogpost.md rename to machine-learning-box/gender_age_prediction/blogposts.md index 3f598b1a..37365951 100644 --- a/machine-learning-box/gender_age_prediction/blogpost.md +++ b/machine-learning-box/gender_age_prediction/blogposts.md @@ -17,11 +17,9 @@ unzip blog-posts-labeled-with-age-and-gender.zip brew install jq -echo -e "userid\tpost\tage\tgender" > train.tsv -jq -r -c '.[] | [.post,.age,.gender] | @tsv' train.json | awk '{print NR"\t"$0}' >> train.tsv - -echo -e "userid\tpost\tage\tgender" > test.tsv -jq -r -c '.[] | [.post,.age,.gender] | @tsv' test.json | awk '{print 526812+NR"\t"$0}' >> test.tsv +echo -e "userid\tpost\tage\tgender" > blogposts.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' train.json | awk '{print NR"\t"$0}' >> blogposts.tsv +jq -r -c '.[] | [.post,.age,.gender] | @tsv' test.json | awk '{print 526812+NR"\t"$0}' >> blogposts.tsv ``` ## Import data to Treasure Data @@ -30,13 +28,19 @@ Please import prepared blog post data to Treasure Data as follows: ```sh # create database -td db:create blogposts +td db:create td_test # load training data -td table:create blogposts train -td import:auto --auto-create blogposts.train --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./train.tsv +td table:create td_test blogposts +td import:auto --auto-create td_test.blogposts --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./blogposts.tsv +``` + +# Run gender-age prediction workflow + +```sh +# Push workflows to Treasure workflow +$ td wf push td_test -# load test data -td table:create blogposts test -td import:auto --auto-create blogposts.test --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./test.tsv +# Run workflow from command line (also runnable from GUI) +$ td wf run blogposts.dig ``` \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/config/blogposts-params.yml b/machine-learning-box/gender_age_prediction/config/blogposts-params.yml deleted file mode 100644 index 3485373e..00000000 --- a/machine-learning-box/gender_age_prediction/config/blogposts-params.yml +++ /dev/null @@ -1,27 +0,0 @@ -target_db: blogposts - -job_priority: -1 - -# RandomForest -rf_seed: 71 -rf_trees: 15 -rf_num_train_parallel: 4 -rf_max_depth: 30 -min_split: 3 -min_samples_leaf: 1 - -# over/down sampling -min_class_weight: 0.8 -max_class_weight: 4.0 - -# heuristic calibration -f15_factor: 1.0 -f20_factor: 1.0 -f25_factor: 1.0 -f35_factor: 1.0 -f50_factor: 1.0 -m15_factor: 1.0 -m20_factor: 1.0 -m25_factor: 1.0 -m35_factor: 1.0 -m50_factor: 1.0 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql index de1c99b5..5bb85df5 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql @@ -3,5 +3,5 @@ select (if(gender='male','M','F') || cast(cast(round(age / 5) as int) * 5 as varchar) ) as gender_age from - source + blogposts ; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql new file mode 100644 index 00000000..6bfdce40 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql @@ -0,0 +1,14 @@ +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.features, + r.gender_age, + -- random sampling + rand(42) as rnd, + -- stratified sampling + count(1) over (partition by r.gender_age) as per_label_count, + rank() over (partition by r.gender_age order by rand(41)) as rank_in_label +FROM + features l + LEFT OUTER JOIN gender_age r ON (l.userid = r.userid) +CLUSTER BY rand(43) -- random shuffling with random seed diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql index db04b3ba..e477108d 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql @@ -1,3 +1,4 @@ +-- @TD distribute_strategy: aggressive WITH term_frequency as ( select t1.userid, @@ -52,8 +53,6 @@ select userid, to_ordered_list(feature(word,bm25),bm25,'-k ${num_features}') as features, to_ordered_list(feature(word,tfidf),tfidf,'-k ${num_features}') as tfidf_features --- to_ordered_list(word,bm25,'-k ${num_features}') as bm25_bow_features, --- to_ordered_list(word,tfidf,'-k ${num_features}') as tfidf_bow_features from scores group by diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql index 7c6846d9..0b87638d 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -1,31 +1,11 @@ -WITH userlog as ( - select - td_client_id, - COALESCE(td_description,td_title) as contents - from - tracking - where - TD_TIME_RANGE( - time, - TD_DATE_TRUNC('day', TD_TIME_ADD(TD_SCHEDULED_TIME(),'-${past_ndays}d','JST'), 'JST'), - null, - 'JST' - ) -) -- DIGDAG_INSERT_LINE SELECT - l.td_client_id as userid, - translate(r.word,":","\;") as word + userid, + translate(r.word,':','\;') as word FROM - userlog l + blogposts l LATERAL VIEW explode( - tokenize( - normalize_unicode( - translate(contents,":","\;"), - 'NFKC' - ), - true - ) + tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) ) r as word WHERE NOT is_stopword(r.word) AND From d5cc7194998dbbf7a789532e9752a845cb9c8166 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 17:48:02 +0900 Subject: [PATCH 03/23] Update workflow --- machine-learning-box/gender_age_prediction/config/params.yml | 3 ++- .../gender_age_prediction/queries/blogposts/tokenize_en.sql | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index 6de49538..d8430557 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -1,6 +1,7 @@ target_db: td_test -job_priority: -1 +job_priority: 0 +# job_priority: -1 topk_predict: 5 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql index 0b87638d..e87e56a1 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -1,3 +1,4 @@ +-- @TD distribute_strategy: aggressive -- DIGDAG_INSERT_LINE SELECT userid, From 23d0b66adc9a013635f033221655b4b1cd913544 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 18:36:22 +0900 Subject: [PATCH 04/23] commented out --- machine-learning-box/gender_age_prediction/blogposts.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index e515e5d7..02b7e8c0 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -3,7 +3,7 @@ _export: td: database: ${target_db} engine: hive - engine_version: experimental +# engine_version: experimental priority: ${job_priority} +preprocess: From 7abcd4213b7327d4801542932b1b5090fa7f87e4 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 21:22:39 +0900 Subject: [PATCH 05/23] Fixed a query bug --- .../gender_age_prediction/queries/rf/confusion_matrix.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql index f346f816..5fb3cac5 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql @@ -9,7 +9,7 @@ WITH test_data as ( AND rnd > ${train_rate} -- using 30% for testing GROUP BY userid -), +) select l.actual, r.label as predicted, From 972fbdd12ab9ff8e2d7c0e06bba190ffadcdc774 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 23:07:39 +0900 Subject: [PATCH 06/23] Fixed a query error --- machine-learning-box/gender_age_prediction/.ruby-version | 1 + .../gender_age_prediction/queries/rf/confusion_matrix.sql | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 machine-learning-box/gender_age_prediction/.ruby-version diff --git a/machine-learning-box/gender_age_prediction/.ruby-version b/machine-learning-box/gender_age_prediction/.ruby-version new file mode 100644 index 00000000..ec1cf33c --- /dev/null +++ b/machine-learning-box/gender_age_prediction/.ruby-version @@ -0,0 +1 @@ +2.6.3 diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql index 5fb3cac5..66217106 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix.sql @@ -7,8 +7,6 @@ WITH test_data as ( WHERE gender_age is not null AND rnd > ${train_rate} -- using 30% for testing - GROUP BY - userid ) select l.actual, From 9dbb56840074a38562e4d7f8d6237a44f2b52e57 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 23:08:38 +0900 Subject: [PATCH 07/23] Changed parameters --- .../gender_age_prediction/config/params.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index d8430557..2f4b77e9 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -22,9 +22,9 @@ max_age: 95 # RandomForest rf_seed: 71 -rf_trees: 15 -rf_num_train_parallel: 4 -rf_max_depth: 30 +rf_trees: 10 +rf_num_train_parallel: 5 +rf_max_depth: 10 min_split: 3 min_samples_leaf: 1 From 126107daebbd1679adf13623978a22f34b0d4345 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 23:11:40 +0900 Subject: [PATCH 08/23] Update workflow --- .../gender_age_prediction/blogposts.dig | 4 +- .../queries/blogposts/joined.sql | 14 ----- .../queries/blogposts/tfidf_vec.sql | 60 ------------------- 3 files changed, 2 insertions(+), 76 deletions(-) delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index 02b7e8c0..c30de551 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -21,11 +21,11 @@ _export: create_table: exploded +feature_vector: - td>: queries/blogposts/tfidf_vec.sql + td>: queries/count_vec.sql create_table: features +prepare_input: - td>: queries/blogposts/joined.sql + td>: queries/joined.sql create_table: input engine: hive diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql deleted file mode 100644 index 6bfdce40..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/joined.sql +++ /dev/null @@ -1,14 +0,0 @@ --- DIGDAG_INSERT_LINE -SELECT - l.userid, - l.features, - r.gender_age, - -- random sampling - rand(42) as rnd, - -- stratified sampling - count(1) over (partition by r.gender_age) as per_label_count, - rank() over (partition by r.gender_age order by rand(41)) as rank_in_label -FROM - features l - LEFT OUTER JOIN gender_age r ON (l.userid = r.userid) -CLUSTER BY rand(43) -- random shuffling with random seed diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql deleted file mode 100644 index e477108d..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tfidf_vec.sql +++ /dev/null @@ -1,60 +0,0 @@ --- @TD distribute_strategy: aggressive -WITH term_frequency as ( - select - t1.userid, - t2.word, - t2.freq - from ( - select - userid, - tf(word) as word2freq - from - exploded - group by - userid - ) t1 - LATERAL VIEW explode(word2freq) t2 as word, freq -), -document_frequency AS ( - select - word, - count(distinct userid) docs - from - exploded - group by - word -), -doc_len as ( - select - userid, - count(1) as dl, - avg(count(1)) over () as avgdl, - APPROX_COUNT_DISTINCT(userid) over () as total_docs - from - exploded - group by - userid -), -scores as ( - select - tf.userid, - tf.word, - bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25, - tfidf(tf.freq, df.docs, dl.total_docs) as tfidf - from - term_frequency tf - JOIN document_frequency df ON (tf.word = df.word) - JOIN doc_len dl ON (tf.userid = dl.userid) - where - df.docs >= 2 -) --- DIGDAG_INSERT_LINE -select - userid, - to_ordered_list(feature(word,bm25),bm25,'-k ${num_features}') as features, - to_ordered_list(feature(word,tfidf),tfidf,'-k ${num_features}') as tfidf_features -from - scores -group by - userid -; \ No newline at end of file From 9d320656c88cd78593bc0a1340a8700759cc9589 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Thu, 26 Dec 2019 23:56:32 +0900 Subject: [PATCH 09/23] Changed age range --- .../queries/blogposts/gender_age.sql | 9 +++++++-- .../queries/blogposts/gender_age.sql~ | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql index 5bb85df5..9cb58f3b 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql @@ -1,7 +1,12 @@ select userid, - (if(gender='male','M','F') || cast(cast(round(age / 5) as int) * 5 as varchar) - ) as gender_age + (if(gender='male','M','F') || + CASE + WHEN age >= 35 THEN '35' + WHEN age >= 25 THEN '25' + ELSE cast(cast(round(age / 5) as int) * 5 as varchar) + END + ) as gender_age -- 35~, 25~, 20~, 15~, 10~ from blogposts ; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ new file mode 100644 index 00000000..5bb85df5 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ @@ -0,0 +1,7 @@ +select + userid, + (if(gender='male','M','F') || cast(cast(round(age / 5) as int) * 5 as varchar) + ) as gender_age +from + blogposts +; From 3eca0b9f8ef03f62c9cf9206f632f54a206a1543 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Dec 2019 00:06:43 +0900 Subject: [PATCH 10/23] Add engine version --- machine-learning-box/gender_age_prediction/blogposts.dig | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index c30de551..e75894af 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -23,6 +23,7 @@ _export: +feature_vector: td>: queries/count_vec.sql create_table: features + engine_version: experimental +prepare_input: td>: queries/joined.sql From 8ca7821d5d35a0243d49c0ea4036ae75dd6faec9 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Dec 2019 00:18:29 +0900 Subject: [PATCH 11/23] Updated workflow --- .../gender_age_prediction/blogposts.dig | 26 +----- .../queries/blogposts/gender_age.sql~ | 7 -- .../queries/blogposts/tokenize_and_ftvec.sql | 80 +++++++++++++++++++ .../queries/blogposts/tokenize_en.sql | 13 --- 4 files changed, 83 insertions(+), 43 deletions(-) delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index e75894af..7d1e084a 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -3,32 +3,12 @@ _export: td: database: ${target_db} engine: hive -# engine_version: experimental priority: ${job_priority} -+preprocess: - _parallel: true - - +gender_age: - td>: queries/blogposts/gender_age.sql - create_table: gender_age - engine: presto - - +vectorize: - - +tokenize_en: - td>: queries/blogposts/tokenize_en.sql - create_table: exploded - - +feature_vector: - td>: queries/count_vec.sql - create_table: features - engine_version: experimental - -+prepare_input: - td>: queries/joined.sql ++vectorize: + td>: queries/blogposts/tokenize_and_ftvec.sql create_table: input - engine: hive + engine_version: experimental +predict: _parallel: true diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ deleted file mode 100644 index 5bb85df5..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql~ +++ /dev/null @@ -1,7 +0,0 @@ -select - userid, - (if(gender='male','M','F') || cast(cast(round(age / 5) as int) * 5 as varchar) - ) as gender_age -from - blogposts -; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql new file mode 100644 index 00000000..9822ca77 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql @@ -0,0 +1,80 @@ +-- @TD distribute_strategy: aggressive +WITH exploded as ( + SELECT + userid, + translate(r.word,':','\;') as word + FROM + blogposts l + LATERAL VIEW explode( + tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) + ) r as word + WHERE + NOT is_stopword(r.word) AND + length(r.word) >= 2 AND cast(r.word AS double) IS NULL +), +document_frequency AS ( + select + word, + count(distinct userid) docs + from + exploded + group by + word + having + count(distinct userid) >= 2 +), +wordcnt as ( + select + l.userid, + l.word, + ln(1+count(1)) as cnt -- logscale count + from + exploded l + LEFT SEMI JOIN document_frequency r ON (l.word = r.word) + group by + l.userid, + l.word +), +rescaled as ( + select + userid, + word, + rescale(cnt, min(cnt) over (partition by word), max(cnt) over (partition by word)) as value + from + wordcnt +), +ftvec as ( + select + userid, + to_ordered_list(feature(word,value), value, '-k ${num_features}') as features + from + rescaled + group by + userid +), +ages as ( + select + userid, + (if(gender='male','M','F') || + CASE + WHEN age >= 35 THEN '35' + WHEN age >= 25 THEN '25' + ELSE cast(round(age / 5) as int) * 5 + END + ) as gender_age -- 35~, 25~, 20~, 15~, 10~ + from + blogposts +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.features, + r.gender_age, + -- random sampling + rand(42) as rnd, + -- stratified sampling + count(1) over (partition by r.gender_age) as per_label_count, + rank() over (partition by r.gender_age order by rand(41)) as rank_in_label +FROM + ftvec l + JOIN ages r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql deleted file mode 100644 index e87e56a1..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql +++ /dev/null @@ -1,13 +0,0 @@ --- @TD distribute_strategy: aggressive --- DIGDAG_INSERT_LINE -SELECT - userid, - translate(r.word,':','\;') as word -FROM - blogposts l - LATERAL VIEW explode( - tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) - ) r as word -WHERE - NOT is_stopword(r.word) AND - length(r.word) >= 2 AND cast(r.word AS double) IS NULL \ No newline at end of file From 20d203b33cbd98207b2e26e6ecd18a55fe5ae2f5 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Dec 2019 07:57:18 +0900 Subject: [PATCH 12/23] Fixed a query error --- .../gender_age_prediction/queries/rf/predict.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql index 08098af9..e2877092 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql @@ -32,7 +32,7 @@ SELECT l.predicted.probabilities[6] * ${m20_factor}, -- M20 l.predicted.probabilities[7] * ${m25_factor}, -- M25~ l.predicted.probabilities[8] * ${m35_factor}, -- M35~ - l.predicted.probabilities[9] * ${m50_factor}, -- M50~ + l.predicted.probabilities[9] * ${m50_factor} -- M50~ ) as probabilities FROM t2 l From 22936fff201466e3290f58ac82f7e8df3c2bd73d Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Fri, 27 Dec 2019 08:42:30 +0900 Subject: [PATCH 13/23] Fixed age range --- .../queries/blogposts/gender_age.sql | 8 ++++---- .../queries/blogposts/tokenize_and_ftvec.sql | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql index 9cb58f3b..4c4612a9 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql @@ -2,11 +2,11 @@ select userid, (if(gender='male','M','F') || CASE - WHEN age >= 35 THEN '35' - WHEN age >= 25 THEN '25' - ELSE cast(cast(round(age / 5) as int) * 5 as varchar) + WHEN age >= 33 THEN '33~48' + WHEN age >= 23 THEN '23~27' + ELSE '13-17' END - ) as gender_age -- 35~, 25~, 20~, 15~, 10~ + ) as gender_age from blogposts ; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql index 9822ca77..36ddc11b 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql @@ -57,11 +57,11 @@ ages as ( userid, (if(gender='male','M','F') || CASE - WHEN age >= 35 THEN '35' - WHEN age >= 25 THEN '25' - ELSE cast(round(age / 5) as int) * 5 + WHEN age >= 33 THEN '33~48' + WHEN age >= 23 THEN '23~27' + ELSE '13-17' END - ) as gender_age -- 35~, 25~, 20~, 15~, 10~ + ) as gender_age from blogposts ) From 7396b19d95eaf767d60c599bf9bd569deecf7401 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 14:51:32 +0900 Subject: [PATCH 14/23] Fixed age range --- .../gender_age_prediction/blogposts.dig | 2 +- .../blogposts_rf_predict.dig | 112 ++++++++++++++++++ .../gender_age_prediction/config/params.yml | 7 ++ .../blogposts/rf/confusion_matrix_pivot.sql | 25 ++++ .../queries/blogposts/rf/predict.sql | 40 +++++++ .../queries/blogposts/rf/topk_predict.sql | 68 +++++++++++ .../queries/blogposts/tokenize_and_ftvec.sql | 6 +- .../queries/rf/confusion_matrix_pivot.sql | 20 ++-- .../queries/rf/predict.sql | 16 +-- .../queries/rf/topk_predict.sql | 16 +-- 10 files changed, 276 insertions(+), 36 deletions(-) create mode 100644 machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index 7d1e084a..5f21e7d3 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -14,4 +14,4 @@ _export: _parallel: true +rf_predict: - call>: rf_predict.dig + call>: blogposts_rf_predict.dig diff --git a/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig b/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig new file mode 100644 index 00000000..e9b62210 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig @@ -0,0 +1,112 @@ +_export: + !include : config/params.yml + td: + database: ${target_db} + engine: hive +# engine_version: experimental + priority: ${job_priority} + ++preparation: + + +label_mapping: + td>: queries/rf/label_mapping.sql + create_table: label_mapping + + +rf_input: + td>: queries/rf/rf_input.sql + create_table: rf_input + + +compute_class_weight: + td>: queries/rf/compute_class_weight.sql + create_table: class_weight + engine: presto + + +prepare_model_tables: + td_ddl>: + empty_tables: ["rf_model_cv", "rf_model"] + ++store_weights: + td>: queries/rf/store_weights.sql + store_last_results: true + ++validate_and_train: + _export: + class_weights: ${td.last_results.weights} + + +cross_validation: + + +parallel_train: + for_range>: + from: 0 + to: ${rf_num_train_parallel} + step: 1 + _parallel: true + _do: + +train: + td>: queries/rf/train_cv.sql + insert_into: rf_model_cv + seed: ${rf_seed + range.index * 100} + + +predict: + td>: queries/rf/predict_cv.sql + create_table: rf_predicted_cv + +# +topk_predict: +# td>: queries/blogposts/rf/topk_predict.sql +# create_table: rf_topk_predict +# +# +evaluation_measures: +# td>: queries/rf/eval.sql + + +eval: + _parallel: true + + +confusion_matrix: + + +confusion_matrix_table: + td>: queries/rf/confusion_matrix.sql + engine: presto + create_table: rf_confusion_matrix + + +confusion_matrix_pivot: + td>: queries/blogposts/rf/confusion_matrix_pivot.sql + engine: presto + + +actual_predict_diff: + td>: queries/rf/actual_predict_diff.sql + engine: presto + +# +heuristic_calibration: +# +# +calibration_prediction: +# td>: queries/rf/calibrate_prediction.sql +# engine: presto +# create_table: rf_predicted_cv_calibrated +# +# +eval_calibration: +# td>: queries/rf/confusion_matrix_calibrated.sql +# engine: presto +# create_table: rf_confusion_matrix_calibrated + + +train_predict: + + +parallel_train: + for_range>: + from: 0 + to: ${rf_num_train_parallel} + step: 1 + _parallel: true + _do: + +train: + td>: queries/rf/train.sql + insert_into: rf_model + seed: ${rf_seed + range.index * 100} + + +prediction: + td>: queries/blogposts/rf/predict.sql + create_table: rf_predicted + + +complement_prediction: + td>: queries/rf/complement_prediction.sql + engine: presto + create_table: rf_complemented diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index 2f4b77e9..89433453 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -44,6 +44,13 @@ m25_factor: 1.0 m35_factor: 1.0 m50_factor: 1.0 +f10x_factor: 1.0 +f20x_factor: 1.0 +f30x_factor: 1.0 +m10x_factor: 1.0 +m20x_factor: 1.0 +m30x_factor: 1.0 + #f15_factor: 1.1 #f20_factor: 1.4 #f25_factor: 0.75 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql new file mode 100644 index 00000000..a4a28e35 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql @@ -0,0 +1,25 @@ +-- DIGDAG_INSERT_LINE +WITH tmp as ( + SELECT + actual, + max(CASE WHEN predicted = 'F15' THEN cnt ELSE 0 END) AS F15, + max(CASE WHEN predicted = 'F20' THEN cnt ELSE 0 END) AS F20, + max(CASE WHEN predicted = 'F25' THEN cnt ELSE 0 END) AS F25, + max(CASE WHEN predicted = 'F35' THEN cnt ELSE 0 END) AS F35, + max(CASE WHEN predicted = 'F50' THEN cnt ELSE 0 END) AS F50, + max(CASE WHEN predicted = 'M15' THEN cnt ELSE 0 END) AS M15, + max(CASE WHEN predicted = 'M20' THEN cnt ELSE 0 END) AS M20, + max(CASE WHEN predicted = 'M25' THEN cnt ELSE 0 END) AS M25, + max(CASE WHEN predicted = 'M35' THEN cnt ELSE 0 END) AS M35, + max(CASE WHEN predicted = 'M50' THEN cnt ELSE 0 END) AS M50 + FROM rf_confusion_matrix + GROUP BY actual +) +select + actual, + F15,F20,F25,F35,F50, + M15,M20,M25,M35,M50 +from + tmp +order by + actual asc diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql new file mode 100644 index 00000000..e2877092 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql @@ -0,0 +1,40 @@ +-- @TD enable_cartesian_product: true +-- @TD autoconvertjoin: true +WITH t2 as ( + SELECT + userid, + rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted + FROM ( + SELECT + t.userid, + p.model_weight, + tree_predict(p.model_id, p.model, t.features, '-classification') as predicted + FROM + rf_model p + LEFT OUTER JOIN rf_input t + ) t1 + GROUP BY + userid +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + r.label, + l.predicted.probability, + l.predicted.probabilities as raw_probability, + array( -- calibration + l.predicted.probabilities[0] * ${f15_factor}, -- F15 + l.predicted.probabilities[1] * ${f20_factor}, -- F20 + l.predicted.probabilities[2] * ${f25_factor}, -- F25~ + l.predicted.probabilities[3] * ${f35_factor}, -- F35~ + l.predicted.probabilities[4] * ${f50_factor}, -- F50~ + l.predicted.probabilities[5] * ${m15_factor}, -- M15 + l.predicted.probabilities[6] * ${m20_factor}, -- M20 + l.predicted.probabilities[7] * ${m25_factor}, -- M25~ + l.predicted.probabilities[8] * ${m35_factor}, -- M35~ + l.predicted.probabilities[9] * ${m50_factor} -- M50~ + ) as probabilities +FROM + t2 l + JOIN label_mapping r ON (l.predicted.label = r.label_id) + diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql new file mode 100644 index 00000000..23af17e4 --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql @@ -0,0 +1,68 @@ +WITH test_data as ( + SELECT + userid, collect_set(gender_age) as actual + FROM + rf_input + WHERE + gender_age is not null + AND rnd > ${train_rate} -- using 30% for testing + GROUP BY + userid +), +calibrated_prediction as ( + SELECT + userid, + array( -- calibration + probabilities[0] * ${f15_factor}, -- F15 + probabilities[1] * ${f20_factor}, -- F20 + probabilities[2] * ${f25_factor}, -- F25 + probabilities[3] * ${f35_factor}, -- F35 + probabilities[4] * ${f50_factor}, -- F50 + probabilities[5] * ${m15_factor}, -- M15 + probabilities[6] * ${m20_factor}, -- M20 + probabilities[7] * ${m25_factor}, -- M25 + probabilities[8] * ${m35_factor}, -- M35 + probabilities[9] * ${m50_factor} -- M50 + ) as probabilities + FROM + rf_predicted_cv +), +exploded as ( + select + l.userid, + r.pos, + r.prob + from + calibrated_prediction l + LATERAL VIEW posexplode(l.probabilities) r as pos, prob +), +predicted as ( + select + l.userid, + to_ordered_list( + r.label, -- value + l.prob, -- key + '-k ${topk_predict}' + ) as predicted, + to_ordered_list( + concat(r.label, ':', l.prob), -- value + l.prob, -- key + '-k ${topk_predict}' + ) as predicted_with_weight + from + exploded l + JOIN label_mapping r ON (l.pos = r.label_id) + where + l.prob > 0 + group by + 1 +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.actual, + r.predicted, + r.predicted_with_weight +FROM + test_data l + JOIN predicted r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql index 36ddc11b..6b2670b7 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql @@ -57,9 +57,9 @@ ages as ( userid, (if(gender='male','M','F') || CASE - WHEN age >= 33 THEN '33~48' - WHEN age >= 23 THEN '23~27' - ELSE '13-17' + WHEN age >= 33 THEN '3x' + WHEN age >= 23 THEN '2x' + ELSE '1x' END ) as gender_age from diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql index a4a28e35..9b85cff9 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql @@ -2,23 +2,19 @@ WITH tmp as ( SELECT actual, - max(CASE WHEN predicted = 'F15' THEN cnt ELSE 0 END) AS F15, - max(CASE WHEN predicted = 'F20' THEN cnt ELSE 0 END) AS F20, - max(CASE WHEN predicted = 'F25' THEN cnt ELSE 0 END) AS F25, - max(CASE WHEN predicted = 'F35' THEN cnt ELSE 0 END) AS F35, - max(CASE WHEN predicted = 'F50' THEN cnt ELSE 0 END) AS F50, - max(CASE WHEN predicted = 'M15' THEN cnt ELSE 0 END) AS M15, - max(CASE WHEN predicted = 'M20' THEN cnt ELSE 0 END) AS M20, - max(CASE WHEN predicted = 'M25' THEN cnt ELSE 0 END) AS M25, - max(CASE WHEN predicted = 'M35' THEN cnt ELSE 0 END) AS M35, - max(CASE WHEN predicted = 'M50' THEN cnt ELSE 0 END) AS M50 + max(CASE WHEN predicted = 'F10x' THEN cnt ELSE 0 END) AS F10x, + max(CASE WHEN predicted = 'F20x' THEN cnt ELSE 0 END) AS F20x, + max(CASE WHEN predicted = 'F30x' THEN cnt ELSE 0 END) AS F30x, + max(CASE WHEN predicted = 'M10x' THEN cnt ELSE 0 END) AS M10x, + max(CASE WHEN predicted = 'M20x' THEN cnt ELSE 0 END) AS M20x, + max(CASE WHEN predicted = 'M30x' THEN cnt ELSE 0 END) AS M30x FROM rf_confusion_matrix GROUP BY actual ) select actual, - F15,F20,F25,F35,F50, - M15,M20,M25,M35,M50 + F10x,F20x,F30x, + M10x,M20x,M30x from tmp order by diff --git a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql index e2877092..5eaca03f 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql @@ -23,16 +23,12 @@ SELECT l.predicted.probability, l.predicted.probabilities as raw_probability, array( -- calibration - l.predicted.probabilities[0] * ${f15_factor}, -- F15 - l.predicted.probabilities[1] * ${f20_factor}, -- F20 - l.predicted.probabilities[2] * ${f25_factor}, -- F25~ - l.predicted.probabilities[3] * ${f35_factor}, -- F35~ - l.predicted.probabilities[4] * ${f50_factor}, -- F50~ - l.predicted.probabilities[5] * ${m15_factor}, -- M15 - l.predicted.probabilities[6] * ${m20_factor}, -- M20 - l.predicted.probabilities[7] * ${m25_factor}, -- M25~ - l.predicted.probabilities[8] * ${m35_factor}, -- M35~ - l.predicted.probabilities[9] * ${m50_factor} -- M50~ + l.predicted.probabilities[0] * ${f10x_factor}, -- F10x + l.predicted.probabilities[1] * ${f20x_factor}, -- F20x + l.predicted.probabilities[2] * ${f30x_factor}, -- F30x + l.predicted.probabilities[3] * ${m10x_factor}, -- M10x + l.predicted.probabilities[4] * ${m20x_factor}, -- M20x + l.predicted.probabilities[5] * ${m30x_factor} -- M30x ) as probabilities FROM t2 l diff --git a/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql index 23af17e4..1bf538b0 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql @@ -13,16 +13,12 @@ calibrated_prediction as ( SELECT userid, array( -- calibration - probabilities[0] * ${f15_factor}, -- F15 - probabilities[1] * ${f20_factor}, -- F20 - probabilities[2] * ${f25_factor}, -- F25 - probabilities[3] * ${f35_factor}, -- F35 - probabilities[4] * ${f50_factor}, -- F50 - probabilities[5] * ${m15_factor}, -- M15 - probabilities[6] * ${m20_factor}, -- M20 - probabilities[7] * ${m25_factor}, -- M25 - probabilities[8] * ${m35_factor}, -- M35 - probabilities[9] * ${m50_factor} -- M50 + probabilities[0] * ${f10x_factor}, -- F10x + probabilities[1] * ${f20x_factor}, -- F20x + probabilities[2] * ${f30x_factor}, -- F30x + probabilities[3] * ${m10x_factor}, -- M10x + probabilities[4] * ${m20x_factor}, -- M20x + probabilities[5] * ${m30x_factor} -- M30x ) as probabilities FROM rf_predicted_cv From 392676354b0e2d33a36cb95935e1aad39dcef801 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 15:48:02 +0900 Subject: [PATCH 15/23] Fixed blogpost workflow --- .../blogposts/rf/confusion_matrix_pivot.sql | 20 ++++++++----------- .../queries/blogposts/rf/predict.sql | 16 ++++++--------- .../queries/blogposts/rf/topk_predict.sql | 16 ++++++--------- 3 files changed, 20 insertions(+), 32 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql index a4a28e35..9b85cff9 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql @@ -2,23 +2,19 @@ WITH tmp as ( SELECT actual, - max(CASE WHEN predicted = 'F15' THEN cnt ELSE 0 END) AS F15, - max(CASE WHEN predicted = 'F20' THEN cnt ELSE 0 END) AS F20, - max(CASE WHEN predicted = 'F25' THEN cnt ELSE 0 END) AS F25, - max(CASE WHEN predicted = 'F35' THEN cnt ELSE 0 END) AS F35, - max(CASE WHEN predicted = 'F50' THEN cnt ELSE 0 END) AS F50, - max(CASE WHEN predicted = 'M15' THEN cnt ELSE 0 END) AS M15, - max(CASE WHEN predicted = 'M20' THEN cnt ELSE 0 END) AS M20, - max(CASE WHEN predicted = 'M25' THEN cnt ELSE 0 END) AS M25, - max(CASE WHEN predicted = 'M35' THEN cnt ELSE 0 END) AS M35, - max(CASE WHEN predicted = 'M50' THEN cnt ELSE 0 END) AS M50 + max(CASE WHEN predicted = 'F10x' THEN cnt ELSE 0 END) AS F10x, + max(CASE WHEN predicted = 'F20x' THEN cnt ELSE 0 END) AS F20x, + max(CASE WHEN predicted = 'F30x' THEN cnt ELSE 0 END) AS F30x, + max(CASE WHEN predicted = 'M10x' THEN cnt ELSE 0 END) AS M10x, + max(CASE WHEN predicted = 'M20x' THEN cnt ELSE 0 END) AS M20x, + max(CASE WHEN predicted = 'M30x' THEN cnt ELSE 0 END) AS M30x FROM rf_confusion_matrix GROUP BY actual ) select actual, - F15,F20,F25,F35,F50, - M15,M20,M25,M35,M50 + F10x,F20x,F30x, + M10x,M20x,M30x from tmp order by diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql index e2877092..5eaca03f 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql @@ -23,16 +23,12 @@ SELECT l.predicted.probability, l.predicted.probabilities as raw_probability, array( -- calibration - l.predicted.probabilities[0] * ${f15_factor}, -- F15 - l.predicted.probabilities[1] * ${f20_factor}, -- F20 - l.predicted.probabilities[2] * ${f25_factor}, -- F25~ - l.predicted.probabilities[3] * ${f35_factor}, -- F35~ - l.predicted.probabilities[4] * ${f50_factor}, -- F50~ - l.predicted.probabilities[5] * ${m15_factor}, -- M15 - l.predicted.probabilities[6] * ${m20_factor}, -- M20 - l.predicted.probabilities[7] * ${m25_factor}, -- M25~ - l.predicted.probabilities[8] * ${m35_factor}, -- M35~ - l.predicted.probabilities[9] * ${m50_factor} -- M50~ + l.predicted.probabilities[0] * ${f10x_factor}, -- F10x + l.predicted.probabilities[1] * ${f20x_factor}, -- F20x + l.predicted.probabilities[2] * ${f30x_factor}, -- F30x + l.predicted.probabilities[3] * ${m10x_factor}, -- M10x + l.predicted.probabilities[4] * ${m20x_factor}, -- M20x + l.predicted.probabilities[5] * ${m30x_factor} -- M30x ) as probabilities FROM t2 l diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql index 23af17e4..1bf538b0 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql @@ -13,16 +13,12 @@ calibrated_prediction as ( SELECT userid, array( -- calibration - probabilities[0] * ${f15_factor}, -- F15 - probabilities[1] * ${f20_factor}, -- F20 - probabilities[2] * ${f25_factor}, -- F25 - probabilities[3] * ${f35_factor}, -- F35 - probabilities[4] * ${f50_factor}, -- F50 - probabilities[5] * ${m15_factor}, -- M15 - probabilities[6] * ${m20_factor}, -- M20 - probabilities[7] * ${m25_factor}, -- M25 - probabilities[8] * ${m35_factor}, -- M35 - probabilities[9] * ${m50_factor} -- M50 + probabilities[0] * ${f10x_factor}, -- F10x + probabilities[1] * ${f20x_factor}, -- F20x + probabilities[2] * ${f30x_factor}, -- F30x + probabilities[3] * ${m10x_factor}, -- M10x + probabilities[4] * ${m20x_factor}, -- M20x + probabilities[5] * ${m30x_factor} -- M30x ) as probabilities FROM rf_predicted_cv From b8e136470954ffb832184355af4ba37cb66c8590 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 15:49:33 +0900 Subject: [PATCH 16/23] Reverted changes --- .../queries/rf/confusion_matrix_pivot.sql | 22 +++++++++++-------- .../queries/rf/predict.sql | 19 +++++++++------- .../queries/rf/topk_predict.sql | 16 +++++++++----- 3 files changed, 34 insertions(+), 23 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql index 9b85cff9..de8f4e9a 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/confusion_matrix_pivot.sql @@ -2,20 +2,24 @@ WITH tmp as ( SELECT actual, - max(CASE WHEN predicted = 'F10x' THEN cnt ELSE 0 END) AS F10x, - max(CASE WHEN predicted = 'F20x' THEN cnt ELSE 0 END) AS F20x, - max(CASE WHEN predicted = 'F30x' THEN cnt ELSE 0 END) AS F30x, - max(CASE WHEN predicted = 'M10x' THEN cnt ELSE 0 END) AS M10x, - max(CASE WHEN predicted = 'M20x' THEN cnt ELSE 0 END) AS M20x, - max(CASE WHEN predicted = 'M30x' THEN cnt ELSE 0 END) AS M30x + max(CASE WHEN predicted = 'F15' THEN cnt ELSE 0 END) AS F15, + max(CASE WHEN predicted = 'F20' THEN cnt ELSE 0 END) AS F20, + max(CASE WHEN predicted = 'F25' THEN cnt ELSE 0 END) AS F25, + max(CASE WHEN predicted = 'F35' THEN cnt ELSE 0 END) AS F35, + max(CASE WHEN predicted = 'F50' THEN cnt ELSE 0 END) AS F50, + max(CASE WHEN predicted = 'M15' THEN cnt ELSE 0 END) AS M15, + max(CASE WHEN predicted = 'M20' THEN cnt ELSE 0 END) AS M20, + max(CASE WHEN predicted = 'M25' THEN cnt ELSE 0 END) AS M25, + max(CASE WHEN predicted = 'M35' THEN cnt ELSE 0 END) AS M35, + max(CASE WHEN predicted = 'M50' THEN cnt ELSE 0 END) AS M50 FROM rf_confusion_matrix GROUP BY actual ) select actual, - F10x,F20x,F30x, - M10x,M20x,M30x + F15,F20,F25,F35,F50, + M15,M20,M25,M35,M50 from tmp order by - actual asc + actual asc \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql index 5eaca03f..fa159529 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/predict.sql @@ -23,14 +23,17 @@ SELECT l.predicted.probability, l.predicted.probabilities as raw_probability, array( -- calibration - l.predicted.probabilities[0] * ${f10x_factor}, -- F10x - l.predicted.probabilities[1] * ${f20x_factor}, -- F20x - l.predicted.probabilities[2] * ${f30x_factor}, -- F30x - l.predicted.probabilities[3] * ${m10x_factor}, -- M10x - l.predicted.probabilities[4] * ${m20x_factor}, -- M20x - l.predicted.probabilities[5] * ${m30x_factor} -- M30x + l.predicted.probabilities[0] * ${f15_factor}, -- F15 + l.predicted.probabilities[1] * ${f20_factor}, -- F20 + l.predicted.probabilities[2] * ${f25_factor}, -- F25~ + l.predicted.probabilities[3] * ${f35_factor}, -- F35~ + l.predicted.probabilities[4] * ${f50_factor}, -- F50~ + l.predicted.probabilities[5] * ${m15_factor}, -- M15 + l.predicted.probabilities[6] * ${m20_factor}, -- M20 + l.predicted.probabilities[7] * ${m25_factor}, -- M25~ + l.predicted.probabilities[8] * ${m35_factor}, -- M35~ + l.predicted.probabilities[9] * ${m50_factor}, -- M50~ ) as probabilities FROM t2 l - JOIN label_mapping r ON (l.predicted.label = r.label_id) - + JOIN label_mapping r ON (l.predicted.label = r.label_id) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql index 1bf538b0..23af17e4 100644 --- a/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/rf/topk_predict.sql @@ -13,12 +13,16 @@ calibrated_prediction as ( SELECT userid, array( -- calibration - probabilities[0] * ${f10x_factor}, -- F10x - probabilities[1] * ${f20x_factor}, -- F20x - probabilities[2] * ${f30x_factor}, -- F30x - probabilities[3] * ${m10x_factor}, -- M10x - probabilities[4] * ${m20x_factor}, -- M20x - probabilities[5] * ${m30x_factor} -- M30x + probabilities[0] * ${f15_factor}, -- F15 + probabilities[1] * ${f20_factor}, -- F20 + probabilities[2] * ${f25_factor}, -- F25 + probabilities[3] * ${f35_factor}, -- F35 + probabilities[4] * ${f50_factor}, -- F50 + probabilities[5] * ${m15_factor}, -- M15 + probabilities[6] * ${m20_factor}, -- M20 + probabilities[7] * ${m25_factor}, -- M25 + probabilities[8] * ${m35_factor}, -- M35 + probabilities[9] * ${m50_factor} -- M50 ) as probabilities FROM rf_predicted_cv From 3a7f21cdd5a53e01abd447022cbb12f427bc3b36 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 16:09:38 +0900 Subject: [PATCH 17/23] Fixed workflow --- .../gender_age_prediction/config/params.yml | 12 ++++++------ .../queries/blogposts/gender_age.sql | 6 +++--- .../blogposts/rf/confusion_matrix_pivot.sql | 16 ++++++++-------- .../queries/blogposts/rf/predict.sql | 12 ++++++------ .../queries/blogposts/rf/topk_predict.sql | 12 ++++++------ 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index 89433453..05908c09 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -44,12 +44,12 @@ m25_factor: 1.0 m35_factor: 1.0 m50_factor: 1.0 -f10x_factor: 1.0 -f20x_factor: 1.0 -f30x_factor: 1.0 -m10x_factor: 1.0 -m20x_factor: 1.0 -m30x_factor: 1.0 +f1x_factor: 1.0 +f2x_factor: 1.0 +f3x_factor: 1.0 +m1x_factor: 1.0 +m2x_factor: 1.0 +m3x_factor: 1.0 #f15_factor: 1.1 #f20_factor: 1.4 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql index 4c4612a9..ffee869c 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql @@ -2,9 +2,9 @@ select userid, (if(gender='male','M','F') || CASE - WHEN age >= 33 THEN '33~48' - WHEN age >= 23 THEN '23~27' - ELSE '13-17' + WHEN age >= 33 THEN '3x' + WHEN age >= 23 THEN '2x' + ELSE '1x' END ) as gender_age from diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql index 9b85cff9..ed1fa392 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/confusion_matrix_pivot.sql @@ -2,19 +2,19 @@ WITH tmp as ( SELECT actual, - max(CASE WHEN predicted = 'F10x' THEN cnt ELSE 0 END) AS F10x, - max(CASE WHEN predicted = 'F20x' THEN cnt ELSE 0 END) AS F20x, - max(CASE WHEN predicted = 'F30x' THEN cnt ELSE 0 END) AS F30x, - max(CASE WHEN predicted = 'M10x' THEN cnt ELSE 0 END) AS M10x, - max(CASE WHEN predicted = 'M20x' THEN cnt ELSE 0 END) AS M20x, - max(CASE WHEN predicted = 'M30x' THEN cnt ELSE 0 END) AS M30x + max(CASE WHEN predicted = 'F1x' THEN cnt ELSE 0 END) AS F1x, + max(CASE WHEN predicted = 'F2x' THEN cnt ELSE 0 END) AS F2x, + max(CASE WHEN predicted = 'F3x' THEN cnt ELSE 0 END) AS F3x, + max(CASE WHEN predicted = 'M1x' THEN cnt ELSE 0 END) AS M1x, + max(CASE WHEN predicted = 'M2x' THEN cnt ELSE 0 END) AS M2x, + max(CASE WHEN predicted = 'M3x' THEN cnt ELSE 0 END) AS M3x FROM rf_confusion_matrix GROUP BY actual ) select actual, - F10x,F20x,F30x, - M10x,M20x,M30x + F1x,F2x,F3x, + M1x,M2x,M3x from tmp order by diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql index 5eaca03f..d743a0cc 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/predict.sql @@ -23,12 +23,12 @@ SELECT l.predicted.probability, l.predicted.probabilities as raw_probability, array( -- calibration - l.predicted.probabilities[0] * ${f10x_factor}, -- F10x - l.predicted.probabilities[1] * ${f20x_factor}, -- F20x - l.predicted.probabilities[2] * ${f30x_factor}, -- F30x - l.predicted.probabilities[3] * ${m10x_factor}, -- M10x - l.predicted.probabilities[4] * ${m20x_factor}, -- M20x - l.predicted.probabilities[5] * ${m30x_factor} -- M30x + l.predicted.probabilities[0] * ${f1x_factor}, -- F1x + l.predicted.probabilities[1] * ${f2x_factor}, -- F2x + l.predicted.probabilities[2] * ${f3x_factor}, -- F3x + l.predicted.probabilities[3] * ${m1x_factor}, -- M1x + l.predicted.probabilities[4] * ${m2x_factor}, -- M2x + l.predicted.probabilities[5] * ${m3x_factor} -- M3x ) as probabilities FROM t2 l diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql index 1bf538b0..849067dc 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/rf/topk_predict.sql @@ -13,12 +13,12 @@ calibrated_prediction as ( SELECT userid, array( -- calibration - probabilities[0] * ${f10x_factor}, -- F10x - probabilities[1] * ${f20x_factor}, -- F20x - probabilities[2] * ${f30x_factor}, -- F30x - probabilities[3] * ${m10x_factor}, -- M10x - probabilities[4] * ${m20x_factor}, -- M20x - probabilities[5] * ${m30x_factor} -- M30x + probabilities[0] * ${f1x_factor}, -- F1x + probabilities[1] * ${f2x_factor}, -- F2x + probabilities[2] * ${f3x_factor}, -- F3x + probabilities[3] * ${m1x_factor}, -- M1x + probabilities[4] * ${m2x_factor}, -- M2x + probabilities[5] * ${m3x_factor} -- M3x ) as probabilities FROM rf_predicted_cv From c4349482c28f7a73bfdde9eb66943f1474701dc7 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 16:45:13 +0900 Subject: [PATCH 18/23] Changed feature vector scheme --- .../gender_age_prediction/config/params.yml | 2 +- .../queries/blogposts/gender_age.sql | 12 ------------ .../queries/blogposts/tokenize_and_ftvec.sql | 14 +++----------- 3 files changed, 4 insertions(+), 24 deletions(-) delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql diff --git a/machine-learning-box/gender_age_prediction/config/params.yml b/machine-learning-box/gender_age_prediction/config/params.yml index 05908c09..ad8152a4 100644 --- a/machine-learning-box/gender_age_prediction/config/params.yml +++ b/machine-learning-box/gender_age_prediction/config/params.yml @@ -30,7 +30,7 @@ min_samples_leaf: 1 # over/down sampling min_class_weight: 0.8 -max_class_weight: 4.0 +max_class_weight: 1.2 # heuristic calibration f15_factor: 1.0 diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql deleted file mode 100644 index ffee869c..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/gender_age.sql +++ /dev/null @@ -1,12 +0,0 @@ -select - userid, - (if(gender='male','M','F') || - CASE - WHEN age >= 33 THEN '3x' - WHEN age >= 23 THEN '2x' - ELSE '1x' - END - ) as gender_age -from - blogposts -; diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql index 6b2670b7..2112f616 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql @@ -27,7 +27,7 @@ wordcnt as ( select l.userid, l.word, - ln(1+count(1)) as cnt -- logscale count + count(1) as cnt -- logscale count from exploded l LEFT SEMI JOIN document_frequency r ON (l.word = r.word) @@ -35,20 +35,12 @@ wordcnt as ( l.userid, l.word ), -rescaled as ( - select - userid, - word, - rescale(cnt, min(cnt) over (partition by word), max(cnt) over (partition by word)) as value - from - wordcnt -), ftvec as ( select userid, - to_ordered_list(feature(word,value), value, '-k ${num_features}') as features + to_ordered_list(feature(word,cnt), value, '-k 100') as features from - rescaled + wordcnt group by userid ), From b98f38f6a3ece985a4bcee8ae1d50933f2781dc0 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 17:39:32 +0900 Subject: [PATCH 19/23] Fixed to use tf-idf --- .../gender_age_prediction/blogposts.dig | 6 +- .../queries/blogposts/ftvec.sql | 85 +++++++++++++++++++ .../queries/blogposts/tokenize_and_ftvec.sql | 72 ---------------- .../queries/blogposts/tokenize_en.sql | 11 +++ 4 files changed, 101 insertions(+), 73 deletions(-) create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql delete mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql create mode 100644 machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index 5f21e7d3..aa6809ca 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -6,7 +6,11 @@ _export: priority: ${job_priority} +vectorize: - td>: queries/blogposts/tokenize_and_ftvec.sql + td>: queries/blogposts/tokenize_en.sql + create_table: exploded + ++vectorize: + td>: queries/blogposts/ftvec.sql create_table: input engine_version: experimental diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql new file mode 100644 index 00000000..f75e8a1e --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql @@ -0,0 +1,85 @@ +-- @TD distribute_strategy: aggressive +WITH term_frequency as ( + select + t1.userid, + t2.word, + t2.freq + from ( + select + userid, + tf(word) as word2freq + from + exploded + group by + userid + ) t1 + LATERAL VIEW explode(word2freq) t2 as word, freq +), +document_frequency AS ( + select + word, + count(distinct userid) docs + from + exploded + group by + word +), +doc_len as ( + select + userid, + count(1) as dl, + avg(count(1)) over () as avgdl, + APPROX_COUNT_DISTINCT(userid) over () as total_docs + from + exploded + group by + userid +), +scores as ( + select + tf.userid, + tf.word, + bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25, + tfidf(tf.freq, df.docs, dl.total_docs) as tfidf + from + term_frequency tf + JOIN document_frequency df ON (tf.word = df.word) + JOIN doc_len dl ON (tf.userid = dl.userid) + where + df.docs >= 2 +), +ftvec as ( + select + userid, + to_ordered_list(feature(word,cnt), value, '-k 100') as features + from + wordcnt + group by + userid +), +ages as ( + select + userid, + (if(gender='male','M','F') || + CASE + WHEN age >= 33 THEN '3x' + WHEN age >= 23 THEN '2x' + ELSE '1x' + END + ) as gender_age + from + blogposts +) +-- DIGDAG_INSERT_LINE +SELECT + l.userid, + l.features, + r.gender_age, + -- random sampling + rand(42) as rnd, + -- stratified sampling + count(1) over (partition by r.gender_age) as per_label_count, + rank() over (partition by r.gender_age order by rand(41)) as rank_in_label +FROM + ftvec l + JOIN ages r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql deleted file mode 100644 index 2112f616..00000000 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_and_ftvec.sql +++ /dev/null @@ -1,72 +0,0 @@ --- @TD distribute_strategy: aggressive -WITH exploded as ( - SELECT - userid, - translate(r.word,':','\;') as word - FROM - blogposts l - LATERAL VIEW explode( - tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) - ) r as word - WHERE - NOT is_stopword(r.word) AND - length(r.word) >= 2 AND cast(r.word AS double) IS NULL -), -document_frequency AS ( - select - word, - count(distinct userid) docs - from - exploded - group by - word - having - count(distinct userid) >= 2 -), -wordcnt as ( - select - l.userid, - l.word, - count(1) as cnt -- logscale count - from - exploded l - LEFT SEMI JOIN document_frequency r ON (l.word = r.word) - group by - l.userid, - l.word -), -ftvec as ( - select - userid, - to_ordered_list(feature(word,cnt), value, '-k 100') as features - from - wordcnt - group by - userid -), -ages as ( - select - userid, - (if(gender='male','M','F') || - CASE - WHEN age >= 33 THEN '3x' - WHEN age >= 23 THEN '2x' - ELSE '1x' - END - ) as gender_age - from - blogposts -) --- DIGDAG_INSERT_LINE -SELECT - l.userid, - l.features, - r.gender_age, - -- random sampling - rand(42) as rnd, - -- stratified sampling - count(1) over (partition by r.gender_age) as per_label_count, - rank() over (partition by r.gender_age order by rand(41)) as rank_in_label -FROM - ftvec l - JOIN ages r ON (l.userid = r.userid) \ No newline at end of file diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql new file mode 100644 index 00000000..2631a80c --- /dev/null +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -0,0 +1,11 @@ +SELECT + userid, + translate(r.word,':','\;') as word +FROM + blogposts l + LATERAL VIEW explode( + tokenize(normalize_unicode(translate(post,':','\;'),'NFKC'),true) + ) r as word +WHERE + NOT is_stopword(r.word) AND + length(r.word) >= 2 AND cast(r.word AS double) IS NULL \ No newline at end of file From 15b8cb13fcdc626ec88cdd2aba8ebea51200a4b8 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 17:41:03 +0900 Subject: [PATCH 20/23] Fixed dig file --- machine-learning-box/gender_age_prediction/blogposts.dig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine-learning-box/gender_age_prediction/blogposts.dig b/machine-learning-box/gender_age_prediction/blogposts.dig index aa6809ca..081d1796 100644 --- a/machine-learning-box/gender_age_prediction/blogposts.dig +++ b/machine-learning-box/gender_age_prediction/blogposts.dig @@ -5,7 +5,7 @@ _export: engine: hive priority: ${job_priority} -+vectorize: ++tokenize: td>: queries/blogposts/tokenize_en.sql create_table: exploded From 6cf3d0ff271180ff4afe0bba0c737cc92cde7701 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 17:55:09 +0900 Subject: [PATCH 21/23] Added query hint --- .../gender_age_prediction/queries/blogposts/tokenize_en.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql index 2631a80c..7a43de42 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -1,3 +1,4 @@ +-- @TD distribute_strategy: aggressive SELECT userid, translate(r.word,':','\;') as word From b7988d9da42b66b1a08fb27d220f792e9cd25c90 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 17:59:18 +0900 Subject: [PATCH 22/23] Fixed a query --- .../gender_age_prediction/queries/blogposts/tokenize_en.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql index 7a43de42..e87e56a1 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/tokenize_en.sql @@ -1,4 +1,5 @@ -- @TD distribute_strategy: aggressive +-- DIGDAG_INSERT_LINE SELECT userid, translate(r.word,':','\;') as word From 3ec24f432f67a5736d290923486bc2bbb8a52664 Mon Sep 17 00:00:00 2001 From: Makoto Yui Date: Mon, 6 Jan 2020 20:01:08 +0900 Subject: [PATCH 23/23] Fixed a bug in query --- .../gender_age_prediction/queries/blogposts/ftvec.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql index f75e8a1e..cec397d4 100644 --- a/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql +++ b/machine-learning-box/gender_age_prediction/queries/blogposts/ftvec.sql @@ -39,8 +39,8 @@ scores as ( select tf.userid, tf.word, - bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25, - tfidf(tf.freq, df.docs, dl.total_docs) as tfidf + bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25 + -- tfidf(tf.freq, df.docs, dl.total_docs) as tfidf from term_frequency tf JOIN document_frequency df ON (tf.word = df.word) @@ -51,16 +51,16 @@ scores as ( ftvec as ( select userid, - to_ordered_list(feature(word,cnt), value, '-k 100') as features + to_ordered_list(feature(word,bm25), bm25, '-k 100') as features from - wordcnt + scores group by userid ), ages as ( select userid, - (if(gender='male','M','F') || + concat(if(gender='male','M','F'), CASE WHEN age >= 33 THEN '3x' WHEN age >= 23 THEN '2x'