Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add blogpost gender age prediction example #240

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions machine-learning-box/gender_age_prediction/.ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2.6.3
21 changes: 21 additions & 0 deletions machine-learning-box/gender_age_prediction/blogposts.dig
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
_export:
!include : config/params.yml
td:
database: ${target_db}
engine: hive
priority: ${job_priority}

+tokenize:
td>: queries/blogposts/tokenize_en.sql
create_table: exploded

+vectorize:
td>: queries/blogposts/ftvec.sql
create_table: input
engine_version: experimental

+predict:
_parallel: true

+rf_predict:
call>: blogposts_rf_predict.dig
46 changes: 46 additions & 0 deletions machine-learning-box/gender_age_prediction/blogposts.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Please download dataset from [Kaggle](https://www.kaggle.com/tomlisankie/blog-posts-labeled-with-age-and-gender/download) first.

Then, you need a Kaggle account for download. Please set your kaggle API credentials in ~/.kaggle/kaggle.json following [this instruction](https://github.com/Kaggle/kaggle-api#api-credentials).

## Prepare data

Please download dataset from kaggle and run the following data preprocessing.


```sh
pip install kaggle

chmod 600 ~/.kaggle/kaggle.json
kaggle datasets download tomlisankie/blog-posts-labeled-with-age-and-gender

unzip blog-posts-labeled-with-age-and-gender.zip

brew install jq

echo -e "userid\tpost\tage\tgender" > blogposts.tsv
jq -r -c '.[] | [.post,.age,.gender] | @tsv' train.json | awk '{print NR"\t"$0}' >> blogposts.tsv
jq -r -c '.[] | [.post,.age,.gender] | @tsv' test.json | awk '{print 526812+NR"\t"$0}' >> blogposts.tsv
```

## Import data to Treasure Data

Please import prepared blog post data to Treasure Data as follows:

```sh
# create database
td db:create td_test

# load training data
td table:create td_test blogposts
td import:auto --auto-create td_test.blogposts --format tsv --column-header --time-value `date +%s` --column-types "int,string,int,string" ./blogposts.tsv
```

# Run gender-age prediction workflow

```sh
# Push workflows to Treasure workflow
$ td wf push td_test

# Run workflow from command line (also runnable from GUI)
$ td wf run blogposts.dig
```
112 changes: 112 additions & 0 deletions machine-learning-box/gender_age_prediction/blogposts_rf_predict.dig
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
_export:
!include : config/params.yml
td:
database: ${target_db}
engine: hive
# engine_version: experimental
priority: ${job_priority}

+preparation:

+label_mapping:
td>: queries/rf/label_mapping.sql
create_table: label_mapping

+rf_input:
td>: queries/rf/rf_input.sql
create_table: rf_input

+compute_class_weight:
td>: queries/rf/compute_class_weight.sql
create_table: class_weight
engine: presto

+prepare_model_tables:
td_ddl>:
empty_tables: ["rf_model_cv", "rf_model"]

+store_weights:
td>: queries/rf/store_weights.sql
store_last_results: true

+validate_and_train:
_export:
class_weights: ${td.last_results.weights}

+cross_validation:

+parallel_train:
for_range>:
from: 0
to: ${rf_num_train_parallel}
step: 1
_parallel: true
_do:
+train:
td>: queries/rf/train_cv.sql
insert_into: rf_model_cv
seed: ${rf_seed + range.index * 100}

+predict:
td>: queries/rf/predict_cv.sql
create_table: rf_predicted_cv

# +topk_predict:
# td>: queries/blogposts/rf/topk_predict.sql
# create_table: rf_topk_predict
#
# +evaluation_measures:
# td>: queries/rf/eval.sql

+eval:
_parallel: true

+confusion_matrix:

+confusion_matrix_table:
td>: queries/rf/confusion_matrix.sql
engine: presto
create_table: rf_confusion_matrix

+confusion_matrix_pivot:
td>: queries/blogposts/rf/confusion_matrix_pivot.sql
engine: presto

+actual_predict_diff:
td>: queries/rf/actual_predict_diff.sql
engine: presto

# +heuristic_calibration:
#
# +calibration_prediction:
# td>: queries/rf/calibrate_prediction.sql
# engine: presto
# create_table: rf_predicted_cv_calibrated
#
# +eval_calibration:
# td>: queries/rf/confusion_matrix_calibrated.sql
# engine: presto
# create_table: rf_confusion_matrix_calibrated

+train_predict:

+parallel_train:
for_range>:
from: 0
to: ${rf_num_train_parallel}
step: 1
_parallel: true
_do:
+train:
td>: queries/rf/train.sql
insert_into: rf_model
seed: ${rf_seed + range.index * 100}

+prediction:
td>: queries/blogposts/rf/predict.sql
create_table: rf_predicted

+complement_prediction:
td>: queries/rf/complement_prediction.sql
engine: presto
create_table: rf_complemented
18 changes: 13 additions & 5 deletions machine-learning-box/gender_age_prediction/config/params.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
target_db: td_test

job_priority: -1
job_priority: 0
# job_priority: -1

topk_predict: 5

Expand All @@ -21,15 +22,15 @@ max_age: 95

# RandomForest
rf_seed: 71
rf_trees: 15
rf_num_train_parallel: 4
rf_max_depth: 30
rf_trees: 10
rf_num_train_parallel: 5
rf_max_depth: 10
min_split: 3
min_samples_leaf: 1

# over/down sampling
min_class_weight: 0.8
max_class_weight: 4.0
max_class_weight: 1.2

# heuristic calibration
f15_factor: 1.0
Expand All @@ -43,6 +44,13 @@ m25_factor: 1.0
m35_factor: 1.0
m50_factor: 1.0

f1x_factor: 1.0
f2x_factor: 1.0
f3x_factor: 1.0
m1x_factor: 1.0
m2x_factor: 1.0
m3x_factor: 1.0

#f15_factor: 1.1
#f20_factor: 1.4
#f25_factor: 0.75
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
-- @TD distribute_strategy: aggressive
WITH term_frequency as (
select
t1.userid,
t2.word,
t2.freq
from (
select
userid,
tf(word) as word2freq
from
exploded
group by
userid
) t1
LATERAL VIEW explode(word2freq) t2 as word, freq
),
document_frequency AS (
select
word,
count(distinct userid) docs
from
exploded
group by
word
),
doc_len as (
select
userid,
count(1) as dl,
avg(count(1)) over () as avgdl,
APPROX_COUNT_DISTINCT(userid) over () as total_docs
from
exploded
group by
userid
),
scores as (
select
tf.userid,
tf.word,
bm25(tf.freq, dl.dl, dl.avgdl, dl.total_docs, df.docs) as bm25
-- tfidf(tf.freq, df.docs, dl.total_docs) as tfidf
from
term_frequency tf
JOIN document_frequency df ON (tf.word = df.word)
JOIN doc_len dl ON (tf.userid = dl.userid)
where
df.docs >= 2
),
ftvec as (
select
userid,
to_ordered_list(feature(word,bm25), bm25, '-k 100') as features
from
scores
group by
userid
),
ages as (
select
userid,
concat(if(gender='male','M','F'),
CASE
WHEN age >= 33 THEN '3x'
WHEN age >= 23 THEN '2x'
ELSE '1x'
END
) as gender_age
from
blogposts
)
-- DIGDAG_INSERT_LINE
SELECT
l.userid,
l.features,
r.gender_age,
-- random sampling
rand(42) as rnd,
-- stratified sampling
count(1) over (partition by r.gender_age) as per_label_count,
rank() over (partition by r.gender_age order by rand(41)) as rank_in_label
FROM
ftvec l
JOIN ages r ON (l.userid = r.userid)
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- DIGDAG_INSERT_LINE
WITH tmp as (
SELECT
actual,
max(CASE WHEN predicted = 'F1x' THEN cnt ELSE 0 END) AS F1x,
max(CASE WHEN predicted = 'F2x' THEN cnt ELSE 0 END) AS F2x,
max(CASE WHEN predicted = 'F3x' THEN cnt ELSE 0 END) AS F3x,
max(CASE WHEN predicted = 'M1x' THEN cnt ELSE 0 END) AS M1x,
max(CASE WHEN predicted = 'M2x' THEN cnt ELSE 0 END) AS M2x,
max(CASE WHEN predicted = 'M3x' THEN cnt ELSE 0 END) AS M3x
FROM rf_confusion_matrix
GROUP BY actual
)
select
actual,
F1x,F2x,F3x,
M1x,M2x,M3x
from
tmp
order by
actual asc
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
-- @TD enable_cartesian_product: true
-- @TD autoconvertjoin: true
WITH t2 as (
SELECT
userid,
rf_ensemble(predicted.value, predicted.posteriori, model_weight) as predicted
FROM (
SELECT
t.userid,
p.model_weight,
tree_predict(p.model_id, p.model, t.features, '-classification') as predicted
FROM
rf_model p
LEFT OUTER JOIN rf_input t
) t1
GROUP BY
userid
)
-- DIGDAG_INSERT_LINE
SELECT
l.userid,
r.label,
l.predicted.probability,
l.predicted.probabilities as raw_probability,
array( -- calibration
l.predicted.probabilities[0] * ${f1x_factor}, -- F1x
l.predicted.probabilities[1] * ${f2x_factor}, -- F2x
l.predicted.probabilities[2] * ${f3x_factor}, -- F3x
l.predicted.probabilities[3] * ${m1x_factor}, -- M1x
l.predicted.probabilities[4] * ${m2x_factor}, -- M2x
l.predicted.probabilities[5] * ${m3x_factor} -- M3x
) as probabilities
FROM
t2 l
JOIN label_mapping r ON (l.predicted.label = r.label_id)

Loading