Skip to content

Commit

Permalink
Merge pull request #1959 from microsoft/miguel/gpu_review
Browse files Browse the repository at this point in the history
Update DKN notebook
  • Loading branch information
wutaomsft committed Jul 17, 2023
2 parents 60e06dc + 493eb31 commit b5d0174
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 77 deletions.
147 changes: 72 additions & 75 deletions examples/00_quick_start/dkn_MIND.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"\n",
"- **word embedding/entity embedding/ context embedding files**: These are `*.npy` files of pretrained embeddings. After loading, each file is a `[n+1,k]` two-dimensional matrix, n is the number of words(or entities) of their hash dictionary, k is dimension of the embedding, note that we keep embedding 0 for zero padding. \n",
"\n",
"In this experiment, we used GloVe\\[4\\] vectors to initialize the word embedding. We trained entity embedding using TransE\\[2\\] on knowledge graph and context embedding is the average of the entity's neighbors in the knowledge graph.<br>\n",
"In this experiment, we used GloVe \\[4\\] vectors to initialize the word embedding. We trained entity embedding using TransE \\[2\\] on knowledge graph and context embedding is the average of the entity's neighbors in the knowledge graph.<br>\n",
"\n",
"## MIND dataset\n",
"\n",
Expand All @@ -69,31 +69,27 @@
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda/envs/tf2/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
" from pyarrow import HadoopFileSystem\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n",
"[GCC 7.5.0]\n",
"Tensorflow version: 2.6.1\n"
"System version: 3.9.16 (main, May 15 2023, 23:46:34) \n",
"[GCC 11.2.0]\n",
"Tensorflow version: 2.7.4\n"
]
}
],
"source": [
"import sys\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import os\n",
"import sys\n",
"from tempfile import TemporaryDirectory\n",
"import scrapbook as sb\n",
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"tf.get_logger().setLevel(\"ERROR\") # only show error messages\n",
"tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n",
"\n",
"from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources, prepare_hparams\n",
"from recommenders.models.deeprec.models.dkn import DKN\n",
Expand All @@ -112,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -123,25 +119,25 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 11.3k/11.3k [00:08<00:00, 1.40kKB/s]\n"
"100%|███████████████████████████████████████████████████████████████████████████████| 11.3k/11.3k [01:39<00:00, 113KB/s]\n"
]
}
],
"source": [
"tmpdir = TemporaryDirectory()\n",
"data_path = os.path.join(tmpdir.name, \"mind-demo-dkn\")\n",
"\n",
"yaml_file = os.path.join(data_path, r'dkn.yaml')\n",
"train_file = os.path.join(data_path, r'train_mind_demo.txt')\n",
"valid_file = os.path.join(data_path, r'valid_mind_demo.txt')\n",
"test_file = os.path.join(data_path, r'test_mind_demo.txt')\n",
"news_feature_file = os.path.join(data_path, r'doc_feature.txt')\n",
"user_history_file = os.path.join(data_path, r'user_history.txt')\n",
"wordEmb_file = os.path.join(data_path, r'word_embeddings_100.npy')\n",
"entityEmb_file = os.path.join(data_path, r'TransE_entity2vec_100.npy')\n",
"contextEmb_file = os.path.join(data_path, r'TransE_context2vec_100.npy')\n",
"yaml_file = os.path.join(data_path, \"dkn.yaml\")\n",
"train_file = os.path.join(data_path, \"train_mind_demo.txt\")\n",
"valid_file = os.path.join(data_path, \"valid_mind_demo.txt\")\n",
"test_file = os.path.join(data_path, \"test_mind_demo.txt\")\n",
"news_feature_file = os.path.join(data_path, \"doc_feature.txt\")\n",
"user_history_file = os.path.join(data_path, \"user_history.txt\")\n",
"wordEmb_file = os.path.join(data_path, \"word_embeddings_100.npy\")\n",
"entityEmb_file = os.path.join(data_path, \"TransE_entity2vec_100.npy\")\n",
"contextEmb_file = os.path.join(data_path, \"TransE_context2vec_100.npy\")\n",
"if not os.path.exists(yaml_file):\n",
" download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/deeprec/', tmpdir.name, 'mind-demo-dkn.zip')\n",
" download_deeprec_resources(\"https://recodatasets.z20.web.core.windows.net/deeprec/\", tmpdir.name, \"mind-demo-dkn.zip\")\n",
" "
]
},
Expand All @@ -154,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -165,14 +161,14 @@
},
"outputs": [],
"source": [
"epochs = 10\n",
"history_size = 50\n",
"batch_size = 100"
"EPOCHS = 10\n",
"HISTORY_SIZE = 50\n",
"BATCH_SIZE = 500"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -183,7 +179,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"kg_file=None,user_clicks=None,FEATURE_COUNT=None,FIELD_COUNT=None,data_format=dkn,PAIR_NUM=None,DNN_FIELD_NUM=None,n_user=None,n_item=None,n_user_attr=None,n_item_attr=None,iterator_type=None,SUMMARIES_DIR=None,MODEL_DIR=None,wordEmb_file=/tmp/tmpm_qyjfgp/mind-demo-dkn/word_embeddings_100.npy,entityEmb_file=/tmp/tmpm_qyjfgp/mind-demo-dkn/TransE_entity2vec_100.npy,contextEmb_file=/tmp/tmpm_qyjfgp/mind-demo-dkn/TransE_context2vec_100.npy,news_feature_file=/tmp/tmpm_qyjfgp/mind-demo-dkn/doc_feature.txt,user_history_file=/tmp/tmpm_qyjfgp/mind-demo-dkn/user_history.txt,use_entity=True,use_context=True,doc_size=10,history_size=50,word_size=12600,entity_size=3987,entity_dim=100,entity_embedding_method=None,transform=True,train_ratio=None,dim=100,layer_sizes=[300],cross_layer_sizes=None,cross_layers=None,activation=['sigmoid'],cross_activation=identity,user_dropout=False,dropout=[0.0],attention_layer_sizes=100,attention_activation=relu,attention_dropout=0.0,model_type=dkn,method=classification,load_saved_model=False,load_model_name=None,filter_sizes=[1, 2, 3],num_filters=100,mu=None,fast_CIN_d=0,use_Linear_part=False,use_FM_part=False,use_CIN_part=False,use_DNN_part=False,init_method=uniform,init_value=0.1,embed_l2=1e-06,embed_l1=0.0,layer_l2=1e-06,layer_l1=0.0,cross_l2=0.0,cross_l1=0.0,reg_kg=0.0,learning_rate=0.0005,lr_rs=1,lr_kg=0.5,kg_training_interval=5,max_grad_norm=2,is_clip_norm=0,dtype=32,loss=log_loss,optimizer=adam,epochs=10,batch_size=100,enable_BN=True,show_step=10000,save_model=False,save_epoch=2,metrics=['auc'],write_tfevents=False,item_embedding_dim=None,cate_embedding_dim=None,user_embedding_dim=None,train_num_ngs=4,need_sample=True,embedding_dropout=0.0,user_vocab=None,item_vocab=None,cate_vocab=None,pairwise_metrics=['group_auc', 'mean_mrr', 'ndcg@5;10'],EARLY_STOP=100,max_seq_length=None,hidden_size=None,L=None,T=None,n_v=None,n_h=None,min_seq_length=1,attention_size=None,att_fcn_layer_sizes=None,dilations=None,kernel_size=None,embed_size=None,n_layers=None,decay=None,eval_epoch=None,top_k=None\n"
"HParams object with values {'use_entity': True, 'use_context': True, 'cross_activation': 'identity', 'user_dropout': False, 'dropout': [0.0], 'attention_dropout': 0.0, 'load_saved_model': False, 'fast_CIN_d': 0, 'use_Linear_part': False, 'use_FM_part': False, 'use_CIN_part': False, 'use_DNN_part': False, 'init_method': 'uniform', 'init_value': 0.1, 'embed_l2': 1e-06, 'embed_l1': 0.0, 'layer_l2': 1e-06, 'layer_l1': 0.0, 'cross_l2': 0.0, 'cross_l1': 0.0, 'reg_kg': 0.0, 'learning_rate': 0.0005, 'lr_rs': 1, 'lr_kg': 0.5, 'kg_training_interval': 5, 'max_grad_norm': 2, 'is_clip_norm': 0, 'dtype': 32, 'optimizer': 'adam', 'epochs': 10, 'batch_size': 500, 'enable_BN': True, 'show_step': 10000, 'save_model': False, 'save_epoch': 2, 'write_tfevents': False, 'train_num_ngs': 4, 'need_sample': True, 'embedding_dropout': 0.0, 'EARLY_STOP': 100, 'min_seq_length': 1, 'slots': 5, 'cell': 'SUM', 'doc_size': 10, 'history_size': 50, 'word_size': 12600, 'entity_size': 3987, 'data_format': 'dkn', 'metrics': ['auc'], 'pairwise_metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'method': 'classification', 'activation': ['sigmoid'], 'attention_activation': 'relu', 'attention_layer_sizes': 100, 'dim': 100, 'entity_dim': 100, 'transform': True, 'filter_sizes': [1, 2, 3], 'layer_sizes': [300], 'model_type': 'dkn', 'num_filters': 100, 'loss': 'log_loss', 'news_feature_file': '/tmp/tmpgy77utho/mind-demo-dkn/doc_feature.txt', 'user_history_file': '/tmp/tmpgy77utho/mind-demo-dkn/user_history.txt', 'wordEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/word_embeddings_100.npy', 'entityEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_entity2vec_100.npy', 'contextEmb_file': '/tmp/tmpgy77utho/mind-demo-dkn/TransE_context2vec_100.npy'}\n"
]
}
],
Expand All @@ -194,9 +190,9 @@
" wordEmb_file=wordEmb_file,\n",
" entityEmb_file=entityEmb_file,\n",
" contextEmb_file=contextEmb_file,\n",
" epochs=epochs,\n",
" history_size=history_size,\n",
" batch_size=batch_size)\n",
" epochs=EPOCHS,\n",
" history_size=HISTORY_SIZE,\n",
" batch_size=BATCH_SIZE)\n",
"print(hparams)"
]
},
Expand All @@ -209,7 +205,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -222,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -233,7 +229,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'auc': 0.5437, 'group_auc': 0.5163, 'mean_mrr': 0.157, 'ndcg@5': 0.1527, 'ndcg@10': 0.2191}\n"
"{'auc': 0.5218, 'group_auc': 0.5071, 'mean_mrr': 0.1494, 'ndcg@5': 0.1539, 'ndcg@10': 0.2125}\n"
]
}
],
Expand All @@ -243,7 +239,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -256,54 +252,54 @@
"output_type": "stream",
"text": [
"at epoch 1\n",
"train info: logloss loss:0.6685948745679047\n",
"eval info: auc:0.5426, group_auc:0.5296, mean_mrr:0.181, ndcg@10:0.2502, ndcg@5:0.1856\n",
"at epoch 1 , train time: 11.8 eval time: 4.8\n",
"train info: logloss loss:0.6945172200600306\n",
"eval info: auc:0.5929, group_auc:0.5633, mean_mrr:0.1834, ndcg@10:0.2511, ndcg@5:0.1939\n",
"at epoch 1 , train time: 39.8 eval time: 8.8\n",
"at epoch 2\n",
"train info: logloss loss:0.6204990633463455\n",
"eval info: auc:0.5473, group_auc:0.5139, mean_mrr:0.1706, ndcg@10:0.2333, ndcg@5:0.1829\n",
"at epoch 2 , train time: 11.1 eval time: 4.7\n",
"train info: logloss loss:0.6527644917368889\n",
"eval info: auc:0.5877, group_auc:0.5499, mean_mrr:0.1891, ndcg@10:0.2542, ndcg@5:0.2013\n",
"at epoch 2 , train time: 36.0 eval time: 9.0\n",
"at epoch 3\n",
"train info: logloss loss:0.5874787417508788\n",
"eval info: auc:0.5389, group_auc:0.493, mean_mrr:0.1639, ndcg@10:0.2214, ndcg@5:0.1691\n",
"at epoch 3 , train time: 11.0 eval time: 4.7\n",
"train info: logloss loss:0.6361906168361505\n",
"eval info: auc:0.6013, group_auc:0.5799, mean_mrr:0.1999, ndcg@10:0.2703, ndcg@5:0.2078\n",
"at epoch 3 , train time: 36.0 eval time: 9.0\n",
"at epoch 4\n",
"train info: logloss loss:0.5630691410121271\n",
"eval info: auc:0.5673, group_auc:0.5183, mean_mrr:0.176, ndcg@10:0.2357, ndcg@5:0.1854\n",
"at epoch 4 , train time: 11.0 eval time: 4.7\n",
"train info: logloss loss:0.6205979473888874\n",
"eval info: auc:0.611, group_auc:0.5862, mean_mrr:0.1851, ndcg@10:0.2624, ndcg@5:0.1853\n",
"at epoch 4 , train time: 36.1 eval time: 8.9\n",
"at epoch 5\n",
"train info: logloss loss:0.5432774212400792\n",
"eval info: auc:0.5959, group_auc:0.564, mean_mrr:0.1926, ndcg@10:0.2562, ndcg@5:0.1977\n",
"at epoch 5 , train time: 11.0 eval time: 4.8\n",
"train info: logloss loss:0.6062351117531458\n",
"eval info: auc:0.6148, group_auc:0.5931, mean_mrr:0.1947, ndcg@10:0.2715, ndcg@5:0.1951\n",
"at epoch 5 , train time: 36.2 eval time: 9.0\n",
"at epoch 6\n",
"train info: logloss loss:0.5274657431547924\n",
"eval info: auc:0.6, group_auc:0.5659, mean_mrr:0.1895, ndcg@10:0.2528, ndcg@5:0.1917\n",
"at epoch 6 , train time: 11.0 eval time: 4.7\n",
"train info: logloss loss:0.5931083386143049\n",
"eval info: auc:0.6153, group_auc:0.5942, mean_mrr:0.2015, ndcg@10:0.2737, ndcg@5:0.2084\n",
"at epoch 6 , train time: 36.3 eval time: 9.3\n",
"at epoch 7\n",
"train info: logloss loss:0.5117715953265206\n",
"eval info: auc:0.6041, group_auc:0.5679, mean_mrr:0.1757, ndcg@10:0.2447, ndcg@5:0.1774\n",
"at epoch 7 , train time: 11.0 eval time: 4.7\n",
"train info: logloss loss:0.582433108240366\n",
"eval info: auc:0.6268, group_auc:0.5981, mean_mrr:0.2011, ndcg@10:0.2765, ndcg@5:0.2085\n",
"at epoch 7 , train time: 36.4 eval time: 10.3\n",
"at epoch 8\n",
"train info: logloss loss:0.49649867617477805\n",
"eval info: auc:0.6061, group_auc:0.5749, mean_mrr:0.1834, ndcg@10:0.25, ndcg@5:0.1831\n",
"at epoch 8 , train time: 10.9 eval time: 4.7\n",
"train info: logloss loss:0.5735978713879982\n",
"eval info: auc:0.6263, group_auc:0.6052, mean_mrr:0.2034, ndcg@10:0.279, ndcg@5:0.217\n",
"at epoch 8 , train time: 36.8 eval time: 9.2\n",
"at epoch 9\n",
"train info: logloss loss:0.4817596108226453\n",
"eval info: auc:0.5967, group_auc:0.5703, mean_mrr:0.1759, ndcg@10:0.2417, ndcg@5:0.1769\n",
"at epoch 9 , train time: 11.0 eval time: 4.7\n",
"train info: logloss loss:0.5567030770083269\n",
"eval info: auc:0.62, group_auc:0.5958, mean_mrr:0.1942, ndcg@10:0.2688, ndcg@5:0.2019\n",
"at epoch 9 , train time: 39.3 eval time: 11.0\n",
"at epoch 10\n",
"train info: logloss loss:0.46655569288690213\n",
"eval info: auc:0.6043, group_auc:0.5742, mean_mrr:0.1796, ndcg@10:0.2474, ndcg@5:0.1822\n",
"at epoch 10 , train time: 11.1 eval time: 4.8\n"
"train info: logloss loss:0.5417348792155584\n",
"eval info: auc:0.6198, group_auc:0.6035, mean_mrr:0.1929, ndcg@10:0.2692, ndcg@5:0.201\n",
"at epoch 10 , train time: 46.3 eval time: 13.2\n"
]
},
{
"data": {
"text/plain": [
"<recommenders.models.deeprec.models.dkn.DKN at 0x7f6d41a62ba8>"
"<recommenders.models.deeprec.models.dkn.DKN at 0x7f2341f7bb50>"
]
},
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -323,7 +319,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {
"pycharm": {
"is_executing": false
Expand All @@ -334,7 +330,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'auc': 0.597, 'group_auc': 0.5887, 'mean_mrr': 0.1883, 'ndcg@5': 0.1904, 'ndcg@10': 0.2608}\n"
"{'auc': 0.6227, 'group_auc': 0.5963, 'mean_mrr': 0.2014, 'ndcg@5': 0.2066, 'ndcg@10': 0.28}\n"
]
}
],
Expand Down Expand Up @@ -371,7 +367,8 @@
"hash": "3a9a0c422ff9f08d62211b9648017c63b0a26d2c935edc37ebb8453675d13bb5"
},
"kernelspec": {
"display_name": "Python 3.7.11 64-bit ('tf2': conda)",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
Expand All @@ -384,7 +381,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
"version": "3.9.16"
},
"pycharm": {
"stem_cell": {
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/examples/test_notebooks_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def test_dkn_quickstart_integration(notebooks, output_notebook, kernel_name):
notebook_path,
output_notebook,
kernel_name=kernel_name,
parameters=dict(epochs=5, batch_size=500),
parameters=dict(EPOCHS=5, BATCH_SIZE=500),
)
results = sb.read_notebook(output_notebook).scraps.dataframe.set_index("name")[
"data"
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/examples/test_notebooks_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,5 @@ def test_dkn_quickstart(notebooks, output_notebook, kernel_name):
notebook_path,
output_notebook,
kernel_name=kernel_name,
parameters=dict(epochs=1, batch_size=500),
parameters=dict(EPOCHS=1, BATCH_SIZE=500),
)

0 comments on commit b5d0174

Please sign in to comment.