hugochan
diff --git a/‎src/config/graphflow_static_graph_quac.yml ‎src/config/graphflow_dynamic_graph_doqa.yml
+31-31 b/‎src/config/graphflow_static_graph_quac.yml ‎src/config/graphflow_dynamic_graph_doqa.yml
+31-31
diff --git a/‎src/config/graphflow_static_graph_coqa.yml
-103 b/‎src/config/graphflow_static_graph_coqa.yml
-103
diff --git a/‎src/core/layers/graphs.py
+11-16 b/‎src/core/layers/graphs.py
+11-16
diff --git a/‎src/core/model.py
+18-10 b/‎src/core/model.py
+18-10
@@ -1,57 +1,58 @@
 # Data
-dataset_name: 'quac'
-trainset: '../data/quac/train.json'
-devset: '../data/quac/dev.json'
-testset: null
-embed_file: '../data/coqa/glove.840B.300d.txt'
-saved_vocab_file: '../data/quac/word_model_min_5'
+dataset_name: 'doqa'
+trainset: '../data/doqa/train.json'
+devset: '../data/doqa/dev.json'
+testset: '../data/doqa/test.json'
+embed_file: '/home/cheny39/glove-vectors/glove.840B.300d.txt'
+saved_vocab_file: '../data/doqa/word_model_min_5'
 pretrained: null
 
 
 # Output
-out_dir: '../out/quac/graphflow_static_graph'
+out_dir: '../out/doqa/graphflow_dynamic_graph'
 
 
 # Preprocessing
 min_freq: 5
 top_vocab: 200000
-n_history: 2
+n_history: 2 # 2!
 no_pre_question: False
 no_pre_answer: False
-max_turn_num: 20
+max_turn_num: 8
 
 
 
 # Model
 embed_type: 'glove'
 vocab_embed_size: 300
-fix_vocab_embed: True
+fix_vocab_embed: True # True!
 f_qem: True # Context exact match feature
 f_pos: True # Context POS feature
 f_ner: True # Context NER feature
 f_tf: False # Context TF feature
 ctx_exact_match_embed_dim: 3
 ctx_pos_embed_dim: 12
 ctx_ner_embed_dim: 8
-answer_marker_embed_dim: 10
+answer_marker_embed_dim: 10 # 10!
 use_ques_marker: True
-ques_marker_embed_dim: 3
-ques_turn_marker_embed_dim: 5
+ques_marker_embed_dim: 3 # 3!
+ques_turn_marker_embed_dim: 5 # 5!
 
-hidden_size: 300
-word_dropout: 0.3
-bert_dropout: 0.4
-rnn_dropout: 0.3
+hidden_size: 128 # 128!
+word_dropout: 0.4 # 0.4!
+bert_dropout: 0.2 # 0.2!
+rnn_dropout: 0.4 # 0.4!
 rnn_input_dropout: null
 
 # Graph neural networks
 use_gnn: True
 bignn: False
-static_graph: True
+static_graph: False
 temporal_gnn: True
-ctx_graph_hops: 3
-ctx_graph_topk: 10
-graph_learner_num_pers: 1
+ctx_graph_hops: 5 # 5!
+ctx_graph_topk: 10 # 10!
+graph_learner_num_pers: 1 # 1
+stacked_layer: False # False
 
 
 # Spatial kernels
@@ -63,36 +64,35 @@ position_emb_size: 50
 
 
 # Bert configure
-use_bert: True
-finetune_bert: False
+use_bert: True # True
+finetune_bert: False # False
 use_bert_weight: True
 use_bert_gamma: False
 bert_model: 'bert-large-uncased'
 bert_dim: 1024
 bert_max_seq_len: 500
-bert_doc_stride: 250
+bert_doc_stride: 250 #
 bert_layer_indexes:
   - 0
   - 24
 
 
 # Optimizer
 optimizer: 'adamax'
-learning_rate: 0.001
-grad_clipping: 10
+learning_rate: 0.0005 # 0.0005!
+grad_clipping: 5 # 5!
 
 
 # Training & testing
 random_seed: 1234
 shuffle: True # Whether to shuffle the examples during training
-batch_size: 1 # No. of dialogs per batch
+batch_size: 1 # No. of dialogs per batch, 1!
 grad_accumulated_steps: 1
-test_batch_size: 1
 max_epochs: 30
 patience: 10
 verbose: 1000 # Print every X batches
-unk_answer_threshold: 0.3
-max_answer_len: 35 # Set max answer length for decoding
+unk_answer_threshold: 0.2 # 0.2!
+max_answer_len: 30 # Set max answer length for decoding # 30! 35!
 predict_train: True # Whether to predict on training set
 out_predictions: True # Whether to output predictions
 predict_raw_text: True # Whether to use raw text and offsets for prediction
@@ -103,4 +103,4 @@ out_pred_in_folder: True # Turn it off for Codalab
 
 # Device
 no_cuda: False
-cuda_id: 0
+cuda_id: -1
@@ -74,16 +74,16 @@ def forward(self, context, ctx_mask):
         # attention = torch.mean(torch.matmul(context_fc, context_fc.transpose(-1, -2)), dim=2)
 
 
-        # # 3) Best attention mechanism
-        # context_fc = context.unsqueeze(2) * torch.relu(self.weight_tensor).unsqueeze(0).unsqueeze(0).unsqueeze(-2)
-        # attention = torch.mean(torch.matmul(context_fc, context.unsqueeze(2).transpose(-1, -2)), dim=2)
+        # 3) Best attention mechanism
+        context_fc = context.unsqueeze(2) * torch.relu(self.weight_tensor).unsqueeze(0).unsqueeze(0).unsqueeze(-2)
+        attention = torch.mean(torch.matmul(context_fc, context.unsqueeze(2).transpose(-1, -2)), dim=2)
 
 
-        # 4）weighted cosine
-        context_fc = context.unsqueeze(2) * self.weight_tensor.unsqueeze(0).unsqueeze(0).unsqueeze(-2)
-        context_norm = F.normalize(context_fc, p=2, dim=-1)
-        attention = torch.matmul(context_norm, context_norm.transpose(-1, -2)).mean(2)
-        markoff_value = 0
+        # # 4）weighted cosine
+        # context_fc = context.unsqueeze(2) * self.weight_tensor.unsqueeze(0).unsqueeze(0).unsqueeze(-2)
+        # context_norm = F.normalize(context_fc, p=2, dim=-1)
+        # attention = torch.matmul(context_norm, context_norm.transpose(-1, -2)).mean(2)
+        # markoff_value = 0
 
 
         if ctx_mask is not None:
@@ -205,12 +205,8 @@ def forward(self, node_state, weighted_adjacency_matrix):
         return node_state
 
     def bignn_update(self, node_state, weighted_adjacency_matrix):
-        # weighted_adjacency_matrix_in = torch.softmax(weighted_adjacency_matrix, dim=-1)
-        # weighted_adjacency_matrix_out = torch.softmax(weighted_adjacency_matrix.transpose(-1, -2), dim=-1)
-
-        weighted_adjacency_matrix_in = weighted_adjacency_matrix / torch.clamp(torch.sum(weighted_adjacency_matrix, dim=-1, keepdim=True), min=VERY_SMALL_NUMBER)
-        weighted_adjacency_matrix_out = weighted_adjacency_matrix.transpose(-1, -2) / torch.clamp(torch.sum(weighted_adjacency_matrix.transpose(-1, -2), dim=-1, keepdim=True), min=VERY_SMALL_NUMBER)
-
+        weighted_adjacency_matrix_in = torch.softmax(weighted_adjacency_matrix, dim=-1)
+        weighted_adjacency_matrix_out = torch.softmax(weighted_adjacency_matrix.transpose(-1, -2), dim=-1)
 
         for _ in range(self.graph_hops):
             agg_state_in = self.aggregate_avgpool(node_state, weighted_adjacency_matrix_in)
@@ -220,8 +216,7 @@ def bignn_update(self, node_state, weighted_adjacency_matrix):
         return node_state
 
     def gnn_update(self, node_state, weighted_adjacency_matrix):
-        # weighted_adjacency_matrix = torch.softmax(weighted_adjacency_matrix, dim=-1)
-        weighted_adjacency_matrix = weighted_adjacency_matrix / torch.clamp(torch.sum(weighted_adjacency_matrix, dim=-1, keepdim=True), min=VERY_SMALL_NUMBER)
+        weighted_adjacency_matrix = torch.softmax(weighted_adjacency_matrix, dim=-1)
 
 
         for _ in range(self.graph_hops):
 
@@ -11,7 +11,8 @@
 
 
 from .utils.coqa import compute_eval_metric
-from .utils.quac import eval_fn
+from .utils.quac import eval_fn as quac_eval_fn
+from .utils.doqa import eval_fn as doqa_eval_fn
 from .utils import constants as Constants
 from .word_model import WordModel
 from .models.graphflow import GraphFlow
@@ -331,7 +332,7 @@ class QuACModel(Model):
 
     def __init__(self, config, train_set=None):
         super(QuACModel, self).__init__(config, train_set)
-
+        self.eval_fn = quac_eval_fn if config['dataset_name'] == 'quac' else doqa_eval_fn
 
     def predict(self, ex, step, update=True, out_predictions=False):
         # Train/Eval mode
@@ -342,13 +343,12 @@ def predict(self, ex, step, update=True, out_predictions=False):
         score_s, score_e, unk_probs, score_yesno, score_followup = res['start_logits'], res['end_logits'], res['unk_probs'], res['score_yesno'], res['score_followup']
 
         output = {
-            'metrics': {'f1': 0.0, 'heq': 0.0, 'dheq': 0.0},
+            'metrics': None,
             'loss': 0.0,
             'total_qs': 0,
             'total_dials': 0
         }
 
-
         # Compute loss
         loss = self.compute_span_loss(score_s, score_e, ex['targets'], ex['span_mask'])
         loss = loss + self.compute_answer_type_loss(unk_probs, score_yesno, score_followup, ex['unk_answer_targets'], ex['yesno_targets'], ex['followup_targets'], res['turn_mask'])
@@ -375,7 +375,7 @@ def predict(self, ex, step, update=True, out_predictions=False):
 
         if (not update) or self.config['predict_train']:
             predictions, spans, yesnos, followups = self.extract_predictions(ex, score_s, score_e, unk_probs, score_yesno, score_followup, self.config['unk_answer_threshold'], res['turn_mask'])
-            output['metrics'], total_qs, total_dials = eval_fn(ex['answers'], predictions, ex['raw_evidence_text'])
+            output['metrics'], total_qs, total_dials = self.eval_fn(ex['answers'], predictions, ex['raw_evidence_text'])
             output['total_qs'] = total_qs
             output['total_dials'] = total_dials
 
@@ -434,12 +434,20 @@ def extract_predictions(self, ex, score_s, score_e, unk_probs, score_yesno, scor
                     yesno = Constants.QuAC_YESNO_OTHER
 
                 followup_type = np.argmax(_followup[j]).item()
-                if followup_type == Constants.QuAC_FOLLOWUP_YES_LABEL:
-                    followup = Constants.QuAC_FOLLOWUP_YES
-                elif followup_type == Constants.QuAC_FOLLOWUP_NO_LABEL:
-                    followup = Constants.QuAC_FOLLOWUP_NO
+
+                if self.config['dataset_name'] == 'quac':
+                    if followup_type == Constants.QuAC_FOLLOWUP_YES_LABEL:
+                        followup = Constants.QuAC_FOLLOWUP_YES
+                    elif followup_type == Constants.QuAC_FOLLOWUP_NO_LABEL:
+                        followup = Constants.QuAC_FOLLOWUP_NO
+                    else:
+                        followup = Constants.QuAC_FOLLOWUP_OTHER
+
                 else:
-                    followup = Constants.QuAC_FOLLOWUP_OTHER
+                    if followup_type == Constants.DoQA_FOLLOWUP_YES_LABEL:
+                        followup = Constants.DoQA_FOLLOWUP_YES
+                    else:
+                        followup = Constants.DoQA_FOLLOWUP_NO
 
                 para_pred.append(pred)
                 para_span.append(span)