Project import generated by Copybara.

PiperOrigin-RevId: 485897516
google-research · Jan 4, 2023 · 838c13b · 838c13b
1 parent 8ad18d8
commit 838c13b
Show file tree

Hide file tree

Showing 13 changed files with 284 additions and 194 deletions.
diff --git a/conda-env.txt b/conda-env.txt
@@ -110,3 +110,16 @@ https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.17.4-py37hc1035e2_0.tar.bz2
 https://repo.anaconda.com/pkgs/main/linux-64/pandas-0.25.3-py37he6710b0_0.tar.bz2
 https://conda.anaconda.org/pytorch/linux-64/pytorch-1.3.1-py3.7_cuda10.0.130_cudnn7.6.3_0.tar.bz2
 https://conda.anaconda.org/pytorch/linux-64/torchvision-0.4.2-py37_cu100.tar.bz2
+https://repo.anaconda.com/pkgs/main/linux-64/svn-1.10.2-h52f66ed_0.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda
+https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libapr-1.7.0-hf178f73_5.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libntlm-1.6-h7f8727e_0.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libutf8proc-2.1.1-h14c3975_0.conda
+https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.8.1.2-h14c3975_0.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libapriconv-1.2.2-h7f8727e_5.conda
+https://repo.anaconda.com/pkgs/main/linux-64/libaprutil-1.6.1-hfefca11_5.conda
+https://repo.anaconda.com/pkgs/main/linux-64/apr-1.7.0-hf178f73_5.conda
+https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.19.2-hac12032_0.conda
+https://repo.anaconda.com/pkgs/main/linux-64/cyrus-sasl-2.1.27-h9c0eb46_9.conda
+https://repo.anaconda.com/pkgs/main/linux-64/serf-1.3.9-ha066f01_0.conda
diff --git a/evaluate.py b/evaluate.py
@@ -293,7 +293,9 @@ def evaluate(prediction_folder, label_folder, xtreme_version, verbose=False):
         score['avg_metric'] = score['avg_f1']
       elif 'avg_accuracy' in score:
         score['avg_metric'] = score['avg_accuracy']
-      detailed_scores[task] = score
+      elif 'avg_map@20' in score:
+        score['avg_metric'] = score['avg_map@20']
+      detailed_scores[task] = dict(score)
       if verbose:
         avg_result = ', '.join(['{}={:.1f}'.format(k, v)
                                 for k, v in score.items()

diff --git a/install_tools.sh b/install_tools.sh
@@ -22,6 +22,12 @@ mkdir -p $LIB
 # install conda env
 conda create --name xtreme --file conda-env.txt
 conda init bash
+
+# If 'conda activate' fails below, try uncommenting the following lines,
+# based on https://github.com/conda/conda/issues/7980.
+# CONDA_PATH=$(conda info | grep -i 'base environment' | awk '{print $4}')
+# source $CONDA_PATH/etc/profile.d/conda.sh
+
 conda activate xtreme
 
 # install latest transformer

diff --git a/leakr_badwords.dic b/leakr_badwords.dic
@@ -0,0 +1,2 @@
+;; Changed 'mum' to Info severity
+$RE:(^|[\W])((?i)mum(?-i))($|[\W]);0;Use regex as mum is a common letter combo
diff --git a/multichecklist/README.md b/multichecklist/README.md
@@ -29,7 +29,7 @@ pip install absl-py
 pip install checklist
 ```
 
-3. Download the MultiCheckList tests and model predictions from [here](https://pantheon.corp.google.com/storage/browser/xtreme_translations/MultiCheckList) into this folder. [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) is
+3. Download the MultiCheckList tests and model predictions from [here](https://console.cloud.google.com/storage/browser/xtreme_translations/MultiCheckList) into this folder. [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) is
 the preferred way to download all files at once:
 
 ```

diff --git a/scripts/download_data.sh b/scripts/download_data.sh
@@ -245,6 +245,58 @@ function download_siqa {
     echo "Successfully downloaded data at $DIR/siqa" >> $DIR/download.log
 }
 
+function download_mewslix {
+    echo "download mewslix [5-10 mins]"
+    OUTPATH=$DIR/mewslix
+    OUTPATH_TMP=$OUTPATH/tmp
+    mkdir -p $OUTPATH_TMP
+    pushd $OUTPATH_TMP
+    svn export -q https://github.com/google-research/google-research/trunk/dense_representations_for_entity_retrieval
+    pushd dense_representations_for_entity_retrieval/mel
+
+    ( # Use a subshell to keep things separate from an already active conda env.
+    # If 'conda activate' fails below, try uncommenting the following lines,
+    # based on https://github.com/conda/conda/issues/7980.
+    # CONDA_PATH=$(conda info | grep -i 'base environment' | awk '{print $4}')
+    # source $CONDA_PATH/etc/profile.d/conda.sh
+
+    # Create and activate a dedicated conda environment for Mewsli-X extraction.
+    bash create-env.sh conda
+    conda activate mewsli_env
+
+    # Run the Mewsli-X downloader script.
+    bash get-mewsli-x.sh
+    )
+
+    INTERIM_DIR=$PWD/mewsli_x/output/dataset
+    echo
+    echo "Move dataset to $OUTPATH/"
+    mv $INTERIM_DIR/candidate_set_entities.jsonl $OUTPATH/
+    mv $INTERIM_DIR/wikipedia_pairs-{train,dev}.jsonl $OUTPATH/
+    mv $INTERIM_DIR/wikinews_mentions-{dev,test}.jsonl $OUTPATH/
+    popd
+    popd
+
+    python $REPO/utils_preprocess.py --data_dir $OUTPATH --output_dir $OUTPATH --task mewslix
+    rm -rf $OUTPATH_TMP $OUTPATH/wikinews_mentions-{dev,test}.jsonl
+    echo "Successfully downloaded data at $OUTPATH" >> $DIR/download.log
+}
+
+function download_lareqa {
+    echo "download lareqa"
+    OUTPATH=$DIR/lareqa/
+    mkdir -p $OUTPATH
+    cd $OUTPATH
+    wget https://github.com/google-research-datasets/lareqa/archive/master.zip -q --show-progress
+    unzip master.zip
+    mv lareqa-master/* .
+    rm -rf lareqa-master/
+    rm master.zip
+    rm LICENSE
+    rm README.md
+    echo "Successfully downloaded data at $OUTPATH" >> $DIR/download.log
+}
+
 download_xnli
 download_pawsx
 download_tatoeba
@@ -257,3 +309,5 @@ download_udpos
 download_panx
 download_xcopa
 download_siqa
+download_mewslix
+download_lareqa
diff --git a/scripts/download_lareqa_data.sh b/scripts/download_lareqa_data.sh
diff --git a/scripts/download_mewslix_data.sh b/scripts/download_mewslix_data.sh
diff --git a/scripts/run_eval_lareqa.sh b/scripts/run_eval_lareqa.sh
@@ -21,7 +21,8 @@ MODEL=${1:-bert-base-multilingual-cased}
 GPU=${2:-0}
 DATA_DIR=${3:-"$REPO/download/"}
 OUT_DIR=${4:-"$REPO/outputs/"}
-# Select a checkpoint based on validation performance.
+# Select a checkpoint based on validation performance. Results in the XTREME-R
+# paper used mBERT checkpoint-9000 and XLM-R checkpoint-10000.
 CHECKPOINT=${5:-checkpoint-9000}
 
 TASK='lareqa'

diff --git a/scripts/train.sh b/scripts/train.sh
@@ -42,5 +42,10 @@ elif [ $TASK == 'bucc2018' ]; then
   bash $REPO/scripts/run_bucc2018.sh $MODEL $GPU $DATA_DIR $OUT_DIR
 elif [ $TASK == 'tatoeba' ]; then
   bash $REPO/scripts/run_tatoeba.sh $MODEL $GPU $DATA_DIR $OUT_DIR
+elif [ $TASK == 'mewslix' ]; then
+  bash $REPO/scripts/train_mewslix.sh $MODEL $GPU $DATA_DIR $OUT_DIR
+elif [ $TASK == 'lareqa' ]; then
+  bash $REPO/scripts/train_lareqa.sh $MODEL $GPU $DATA_DIR $OUT_DIR
+  bash $REPO/scripts/run_eval_lareqa.sh $MODEL $GPU $DATA_DIR $OUT_DIR
 fi
 
diff --git a/scripts/train_mewslix.sh b/scripts/train_mewslix.sh
@@ -23,30 +23,27 @@ OUT_DIR=${4:-"$REPO/outputs"}
 TASK='mewslix'
 
 # These settings should match those used in scripts/run_eval_mewslix.sh
-# They are primarily aimed at a quick turnaround time (~1h using 1 GPU).
+# They are primarily aimed at a quick training time (~1h using 1 GPU for mBERT).
 MAX_SEQ_LEN=64
 NUM_EPOCHS=2
 GRAD_ACC_STEPS=4
 
+# Learning rates were set based on best dev-set loss on the English
+# 'wikipedia_pairs-dev' after 2 epochs, searching over {1e-5, 2e-5, 5e-5, 1e-4}.
 if [ $MODEL == "bert-base-multilingual-cased" ]; then
   MODEL_TYPE="bert-retrieval"
-  LR=2e-5  # best dev-set loss after 2 epochs among {1e-5, 2e-5, 5e-5, 1e-4}
+  LR=2e-5
   DO_LOWER_CASE=""
   PER_GPU_BATCH_SIZE=64  # largest power of two that fit 16GB GPU RAM
   LOGGING_STEPS=50
   SAVE_STEPS=100
-# BEGIN GOOGLE-INTERNAL
-# TODO(jabot,ruder): The XLM-R baseline requires some further hyperparam
-# tuning and/or troubleshooting, as it scored near zero on the eval metric
-# using the current settings.
 elif [ $MODEL == "xlm-roberta-large" ]; then
   MODEL_TYPE="xlmr-retrieval"
-  LR=1e-4
+  LR=1e-5
   DO_LOWER_CASE="--do_lower_case"
   PER_GPU_BATCH_SIZE=8  # largest power of two that fit 16GB GPU RAM
   LOGGING_STEPS=500
   SAVE_STEPS=2000
-# END GOOGLE-INTERNAL
 else
   echo "$MODEL not configured."
 fi
@@ -82,4 +79,7 @@ python third_party/run_retrieval_el.py \
   --train_lang en \
   --eval_lang en \
   $DO_LOWER_CASE \
-  2>&1 | tee $MODEL_DIR/train.log
+  2>&1 | tee $MODEL_DIR/train.log
+
+set +x
+bash $REPO/scripts/run_eval_mewslix.sh $MODEL_DIR $GPU $DATA_DIR $OUT_DIR
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		;; Changed 'mum' to Info severity
		$RE:(^\|[\W])((?i)mum(?-i))($\|[\W]);0;Use regex as mum is a common letter combo