Skip to content

Commit

Permalink
Project import generated by Copybara.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 485897516
  • Loading branch information
Language Team authored and sebastianruder committed Jan 4, 2023
1 parent 8ad18d8 commit 838c13b
Show file tree
Hide file tree
Showing 13 changed files with 284 additions and 194 deletions.
13 changes: 13 additions & 0 deletions conda-env.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,16 @@ https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.17.4-py37hc1035e2_0.tar.bz2
https://repo.anaconda.com/pkgs/main/linux-64/pandas-0.25.3-py37he6710b0_0.tar.bz2
https://conda.anaconda.org/pytorch/linux-64/pytorch-1.3.1-py3.7_cuda10.0.130_cudnn7.6.3_0.tar.bz2
https://conda.anaconda.org/pytorch/linux-64/torchvision-0.4.2-py37_cu100.tar.bz2
https://repo.anaconda.com/pkgs/main/linux-64/svn-1.10.2-h52f66ed_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda
https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda
https://repo.anaconda.com/pkgs/main/linux-64/libapr-1.7.0-hf178f73_5.conda
https://repo.anaconda.com/pkgs/main/linux-64/libntlm-1.6-h7f8727e_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/libutf8proc-2.1.1-h14c3975_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.8.1.2-h14c3975_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/libapriconv-1.2.2-h7f8727e_5.conda
https://repo.anaconda.com/pkgs/main/linux-64/libaprutil-1.6.1-hfefca11_5.conda
https://repo.anaconda.com/pkgs/main/linux-64/apr-1.7.0-hf178f73_5.conda
https://repo.anaconda.com/pkgs/main/linux-64/krb5-1.19.2-hac12032_0.conda
https://repo.anaconda.com/pkgs/main/linux-64/cyrus-sasl-2.1.27-h9c0eb46_9.conda
https://repo.anaconda.com/pkgs/main/linux-64/serf-1.3.9-ha066f01_0.conda
4 changes: 3 additions & 1 deletion evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ def evaluate(prediction_folder, label_folder, xtreme_version, verbose=False):
score['avg_metric'] = score['avg_f1']
elif 'avg_accuracy' in score:
score['avg_metric'] = score['avg_accuracy']
detailed_scores[task] = score
elif 'avg_map@20' in score:
score['avg_metric'] = score['avg_map@20']
detailed_scores[task] = dict(score)
if verbose:
avg_result = ', '.join(['{}={:.1f}'.format(k, v)
for k, v in score.items()
Expand Down
6 changes: 6 additions & 0 deletions install_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ mkdir -p $LIB
# install conda env
conda create --name xtreme --file conda-env.txt
conda init bash

# If 'conda activate' fails below, try uncommenting the following lines,
# based on https://github.com/conda/conda/issues/7980.
# CONDA_PATH=$(conda info | grep -i 'base environment' | awk '{print $4}')
# source $CONDA_PATH/etc/profile.d/conda.sh

conda activate xtreme

# install latest transformer
Expand Down
2 changes: 2 additions & 0 deletions leakr_badwords.dic
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
;; Changed 'mum' to Info severity
$RE:(^|[\W])((?i)mum(?-i))($|[\W]);0;Use regex as mum is a common letter combo
2 changes: 1 addition & 1 deletion multichecklist/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install absl-py
pip install checklist
```

3. Download the MultiCheckList tests and model predictions from [here](https://pantheon.corp.google.com/storage/browser/xtreme_translations/MultiCheckList) into this folder. [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) is
3. Download the MultiCheckList tests and model predictions from [here](https://console.cloud.google.com/storage/browser/xtreme_translations/MultiCheckList) into this folder. [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) is
the preferred way to download all files at once:

```
Expand Down
54 changes: 54 additions & 0 deletions scripts/download_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,58 @@ function download_siqa {
echo "Successfully downloaded data at $DIR/siqa" >> $DIR/download.log
}

function download_mewslix {
echo "download mewslix [5-10 mins]"
OUTPATH=$DIR/mewslix
OUTPATH_TMP=$OUTPATH/tmp
mkdir -p $OUTPATH_TMP
pushd $OUTPATH_TMP
svn export -q https://github.com/google-research/google-research/trunk/dense_representations_for_entity_retrieval
pushd dense_representations_for_entity_retrieval/mel

( # Use a subshell to keep things separate from an already active conda env.
# If 'conda activate' fails below, try uncommenting the following lines,
# based on https://github.com/conda/conda/issues/7980.
# CONDA_PATH=$(conda info | grep -i 'base environment' | awk '{print $4}')
# source $CONDA_PATH/etc/profile.d/conda.sh

# Create and activate a dedicated conda environment for Mewsli-X extraction.
bash create-env.sh conda
conda activate mewsli_env

# Run the Mewsli-X downloader script.
bash get-mewsli-x.sh
)

INTERIM_DIR=$PWD/mewsli_x/output/dataset
echo
echo "Move dataset to $OUTPATH/"
mv $INTERIM_DIR/candidate_set_entities.jsonl $OUTPATH/
mv $INTERIM_DIR/wikipedia_pairs-{train,dev}.jsonl $OUTPATH/
mv $INTERIM_DIR/wikinews_mentions-{dev,test}.jsonl $OUTPATH/
popd
popd

python $REPO/utils_preprocess.py --data_dir $OUTPATH --output_dir $OUTPATH --task mewslix
rm -rf $OUTPATH_TMP $OUTPATH/wikinews_mentions-{dev,test}.jsonl
echo "Successfully downloaded data at $OUTPATH" >> $DIR/download.log
}

function download_lareqa {
echo "download lareqa"
OUTPATH=$DIR/lareqa/
mkdir -p $OUTPATH
cd $OUTPATH
wget https://github.com/google-research-datasets/lareqa/archive/master.zip -q --show-progress
unzip master.zip
mv lareqa-master/* .
rm -rf lareqa-master/
rm master.zip
rm LICENSE
rm README.md
echo "Successfully downloaded data at $OUTPATH" >> $DIR/download.log
}

download_xnli
download_pawsx
download_tatoeba
Expand All @@ -257,3 +309,5 @@ download_udpos
download_panx
download_xcopa
download_siqa
download_mewslix
download_lareqa
34 changes: 0 additions & 34 deletions scripts/download_lareqa_data.sh

This file was deleted.

55 changes: 0 additions & 55 deletions scripts/download_mewslix_data.sh

This file was deleted.

3 changes: 2 additions & 1 deletion scripts/run_eval_lareqa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ MODEL=${1:-bert-base-multilingual-cased}
GPU=${2:-0}
DATA_DIR=${3:-"$REPO/download/"}
OUT_DIR=${4:-"$REPO/outputs/"}
# Select a checkpoint based on validation performance.
# Select a checkpoint based on validation performance. Results in the XTREME-R
# paper used mBERT checkpoint-9000 and XLM-R checkpoint-10000.
CHECKPOINT=${5:-checkpoint-9000}

TASK='lareqa'
Expand Down
5 changes: 5 additions & 0 deletions scripts/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,10 @@ elif [ $TASK == 'bucc2018' ]; then
bash $REPO/scripts/run_bucc2018.sh $MODEL $GPU $DATA_DIR $OUT_DIR
elif [ $TASK == 'tatoeba' ]; then
bash $REPO/scripts/run_tatoeba.sh $MODEL $GPU $DATA_DIR $OUT_DIR
elif [ $TASK == 'mewslix' ]; then
bash $REPO/scripts/train_mewslix.sh $MODEL $GPU $DATA_DIR $OUT_DIR
elif [ $TASK == 'lareqa' ]; then
bash $REPO/scripts/train_lareqa.sh $MODEL $GPU $DATA_DIR $OUT_DIR
bash $REPO/scripts/run_eval_lareqa.sh $MODEL $GPU $DATA_DIR $OUT_DIR
fi

18 changes: 9 additions & 9 deletions scripts/train_mewslix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,27 @@ OUT_DIR=${4:-"$REPO/outputs"}
TASK='mewslix'

# These settings should match those used in scripts/run_eval_mewslix.sh
# They are primarily aimed at a quick turnaround time (~1h using 1 GPU).
# They are primarily aimed at a quick training time (~1h using 1 GPU for mBERT).
MAX_SEQ_LEN=64
NUM_EPOCHS=2
GRAD_ACC_STEPS=4

# Learning rates were set based on best dev-set loss on the English
# 'wikipedia_pairs-dev' after 2 epochs, searching over {1e-5, 2e-5, 5e-5, 1e-4}.
if [ $MODEL == "bert-base-multilingual-cased" ]; then
MODEL_TYPE="bert-retrieval"
LR=2e-5 # best dev-set loss after 2 epochs among {1e-5, 2e-5, 5e-5, 1e-4}
LR=2e-5
DO_LOWER_CASE=""
PER_GPU_BATCH_SIZE=64 # largest power of two that fit 16GB GPU RAM
LOGGING_STEPS=50
SAVE_STEPS=100
# BEGIN GOOGLE-INTERNAL
# TODO(jabot,ruder): The XLM-R baseline requires some further hyperparam
# tuning and/or troubleshooting, as it scored near zero on the eval metric
# using the current settings.
elif [ $MODEL == "xlm-roberta-large" ]; then
MODEL_TYPE="xlmr-retrieval"
LR=1e-4
LR=1e-5
DO_LOWER_CASE="--do_lower_case"
PER_GPU_BATCH_SIZE=8 # largest power of two that fit 16GB GPU RAM
LOGGING_STEPS=500
SAVE_STEPS=2000
# END GOOGLE-INTERNAL
else
echo "$MODEL not configured."
fi
Expand Down Expand Up @@ -82,4 +79,7 @@ python third_party/run_retrieval_el.py \
--train_lang en \
--eval_lang en \
$DO_LOWER_CASE \
2>&1 | tee $MODEL_DIR/train.log
2>&1 | tee $MODEL_DIR/train.log

set +x
bash $REPO/scripts/run_eval_mewslix.sh $MODEL_DIR $GPU $DATA_DIR $OUT_DIR
Loading

0 comments on commit 838c13b

Please sign in to comment.