DelVel · Jul 15, 2024
diff --git a/‎README.md
+99 b/‎README.md
+99
diff --git a/‎SentEval/LICENSE
+30 b/‎SentEval/LICENSE
+30
diff --git a/‎SentEval/README.md
+249 b/‎SentEval/README.md
+249
diff --git a/‎SentEval/data/downstream/download_dataset.sh
+2 b/‎SentEval/data/downstream/download_dataset.sh
+2
diff --git a/‎SentEval/examples/bow.py
+113 b/‎SentEval/examples/bow.py
+113
diff --git a/‎SentEval/examples/bow_word_piece.py
+158 b/‎SentEval/examples/bow_word_piece.py
+158
diff --git a/‎SentEval/examples/gensen.py
+74 b/‎SentEval/examples/gensen.py
+74
diff --git a/‎SentEval/examples/googleuse.py
+67 b/‎SentEval/examples/googleuse.py
+67
diff --git a/‎SentEval/examples/infersent.py
+76 b/‎SentEval/examples/infersent.py
+76
diff --git a/‎SentEval/examples/models.py
+265 b/‎SentEval/examples/models.py
+265
diff --git a/‎SentEval/examples/skipthought.py
+61 b/‎SentEval/examples/skipthought.py
+61
diff --git a/‎SentEval/senteval/__init__.py
+10 b/‎SentEval/senteval/__init__.py
+10
diff --git a/‎SentEval/senteval/binary.py
+92 b/‎SentEval/senteval/binary.py
+92
diff --git a/‎SentEval/senteval/engine.py
+131 b/‎SentEval/senteval/engine.py
+131
diff --git a/‎SentEval/senteval/mrpc.py
+104 b/‎SentEval/senteval/mrpc.py
+104
diff --git a/‎SentEval/senteval/probing.py
+171 b/‎SentEval/senteval/probing.py
+171
diff --git a/‎SentEval/senteval/rank.py
+108 b/‎SentEval/senteval/rank.py
+108
diff --git a/‎SentEval/senteval/sick.py
+216 b/‎SentEval/senteval/sick.py
+216
diff --git a/‎SentEval/senteval/snli.py
+113 b/‎SentEval/senteval/snli.py
+113
diff --git a/‎SentEval/senteval/sst.py
+96 b/‎SentEval/senteval/sst.py
+96
diff --git a/‎SentEval/senteval/sts.py
+265 b/‎SentEval/senteval/sts.py
+265
diff --git a/‎SentEval/senteval/tools/__init__.py b/‎SentEval/senteval/tools/__init__.py
diff --git a/‎SentEval/senteval/tools/classifier.py
+202 b/‎SentEval/senteval/tools/classifier.py
+202
@@ -0,0 +1,99 @@
+# E5-V: Universal Embeddings with Multimodal Large Language Models
+
+## Example
+``` python
+import torch
+import torch.nn.functional as F
+import requests
+from PIL import Image
+from transformers import AutoTokenizer
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+
+llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'
+
+processor = LlavaNextProcessor.from_pretrained('royokong/e5-v')
+model = LlavaNextForConditionalGeneration.from_pretrained('royokong/e5-v', torch_dtype=torch.float16).cuda()
+
+img_prompt = llama3_template.format('<image>\nSummary above image in one word: ')
+text_prompt = llama3_template.format('<sent>\nSummary above sentence in one word: ')
+
+urls = ['https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg',
+        'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg']
+images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+
+texts = ['A dog sitting in the grass.',
+         'A cat standing in the snow.']
+
+text_inputs = processor([text_prompt.replace('<sent>', text) for text in texts], return_tensors="pt", padding=True).to('cuda')
+img_inputs = processor([img_prompt]*len(images), images, return_tensors="pt", padding=True).to('cuda')
+
+with torch.no_grad():
+    text_embs = model(**text_inputs, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
+    img_embs = model(**img_inputs, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
+
+    text_embs = F.normalize(text_embs, dim=-1)
+    img_embs = F.normalize(img_embs, dim=-1)
+
+print(text_embs @ img_embs.t())
+```
+
+
+## Evaulate
+To evaluate the original results in the paper, please run following
+```sh
+# eval on coco, flickr30k, fashioniq and cirr
+accelerate launch --num_machines=1 --num_processes 8 --machine_rank 0 retrieval.py  --use_e5v 
+
+# eval on i2i-coco, i2i-flickr30k
+accelerate launch --num_machines=1 --num_processes 8 --machine_rank 0 retrieval.py  --use_e5v  --ocr_replace_text
+
+# eval on sts tasks
+cd SentEval/data/downstream/
+bash download_dataset.sh
+cd -
+accelerate launch --num_machines=1 --num_processes 8 --machine_rank 0 eval_sts.py --model_name_or_path royokong/e5-v
+```
+
+## Training
+1. Install Dependencies
+
+``` sh
+pip install -r requirements.txt
+```
+
+2. Download Data
+
+``` sh
+cd ./data
+bash download_nli.sh
+cd -
+```
+
+3. Transfer llava-llama-3-8b model to huggingface format
+
+``` sh
+mkdir -p models
+cd models
+for i in 1 2 3 4; do
+    wget https://huggingface.co/lmms-lab/llama3-llava-next-8b/resolve/main/model-0000$i-of-00004.safetensors
+done
+cd -
+python load_llama3_hf.py
+rm models/*.safetensors
+```
+
+4. Train
+``` sh
+bash run.sh
+```
+
+5. Test
+Use `--lora_path` flag to test the results.
+``` sh
+accelerate launch --num_machines=1 --num_processes 8 --machine_rank 0 retrieval.py \
+    --llava_llama3 --lora_path e5v-8b  --batch_size 1
+```
+
+
+## Acknowledgement
+Our Code is based on SimCSE and alpaca-lora
@@ -0,0 +1,30 @@
+BSD License
+
+For SentEval software
+
+Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,249 @@
+Our modification to SentEval:
+
+1. Add the `all` setting to all STS tasks.
+2. Change STS-B and SICK-R to not use an additional regressor.
+
+# SentEval: evaluation toolkit for sentence embeddings
+
+SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations.
+
+
+**(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings**
+
+**(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)**
+
+## Dependencies
+
+This code is written in python. The dependencies are:
+
+* Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/)
+* [Pytorch](http://pytorch.org/)>=0.4
+* [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0
+
+## Transfer tasks
+
+### Downstream tasks
+SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks:
+
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| movie review                 	| 11k     	| 11k    	| 1 | 1 |
+| [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)       	| product review               	| 4k      	| 4k     	| 1 | 1 |
+| [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| subjectivity status          	| 10k     	| 10k    	| 1 | 1 |
+| [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm)     	| opinion-polarity  | 11k     	| 11k    	| 1 | 1 |
+| [SST](https://nlp.stanford.edu/sentiment/index.html)      	| binary sentiment analysis  	| 67k     	| 1.8k   	| 1 | 1 |
+| **[SST](https://nlp.stanford.edu/sentiment/index.html)**      	| **fine-grained sentiment analysis**  	| 8.5k     	| 2.2k   	| 1 | 1 |
+| [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/)     	| question-type classification 	| 6k      	| 0.5k    	| 1 | 1 |
+| [SICK-E](http://clic.cimec.unitn.it/composes/sick.html)   	| natural language inference 	| 4.5k    	| 4.9k   	| 1 | 1 |
+| [SNLI](https://nlp.stanford.edu/projects/snli/)     	| natural language inference   	| 550k    	| 9.8k   	| 1 | 1 |
+| [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection  | 4.1k | 1.7k | 1 | 1 |
+| [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) 	| semantic textual similarity  	| N/A     	| 3.1k   	| 0  | 0 |
+| [STS 2013](http://ixa2.si.ehu.es/sts/) 	| semantic textual similarity  	| N/A     	| 1.5k   	| 0  | 0 |
+| [STS 2014](http://alt.qcri.org/semeval2014/task10/) 	| semantic textual similarity  	| N/A     	| 3.7k   	| 0  | 0 |
+| [STS 2015](http://alt.qcri.org/semeval2015/task2/) 	| semantic textual similarity  	| N/A     	| 8.5k   	| 0  | 0 |
+| [STS 2016](http://alt.qcri.org/semeval2016/task1/) 	| semantic textual similarity  	| N/A     	| 9.2k   	| 0  | 0 |
+| [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results)    	| semantic textual similarity  	| 5.7k    	| 1.4k   	| 1 | 0 |
+| [SICK-R](http://clic.cimec.unitn.it/composes/sick.html)   	| semantic textual similarity | 4.5k    	| 4.9k   	| 1 | 0 |
+| [COCO](http://mscoco.org/)     	| image-caption retrieval      	| 567k    	| 5*1k   	| 1 | 0 |
+
+where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below).
+
+Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf)
+
+### Probing tasks
+SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings:
+
+| Task     	| Type                         	| #train 	| #test 	| needs_train 	| set_classifier |
+|----------	|------------------------------	|-----------:|----------:|:-----------:|:----------:|
+| [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Length prediction	| 100k     	| 10k    	| 1 | 1 |
+| [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word Content analysis	| 100k     	| 10k    	| 1 | 1 |
+| [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Tree depth prediction	| 100k     	| 10k    	| 1 | 1 |
+| [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Top Constituents prediction	| 100k     	| 10k    	| 1 | 1 |
+| [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Word order analysis	| 100k     	| 10k    	| 1 | 1 |
+| [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Verb tense prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Subject number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Object number prediction	| 100k     	| 10k    	| 1 | 1 |
+| [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Semantic odd man out	| 100k     	| 10k    	| 1 | 1 |
+| [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing)	| Coordination Inversion | 100k     	| 10k    	| 1 | 1 |
+
+## Download datasets
+To get all the transfer tasks datasets, run (in data/downstream/):
+```bash
+./get_transfer_data.bash
+```
+This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default.
+
+## How to use SentEval: examples
+
+### examples/bow.py
+
+In examples/bow.py, we evaluate the quality of the average of word embeddings.
+
+To download state-of-the-art fastText embeddings:
+
+```bash
+curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
+curl -Lo crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
+```
+
+To reproduce the results for bag-of-vectors, run (in examples/):  
+```bash
+python bow.py
+```
+
+As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features.
+
+### examples/infersent.py
+
+To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/):
+```bash
+curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl
+curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl
+```
+
+### examples/skipthought.py - examples/gensen.py - examples/googleuse.py
+
+We also provide example scripts for three other encoders:
+
+* [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano
+* [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch
+* [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow
+
+Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary.
+The Google encoder script should work as-is.
+
+## How to use SentEval
+
+To evaluate your sentence embeddings, SentEval requires that you implement two functions:
+
+1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc)
+2. **batcher** (transforms a batch of text sentences into sentence embeddings)
+
+
+### 1.) prepare(params, samples) (optional)
+
+*batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task.
+
+```
+prepare(params, samples)
+```
+* *params*: senteval parameters.
+* *samples*: list of all sentences from the tranfer task.
+* *output*: No output. Arguments stored in "params" can further be used by *batcher*.
+
+*Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors.
+
+
+### 2.) batcher(params, batch)
+```
+batcher(params, batch)
+```
+* *params*: senteval parameters.
+* *batch*: numpy array of text sentences (of size params.batch_size)
+* *output*: numpy array of sentence embeddings (of size params.batch_size)
+
+*Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences.
+
+### 3.) evaluation on transfer tasks
+
+After having implemented the batch and prepare function for your own sentence encoder,
+
+1) to perform the actual evaluation, first import senteval and set its parameters:
+```python
+import senteval
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+```
+
+2) (optional) set the parameters of the classifier (when applicable):
+```python
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training.
+
+3) Create an instance of the class SE:
+```python
+se = senteval.engine.SE(params, batcher, prepare)
+```
+
+4) define the set of transfer tasks and run the evaluation:
+```python
+transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark']
+results = se.eval(transfer_tasks)
+```
+The current list of available tasks is:
+```python
+['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI',
+'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval',
+'STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense',
+'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion']
+```
+
+## SentEval parameters
+Global parameters of SentEval:
+```bash
+# senteval parameters
+task_path                   # path to SentEval datasets (required)
+seed                        # seed
+usepytorch                  # use cuda-pytorch (else scikit-learn) where possible
+kfold                       # k-fold validation for MR/CR/SUB/MPQA.
+```
+
+Parameters of the classifier:
+```bash
+nhid:                       # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh
+optim:                      # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+tenacity:                   # how many times dev acc does not increase before training stops
+epoch_size:                 # each epoch corresponds to epoch_size pass on the train set
+max_epoch:                  # max number of epoches
+dropout:                    # dropout for MLP
+```
+
+Note that to get a proxy of the results while **dramatically reducing computation time**,
+we suggest the **prototyping config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+```
+which will results in a 5 times speedup for classification tasks.
+
+To produce results that are **comparable to the literature**, use the **default config**:
+```python
+params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
+params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+```
+which takes longer but will produce better and comparable results.
+
+For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set.
+
+## References
+
+Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods.
+
+### SentEval: An Evaluation Toolkit for Universal Sentence Representations
+
+[1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449)
+
+```
+@article{conneau2018senteval,
+  title={SentEval: An Evaluation Toolkit for Universal Sentence Representations},
+  author={Conneau, Alexis and Kiela, Douwe},
+  journal={arXiv preprint arXiv:1803.05449},
+  year={2018}
+}
+```
+
+Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com)
+
+### Related work
+* [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726)
+* [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx)
+* [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207)
+* [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364)
+* [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079)
+* [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334)
+* [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175)
+* [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070)
@@ -0,0 +1,2 @@
+wget --no-check-certificate https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar
+tar xvf senteval.tar
@@ -0,0 +1,113 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import io
+import numpy as np
+import logging
+
+
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+# PATH_TO_VEC = 'glove/glove.840B.300d.txt'
+PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec'
+
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+
+# Create dictionary
+def create_dictionary(sentences, threshold=0):
+    words = {}
+    for s in sentences:
+        for word in s:
+            words[word] = words.get(word, 0) + 1
+
+    if threshold > 0:
+        newwords = {}
+        for word in words:
+            if words[word] >= threshold:
+                newwords[word] = words[word]
+        words = newwords
+    words['<s>'] = 1e9 + 4
+    words['</s>'] = 1e9 + 3
+    words['<p>'] = 1e9 + 2
+
+    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
+    id2word = []
+    word2id = {}
+    for i, (w, _) in enumerate(sorted_words):
+        id2word.append(w)
+        word2id[w] = i
+
+    return id2word, word2id
+
+# Get word vectors from vocabulary (glove, word2vec, fasttext ..)
+def get_wordvec(path_to_vec, word2id):
+    word_vec = {}
+
+    with io.open(path_to_vec, 'r', encoding='utf-8') as f:
+        # if word2vec or fasttext file : skip first line "next(f)"
+        for line in f:
+            word, vec = line.split(' ', 1)
+            if word in word2id:
+                word_vec[word] = np.fromstring(vec, sep=' ')
+
+    logging.info('Found {0} words with word vectors, out of \
+        {1} words'.format(len(word_vec), len(word2id)))
+    return word_vec
+
+
+# SentEval prepare and batcher
+def prepare(params, samples):
+    _, params.word2id = create_dictionary(samples)
+    params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
+    params.wvec_dim = 300
+    return
+
+def batcher(params, batch):
+    batch = [sent if sent != [] else ['.'] for sent in batch]
+    embeddings = []
+
+    for sent in batch:
+        sentvec = []
+        for word in sent:
+            if word in params.word_vec:
+                sentvec.append(params.word_vec[word])
+        if not sentvec:
+            vec = np.zeros(params.wvec_dim)
+            sentvec.append(vec)
+        sentvec = np.mean(sentvec, 0)
+        embeddings.append(sentvec)
+
+    embeddings = np.vstack(embeddings)
+    return embeddings
+
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    #transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      #'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      #'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      #'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      #'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      #'OddManOut', 'CoordinationInversion']
+    transfer_tasks = ['STSBenchmark']
+    results = se.eval(transfer_tasks)
+    print(results)
@@ -0,0 +1,158 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import io
+import numpy as np
+import logging
+
+from transformers import BertTokenizer
+
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+# PATH_TO_VEC = 'glove/glove.840B.300d.txt'
+PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec'
+
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+a_remove_set = {".", "a", "the", "in", ",", "is", "to", "of", "and", "'", "on", "man", "-", "s", "with", "for", "\"", "at", "##s", "woman", "are", "it", "two", "that", "you", "dog", "said", "playing", "i", "an", "as", "was", "from", ":", "by", "white"}
+remove_set = {'?', '*', '#', '´', '’', '=', '…', '|', '~', '/', '‚', '¿', '–', '»', '-', '€', '‘', '"', '(', '•', '`', '$', ':', '[', '”', '%', '£', '<', '[UNK]', ';', '“', '@', '_', '{', '^', ',', '.', '!', '™', '&', ']', '>', '\\', "'", ')', '+', '—'}
+
+# Create dictionary
+def create_dictionary(sentences, threshold=0):
+    words = {}
+    for s in sentences:
+        for word in s:
+            words[word] = words.get(word, 0) + 1
+        #for word in tokenizer.convert_ids_to_tokens(tokenizer.encode(' '.join(s), add_special_tokens=False)):
+            #if '##' in word and word in remove_set: continue
+            #words[word] = words.get(word, 0) + 1
+
+    if threshold > 0:
+        newwords = {}
+        for word in words:
+            if words[word] >= threshold:
+                newwords[word] = words[word]
+        words = newwords
+    words['<s>'] = 1e9 + 4
+    words['</s>'] = 1e9 + 3
+    words['<p>'] = 1e9 + 2
+
+    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
+    id2word = []
+    word2id = {}
+    for i, (w, _) in enumerate(sorted_words):
+        id2word.append(w)
+        word2id[w] = i
+
+    return id2word, word2id
+
+# Get word vectors from vocabulary (glove, word2vec, fasttext ..)
+def get_wordvec(path_to_vec, word2id):
+    word_vec = {}
+
+    with io.open(path_to_vec, 'r', encoding='utf-8') as f:
+        # if word2vec or fasttext file : skip first line "next(f)"
+        for line in f:
+            word, vec = line.split(' ', 1)
+            if word in word2id:
+                word_vec[word] = np.fromstring(vec, sep=' ')
+
+    logging.info('Found {0} words with word vectors, out of \
+        {1} words'.format(len(word_vec), len(word2id)))
+    return word_vec
+
+def get_bert_wordvec(path_to_vec, word2id):
+    word_vec = {}
+    from transformers import BertModel
+    bert = BertModel.from_pretrained('bert-base-uncased')
+    vocab = tokenizer.get_vocab()
+    bert_word_vec = bert.embeddings.word_embeddings.weight.detach().numpy()
+
+    for word in word2id:
+        if word in ['<s>', '</s>', '<p>']:
+            word_vec[word] = np.zeros(768)
+        else:
+            word_vec[word] = bert_word_vec[vocab[word]]
+
+    logging.info('Found {0} words with word vectors, out of \
+        {1} words'.format(len(word_vec), len(word2id)))
+    return word_vec
+
+# SentEval prepare and batcher
+def prepare(params, samples):
+    _, params.word2id = create_dictionary(samples)
+    params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
+    params.wvec_dim = 300
+    #params.word_vec = get_bert_wordvec(PATH_TO_VEC, params.word2id)
+    #params.wvec_dim = 768
+    return
+
+def batcher(params, batch):
+    batch = [sent if sent != [] else ['.'] for sent in batch]
+    embeddings = []
+
+    for sent in batch:
+        sentvec = []
+        # for word in tokenizer.convert_ids_to_tokens(tokenizer.encode(' '.join(sent), add_special_tokens=False)):
+        for word in sent:
+            if word in params.word_vec:# and word not in a_remove_set and word not in remove_set:
+                sentvec.append(params.word_vec[word])
+        if not sentvec:
+            vec = np.zeros(params.wvec_dim)
+            sentvec.append(vec)
+        sentvec = np.mean(sentvec, 0)
+        embeddings.append(sentvec)
+
+    embeddings = np.vstack(embeddings)
+    return embeddings
+
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    #transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      #'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      #'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      #'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      #'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      #'OddManOut', 'CoordinationInversion']
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
+    results = se.eval(transfer_tasks)
+    print(results)
+    task_names = []
+    scores = []
+    for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']:
+        task_names.append(task)
+        if task in results:
+            if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
+                scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100))
+            else:
+                scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100))
+        else:
+            scores.append("0.00")
+    task_names.append("Avg.")
+    scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
+
+    from prettytable import PrettyTable
+    tb = PrettyTable()
+    tb.field_names = task_names
+    tb.add_row(scores)
+    print(tb)
@@ -0,0 +1,74 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Clone GenSen repo here: https://github.com/Maluuba/gensen.git
+And follow instructions for loading the model used in batcher
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import logging
+# import GenSen package
+from gensen import GenSen, GenSenSingle
+
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+# SentEval prepare and batcher
+def prepare(params, samples):
+    return
+
+def batcher(params, batch):
+    batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
+    _, reps_h_t = gensen.get_representation(
+        sentences, pool='last', return_numpy=True, tokenize=True
+    )
+    embeddings = reps_h_t
+    return embeddings
+
+# Load GenSen model
+gensen_1 = GenSenSingle(
+    model_folder='../data/models',
+    filename_prefix='nli_large_bothskip',
+    pretrained_emb='../data/embedding/glove.840B.300d.h5'
+)
+gensen_2 = GenSenSingle(
+    model_folder='../data/models',
+    filename_prefix='nli_large_bothskip_parse',
+    pretrained_emb='../data/embedding/glove.840B.300d.h5'
+)
+gensen_encoder = GenSen(gensen_1, gensen_2)
+reps_h, reps_h_t = gensen.get_representation(
+    sentences, pool='last', return_numpy=True, tokenize=True
+)
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+params_senteval['gensen'] = gensen_encoder
+
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)
@@ -0,0 +1,67 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import, division
+
+import os
+import sys
+import logging
+import tensorflow as tf
+import tensorflow_hub as hub
+tf.logging.set_verbosity(0)
+
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+
+# import SentEval
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+# tensorflow session
+session = tf.Session()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+
+# SentEval prepare and batcher
+def prepare(params, samples):
+    return
+
+def batcher(params, batch):
+    batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
+    embeddings = params['google_use'](batch)
+    return embeddings
+
+def make_embed_fn(module):
+  with tf.Graph().as_default():
+    sentences = tf.placeholder(tf.string)
+    embed = hub.Module(module)
+    embeddings = embed(sentences)
+    session = tf.train.MonitoredSession()
+  return lambda x: session.run(embeddings, {sentences: x})
+
+# Start TF session and load Google Universal Sentence Encoder
+encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2")
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+params_senteval['google_use'] = encoder
+
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)
@@ -0,0 +1,76 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+InferSent models. See https://github.com/facebookresearch/InferSent.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+import os
+import torch
+import logging
+
+# get models.py from InferSent repo
+from models import InferSent
+
+# Set PATHs
+PATH_SENTEVAL = '../'
+PATH_TO_DATA = '../data'
+PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt'  # or crawl-300d-2M.vec for V2
+MODEL_PATH = 'infersent1.pkl'
+V = 1 # version of InferSent
+
+assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \
+    'Set MODEL and GloVe PATHs'
+
+# import senteval
+sys.path.insert(0, PATH_SENTEVAL)
+import senteval
+
+
+def prepare(params, samples):
+    params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False)
+
+
+def batcher(params, batch):
+    sentences = [' '.join(s) for s in batch]
+    embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False)
+    return embeddings
+
+
+"""
+Evaluation of trained model on Transfer Tasks (SentEval)
+"""
+
+# define senteval params
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
+                                 'tenacity': 3, 'epoch_size': 2}
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    # Load InferSent model
+    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
+                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
+    model = InferSent(params_model)
+    model.load_state_dict(torch.load(MODEL_PATH))
+    model.set_w2v_path(PATH_TO_W2V)
+
+    params_senteval['infersent'] = model.cuda()
+
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)
@@ -0,0 +1,265 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf
+"""
+
+import numpy as np
+import time
+
+import torch
+import torch.nn as nn
+
+
+class InferSent(nn.Module):
+
+    def __init__(self, config):
+        super(InferSent, self).__init__()
+        self.bsize = config['bsize']
+        self.word_emb_dim = config['word_emb_dim']
+        self.enc_lstm_dim = config['enc_lstm_dim']
+        self.pool_type = config['pool_type']
+        self.dpout_model = config['dpout_model']
+        self.version = 1 if 'version' not in config else config['version']
+
+        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
+                                bidirectional=True, dropout=self.dpout_model)
+
+        assert self.version in [1, 2]
+        if self.version == 1:
+            self.bos = '<s>'
+            self.eos = '</s>'
+            self.max_pad = True
+            self.moses_tok = False
+        elif self.version == 2:
+            self.bos = '<p>'
+            self.eos = '</p>'
+            self.max_pad = False
+            self.moses_tok = True
+
+    def is_cuda(self):
+        # either all weights are on cpu or they are on gpu
+        return self.enc_lstm.bias_hh_l0.data.is_cuda
+
+    def forward(self, sent_tuple):
+        # sent_len: [max_len, ..., min_len] (bsize)
+        # sent: (seqlen x bsize x worddim)
+        sent, sent_len = sent_tuple
+
+        # Sort by length (keep idx)
+        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
+        sent_len_sorted = sent_len_sorted.copy()
+        idx_unsort = np.argsort(idx_sort)
+
+        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
+            else torch.from_numpy(idx_sort)
+        sent = sent.index_select(1, idx_sort)
+
+        # Handling padding in Recurrent Networks
+        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
+        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
+        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]
+
+        # Un-sort by length
+        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
+            else torch.from_numpy(idx_unsort)
+        sent_output = sent_output.index_select(1, idx_unsort)
+
+        # Pooling
+        if self.pool_type == "mean":
+            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
+            emb = torch.sum(sent_output, 0).squeeze(0)
+            emb = emb / sent_len.expand_as(emb)
+        elif self.pool_type == "max":
+            if not self.max_pad:
+                sent_output[sent_output == 0] = -1e9
+            emb = torch.max(sent_output, 0)[0]
+            if emb.ndimension() == 3:
+                emb = emb.squeeze(0)
+                assert emb.ndimension() == 2
+
+        return emb
+
+    def set_w2v_path(self, w2v_path):
+        self.w2v_path = w2v_path
+
+    def get_word_dict(self, sentences, tokenize=True):
+        # create vocab of words
+        word_dict = {}
+        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
+        for sent in sentences:
+            for word in sent:
+                if word not in word_dict:
+                    word_dict[word] = ''
+        word_dict[self.bos] = ''
+        word_dict[self.eos] = ''
+        return word_dict
+
+    def get_w2v(self, word_dict):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        # create word_vec with w2v vectors
+        word_vec = {}
+        with open(self.w2v_path, encoding='utf-8') as f:
+            for line in f:
+                word, vec = line.split(' ', 1)
+                if word in word_dict:
+                    word_vec[word] = np.fromstring(vec, sep=' ')
+        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
+        return word_vec
+
+    def get_w2v_k(self, K):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        # create word_vec with k first w2v vectors
+        k = 0
+        word_vec = {}
+        with open(self.w2v_path, encoding='utf-8') as f:
+            for line in f:
+                word, vec = line.split(' ', 1)
+                if k <= K:
+                    word_vec[word] = np.fromstring(vec, sep=' ')
+                    k += 1
+                if k > K:
+                    if word in [self.bos, self.eos]:
+                        word_vec[word] = np.fromstring(vec, sep=' ')
+
+                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
+                    break
+        return word_vec
+
+    def build_vocab(self, sentences, tokenize=True):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        word_dict = self.get_word_dict(sentences, tokenize)
+        self.word_vec = self.get_w2v(word_dict)
+        print('Vocab size : %s' % (len(self.word_vec)))
+
+    # build w2v vocab with k most frequent words
+    def build_vocab_k_words(self, K):
+        assert hasattr(self, 'w2v_path'), 'w2v path not set'
+        self.word_vec = self.get_w2v_k(K)
+        print('Vocab size : %s' % (K))
+
+    def update_vocab(self, sentences, tokenize=True):
+        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
+        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
+        word_dict = self.get_word_dict(sentences, tokenize)
+
+        # keep only new words
+        for word in self.word_vec:
+            if word in word_dict:
+                del word_dict[word]
+
+        # udpate vocabulary
+        if word_dict:
+            new_word_vec = self.get_w2v(word_dict)
+            self.word_vec.update(new_word_vec)
+        else:
+            new_word_vec = []
+        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))
+
+    def get_batch(self, batch):
+        # sent in batch in decreasing order of lengths
+        # batch: (bsize, max_len, word_dim)
+        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))
+
+        for i in range(len(batch)):
+            for j in range(len(batch[i])):
+                embed[j, i, :] = self.word_vec[batch[i][j]]
+
+        return torch.FloatTensor(embed)
+
+    def tokenize(self, s):
+        from nltk.tokenize import word_tokenize
+        if self.moses_tok:
+            s = ' '.join(word_tokenize(s))
+            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
+            return s.split()
+        else:
+            return word_tokenize(s)
+
+    def prepare_samples(self, sentences, bsize, tokenize, verbose):
+        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
+                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
+        n_w = np.sum([len(x) for x in sentences])
+
+        # filters words without w2v vectors
+        for i in range(len(sentences)):
+            s_f = [word for word in sentences[i] if word in self.word_vec]
+            if not s_f:
+                import warnings
+                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
+                               Replacing by "</s>"..' % (sentences[i], i))
+                s_f = [self.eos]
+            sentences[i] = s_f
+
+        lengths = np.array([len(s) for s in sentences])
+        n_wk = np.sum(lengths)
+        if verbose:
+            print('Nb words kept : %s/%s (%.1f%s)' % (
+                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))
+
+        # sort by decreasing length
+        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
+        sentences = np.array(sentences)[idx_sort]
+
+        return sentences, lengths, idx_sort
+
+    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
+        tic = time.time()
+        sentences, lengths, idx_sort = self.prepare_samples(
+                        sentences, bsize, tokenize, verbose)
+
+        embeddings = []
+        for stidx in range(0, len(sentences), bsize):
+            batch = self.get_batch(sentences[stidx:stidx + bsize])
+            if self.is_cuda():
+                batch = batch.cuda()
+            with torch.no_grad():
+                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
+            embeddings.append(batch)
+        embeddings = np.vstack(embeddings)
+
+        # unsort
+        idx_unsort = np.argsort(idx_sort)
+        embeddings = embeddings[idx_unsort]
+
+        if verbose:
+            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
+                    len(embeddings)/(time.time()-tic),
+                    'gpu' if self.is_cuda() else 'cpu', bsize))
+        return embeddings
+
+    def visualize(self, sent, tokenize=True):
+
+        sent = sent.split() if not tokenize else self.tokenize(sent)
+        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]
+
+        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
+            import warnings
+            warnings.warn('No words in "%s" have w2v vectors. Replacing \
+                           by "%s %s"..' % (sent, self.bos, self.eos))
+        batch = self.get_batch(sent)
+
+        if self.is_cuda():
+            batch = batch.cuda()
+        output = self.enc_lstm(batch)[0]
+        output, idxs = torch.max(output, 0)
+        # output, idxs = output.squeeze(), idxs.squeeze()
+        idxs = idxs.data.cpu().numpy()
+        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]
+
+        # visualize model
+        import matplotlib.pyplot as plt
+        x = range(len(sent[0]))
+        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
+        plt.xticks(x, sent[0], rotation=45)
+        plt.bar(x, y)
+        plt.ylabel('%')
+        plt.title('Visualisation of words importance')
+        plt.show()
+
+        return output, idxs
@@ -0,0 +1,61 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import, division, unicode_literals
+
+"""
+Example of file for SkipThought in SentEval
+"""
+import logging
+import sys
+sys.setdefaultencoding('utf8')
+
+
+# Set PATHs
+PATH_TO_SENTEVAL = '../'
+PATH_TO_DATA = '../data/senteval_data/'
+PATH_TO_SKIPTHOUGHT = ''
+
+assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH'
+
+# import skipthought and Senteval
+sys.path.insert(0, PATH_TO_SKIPTHOUGHT)
+import skipthoughts
+sys.path.insert(0, PATH_TO_SENTEVAL)
+import senteval
+
+
+def prepare(params, samples):
+    return
+
+def batcher(params, batch):
+    batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch]
+    embeddings = skipthoughts.encode(params['encoder'], batch,
+                                     verbose=False, use_eos=True)
+    return embeddings
+
+
+# Set params for SentEval
+params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512}
+params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
+                                 'tenacity': 5, 'epoch_size': 4}
+# Set up logger
+logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
+
+if __name__ == "__main__":
+    # Load SkipThought model
+    params_senteval['encoder'] = skipthoughts.load_model()
+
+    se = senteval.engine.SE(params_senteval, batcher, prepare)
+    transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
+                      'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                      'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
+                      'Length', 'WordContent', 'Depth', 'TopConstituents',
+                      'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                      'OddManOut', 'CoordinationInversion']
+    results = se.eval(transfer_tasks)
+    print(results)
@@ -0,0 +1,10 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+from __future__ import absolute_import
+
+from senteval.engine import SE
@@ -0,0 +1,92 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import io
+import os
+import numpy as np
+import logging
+
+from senteval.tools.validation import InnerKFoldClassifier
+
+
+class BinaryClassifierEval(object):
+    def __init__(self, pos, neg, seed=1111):
+        self.seed = seed
+        self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
+        self.n_samples = len(self.samples)
+
+    def do_prepare(self, params, prepare):
+        # prepare is given the whole text
+        return prepare(params, self.samples)
+        # prepare puts everything it outputs in "params" : params.word2id etc
+        # Those output will be further used by "batcher".
+
+    def loadFile(self, fpath):
+        with io.open(fpath, 'r', encoding='latin-1') as f:
+            return [line.split() for line in f.read().splitlines()]
+
+    def run(self, params, batcher):
+        enc_input = []
+        # Sort to reduce padding
+        sorted_corpus = sorted(zip(self.samples, self.labels),
+                               key=lambda z: (len(z[0]), z[1]))
+        sorted_samples = [x for (x, y) in sorted_corpus]
+        sorted_labels = [y for (x, y) in sorted_corpus]
+        logging.info('Generating sentence embeddings')
+        for ii in range(0, self.n_samples, params.batch_size):
+            batch = sorted_samples[ii:ii + params.batch_size]
+            embeddings = batcher(params, batch)
+            enc_input.append(embeddings)
+        enc_input = np.vstack(enc_input)
+        logging.info('Generated sentence embeddings')
+
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
+                'ntest': self.n_samples}
+
+
+class CREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : CR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+
+
+class MREval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MR *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
+
+
+class SUBJEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SUBJ *****\n\n')
+        obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
+        subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
+        super(self.__class__, self).__init__(obj, subj, seed)
+
+
+class MPQAEval(BinaryClassifierEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : MPQA *****\n\n')
+        pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
+        neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
+        super(self.__class__, self).__init__(pos, neg, seed)
@@ -0,0 +1,131 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+
+Generic sentence evaluation scripts wrapper
+
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+from senteval import utils
+from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
+from senteval.snli import SNLIEval
+from senteval.trec import TRECEval
+from senteval.sick import SICKEntailmentEval, SICKEval
+from senteval.mrpc import MRPCEval
+from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune, STSBenchmarkEvalDev
+from senteval.sst import SSTEval
+from senteval.rank import ImageCaptionRetrievalEval
+from senteval.probing import *
+
+class SE(object):
+    def __init__(self, params, batcher, prepare=None):
+        # parameters
+        params = utils.dotdict(params)
+        params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
+        params.seed = 1111 if 'seed' not in params else params.seed
+
+        params.batch_size = 128 if 'batch_size' not in params else params.batch_size
+        params.nhid = 0 if 'nhid' not in params else params.nhid
+        params.kfold = 5 if 'kfold' not in params else params.kfold
+
+        if 'classifier' not in params or not params['classifier']:
+            params.classifier = {'nhid': 0}
+
+        assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
+
+        self.params = params
+
+        # batcher and prepare
+        self.batcher = batcher
+        self.prepare = prepare if prepare else lambda x, y: None
+
+        self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
+                           'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
+                           'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
+                           'STS14', 'STS15', 'STS16',
+                           'Length', 'WordContent', 'Depth', 'TopConstituents',
+                           'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
+                           'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix', 'STSBenchmark-dev']
+
+    def eval(self, name):
+        # evaluate on evaluation [name], either takes string or list of strings
+        if (isinstance(name, list)):
+            self.results = {x: self.eval(x) for x in name}
+            return self.results
+
+        tpath = self.params.task_path
+        assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
+
+        # Original SentEval tasks
+        if name == 'CR':
+            self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
+        elif name == 'MR':
+            self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
+        elif name == 'MPQA':
+            self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
+        elif name == 'SUBJ':
+            self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
+        elif name == 'SST2':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
+        elif name == 'SST5':
+            self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
+        elif name == 'TREC':
+            self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
+        elif name == 'MRPC':
+            self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
+        elif name == 'SICKRelatedness':
+            self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'STSBenchmark':
+            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'STSBenchmark-dev':
+            self.evaluation = STSBenchmarkEvalDev(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'STSBenchmark-fix':
+            self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed)
+        elif name == 'STSBenchmark-finetune':
+            self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
+        elif name == 'SICKRelatedness-finetune':
+            self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'SICKEntailment':
+            self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
+        elif name == 'SNLI':
+            self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
+        elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
+            fpath = name + '-en-test'
+            self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
+        elif name == 'ImageCaptionRetrieval':
+            self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
+
+        # Probing Tasks
+        elif name == 'Length':
+                self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'WordContent':
+                self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Depth':
+                self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'TopConstituents':
+                self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'BigramShift':
+                self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'Tense':
+                self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'SubjNumber':
+                self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'ObjNumber':
+                self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'OddManOut':
+                self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
+        elif name == 'CoordinationInversion':
+                self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
+
+        self.params.current_task = name
+        self.evaluation.do_prepare(self.params, self.prepare)
+
+        self.results = self.evaluation.run(self.params, self.batcher)
+
+        return self.results
@@ -0,0 +1,104 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+MRPC : Microsoft Research Paraphrase (detection) Corpus
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import logging
+import numpy as np
+import io
+
+from senteval.tools.validation import KFoldClassifier
+
+from sklearn.metrics import f1_score
+
+
+class MRPCEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.info('***** Transfer task : MRPC *****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path,
+                              'msr_paraphrase_train.txt'))
+        test = self.loadFile(os.path.join(task_path,
+                             'msr_paraphrase_test.txt'))
+        self.mrpc_data = {'train': train, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        # TODO : Should we separate samples in "train, test"?
+        samples = self.mrpc_data['train']['X_A'] + \
+                  self.mrpc_data['train']['X_B'] + \
+                  self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                mrpc_data['X_A'].append(text[3].split())
+                mrpc_data['X_B'].append(text[4].split())
+                mrpc_data['y'].append(text[0])
+
+        mrpc_data['X_A'] = mrpc_data['X_A'][1:]
+        mrpc_data['X_B'] = mrpc_data['X_B'][1:]
+        mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
+        return mrpc_data
+
+    def run(self, params, batcher):
+        mrpc_embed = {'train': {}, 'test': {}}
+
+        for key in self.mrpc_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            text_data = {}
+            sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
+                                       self.mrpc_data[key]['X_B'],
+                                       self.mrpc_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            text_data['A'] = [x for (x, y, z) in sorted_corpus]
+            text_data['B'] = [y for (x, y, z) in sorted_corpus]
+            text_data['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['A', 'B']:
+                mrpc_embed[key][txt_type] = []
+                for ii in range(0, len(text_data['y']), params.batch_size):
+                    batch = text_data[txt_type][ii:ii + params.batch_size]
+                    embeddings = batcher(params, batch)
+                    mrpc_embed[key][txt_type].append(embeddings)
+                mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
+            mrpc_embed[key]['y'] = np.array(text_data['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = mrpc_embed['train']['A']
+        trainB = mrpc_embed['train']['B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = mrpc_embed['train']['y']
+
+        # Test
+        testA = mrpc_embed['test']['A']
+        testB = mrpc_embed['test']['B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = mrpc_embed['test']['y']
+
+        config = {'nclasses': 2, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid, 'kfold': params.kfold}
+        clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
+                              test={'X': testF, 'y': testY}, config=config)
+
+        devacc, testacc, yhat = clf.run()
+        testf1 = round(100*f1_score(testY, yhat), 2)
+        logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
+                      .format(devacc, testacc, testf1))
+        return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
+                'ndev': len(trainA), 'ntest': len(testA)}
@@ -0,0 +1,171 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+probing tasks
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import copy
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class PROBINGEval(object):
+    def __init__(self, task, task_path, seed=1111):
+        self.seed = seed
+        self.task = task
+        logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())
+        self.task_data = {'train': {'X': [], 'y': []},
+                          'dev': {'X': [], 'y': []},
+                          'test': {'X': [], 'y': []}}
+        self.loadFile(task_path)
+        logging.info('Loaded %s train - %s dev - %s test for %s' %
+                     (len(self.task_data['train']['y']), len(self.task_data['dev']['y']),
+                      len(self.task_data['test']['y']), self.task))
+
+    def do_prepare(self, params, prepare):
+        samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \
+                  self.task_data['test']['X']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.rstrip().split('\t')
+                self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())
+                self.task_data[self.tok2split[line[0]]]['y'].append(line[1])
+
+        labels = sorted(np.unique(self.task_data['train']['y']))
+        self.tok2label = dict(zip(labels, range(len(labels))))
+        self.nclasses = len(self.tok2label)
+
+        for split in self.task_data:
+            for i, y in enumerate(self.task_data[split]['y']):
+                self.task_data[split]['y'][i] = self.tok2label[y]
+
+    def run(self, params, batcher):
+        task_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+        logging.info('Computing embeddings for train/dev/test')
+        for key in self.task_data:
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.task_data[key]['X'],
+                                     self.task_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))
+
+            task_embed[key]['X'] = []
+            for ii in range(0, len(self.task_data[key]['y']), bsize):
+                batch = self.task_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                task_embed[key]['X'].append(embeddings)
+            task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
+            task_embed[key]['y'] = np.array(self.task_data[key]['y'])
+        logging.info('Computed embeddings')
+
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+
+        if self.task == "WordContent" and params.classifier['nhid'] > 0:
+            config_classifier = copy.deepcopy(config_classifier)
+            config_classifier['classifier']['nhid'] = 0
+            print(params.classifier['nhid'])
+
+        clf = SplitClassifier(X={'train': task_embed['train']['X'],
+                                 'valid': task_embed['dev']['X'],
+                                 'test': task_embed['test']['X']},
+                              y={'train': task_embed['train']['y'],
+                                 'valid': task_embed['dev']['y'],
+                                 'test': task_embed['test']['y']},
+                              config=config_classifier)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper()))
+
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(task_embed['dev']['X']),
+                'ntest': len(task_embed['test']['X'])}
+
+"""
+Surface Information
+"""
+class LengthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'sentence_length.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Length', task_path, seed)
+
+class WordContentEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'word_content.txt')
+        # labels: 200 target words
+        PROBINGEval.__init__(self, 'WordContent', task_path, seed)
+
+"""
+Latent Structural Information
+"""
+class DepthEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'tree_depth.txt')
+        # labels: bins
+        PROBINGEval.__init__(self, 'Depth', task_path, seed)
+
+class TopConstituentsEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'top_constituents.txt')
+        # labels: 'PP_NP_VP_.' .. (20 classes)
+        PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)
+
+class BigramShiftEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'bigram_shift.txt')
+        # labels: 0 or 1
+        PROBINGEval.__init__(self, 'BigramShift', task_path, seed)
+
+# TODO: Voice?
+
+"""
+Latent Semantic Information
+"""
+
+class TenseEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'past_present.txt')
+        # labels: 'PRES', 'PAST'
+        PROBINGEval.__init__(self, 'Tense', task_path, seed)
+
+class SubjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'subj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)
+
+class ObjNumberEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'obj_number.txt')
+        # labels: 'NN', 'NNS'
+        PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)
+
+class OddManOutEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'odd_man_out.txt')
+        # labels: 'O', 'C'
+        PROBINGEval.__init__(self, 'OddManOut', task_path, seed)
+
+class CoordinationInversionEval(PROBINGEval):
+    def __init__(self, task_path, seed=1111):
+        task_path = os.path.join(task_path, 'coordination_inversion.txt')
+        # labels: 'O', 'I'
+        PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)
@@ -0,0 +1,108 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+Image-Caption Retrieval with COCO dataset
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import sys
+import logging
+import numpy as np
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+from senteval.tools.ranking import ImageSentenceRankingPytorch
+
+
+class ImageCaptionRetrievalEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
+
+        # Get captions and image features
+        self.seed = seed
+        train, dev, test = self.loadFile(task_path)
+        self.coco_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.coco_data['train']['sent'] + \
+                  self.coco_data['dev']['sent'] + \
+                  self.coco_data['test']['sent']
+        prepare(params, samples)
+
+    def loadFile(self, fpath):
+        coco = {}
+
+        for split in ['train', 'valid', 'test']:
+            list_sent = []
+            list_img_feat = []
+            if sys.version_info < (3, 0):
+                with open(os.path.join(fpath, split + '.pkl')) as f:
+                    cocodata = pickle.load(f)
+            else:
+                with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
+                    cocodata = pickle.load(f, encoding='latin1')
+
+            for imgkey in range(len(cocodata['features'])):
+                assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
+                       cocodata['image_to_caption_ids'][imgkey]
+                for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
+                    sent = cocodata['captions'][captkey]['cleaned_caption']
+                    sent += ' .'  # add punctuation to end of sentence in COCO
+                    list_sent.append(sent.encode('utf-8').split())
+                    list_img_feat.append(cocodata['features'][imgkey])
+            assert len(list_sent) == len(list_img_feat) and \
+                len(list_sent) % 5 == 0
+            list_img_feat = np.array(list_img_feat).astype('float32')
+            coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
+        return coco['train'], coco['valid'], coco['test']
+
+    def run(self, params, batcher):
+        coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
+                      'dev': {'sentfeat': [], 'imgfeat': []},
+                      'test': {'sentfeat': [], 'imgfeat': []}}
+
+        for key in self.coco_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
+            self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
+            idx_unsort = np.argsort(idx_sort)
+
+            coco_embed[key]['X'] = []
+            nsent = len(self.coco_data[key]['sent'])
+            for ii in range(0, nsent, params.batch_size):
+                batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
+                embeddings = batcher(params, batch)
+                coco_embed[key]['sentfeat'].append(embeddings)
+            coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
+            coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
+        clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
+                                          valid=coco_embed['dev'],
+                                          test=coco_embed['test'],
+                                          config=config)
+
+        bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
+            r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
+
+        logging.debug("\nTest scores | Image to text: \
+            {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
+        logging.debug("Test scores | Text to image: \
+            {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
+
+        return {'devacc': bestdevscore,
+                'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
+                        (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
+                'ndev': len(coco_embed['dev']['sentfeat']),
+                'ntest': len(coco_embed['test']['sentfeat'])}
@@ -0,0 +1,216 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SICK Relatedness and Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import logging
+import numpy as np
+
+from sklearn.metrics import mean_squared_error
+from scipy.stats import pearsonr, spearmanr
+
+from senteval.tools.relatedness import RelatednessPytorch
+from senteval.tools.validation import SplitClassifier
+
+class SICKEval(object):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Relatedness*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.sick_data['train']['X_A'] + \
+                  self.sick_data['train']['X_B'] + \
+                  self.sick_data['dev']['X_A'] + \
+                  self.sick_data['dev']['X_B'] + \
+                  self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[3])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
+
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = self.encode_labels(self.sick_data['train']['y'])
+
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = self.encode_labels(self.sick_data['dev']['y'])
+
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = self.encode_labels(self.sick_data['test']['y'])
+
+        config = {'seed': self.seed, 'nclasses': 5}
+        clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
+                                 valid={'X': devF, 'y': devY},
+                                 test={'X': testF, 'y': testY},
+                                 devscores=self.sick_data['dev']['y'],
+                                 config=config)
+
+        devspr, yhat = clf.run()
+
+        pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
+        sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
+        pr = 0 if pr != pr else pr
+        sr = 0 if sr != sr else sr
+        se = mean_squared_error(yhat, self.sick_data['test']['y'])
+        logging.debug('Dev : Spearman {0}'.format(devspr))
+        logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
+                       for SICK Relatedness\n'.format(pr, sr, se))
+
+        return {'devspearman': devspr, 'pearson': pr, 'spearman': sr, 'mse': se,
+                'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}
+
+    def encode_labels(self, labels, nclass=5):
+        """
+        Label encoding from Tree LSTM paper (Tai, Socher, Manning)
+        """
+        Y = np.zeros((len(labels), nclass)).astype('float32')
+        for j, y in enumerate(labels):
+            for i in range(nclass):
+                if i+1 == np.floor(y) + 1:
+                    Y[j, i] = y - np.floor(y)
+                if i+1 == np.floor(y):
+                    Y[j, i] = np.floor(y) - y + 1
+        return Y
+
+
+class SICKEntailmentEval(SICKEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('***** Transfer task : SICK-Entailment*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def loadFile(self, fpath):
+        label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[4])
+        sick_data['y'] = [label2id[s] for s in sick_data['y']]
+        return sick_data
+
+    def run(self, params, batcher):
+        sick_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sick_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
+                                       self.sick_data[key]['X_B'],
+                                       self.sick_data[key]['y']),
+                                   key=lambda z: (len(z[0]), len(z[1]), z[2]))
+
+            self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
+            self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
+
+            for txt_type in ['X_A', 'X_B']:
+                sick_embed[key][txt_type] = []
+                for ii in range(0, len(self.sick_data[key]['y']), bsize):
+                    batch = self.sick_data[key][txt_type][ii:ii + bsize]
+                    embeddings = batcher(params, batch)
+                    sick_embed[key][txt_type].append(embeddings)
+                sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        # Train
+        trainA = sick_embed['train']['X_A']
+        trainB = sick_embed['train']['X_B']
+        trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
+        trainY = np.array(self.sick_data['train']['y'])
+
+        # Dev
+        devA = sick_embed['dev']['X_A']
+        devB = sick_embed['dev']['X_B']
+        devF = np.c_[np.abs(devA - devB), devA * devB]
+        devY = np.array(self.sick_data['dev']['y'])
+
+        # Test
+        testA = sick_embed['test']['X_A']
+        testB = sick_embed['test']['X_B']
+        testF = np.c_[np.abs(testA - testB), testA * testB]
+        testY = np.array(self.sick_data['test']['y'])
+
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'classifier': params.classifier,
+                  'nhid': params.nhid}
+        clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
+                              y={'train': trainY, 'valid': devY, 'test': testY},
+                              config=config)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+                       SICK entailment\n'.format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(devA), 'ntest': len(testA)}
@@ -0,0 +1,113 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SNLI - Entailment
+'''
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import os
+import io
+import copy
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class SNLIEval(object):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
+        self.seed = seed
+        train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
+        train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
+
+        trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
+                              encoding='utf-8').read().splitlines()
+
+        valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
+        valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
+        validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
+                              encoding='utf-8').read().splitlines()
+
+        test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
+        test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
+        testlabels = io.open(os.path.join(taskpath, 'labels.test'),
+                             encoding='utf-8').read().splitlines()
+
+        # sort data (by s2 first) to reduce padding
+        sorted_train = sorted(zip(train2, train1, trainlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        train2, train1, trainlabels = map(list, zip(*sorted_train))
+
+        sorted_valid = sorted(zip(valid2, valid1, validlabels),
+                              key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        valid2, valid1, validlabels = map(list, zip(*sorted_valid))
+
+        sorted_test = sorted(zip(test2, test1, testlabels),
+                             key=lambda z: (len(z[0]), len(z[1]), z[2]))
+        test2, test1, testlabels = map(list, zip(*sorted_test))
+
+        self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
+        self.data = {'train': (train1, train2, trainlabels),
+                     'valid': (valid1, valid2, validlabels),
+                     'test': (test1, test2, testlabels)
+                     }
+
+    def do_prepare(self, params, prepare):
+        return prepare(params, self.samples)
+
+    def loadFile(self, fpath):
+        with codecs.open(fpath, 'rb', 'latin-1') as f:
+            return [line.split() for line in
+                    f.read().splitlines()]
+
+    def run(self, params, batcher):
+        self.X, self.y = {}, {}
+        dico_label = {'entailment': 0,  'neutral': 1, 'contradiction': 2}
+        for key in self.data:
+            if key not in self.X:
+                self.X[key] = []
+            if key not in self.y:
+                self.y[key] = []
+
+            input1, input2, mylabels = self.data[key]
+            enc_input = []
+            n_labels = len(mylabels)
+            for ii in range(0, n_labels, params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+                    enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
+                                                np.abs(enc1 - enc2))))
+                if (ii*params.batch_size) % (20000*params.batch_size) == 0:
+                    logging.info("PROGRESS (encoding): %.2f%%" %
+                                 (100 * ii / n_labels))
+            self.X[key] = np.vstack(enc_input)
+            self.y[key] = [dico_label[y] for y in mylabels]
+
+        config = {'nclasses': 3, 'seed': self.seed,
+                  'usepytorch': params.usepytorch,
+                  'cudaEfficient': True,
+                  'nhid': params.nhid, 'noreg': True}
+
+        config_classifier = copy.deepcopy(params.classifier)
+        config_classifier['max_epoch'] = 15
+        config_classifier['epoch_size'] = 1
+        config['classifier'] = config_classifier
+
+        clf = SplitClassifier(self.X, self.y, config)
+        devacc, testacc = clf.run()
+        logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
+                      .format(devacc, testacc))
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(self.data['valid'][0]),
+                'ntest': len(self.data['test'][0])}
@@ -0,0 +1,96 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+SST - binary classification
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import logging
+import numpy as np
+
+from senteval.tools.validation import SplitClassifier
+
+
+class SSTEval(object):
+    def __init__(self, task_path, nclasses=2, seed=1111):
+        self.seed = seed
+
+        # binary of fine-grained
+        assert nclasses in [2, 5]
+        self.nclasses = nclasses
+        self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
+        logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
+
+        train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
+        dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
+        test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
+        self.sst_data = {'train': train, 'dev': dev, 'test': test}
+
+    def do_prepare(self, params, prepare):
+        samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
+                  self.sst_data['test']['X']
+        return prepare(params, samples)
+
+    def loadFile(self, fpath):
+        sst_data = {'X': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if self.nclasses == 2:
+                    sample = line.strip().split('\t')
+                    sst_data['y'].append(int(sample[1]))
+                    sst_data['X'].append(sample[0].split())
+                elif self.nclasses == 5:
+                    sample = line.strip().split(' ', 1)
+                    sst_data['y'].append(int(sample[0]))
+                    sst_data['X'].append(sample[1].split())
+        assert max(sst_data['y']) == self.nclasses - 1
+        return sst_data
+
+    def run(self, params, batcher):
+        sst_embed = {'train': {}, 'dev': {}, 'test': {}}
+        bsize = params.batch_size
+
+        for key in self.sst_data:
+            logging.info('Computing embedding for {0}'.format(key))
+            # Sort to reduce padding
+            sorted_data = sorted(zip(self.sst_data[key]['X'],
+                                     self.sst_data[key]['y']),
+                                 key=lambda z: (len(z[0]), z[1]))
+            self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
+
+            sst_embed[key]['X'] = []
+            for ii in range(0, len(self.sst_data[key]['y']), bsize):
+                batch = self.sst_data[key]['X'][ii:ii + bsize]
+                embeddings = batcher(params, batch)
+                sst_embed[key]['X'].append(embeddings)
+            sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
+            sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
+            logging.info('Computed {0} embeddings'.format(key))
+
+        config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
+                             'usepytorch': params.usepytorch,
+                             'classifier': params.classifier}
+
+        clf = SplitClassifier(X={'train': sst_embed['train']['X'],
+                                 'valid': sst_embed['dev']['X'],
+                                 'test': sst_embed['test']['X']},
+                              y={'train': sst_embed['train']['y'],
+                                 'valid': sst_embed['dev']['y'],
+                                 'test': sst_embed['test']['y']},
+                              config=config_classifier)
+
+        devacc, testacc = clf.run()
+        logging.debug('\nDev acc : {0} Test acc : {1} for \
+            SST {2} classification\n'.format(devacc, testacc, self.task_name))
+
+        return {'devacc': devacc, 'acc': testacc,
+                'ndev': len(sst_embed['dev']['X']),
+                'ntest': len(sst_embed['test']['X'])}
@@ -0,0 +1,265 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+'''
+STS-{2012,2013,2014,2015,2016} (unsupervised) and
+STS-benchmark (supervised) tasks
+'''
+
+from __future__ import absolute_import, division, unicode_literals
+
+import os
+import io
+import numpy as np
+import logging
+
+from scipy.stats import spearmanr, pearsonr
+
+from senteval.utils import cosine
+from senteval.sick import SICKEval
+
+
+class STSEval(object):
+    def loadFile(self, fpath):
+        self.data = {}
+        self.samples = []
+
+        for dataset in self.datasets:
+            sent1, sent2 = zip(*[l.split("\t") for l in
+                               io.open(fpath + '/STS.input.%s.txt' % dataset,
+                                       encoding='utf8').read().splitlines()])
+            raw_scores = np.array([x for x in
+                                   io.open(fpath + '/STS.gs.%s.txt' % dataset,
+                                           encoding='utf8')
+                                   .read().splitlines()])
+            not_empty_idx = raw_scores != ''
+
+            gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
+            sent1 = np.array([s.split() for s in sent1], dtype=object)[not_empty_idx]
+            sent2 = np.array([s.split() for s in sent2], dtype=object)[not_empty_idx]
+
+            # sort data by length to minimize padding in batcher
+            sorted_data = sorted(zip(sent1, sent2, gs_scores),
+                                 key=lambda z: (len(z[0]), len(z[1]), z[2]))
+            sent1, sent2, gs_scores = map(list, zip(*sorted_data))
+
+            self.data[dataset] = (sent1, sent2, gs_scores)
+            self.samples += sent1 + sent2
+
+    def do_prepare(self, params, prepare):
+        if 'similarity' in params:
+            self.similarity = params.similarity
+        else:  # Default similarity is cosine
+            self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
+        return prepare(params, self.samples)
+
+    def run(self, params, batcher):
+        results = {}
+        all_sys_scores = []
+        all_gs_scores = []
+        for dataset in self.datasets:
+            sys_scores = []
+            input1, input2, gs_scores = self.data[dataset]
+            for ii in range(0, len(gs_scores), params.batch_size):
+                batch1 = input1[ii:ii + params.batch_size]
+                batch2 = input2[ii:ii + params.batch_size]
+
+                # we assume get_batch already throws out the faulty ones
+                if len(batch1) == len(batch2) and len(batch1) > 0:
+                    enc1 = batcher(params, batch1)
+                    enc2 = batcher(params, batch2)
+
+                    for kk in range(enc2.shape[0]):
+                        sys_score = self.similarity(enc1[kk], enc2[kk])
+                        sys_scores.append(sys_score)
+            all_sys_scores.extend(sys_scores)
+            all_gs_scores.extend(gs_scores)
+            results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
+                                'spearman': spearmanr(sys_scores, gs_scores),
+                                'nsamples': len(sys_scores)}
+            logging.debug('%s : pearson = %.4f, spearman = %.4f' %
+                          (dataset, results[dataset]['pearson'][0],
+                           results[dataset]['spearman'][0]))
+
+        weights = [results[dset]['nsamples'] for dset in results.keys()]
+        list_prs = np.array([results[dset]['pearson'][0] for
+                            dset in results.keys()])
+        list_spr = np.array([results[dset]['spearman'][0] for
+                            dset in results.keys()])
+
+        avg_pearson = np.average(list_prs)
+        avg_spearman = np.average(list_spr)
+        wavg_pearson = np.average(list_prs, weights=weights)
+        wavg_spearman = np.average(list_spr, weights=weights)
+        all_pearson = pearsonr(all_sys_scores, all_gs_scores)
+        all_spearman = spearmanr(all_sys_scores, all_gs_scores)
+        results['all'] = {'pearson': {'all': all_pearson[0],
+                                      'mean': avg_pearson,
+                                      'wmean': wavg_pearson},
+                          'spearman': {'all': all_spearman[0],
+                                       'mean': avg_spearman,
+                                       'wmean': wavg_spearman}}
+        logging.debug('ALL : Pearson = %.4f, \
+            Spearman = %.4f' % (all_pearson[0], all_spearman[0]))
+        logging.debug('ALL (weighted average) : Pearson = %.4f, \
+            Spearman = %.4f' % (wavg_pearson, wavg_spearman))
+        logging.debug('ALL (average) : Pearson = %.4f, \
+            Spearman = %.4f\n' % (avg_pearson, avg_spearman))
+
+        return results
+
+
+class STS12Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS12 *****\n\n')
+        self.seed = seed
+        self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
+                         'surprise.OnWN', 'surprise.SMTnews']
+        self.loadFile(taskpath)
+
+
+class STS13Eval(STSEval):
+    # STS13 here does not contain the "SMT" subtask due to LICENSE issue
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n')
+        self.seed = seed
+        self.datasets = ['FNWN', 'headlines', 'OnWN']
+        self.loadFile(taskpath)
+
+
+class STS14Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS14 *****\n\n')
+        self.seed = seed
+        self.datasets = ['deft-forum', 'deft-news', 'headlines',
+                         'images', 'OnWN', 'tweet-news']
+        self.loadFile(taskpath)
+
+
+class STS15Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS15 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answers-forums', 'answers-students',
+                         'belief', 'headlines', 'images']
+        self.loadFile(taskpath)
+
+
+class STS16Eval(STSEval):
+    def __init__(self, taskpath, seed=1111):
+        logging.debug('***** Transfer task : STS16 *****\n\n')
+        self.seed = seed
+        self.datasets = ['answer-answer', 'headlines', 'plagiarism',
+                         'postediting', 'question-question']
+        self.loadFile(taskpath)
+
+
+class STSBenchmarkEval(STSEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        self.samples = []
+        #train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        #dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        #test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        #self.datasets = ['train', 'dev', 'test']
+        #self.data = {'train': train, 'dev': dev, 'test': test}
+        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        self.datasets = ['test']
+        self.data = {'test': test}
+
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        self.samples += sick_data['X_A'] + sick_data["X_B"]
+        return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
+
+class STSBenchmarkEvalDev(STSEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        self.samples = []
+        #train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        #dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        #test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        #self.datasets = ['train', 'dev', 'test']
+        #self.data = {'train': train, 'dev': dev, 'test': test}
+        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        self.datasets = ['dev']
+        self.data = {'dev': dev}
+
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        self.samples += sick_data['X_A'] + sick_data["X_B"]
+        return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
+
+class STSBenchmarkFinetune(SICKEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
+        self.seed = seed
+        train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
+        dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
+        test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
+        self.sick_data = {'train': train, 'dev': dev, 'test': test}
+
+    def loadFile(self, fpath):
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                text = line.strip().split('\t')
+                sick_data['X_A'].append(text[5].split())
+                sick_data['X_B'].append(text[6].split())
+                sick_data['y'].append(text[4])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        return sick_data
+        
+class SICKRelatednessEval(STSEval):
+    def __init__(self, task_path, seed=1111):
+        logging.debug('\n\n***** Transfer task : SICKRelatedness*****\n\n')
+        self.seed = seed
+        self.samples = []
+        #train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
+        #dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
+        #test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        #self.datasets = ['train', 'dev', 'test']
+        #self.data = {'train': train, 'dev': dev, 'test': test}
+        test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
+        self.datasets = ['test']
+        self.data = {'test': test}
+    
+    def loadFile(self, fpath):
+        skipFirstLine = True
+        sick_data = {'X_A': [], 'X_B': [], 'y': []}
+        with io.open(fpath, 'r', encoding='utf-8') as f:
+            for line in f:
+                if skipFirstLine:
+                    skipFirstLine = False
+                else:
+                    text = line.strip().split('\t')
+                    sick_data['X_A'].append(text[1].split())
+                    sick_data['X_B'].append(text[2].split())
+                    sick_data['y'].append(text[3])
+
+        sick_data['y'] = [float(s) for s in sick_data['y']]
+        self.samples += sick_data['X_A'] + sick_data["X_B"]
+        return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
@@ -0,0 +1,202 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+"""
+Pytorch Classifier class in the style of scikit-learn
+Classifiers include Logistic Regression and MLP
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+import numpy as np
+import copy
+from senteval import utils
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class PyTorchClassifier(object):
+    def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,
+                 cudaEfficient=False):
+        # fix seed
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+
+        self.inputdim = inputdim
+        self.nclasses = nclasses
+        self.l2reg = l2reg
+        self.batch_size = batch_size
+        self.cudaEfficient = cudaEfficient
+
+    def prepare_split(self, X, y, validation_data=None, validation_split=None):
+        # Preparing validation data
+        assert validation_split or validation_data
+        if validation_data is not None:
+            trainX, trainy = X, y
+            devX, devy = validation_data
+        else:
+            permutation = np.random.permutation(len(X))
+            trainidx = permutation[int(validation_split * len(X)):]
+            devidx = permutation[0:int(validation_split * len(X))]
+            trainX, trainy = X[trainidx], y[trainidx]
+            devX, devy = X[devidx], y[devidx]
+
+        device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')
+
+        trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)
+        trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)
+        devX = torch.from_numpy(devX).to(device, dtype=torch.float32)
+        devy = torch.from_numpy(devy).to(device, dtype=torch.int64)
+
+        return trainX, trainy, devX, devy
+
+    def fit(self, X, y, validation_data=None, validation_split=None,
+            early_stop=True):
+        self.nepoch = 0
+        bestaccuracy = -1
+        stop_train = False
+        early_stop_count = 0
+
+        # Preparing validation data
+        trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,
+                                                        validation_split)
+
+        # Training
+        while not stop_train and self.nepoch <= self.max_epoch:
+            self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)
+            accuracy = self.score(devX, devy)
+            if accuracy > bestaccuracy:
+                bestaccuracy = accuracy
+                bestmodel = copy.deepcopy(self.model)
+            elif early_stop:
+                if early_stop_count >= self.tenacity:
+                    stop_train = True
+                early_stop_count += 1
+        self.model = bestmodel
+        return bestaccuracy
+
+    def trainepoch(self, X, y, epoch_size=1):
+        self.model.train()
+        for _ in range(self.nepoch, self.nepoch + epoch_size):
+            permutation = np.random.permutation(len(X))
+            all_costs = []
+            for i in range(0, len(X), self.batch_size):
+                # forward
+                idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)
+
+                Xbatch = X[idx]
+                ybatch = y[idx]
+
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                # loss
+                loss = self.loss_fn(output, ybatch)
+                all_costs.append(loss.data.item())
+                # backward
+                self.optimizer.zero_grad()
+                loss.backward()
+                # Update parameters
+                self.optimizer.step()
+        self.nepoch += epoch_size
+
+    def score(self, devX, devy):
+        self.model.eval()
+        correct = 0
+        if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:
+            devX = torch.FloatTensor(devX).cuda()
+            devy = torch.LongTensor(devy).cuda()
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                ybatch = devy[i:i + self.batch_size]
+                if self.cudaEfficient:
+                    Xbatch = Xbatch.cuda()
+                    ybatch = ybatch.cuda()
+                output = self.model(Xbatch)
+                pred = output.data.max(1)[1]
+                correct += pred.long().eq(ybatch.data.long()).sum().item()
+            accuracy = 1.0 * correct / len(devX)
+        return accuracy
+
+    def predict(self, devX):
+        self.model.eval()
+        if not isinstance(devX, torch.cuda.FloatTensor):
+            devX = torch.FloatTensor(devX).cuda()
+        yhat = np.array([])
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                output = self.model(Xbatch)
+                yhat = np.append(yhat,
+                                 output.data.max(1)[1].cpu().numpy())
+        yhat = np.vstack(yhat)
+        return yhat
+
+    def predict_proba(self, devX):
+        self.model.eval()
+        probas = []
+        with torch.no_grad():
+            for i in range(0, len(devX), self.batch_size):
+                Xbatch = devX[i:i + self.batch_size]
+                vals = F.softmax(self.model(Xbatch).data.cpu().numpy())
+                if not probas:
+                    probas = vals
+                else:
+                    probas = np.concatenate(probas, vals, axis=0)
+        return probas
+
+
+"""
+MLP with Pytorch (nhid=0 --> Logistic Regression)
+"""
+
+class MLP(PyTorchClassifier):
+    def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
+                 seed=1111, cudaEfficient=False):
+        super(self.__class__, self).__init__(inputdim, nclasses, l2reg,
+                                             batch_size, seed, cudaEfficient)
+        """
+        PARAMETERS:
+        -nhid:       number of hidden units (0: Logistic Regression)
+        -optim:      optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
+        -tenacity:   how many times dev acc does not increase before stopping
+        -epoch_size: each epoch corresponds to epoch_size pass on the train set
+        -max_epoch:  max number of epoches
+        -dropout:    dropout for MLP
+        """
+
+        self.nhid = 0 if "nhid" not in params else params["nhid"]
+        self.optim = "adam" if "optim" not in params else params["optim"]
+        self.tenacity = 5 if "tenacity" not in params else params["tenacity"]
+        self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"]
+        self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"]
+        self.dropout = 0. if "dropout" not in params else params["dropout"]
+        self.batch_size = 64 if "batch_size" not in params else params["batch_size"]
+
+        if params["nhid"] == 0:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, self.nclasses),
+            ).cuda()
+        else:
+            self.model = nn.Sequential(
+                nn.Linear(self.inputdim, params["nhid"]),
+                nn.Dropout(p=self.dropout),
+                nn.Sigmoid(),
+                nn.Linear(params["nhid"], self.nclasses),
+            ).cuda()
+
+        self.loss_fn = nn.CrossEntropyLoss().cuda()
+        self.loss_fn.size_average = False
+
+        optim_fn, optim_params = utils.get_optimizer(self.optim)
+        self.optimizer = optim_fn(self.model.parameters(), **optim_params)
+        self.optimizer.param_groups[0]['weight_decay'] = self.l2reg
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+wget --no-check-certificate https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar`
	`2`	`+tar xvf senteval.tar`