diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2f8633b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,116 @@
+# LaTeX temporary files
+*.aux
+*.log
+*.toc
+
+# PDF output - usually a bad idea to keep this in Git
+*.pdf
+
+# Latexmk
+*.fdb_latexmk
+
+# SyncTeX
+*.synctex.gz
+
+# LaTeX Beamer
+*.snm
+*.vrb
+*.nav
+*.out
+
+# BibTeX
+*.bbl
+*.blg
+
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..33da967
--- /dev/null
+++ b/README.md
@@ -0,0 +1,195 @@
+# Learning to select data for transfer learning with Bayesian Optimization
+
+Sebastian Ruder, Barbara Plank (2017). Learning to select data for transfer 
+learning with Bayesian Optimization. _In Proceedings of the 2017 Conference 
+on Empirical Methods in Natural Language Processing_, Copenhagen, Denmark.
+
+## Requirements
+
+### RoBO
+
+The Robust Bayesian Optimization framework [RoBO](http://automl.github.io/RoBO/) needs to be installed.
+It can be installed using the following steps:
+
+1. First, install `libeigen3-dev` as a prerequisite:
+`sudo apt-get install libeigen3-dev` (*)
+2. Then, clone the RoBO repository: 
+`git clone https://github.com/automl/RoBO.git`
+3. Change into the directory: `cd RoBO/`
+4. Install RoBOs requirements:
+`for req in $(cat all_requirements.txt); do pip install $req; done`
+5. Finally, install RoBO:
+`python setup.py install`
+
+For the topic models, `gensim` needs to be installed:
+`pip install gensim`
+
+### DyNet
+
+We use the neural network library [DyNet](http://dynet.readthedocs.io/en/latest/index.html),
+which works well with networks that have dynamic structures. DyNet can be 
+installed by following the instructions [here](http://dynet.readthedocs.io/en/latest/python.html#manual-installation).
+
+## Repository structure
+
+- `bilstm_tagger`: The repository containing code for the Bi-LSTM tagger from 
+Plank et al. (2016).
+- `bist_parser`: The repository containing the code for the BIST parser from 
+Kiperwasser and Goldberg (2016).
+- `bayes_opt.py`: The main logic for running Bayesian Optimization.
+- `constants.py`: Constants that are shared across all files.
+- `data_utils.py`: Utility methods for data reading and processing.
+- `similarity.py`: Methods for measuring domain similarity.
+- `simpletagger.py`: Code for running the Structured Perceptron POS tagger.
+- `task_utils.py`: Utility methods for training and evaluation.
+
+## Instructions
+
+### Running Bayesian Optimization
+
+The main logic for running Bayesian Optimization can be found in `bayes_opt.py`.
+The features that are currently used are currently defined in `constants.py` as
+`FEATURE_SETS` and are split into diversity and similarity features.
+Bayesian Optimization minimizes the validation error on the specified dataset.
+
+### Example usage
+
+```
+python bayes_opt.py --dynet-autobatch 1 -d data/gweb_sancl -m models/model \
+                    -t emails newsgroups reviews weblogs wsj --task pos \
+                    -b random most-similar-examples \
+                    --parser-output-path parser_outputs \
+                    --perl-script-path bist_parser/bmstparser/src/util_scripts/eval.pl \
+                    -f similarity --z-norm --num-iterations 100 \
+                    --num-runs 1 --log-file logs/log
+```
+
+- `dynet-autobatch 1`: use DyNet auto-batching
+- `-d data/gweb_sancl`: use the data from the SANCL 2012 shared task
+- `-m models/model`: specify the directory where the model should be saved
+- `-t emails newsgroups reviews weblogs wsj`: adapt to the specified target 
+domains in the order they were provided
+- `--task pos`: perform POS tagging with the Structured Perceptron model
+- `-b`: use the random and most-similar-examples baselines
+- `--parser-output-path`, `--perl-script-path`: only required when performing 
+parsing
+- `-f`: use only similarity features with Bayesian Optimization
+- `--z-norm`: perform z-normalisation (recommended)
+- `--num-iterations`: perform 100 iterations of Bayesian Optimization
+- `--num-runs`: perform one run of Bayesian Optimization per target domain
+- `--log-file`: log the results of the baselines and Bayesian Optimization to
+ this file
+
+### Adding a new task
+
+In order to add a new task, you need to do several things:
+- Add the new task to `TASKS`, `TASK2TRAIN_EXAMPLES`, and `TASK2DOMAINS` in 
+`constants.py`.
+- Add a method to read data for the task to `data_utils.py` and add the 
+mapping to `data_utils.task2read_data_func`.
+- Add a method to train and evaluate the task to `task_utils.py` and add the 
+mapping to `task_utils.task2train_and_evaluate_func`.
+- Add the function that should be minimized to `bayes_opt.py` and add the 
+mapping to `task2_objective_function`. The function should take
+as input the feature weights and output the error.
+
+### Adding new features
+
+New feature sets or features can be added by adding them to `constants.py`.
+Similarity features or new representations can be added to 
+`similarity.py`. Diversity features or any other features can to be added to
+`features.py`. All new features must be added to 
+`get_feature_representations` and `get_feature_names` in `features.py`.
+
+
+
+## Data
+
+### Multi-Domain Sentiment Dataset
+
+The Amazon Reviews Multi-Domain Sentiment Dataset (Blitzer et al., 2007)
+used in the current Bayesian Optimization experiment can be downloaded
+using the following steps:
+1. Create a new `amazon-reviews` directory:
+`mkdir amazon-reviews`
+2. Change into the directory:
+`cd amazon-reviews`
+3. Download the dataset:
+`wget https://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz`
+4. Extract the dataset:
+`tar -xvf processed_acl.tar.gz`
+
+In `bayes_opt.py`, the `data-path` argument should now be pointed to
+the `amazon-reviews` directory.
+
+### Multi-domain POS and parsing data
+
+We use the data from the [SANCL 2012 shared task/English Web Treebank](https://catalog.ldc.upenn.edu/ldc2012t13).
+
+### Word embedding data
+
+Pre-trained word embeddings can be downloaded from [here](http://nlp.stanford.edu/projects/glove/).
+We are using GloVe embeddings in the paper, but other pre-trained embeddings are also possible.
+Smaller embedding files can be used for faster iteration.
+
+
+## Models
+
+### BIST parser
+
+We use the BIST parser from Kiperwasser and Goldberg (2016) for our experiments. The parser repo can be found
+[here](https://github.com/elikip/bist-parser) and was integrated using [`git submodule`](http://stackoverflow.com/questions/2140985/how-to-set-up-a-git-project-to-use-an-external-repo-submodule).
+
+For running the parser with Bayesian Optimization, two additional hyperparameters are necessary:
+- `--perl-script-path`: This is the location of the `perl` script that is used to evaluate the parser's predictions.
+                        The script is located in `bist_parser/bmstparser/src/util_scripts/eval.pl` per default.
+- `--parser-output-path`: This is the location of the folder where the parser's predictions and the output of the
+                          `perl` script will be written to.
+
+Per default, Labeled Attachment Score on the held-out validation set is used to evaluate the parser's performance and 
+evaluation results are saved to a subfolders of `parser-output-path` that indicate the target domain and feature sets
+used. Another subsubfolder is created for the best weights configuration so that Labeled Attachment Score, Unlabeled
+Attachment Score and Accuracy as well as other statistics are available for the final test set evaluation.
+
+### Bi-LSTM tagger
+
+The Bi-LSTM tagger we are using is a simplified, single-task version of the
+hierarchical Multi-task Bi-LSTM tagger used by Plank et al. (2016). The source
+repository of the tagger can be found [here](https://github.com/bplank/bilstm-aux/).
+
+## (*) Installing Eigen without sudo rights
+
+In case you you do not have sudo rights to run `sudo apt-get install
+libeigen3-dev` here is a workaround.
+
+Create a folder where you download the sources of libeigen3-dev:
+
+```
+mkdir -p tools/eigen3
+cd tools/eigen3
+apt-get source libeigen3-dev
+```
+
+Afterwards point the required packages for `RoBo` to the folder just created: `tools/eigen3/eigen3-3.2.0`
+
+For instance, to install the 'george' requirement of `RoBo`, add the `--global-option` parameters pointing to the eigen directory:
+
+```
+pip install git+https://github.com/sfalkner/george.git --global-option=build_ext --global-option=-I/path/to/tools/eigen3/eigen3-3.2.0
+```
+
+(see http://dan.iel.fm/george/current/user/quickstart/#installation -> if you have Eigen in a strange place)
+
+
+## Reference
+
+If you make use of the contents of this repository, we appreciate citing the following paper:
+```
+@inproceedings{ruder2017select,
+  title={{Learning to select data for transfer learning with Bayesian Optimization}},
+  author={Ruder, Sebastian and Plank, Barbara},
+  booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processin},
+  year={2017}
+}
+```
+
diff --git a/bayes_opt.py b/bayes_opt.py
new file mode 100644
index 0000000..5714a07
--- /dev/null
+++ b/bayes_opt.py
@@ -0,0 +1,459 @@
+"""
+Run Bayesian optimization to learn to learn select data for transfer learning.
+
+Uses Python 3.5.
+"""
+
+import os
+import argparse
+import logging
+import pickle
+import copy
+
+import numpy as np
+from scipy import stats
+from sklearn.cross_validation import train_test_split
+
+from robo.fmin import bayesian_optimization
+
+import task_utils
+import data_utils
+import similarity
+import features
+from constants import FEATURE_SETS, SENTIMENT, POS, POS_BILSTM, PARSING,\
+    TASK2TRAIN_EXAMPLES, TASK2DOMAINS, TASKS, POS_PARSING_TRG_DOMAINS,\
+    SENTIMENT_TRG_DOMAINS, BASELINES, BAYES_OPT, RANDOM, MOST_SIMILAR_DOMAIN,\
+    MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA
+
+from bist_parser.bmstparser.src.utils import ConllEntry
+
+
+def task2_objective_function(task):
+    """Returns the objective function of a task."""
+    if task == SENTIMENT:
+        return objective_function_sentiment
+    if task == POS:
+        return objective_function_pos
+    if task == POS_BILSTM:
+        return objective_function_pos_bilstm
+    if task == PARSING:
+        return objective_function_parsing
+    raise ValueError('No objective function implemented for %s.' % task)
+
+
+def objective_function_sentiment(feature_weights):
+    """
+    The objective function to optimize for sentiment analysis.
+    :param feature_weights: a numpy array; these are the weights of the features
+                            that we want to learn
+    :return: the error that should be minimized
+    """
+    train_subset, train_labels_subset = task_utils.get_data_subsets(
+        feature_values, feature_weights, X_train, y_train, SENTIMENT,
+        TASK2TRAIN_EXAMPLES[SENTIMENT])
+
+    # train and evaluate the SVM; we input the test documents here but only
+    # minimize the validation error
+    val_accuracy, _ = task_utils.train_and_evaluate_sentiment(
+        train_subset, train_labels_subset, X_val, y_val, X_test, y_test)
+
+    # we minimize the error; the lower the better
+    error = 1 - float(val_accuracy)
+    return error
+
+
+def objective_function_pos(feature_weights):
+    """
+    The objective function to optimize for POS tagging.
+    :param feature_weights: a numpy array; these are the weights of the features
+                            that we want to learn
+    :return: the error that should be minimized
+    """
+    train_subset, train_labels_subset = task_utils.get_data_subsets(
+        feature_values, feature_weights, X_train, y_train, POS,
+        TASK2TRAIN_EXAMPLES[POS])
+
+    # train and evaluate the tagger; we input the test documents here but only
+    # minimize the validation error
+    val_accuracy, _ = task_utils.train_and_evaluate_pos(
+        train_subset, train_labels_subset, X_val, y_val)
+
+    # we minimize the error; the lower the better
+    error = 1 - float(val_accuracy)
+    return error
+
+
+def objective_function_pos_bilstm(feature_weights):
+    """
+    The objective function to optimize for POS tagging.
+    :param feature_weights: a numpy array; these are the weights of the features
+                            that we want to learn
+    :return: the error that should be minimized
+    """
+    train_subset, train_labels_subset = task_utils.get_data_subsets(
+        feature_values, feature_weights, X_train, y_train, POS_BILSTM,
+        TASK2TRAIN_EXAMPLES[POS_BILSTM])
+
+    # train and evaluate the tagger; we input the test documents here but only
+    # minimize the validation error
+    val_accuracy, _ = task_utils.train_and_evaluate_pos_bilstm(
+        train_subset, train_labels_subset, X_val, y_val)
+
+    # we minimize the error; the lower the better
+    error = 1 - float(val_accuracy)
+    return error
+
+
+def objective_function_parsing(feature_weights):
+    """
+    The objective function to optimize for dependency parsing.
+    :param feature_weights: a numpy array; these are the weights of the features
+                            that we want to learn
+    :return: the error that should be minimized
+    """
+    train_subset, train_labels_subset = task_utils.get_data_subsets(
+        feature_values, feature_weights, X_train, y_train, PARSING,
+        TASK2TRAIN_EXAMPLES[PARSING])
+    val_accuracy, _ = task_utils.train_and_evaluate_parsing(
+        train_subset, train_labels_subset, X_val, y_val,
+        parser_output_path=parser_output_path,
+        perl_script_path=perl_script_path)
+    error = 100 - float(val_accuracy)
+    return error
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Learn to select data using Bayesian Optimization.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # dynet parameters
+    parser.add_argument('--dynet-autobatch', type=int,
+                        help='use auto-batching (1) (should be first argument)')
+    parser.add_argument('--dynet-mem', default=5000, help='the memory used',
+                        type=int)  # Note: needs to be given to the script!
+    parser.add_argument('--dynet-seed', default=1512141834, type=int,
+                        help='the dynet seed')  # Note: needs to still be given!
+
+    # domain and data paths
+    parser.add_argument('-d', '--data-path', required=True,
+                        help='the path to the directory containing the '
+                             'processed_acl or gweb_sancl directory')
+    parser.add_argument('-m', '--model-dir', required=True,
+                        help='the directory where the model should be saved')
+    parser.add_argument('-t', '--trg-domains', nargs='+', required=True,
+                        choices=POS_PARSING_TRG_DOMAINS + SENTIMENT_TRG_DOMAINS,
+                        help='the domains to which to adapt')
+    parser.add_argument('--task', choices=TASKS, required=True,
+                        help='the task which to optimize')
+    parser.add_argument('-b', '--baselines', nargs='+', choices=BASELINES,
+                        default=[RANDOM],
+                        help='the baselines that should be compared against')
+    parser.add_argument('-o', '--parser-output-path',
+                        default='outputs', help='the output path of the parser')
+    parser.add_argument('-p', '--perl-script-path', help='perl script path',
+                        default='bist_parser/bmstparser/src/util_scripts/eval'
+                                '.pl')
+
+    # feature parameters
+    parser.add_argument('-f', '--feature-sets', nargs='+', default=['similarity'],
+                        choices=FEATURE_SETS,
+                        help='which feature sets (similarity, topic_similarity,'
+                             'word_embedding_similarity, diversity) '
+                             'to use; default: similarity')
+    parser.add_argument('--z-norm', action='store_true',
+                        help='use z-normalisation')  # important to specify
+    parser.add_argument('--feature-weights-file',
+                        help='a file containing learned feature weights to be'
+                             'used for cross-domain experiments')
+
+    # word embedding parameters
+    parser.add_argument('-wv', '--word2vec-path', help='the path to the word'
+                                                       'vector file')
+    parser.add_argument('-vs', '--vector-size', type=int, default=300,
+                        help='the size of the word vectors')
+    parser.add_argument('--header', action='store_true',
+                        help='whether the word embeddings file contains header;'
+                        'GloVe embeddings used in the paper have no header')
+
+    # processing parameters
+    parser.add_argument('-v', '--max-vocab-size', default=10000, type=int,
+                        help='the maximum size of the vocabulary')
+
+    # training parameters
+    parser.add_argument('--num-iterations', default=100, type=int)
+    parser.add_argument('--logging', action='store_true', help='perform logging')
+    parser.add_argument('--num-runs', type=int, default=1,
+                        help='the number of experiment runs for each domain')
+    parser.add_argument('--log-file', required=True,
+                        help='the path to which validation and test accuracies'
+                             'should be logged')
+
+    args = parser.parse_args()
+
+    # switch on logging if specified to see the output of LDA training and of
+    # the Bayesian optimization
+    if args.logging:
+        logging.basicConfig(level=logging.INFO)
+
+    assert os.path.exists(args.data_path), ('Error: %s does not exist.' %
+                                            args.data_path)
+    assert not args.word2vec_path or os.path.exists(args.word2vec_path), \
+        'Error: %s does not exist.' % args.word2vec_path
+
+    # create the model directory if it does not exist
+    if not os.path.exists(args.model_dir):
+        print('Creating %s...' % args.model_dir)
+        os.makedirs(args.model_dir)
+
+    # perl script path and parser output path are only required for parsing
+    perl_script_path = None
+    if args.task == PARSING:
+        assert args.parser_output_path is not None
+        assert args.perl_script_path is not None
+        if not os.path.exists(args.parser_output_path):
+            os.makedirs('Creating output path %s.' % args.parser_output_path)
+        assert os.path.exists(args.perl_script_path)
+        perl_script_path = args.perl_script_path
+
+    # get the task-specific methods and hyper-parameters
+    num_train_examples = TASK2TRAIN_EXAMPLES[args.task]
+    task_trg_domains = TASK2DOMAINS[args.task]
+    read_data = data_utils.task2read_data_func(args.task)
+    train_and_evaluate = task_utils.task2train_and_evaluate_func(args.task)
+    objective_function = task2_objective_function(args.task)
+
+    # get the names of the individual features in the feature sets
+    assert args.word2vec_path or 'diversity' not in args.feature_sets,\
+        'Error: Word2vec path is required for quadratic entropy in ' \
+        'diversity-based features.'
+    feature_names = features.get_feature_names(args.feature_sets)
+
+    if args.feature_weights_file:
+        print('Training model with pre-learned feature weights rather than '
+              'learning new ones...')
+        assert os.path.exists(args.feature_weights_file),\
+            'Error: %s does not exist.' % args.feature_weights_file
+
+    # read the data and pickle it or load it
+    preproc_data_path = os.path.join(args.model_dir,
+                                     'preproc_data_%s.pkl' % args.task)
+    if not os.path.exists(preproc_data_path):
+        domain2data = read_data(args.data_path)
+        print('Saving domain2data object to %s...' % preproc_data_path)
+        with open(preproc_data_path, 'wb') as f:
+            pickle.dump(domain2data, f)
+    else:
+        print('Loading domain2data object from %s...' % preproc_data_path)
+        with open(preproc_data_path, 'rb') as f:
+            domain2data = pickle.load(f)
+    assert set(task_trg_domains) == set(domain2data.keys())
+
+    # create the vocabulary or load it if it was already created
+    vocab_path = os.path.join(args.model_dir, 'vocab.txt')
+    vocab = data_utils.Vocab(args.max_vocab_size, vocab_path)
+    if not os.path.exists(vocab_path):
+        # retrieve all available tokenised sentences
+        tokenised_sentences = data_utils.get_all_docs(
+            domain2data.items(), unlabeled=True)[0]
+        if args.task == PARSING:
+            # get the word form from every ConllEntry
+            tokenised_sentences = [[token.form if isinstance(token, ConllEntry)
+                                    else token for token in tokens]
+                                   for tokens in tokenised_sentences]
+        vocab.create(tokenised_sentences)
+        del tokenised_sentences
+    else:
+        vocab.load()
+
+    # load word vectors if we are using them
+    word2vec = None
+    if args.word2vec_path:
+        vocab_word2vec_file = os.path.join(args.model_dir, 'vocab_word2vec.txt')
+        word2vec = similarity.load_word_vectors(
+            args.word2vec_path, vocab_word2vec_file, vocab.word2id,
+            vector_size=args.vector_size, header=args.header)
+
+    # perform the task-specific pre-processing
+    if args.task == SENTIMENT:
+        print('Creating binary training data...')
+        domain2train_data = data_utils.get_tfidf_data(domain2data, vocab)
+    elif args.task in [POS, POS_BILSTM]:
+        print('Using words as training data for POS tagging...')
+        domain2train_data = domain2data
+    elif args.task == PARSING:
+        print('Using CoNLL entries as training data for parsing. Using word '
+              'forms to extract feature representations...')
+        domain2train_data = copy.deepcopy(domain2data)
+        for domain, domain_data in domain2data.items():
+            domain_data[0] = [[conll_entry.form for conll_entry in conll_entries]
+                              for conll_entries in domain_data[0]]
+    else:
+        raise ValueError('Data preproc for %s is not implemented.' % args.task)
+
+    print('Creating relative term frequency distributions for all domains...')
+    term_dist_path = os.path.join(args.model_dir, 'term_dist.txt')
+    domain2term_dist = similarity.get_domain_term_dists(
+        term_dist_path, domain2data, vocab)
+
+    # perform optimization for every target domain
+    for trg_domain in args.trg_domains:
+        print('Target domain:', trg_domain)
+
+        # set the domain and similarity-specific parser output path for parsing
+        parser_output_path, best_weights_parser_output_path = None, None
+        if args.task == PARSING:
+            parser_output_path = os.path.join(
+                args.parser_output_path, '%s-%s' % (trg_domain, '_'.join(
+                    args.feature_sets)))
+            if not os.path.exists(parser_output_path):
+                print('Creating %s...' % parser_output_path)
+                os.makedirs(parser_output_path)
+            # use a separate subfolder for the best weights
+            best_weights_parser_output_path = os.path.join(parser_output_path,
+                                                           'best-weights')
+            if not os.path.exists(best_weights_parser_output_path):
+                os.makedirs(best_weights_parser_output_path)
+
+        # get the training data of all source domains (not the target domain)
+        X_train, y_train, train_domains = data_utils.get_all_docs(
+            [(k, v) for (k, v) in sorted(domain2train_data.items())
+             if k != trg_domain], unlabeled=False)
+
+        # get the unprocessed examples for extracting the feature values
+        examples, y_train_check, train_domains_check = data_utils.get_all_docs(
+            [(k, v) for (k, v) in sorted(domain2data.items())
+             if k != trg_domain], unlabeled=False)
+
+        # some sanity checks just to make sure the processed and the
+        # unprocessed data still correspond to the same examples
+        assert np.array_equal(y_train, y_train_check)
+        assert len(train_domains) == len(train_domains_check),\
+            'Error: %d != %d.' % (len(train_domains), len(train_domains_check))
+        assert train_domains == train_domains_check, ('Error: %s != %s' % (
+            str(train_domains), str(train_domains_check)))
+        if args.task in [POS, POS_BILSTM, PARSING]:
+            # for sentiment, we are using a sparse matrix
+            X_train = np.array(X_train)
+        print('Training data shape:', X_train.shape, y_train.shape)
+
+        # train topic model if any of the features requires a topic distribution
+        topic_vectorizer, lda_model = None, None
+        if any(f_name.startswith('topic') for f_name in feature_names):
+            # train a topic model on labeled and unlabeled data of all domains
+            topic_vectorizer, lda_model = similarity.train_topic_model(
+                data_utils.get_all_docs(
+                    domain2data.items(), unlabeled=True)[0], vocab)
+
+        # get the feature representations of the training data
+        print('Creating the feature representations for the training data. '
+              'This may take some time...')
+        feature_values = features.get_feature_representations(
+            feature_names, examples, domain2data[trg_domain][0], vocab,
+            word2vec, topic_vectorizer, lda_model)
+
+        if args.z_norm:
+            # apply z-normalisation; this is important for good performance
+            print('Z-normalizing features...')
+            print('First five example features before normalisation:',
+                  feature_values[:5, :])
+            print('Standard deviation of features:', np.std(feature_values,
+                                                            axis=0))
+            print('Mean of features:', np.mean(feature_values, axis=0))
+            feature_values = stats.zscore(feature_values, axis=0)
+
+        # delete unnecessary variables to save space
+        del examples, y_train_check, train_domains_check
+
+        # run num_runs iterations of the optimization and baselines in order to
+        # compute statistics around mean/variance; things that vary between
+        # runs: validation/test split; train set of random baseline;
+        # final BayesOpt parameters; the feature values are constant for each
+        # run, which is why we generate them before to reduce the overhead
+        run_dict = {method: [] for method in BASELINES + [BAYES_OPT]}
+        for i in range(args.num_runs):
+            print('\nTarget domain %s. Run %d/%d.' % (trg_domain, i+1,
+                                                      args.num_runs))
+
+            # get the evaluation data from the target domain
+            X_test, y_test, _ = domain2train_data[trg_domain]
+            
+            # split off a validation set from the evaluation data
+            X_test, X_val, y_test, y_val = train_test_split(
+                X_test, y_test, test_size=100, stratify=y_test
+                if args.task == SENTIMENT else None)
+            print('# of validation examples: %d. # of test examples: %d.'
+                  % (len(y_val), len(y_test)))
+
+            # train the model with pre-learned feature weights if specified
+            if args.feature_weights_file:
+                print('Training with pre-learned feature weights...')
+                task_utils.train_pretrained_weights(
+                    feature_values, X_train, y_train, train_domains,
+                    num_train_examples, X_val, y_val, X_test, y_test,
+                    trg_domain, args, feature_names, parser_output_path,
+                    perl_script_path)
+                continue
+
+            for baseline in args.baselines:
+
+                # select the training data dependent on the baseline
+                if baseline == RANDOM:
+                    print('Randomly selecting examples...')
+                    train_subset, _, labels_subset, _ = train_test_split(
+                        X_train, y_train, train_size=num_train_examples,
+                        stratify=y_train if args.task == SENTIMENT else None)
+                elif baseline == ALL_SOURCE_DATA:
+                    print('Selecting all source data examples...')
+                    train_subset, labels_subset = X_train, y_train
+                elif baseline == MOST_SIMILAR_DOMAIN:
+                    print('Selecting examples from the most similar domain...')
+                    most_similar_domain = similarity.get_most_similar_domain(
+                        trg_domain, domain2term_dist)
+                    train_subset, labels_subset, _ = domain2train_data[
+                        most_similar_domain]
+                    train_subset, _, labels_subset, _ = train_test_split(
+                        train_subset, labels_subset, train_size=num_train_examples,
+                        stratify=labels_subset if args.task == SENTIMENT else None)
+                elif baseline == MOST_SIMILAR_EXAMPLES:
+                    print('Selecting the most similar examples...')
+                    one_all_weights = np.ones(len(feature_names))
+                    one_all_weights[1:] = 0
+                    train_subset, labels_subset = task_utils.get_data_subsets(
+                        feature_values, one_all_weights, X_train, y_train,
+                        args.task, num_train_examples)
+                else:
+                    raise ValueError('%s is not a baseline.' % baseline)
+
+                # train the baseline
+                val_accuracy, test_accuracy = train_and_evaluate(
+                    train_subset, labels_subset, X_val, y_val,
+                    X_test, y_test, parser_output_path=parser_output_path,
+                    perl_script_path=perl_script_path)
+                run_dict[baseline].append((val_accuracy, test_accuracy))
+
+            # define the lower and upper bounds of the input space [-1, 1]
+            lower = np.array(len(feature_names) * [-1])
+            upper = np.array(len(feature_names) * [1])
+            print('Lower limits shape:', lower.shape)
+            print('Upper limits shape:', upper.shape)
+
+            print('Running Bayesian Optimization...')
+            res = bayesian_optimization(objective_function, lower=lower,
+                                        upper=upper,
+                                        num_iterations=args.num_iterations)
+
+            best_feature_weights = res['x_opt']
+            print('Best feature weights', best_feature_weights)
+            train_subset, labels_subset = task_utils.get_data_subsets(
+                feature_values, best_feature_weights, X_train, y_train,
+                args.task, num_train_examples)
+            val_accuracy, test_accuracy = train_and_evaluate(
+                train_subset, labels_subset, X_val, y_val, X_test, y_test,
+                parser_output_path=best_weights_parser_output_path,
+                perl_script_path=perl_script_path)
+            run_dict[BAYES_OPT].append((val_accuracy, test_accuracy,
+                                          best_feature_weights))
+
+        # log the results of all methods to the log file
+        data_utils.log_to_file(args.log_file, run_dict, trg_domain, args)
diff --git a/bilstm_tagger/License b/bilstm_tagger/License
new file mode 100755
index 0000000..87c6de2
--- /dev/null
+++ b/bilstm_tagger/License
@@ -0,0 +1,13 @@
+Copyright 2016 The bilstm-aux authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/bilstm_tagger/README.md b/bilstm_tagger/README.md
new file mode 100755
index 0000000..95c4940
--- /dev/null
+++ b/bilstm_tagger/README.md
@@ -0,0 +1,108 @@
+## bi-LSTM tagger
+
+Bidirectional Long-Short Term Memory tagger 
+
+If you use this tagger please cite our paper:
+http://arxiv.org/abs/1604.05529
+
+### Requirements
+
+* python3 
+* [dynet](https://github.com/clab/dynet)
+
+## Installation
+
+Download and install dynet in a directory of your choice DYNETDIR: 
+
+```
+mkdir $DYNETDIR
+git clone https://github.com/clab/dynet
+```
+
+Follow the instructions in the Dynet documentation (use `-DPYTHON`,
+see http://dynet.readthedocs.io/en/latest/python.html). 
+
+And compile dynet:
+
+```
+cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python`
+```
+
+(if you have a GPU:
+
+```
+cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python` -DBACKEND=cuda
+```
+)
+
+After successful installation open python and import dynet, you can
+test if the installation worked with:
+
+```
+>>> import dynet
+[dynet] random seed: 2809331847
+[dynet] allocating memory: 512MB
+[dynet] memory allocation done.
+>>> dynet.__version__
+2.0
+```
+
+(You may need to set you PYTHONPATH to include Dynet's `build/python`)
+
+#### DyNet supports python 3
+
+The old bilstm-aux had a patch to work with python 3. This
+is no longer necessary, as DyNet supports python 3 as of
+https://github.com/clab/dynet/pull/130#issuecomment-259656695
+
+
+#### Example command
+
+Training the tagger:
+
+```
+python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1  > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2
+```
+
+#### Embeddings
+
+The poly embeddings [(Al-Rfou et al.,
+2013)](https://sites.google.com/site/rmyeid/projects/polyglot) can be
+downloaded from [here](http://www.let.rug.nl/bplank/bilty/embeds.tar.gz) (0.6GB)
+
+
+#### A couple of remarks
+
+The choice of 22 languages from UD1.2 (rather than 33) is described in
+our TACL parsing paper, Section 3.1. [(Agić et al.,
+2016)](https://transacl.org/ojs/index.php/tacl/article/view/869). Note,
+however, that the bi-LSTM tagger does not require large amounts of
+training data (as discussed in our paper). Therefore above are 
+results for all languages in UD1.3 (for the canonical language
+subparts, i.e., those with just the language prefix, no further
+suffix; e.g. 'nl' but not 'nl_lassy', and those languages which are
+distributed with word forms).
+
+The `bilty` code is a significantly refactored version of the code
+originally used in the paper. For example, `bilty` supports multi-task
+learning with output layers at different layers (`--pred_layer`), and
+it correctly supports stacked LSTMs (see e.g., Ballesteros et al.,
+2015, Dyer et al., 2015). The results on UD1.3 are obtained with
+`bilty` using no stacking (`--h_layers 1`). 
+
+#### Recommended setting for `bilty`:
+
+* 3 stacked LSTMs, predicting on outermost layer, otherwise default settings, i.e., `--h_layers 3 --pred_layer 3`
+
+#### Reference
+
+```
+@inproceedings{plank:ea:2016,
+  title={{Multilingual Part-of-Speech Tagging with Bidirectional Long Short-Term Memory Models and Auxiliary Loss}},
+  author={Plank, Barbara and S{\o}gaard, Anders and Goldberg, Yoav},
+  booktitle={ACL 2016, arXiv preprint arXiv:1604.05529},
+  url={http://arxiv.org/abs/1604.05529},
+  year={2016}
+}
+```
+
diff --git a/bilstm_tagger/langs/lang_canonic.txt b/bilstm_tagger/langs/lang_canonic.txt
new file mode 100755
index 0000000..9e15b31
--- /dev/null
+++ b/bilstm_tagger/langs/lang_canonic.txt
@@ -0,0 +1,39 @@
+ar
+bg
+ca
+cs
+cu
+da
+de
+el
+en
+es
+et
+eu
+fa
+fi
+fr
+ga
+gl
+got
+grc
+he
+hi
+hr
+hu
+id
+it
+kk
+la
+lv
+nl
+no
+pl
+pt
+ro
+ru
+sl
+sv
+ta
+tr
+zh
diff --git a/bilstm_tagger/langs/lang_with_embeds.txt b/bilstm_tagger/langs/lang_with_embeds.txt
new file mode 100755
index 0000000..a595b41
--- /dev/null
+++ b/bilstm_tagger/langs/lang_with_embeds.txt
@@ -0,0 +1,26 @@
+ar
+bg
+ca
+cs
+da
+de
+el
+en
+es
+et
+eu
+fa
+fi
+fr
+ga
+he
+hi
+hr
+id
+it
+nl
+no
+pl
+pt
+sl
+sv
diff --git a/bilstm_tagger/results-UD1.3-pycnn.md b/bilstm_tagger/results-UD1.3-pycnn.md
new file mode 100755
index 0000000..1a9a47f
--- /dev/null
+++ b/bilstm_tagger/results-UD1.3-pycnn.md
@@ -0,0 +1,64 @@
+
+#### Results on UD1.3
+
+NB. The results below are with the old version of Dynet (pycnn).
+
+The table below provides results on UD1.3 (iters=20, h_layers=1).
+
++poly is using pre-trained embeddings to initialize
+word embeddings.  Note that for some languages it slightly hurts performance.
+
+```
+python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1  > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2
+```
+
+| Lang | i20-h1  | +poly |
+| ---| -----:| -----:|
+| ar | 96.07 | 96.37 |
+| bg | 98.21 | 98.12 |
+| ca | 98.11 | 98.24 |
+| cs | 98.63 | 98.60 |
+| cu | 96.48 | -- |
+| da | 96.06 | 96.04 |
+| de | 92.91 | 93.64 |
+| el | 97.85 | 98.36 |
+| en | 94.60 | 95.04 |
+| es | 95.23 | 95.76 |
+| et | 95.75 | 96.57 |
+| eu | 93.86 | 95.40 |
+| fa | 96.82 | 97.38 |
+| fi | 94.32 | 95.35 |
+| fr | 96.34 | 96.45 |
+| ga | 90.50 | 91.29 |
+| gl | 96.89 | -- |
+| got | 95.97 | -- |
+| grc | 94.36 | -- |
+| he | 95.25 | 96.78 |
+| hi | 96.37 | 96.93 |
+| hr | 94.98 | 96.07 |
+| hu | 93.84 | -- |
+| id | 93.17 | 93.55 |
+| it | 97.40 | 97.82 |
+| kk | 77.68 | -- |
+| la | 90.17 | -- |
+| lv | 91.42 | -- |
+| nl | 90.02 | 89.87 |
+| no | 97.58 | 97.97 |
+| pl | 96.30 | 97.36 |
+| pt | 97.21 | 97.46 |
+| ro | 95.49 | -- |
+| ru | 95.69 | -- |
+| sl | 97.53 | 96.42 |
+| sv | 96.49 | 96.76 |
+| ta | 84.51 | -- |
+| tr | 93.81 | -- |
+| zh | 93.13 | -- |
+
+Using pre-trained embeddings often helps to improve accuracy, however, does not
+strictly hold for all languages.
+
+For more information, predictions files and pre-trained models
+visit [http://www.let.rug.nl/bplank/bilty/](http://www.let.rug.nl/bplank/bilty/)
+
+
+
diff --git a/bilstm_tagger/scripts/submit-bilty-ud1.3.sh b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh
new file mode 100755
index 0000000..f5c661b
--- /dev/null
+++ b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# train models on UD 1.3
+#
+SUBMIT=0
+
+PARTITION=nodes
+mkdir -p runs
+
+CORPUSDIR=~/corpora/pos/ud1.3/orgtok/goldpos/
+EXPDIR=/data/p252438/experiments/bilty
+
+tagger=bilty
+mkdir -p $EXPDIR/models/$tagger
+mkdir -p $EXPDIR/nohup
+mkdir -p $EXPDIR/predictions/$tagger
+
+ITERS=20    
+#ITERS=30    
+SIGMA=0.2
+CDIM=100
+
+SEED=1512141834
+TRAINER=sgd
+INDIM=64
+HLAYERS=1
+#HLAYERS=3
+T0_OUT=$HLAYERS
+
+for lang in `cat langs/lang_with_embeds.txt`; # all for which we have poly embeds (26)
+do 
+    TRAIN=$lang-ud-train.conllu
+    JOBNAME=bilty-$lang-ud1.3-poly-i$ITERS-h$HLAYERS
+
+    echo "#!/bin/bash"  > $$tmp
+    echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp
+    echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp
+    echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp
+    echo "module load CMake" >> $$tmp
+    
+    echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --embeds embeds/poly_a/$lang.polyglot.txt --h_layers $HLAYERS --pred_layer $T0_OUT  > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp
+
+    if [ $SUBMIT -eq 1 ] ; then
+	echo "SUBMIT"
+        sbatch $$tmp
+    fi
+    cat $$tmp
+    rm $$tmp
+done
+
+for lang in `cat langs/lang_canonic.txt` ;  # all without embeddings (but only canical names)
+do 
+    TRAIN=$lang-ud-train.conllu
+    JOBNAME=bilty-$lang-ud1.3-i$ITERS-h$HLAYERS
+    
+    echo "#!/bin/bash"  > $$tmp
+    echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp
+    echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp
+    echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp
+    echo "module load CMake" >> $$tmp
+
+    echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --h_layers $HLAYERS --pred_layer $T0_OUT  > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp
+
+    if [ $SUBMIT -eq 1 ] ; then
+        echo "SUBMIT"
+        sbatch $$tmp
+    fi
+
+    cat $$tmp
+    rm $$tmp
+done
diff --git a/bilstm_tagger/src/bilty.py b/bilstm_tagger/src/bilty.py
new file mode 100755
index 0000000..e3d89d7
--- /dev/null
+++ b/bilstm_tagger/src/bilty.py
@@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+# coding=utf-8
+"""
+A neural network based tagger  (bi-LSTM)
+:author: Barbara Plank
+"""
+import argparse
+import random
+import time
+import sys
+import numpy as np
+import os
+import pickle
+import dynet
+
+from lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor
+from lib.mio import read_conll_file, load_embeddings_file
+
+
+def main():
+    parser = argparse.ArgumentParser(description="""Run the NN tagger""")
+    parser.add_argument("--train", nargs='*', help="train folder for each task") # allow multiple train files, each asociated with a task = position in the list
+    parser.add_argument("--pred_layer", nargs='*', help="layer of predictons for each task", required=True) # for each task the layer on which it is predicted (default 1)
+    parser.add_argument("--model", help="load model from file", required=False)
+    parser.add_argument("--iters", help="training iterations [default: 30]", required=False,type=int,default=30)
+    parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False,type=int,default=64)
+    parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False,type=int,default=100)
+    parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False,type=int,default=100)
+    parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False,type=int,default=1)
+    parser.add_argument("--test", nargs='*', help="test file(s)", required=False) # should be in the same order/task as train
+    parser.add_argument("--dev", help="dev file(s)", required=False) 
+    parser.add_argument("--output", help="output predictions to file", required=False,default=None)
+    parser.add_argument("--lower", help="lowercase words (not used)", required=False,default=False,action="store_true")
+    parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False,default=None)
+    parser.add_argument("--embeds", help="word embeddings file", required=False, default=None)
+    parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float)
+    parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh", type=MyNNTaggerArgumentOptions.acfunct)
+    parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd")
+    parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False, type=int)
+    parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int)
+    parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None)
+
+    args = parser.parse_args()
+
+    if args.train:
+        if not args.pred_layer:
+            print("--pred_layer required!")
+            exit()
+    
+    if args.dynet_seed:
+        print(">>> using seed: ", args.dynet_seed, file=sys.stderr)
+        np.random.seed(args.dynet_seed)
+        random.seed(args.dynet_seed)
+
+    if args.c_in_dim == 0:
+        print("no character embeddings", file=sys.stderr)
+
+    if args.save:
+        # check if folder exists
+        if os.path.isdir(args.save):
+            modeldir = os.path.dirname(args.save)
+            if not os.path.exists(modeldir):
+                os.makedirs(modeldir)
+    if args.output:
+        if os.path.isdir(args.output):
+            outdir = os.path.dirname(args.output)
+            if not os.path.exists(outdir):
+                os.makedirs(outdir)
+            
+
+    start = time.time()
+
+    if args.model:
+        print("loading model from file {}".format(args.model), file=sys.stderr)
+        tagger = load(args)
+    else:
+        tagger = NNTagger(args.in_dim,
+                              args.h_dim,
+                              args.c_in_dim,
+                              args.h_layers,
+                              args.pred_layer,
+                              embeds_file=args.embeds,
+                              activation=args.ac,
+                              lower=args.lower,
+                              noise_sigma=args.sigma)
+
+    if args.train and len( args.train ) != 0:
+        tagger.fit(args.train, args.iters, args.trainer, dev=args.dev)
+        if args.save:
+            save(tagger, args)
+
+    if args.test and len( args.test ) != 0:
+        stdout = sys.stdout
+        # One file per test ... 
+        for i, test in enumerate( args.test ):
+            if args.output != None:
+                file_pred = args.output+".task"+str(i)
+                sys.stdout = open(file_pred, 'w')
+
+            sys.stderr.write('\nTesting Task'+str(i)+'\n')
+            sys.stderr.write('*******\n')
+            test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task"+str(i))
+            correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output)
+
+            print("\ntask%s test accuracy on %s items: %.4f" % (i, i+1, correct/total), file=sys.stderr)
+            print(("Task"+str(i)+" Done. Took {0:.2f} seconds.".format(time.time()-start)),file=sys.stderr)
+            sys.stdout = stdout 
+
+
+    if args.ac:
+        activation=args.ac.__name__
+    else:
+        activation="None"
+    print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}"
+          "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}"
+          "\tembeds: {3}".format(args.in_dim,args.h_dim,args.h_layers,args.embeds,activation, args.sigma, args.lower, args.c_in_dim), file=sys.stderr)
+
+    if args.save_embeds:
+        tagger.save_embeds(args.save_embeds)
+
+def load(args):
+    """
+    load a model from file; specify the .model file, it assumes the *pickle file in the same location
+    """
+    myparams = pickle.load(open(args.model+".pickle", "rb"))
+    tagger = NNTagger(myparams["in_dim"],
+                      myparams["h_dim"],
+                      myparams["c_in_dim"],
+                      myparams["h_layers"],
+                      myparams["pred_layer"],
+                      activation=myparams["activation"], tasks_ids=myparams["tasks_ids"])
+    tagger.set_indices(myparams["w2i"],myparams["c2i"],myparams["task2tag2idx"])
+    tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \
+        tagger.build_computation_graph(myparams["num_words"],
+                                       myparams["num_chars"])
+    #tagger.model.load(str.encode(args.model))
+    tagger.model.load(args.model)
+    print("model loaded: {}".format(args.model), file=sys.stderr)
+    return tagger
+
+def save(nntagger, args):
+    """
+    save a model; dynet only saves the parameters, need to store the rest separately
+    """
+    outdir = args.save
+    modelname = outdir + ".model"
+    #nntagger.model.save(str.encode(modelname))  #python3 needs it as bytes - no longer!
+    nntagger.model.save(modelname)
+    import pickle
+    print(nntagger.task2tag2idx)
+    myparams = {"num_words": len(nntagger.w2i),
+                "num_chars": len(nntagger.c2i),
+                "tasks_ids": nntagger.tasks_ids,
+                "w2i": nntagger.w2i,
+                "c2i": nntagger.c2i,
+                "task2tag2idx": nntagger.task2tag2idx,
+                "activation": nntagger.activation,
+                "in_dim": nntagger.in_dim,
+                "h_dim": nntagger.h_dim,
+                "c_in_dim": nntagger.c_in_dim,
+                "h_layers": nntagger.h_layers,
+                "embeds_file": nntagger.embeds_file,
+                "pred_layer": nntagger.pred_layer
+                }
+    pickle.dump(myparams, open( modelname+".pickle", "wb" ) )
+    print("model stored: {}".format(modelname), file=sys.stderr)
+
+
+class NNTagger(object):
+
+    def __init__(self,in_dim,h_dim,c_in_dim,h_layers,pred_layer,embeds_file=None,activation=dynet.tanh, lower=False, noise_sigma=0.1, tasks_ids=[]):
+        self.w2i = {}  # word to index mapping
+        self.c2i = {}  # char to index mapping
+        self.tasks_ids = tasks_ids # list of names for each task
+        self.task2tag2idx = {} # need one dictionary per task
+        self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task
+        self.model = dynet.Model() #init model
+        self.in_dim = in_dim
+        self.h_dim = h_dim
+        self.c_in_dim = c_in_dim
+        self.activation = activation
+        self.lower = lower
+        self.noise_sigma = noise_sigma
+        self.h_layers = h_layers
+        self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors
+        self.wembeds = None # lookup: embeddings for words
+        self.cembeds = None # lookup: embeddings for characters
+        self.embeds_file = embeds_file
+        self.char_rnn = None # RNN for character input
+
+
+    def pick_neg_log(self, pred, gold):
+        return -dynet.log(dynet.pick(pred, gold))
+
+    def set_indices(self, w2i, c2i, task2t2i):
+        for task_id in task2t2i:
+            self.task2tag2idx[task_id] = task2t2i[task_id]
+        self.w2i = w2i
+        self.c2i = c2i
+
+    def fit(self, list_folders_name, num_iterations, train_algo, dev=None):
+        """
+        train the tagger
+        """
+        print("read training data",file=sys.stderr)
+
+        nb_tasks = len( list_folders_name )
+
+        train_X, train_Y, task_labels, w2i, c2i, task2t2i = self.get_train_data(list_folders_name)
+
+        ## after calling get_train_data we have self.tasks_ids
+        self.task2layer = {task_id: out_layer for task_id, out_layer in zip(self.tasks_ids, self.pred_layer)}
+        print("task2layer", self.task2layer, file=sys.stderr)
+
+        # store mappings of words and tags to indices
+        self.set_indices(w2i,c2i,task2t2i)
+
+        if dev:
+            dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0")
+
+        # init lookup parameters and define graph
+        print("build graph",file=sys.stderr)
+        
+        num_words = len(self.w2i)
+        num_chars = len(self.c2i)
+        
+        assert(nb_tasks==len(self.pred_layer))
+        
+        self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars)
+
+        if train_algo == "sgd":
+            trainer = dynet.SimpleSGDTrainer(self.model)
+        elif train_algo == "adam":
+            trainer = dynet.AdamTrainer(self.model)
+
+        train_data = list(zip(train_X,train_Y, task_labels))
+
+        for iter in range(num_iterations):
+            total_loss=0.0
+            total_tagged=0.0
+            random.shuffle(train_data)
+            for ((word_indices,char_indices),y, task_of_instance) in train_data:
+                # use same predict function for training and testing
+                output = self.predict(word_indices, char_indices, task_of_instance, train=True)
+
+                loss1 = dynet.esum([self.pick_neg_log(pred,gold) for pred, gold in zip(output, y)])
+                lv = loss1.value()
+                total_loss += lv
+                total_tagged += len(word_indices)
+
+                loss1.backward()
+                trainer.update()
+
+            print("iter {2} {0:>12}: {1:.2f}".format("total loss",total_loss/total_tagged,iter), file=sys.stderr)
+            
+            if dev:
+                # evaluate after every epoch
+                correct, total = self.evaluate(dev_X, dev_Y, org_X, org_Y, task_labels)
+                print("\ndev accuracy: %.4f" % (correct/total), file=sys.stderr)
+
+
+
+    def build_computation_graph(self, num_words, num_chars):
+        """
+        build graph and link to parameters
+        """
+         # initialize the word embeddings and the parameters
+        if self.embeds_file:
+            print("loading embeddings", file=sys.stderr)
+            embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower)
+            assert(emb_dim==self.in_dim)
+            num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
+            # init model parameters and initialize them
+            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+               
+            init=0
+            l = len(embeddings.keys())
+            for word in embeddings.keys():
+                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
+                if word in self.w2i:
+                    wembeds.init_row(self.w2i[word], embeddings[word])
+                else:
+                    self.w2i[word]=len(self.w2i.keys()) # add new word
+                    wembeds.init_row(self.w2i[word], embeddings[word])
+                init+=1
+            print("initialized: {}".format(init), file=sys.stderr)
+
+        else:
+            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+               
+
+        #make it more flexible to add number of layers as specified by parameter
+        layers = [] # inner layers
+        output_layers_dict = {}   # from task_id to actual softmax predictor
+        task_expected_at = {} # map task_id => output_layer_#
+
+        # connect output layers to tasks
+        for output_layer, task_id in zip(self.pred_layer, self.tasks_ids):
+            if output_layer > self.h_layers:
+                raise ValueError("cannot have a task at a layer which is beyond the model, increase h_layers")
+            task_expected_at[task_id] = output_layer
+
+        print("task expected at", task_expected_at, file=sys.stderr)
+
+        nb_tasks = len( self.tasks_ids )
+
+        print("h_layers:", self.h_layers, file=sys.stderr)
+        for layer_num in range(0,self.h_layers):
+            print(">>>", layer_num, "layer_num") 
+
+            if layer_num == 0:
+                builder = dynet.LSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer
+                layers.append(BiRNNSequencePredictor(builder)) #returns forward and backward sequence
+            else:
+                # add inner layers (if h_layers >1)
+                builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
+                layers.append(BiRNNSequencePredictor(builder))
+
+       # store at which layer to predict task
+        for task_id in self.tasks_ids:
+            task_num_labels= len(self.task2tag2idx[task_id])
+            output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax))
+
+        sys.stderr.write('#\nOutput layers'+str(len(output_layers_dict))+'\n')
+
+        char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
+
+        predictors = {}
+        predictors["inner"] = layers
+        predictors["output_layers_dict"] = output_layers_dict
+        predictors["task_expected_at"] = task_expected_at
+
+        return predictors, char_rnn, wembeds, cembeds
+
+    def get_features(self, words):
+        """
+        from a list of words, return the word and word char indices
+        """
+        word_indices = []
+        word_char_indices = []
+        for word in words:
+            if word in self.w2i:
+                word_indices.append(self.w2i[word])
+            else:
+                word_indices.append(self.w2i["_UNK"])
+                
+            chars_of_word = [self.c2i["<w>"]]
+            for char in word:
+                if char in self.c2i:
+                    chars_of_word.append(self.c2i[char])
+                else:
+                    chars_of_word.append(self.c2i["_UNK"])
+            chars_of_word.append(self.c2i["</w>"])
+            word_char_indices.append(chars_of_word)
+        return word_indices, word_char_indices
+                                                                                                                                
+
+    def get_data_as_indices(self, folder_name, task):
+        """
+        X = list of (word_indices, word_char_indices)
+        Y = list of tag indices
+        """
+        X, Y = [],[]
+        org_X, org_Y = [], []
+        task_labels = []
+        for (words, tags) in read_conll_file(folder_name):
+            word_indices, word_char_indices = self.get_features(words)
+            tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags]
+            X.append((word_indices,word_char_indices))
+            Y.append(tag_indices)
+            org_X.append(words)
+            org_Y.append(tags)
+            task_labels.append( task )
+        return X, Y, org_X, org_Y, task_labels
+
+
+    def predict(self, word_indices, char_indices, task_id, train=False):
+        """
+        predict tags for a sentence represented as char+word embeddings
+        """
+        dynet.renew_cg() # new graph
+
+        char_emb = []
+        rev_char_emb = []
+        # get representation for words
+        for chars_of_token in char_indices:
+            # use last state as word representation
+            last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1]
+            rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1]
+            char_emb.append(last_state)
+            rev_char_emb.append(rev_last_state)
+            
+        wfeatures = [self.wembeds[w] for w in word_indices]
+        features = [dynet.concatenate([w,c,rev_c]) for w,c,rev_c in zip(wfeatures,char_emb,reversed(rev_char_emb))]
+        
+        if train: # only do at training time
+            features = [dynet.noise(fe,self.noise_sigma) for fe in features]
+
+        output_expected_at_layer = self.predictors["task_expected_at"][task_id]
+        output_expected_at_layer -=1
+
+        # go through layers
+        # input is now combination of w + char emb
+        prev = features
+        num_layers = self.h_layers
+#        for i in range(0,num_layers-1):
+        for i in range(0,num_layers):
+            predictor = self.predictors["inner"][i]
+            forward_sequence, backward_sequence = predictor.predict_sequence(prev)        
+            if i > 0 and self.activation:
+                # activation between LSTM layers
+                forward_sequence = [self.activation(s) for s in forward_sequence]
+                backward_sequence = [self.activation(s) for s in backward_sequence]
+
+            if i == output_expected_at_layer:
+                output_predictor = self.predictors["output_layers_dict"][task_id] 
+                concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))]
+
+                if train and self.noise_sigma > 0.0:
+                    concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer]
+                output = output_predictor.predict_sequence(concat_layer)
+                return output
+
+            prev = forward_sequence
+            prev_rev = backward_sequence # not used
+
+        raise Exception("oops should not be here")
+        return None
+
+    def evaluate(self, test_X, test_Y, org_X, org_Y, task_labels, output_predictions=None, verbose=True):
+        """
+        compute accuracy on a test file
+        """
+        correct = 0
+        total = 0.0
+
+        if output_predictions != None:
+            i2w = {self.w2i[w] : w for w in self.w2i.keys()}
+            task_id = task_labels[0] #get first
+            print(task_id,"labels:", self.task2tag2idx[task_id], file=sys.stderr )
+            i2t = {self.task2tag2idx[task_id][t] : t for t in self.task2tag2idx[task_id].keys()}
+
+        for i, ((word_indices, word_char_indices), gold_tag_indices, task_of_instance) in enumerate(zip(test_X, test_Y, task_labels)):
+            if verbose:
+                if i%100==0:
+                    sys.stderr.write('%s'%i)
+                elif i%10==0:
+                    sys.stderr.write('.')
+                    
+            output = self.predict(word_indices, word_char_indices, task_of_instance)
+            predicted_tag_indices = [np.argmax(o.value()) for o in output]  
+            if output_predictions:
+                prediction = [i2t[idx] for idx in predicted_tag_indices]
+
+                words = org_X[i]
+                gold = org_Y[i]
+
+                for w,g,p in zip(words,gold,prediction):
+                    print(("{}\t{}\t{}".format(w,g,p)))
+                print("")
+            correct += sum([1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold])
+            total += len(gold_tag_indices)
+
+        return correct, total
+
+
+
+    # Get train data: need to read each train set (linked to a task) separately
+
+    def get_train_data(self, list_folders_name):
+        """
+
+        :param list_folders_name: list of folders names
+        :param lower: whether to lowercase tokens
+
+        transform training data to features (word indices)
+        map tags to integers
+        """
+        X = []
+        Y = []
+        task_labels = [] #keeps track of where instances come from "task1" or "task2"..
+        self.tasks_ids = [] #record the id of the tasks
+
+        #num_sentences=0
+        #num_tokens=0
+
+        # word 2 indices and tag 2 indices
+        w2i = {} # word to index
+        c2i = {} # char to index
+        task2tag2idx = {} # id of the task -> tag2idx
+
+        w2i["_UNK"] = 0  # unk word / OOV
+        c2i["_UNK"] = 0  # unk char
+        c2i["<w>"] = 1   # word start
+        c2i["</w>"] = 2  # word end index
+        
+        
+        for i, folder_name in enumerate( list_folders_name ):
+            num_sentences=0
+            num_tokens=0
+            task_id = 'task'+str(i)
+            self.tasks_ids.append( task_id )
+            if task_id not in task2tag2idx:
+                task2tag2idx[task_id] = {}
+            for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)):
+                num_sentences += 1
+                instance_word_indices = [] #sequence of word indices
+                instance_char_indices = [] #sequence of char indices 
+                instance_tags_indices = [] #sequence of tag indices
+
+                for i, (word, tag) in enumerate(zip(words, tags)):
+                    num_tokens += 1
+
+                    # map words and tags to indices
+                    if word not in w2i:
+                        w2i[word] = len(w2i)
+                    instance_word_indices.append(w2i[word])
+
+                    chars_of_word = [c2i["<w>"]]
+                    for char in word:
+                        if char not in c2i:
+                            c2i[char] = len(c2i)
+                        chars_of_word.append(c2i[char])
+                    chars_of_word.append(c2i["</w>"])
+                    instance_char_indices.append(chars_of_word)
+                            
+                    if tag not in task2tag2idx[task_id]:
+                        #tag2idx[tag]=len(tag2idx)
+                        task2tag2idx[task_id][tag]=len(task2tag2idx[task_id])
+
+                    instance_tags_indices.append(task2tag2idx[task_id].get(tag))
+
+                X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
+                Y.append(instance_tags_indices)
+                task_labels.append(task_id)
+
+            #self.num_labels[task_id] = len( task2tag2idx[task_id] )
+
+            if num_sentences == 0 or num_tokens == 0:
+                sys.exit( "No data read from: "+folder_name )
+            print("TASK "+task_id+" "+folder_name, file=sys.stderr )
+            print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+            print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)
+
+        assert(len(X)==len(Y))
+        return X, Y, task_labels, w2i, c2i, task2tag2idx  #sequence of features, sequence of labels, necessary mappings
+
+
+    def save_embeds(self, out_filename):
+        # construct reverse mapping
+        i2w = {self.w2i[w]: w for w in self.w2i.keys()}
+
+        OUT = open(out_filename+".w.emb","w")
+        for word_id in i2w.keys():
+            wembeds_expression = self.wembeds[word_id]
+            word = i2w[word_id]
+            OUT.write("{} {}\n".format(word," ".join([str(x) for x in wembeds_expression.npvalue()])))
+        OUT.close()
+
+
+class MyNNTaggerArgumentOptions(object):
+    def __init__(self):
+        pass
+    ### functions for checking arguments
+    def acfunct(arg):
+        """ check for allowed argument for --ac option """
+        try:
+            functions = [dynet.rectify, dynet.tanh]
+            functions = { function.__name__ : function for function in functions}
+            functions["None"] = None
+            return functions[str(arg)]
+        except:
+            raise argparse.ArgumentTypeError("String {} does not match required format".format(arg,))
+
+
+
+if __name__=="__main__":
+    main()
diff --git a/bilstm_tagger/src/run_simple.py b/bilstm_tagger/src/run_simple.py
new file mode 100755
index 0000000..60eec71
--- /dev/null
+++ b/bilstm_tagger/src/run_simple.py
@@ -0,0 +1,23 @@
+#### Example of using bilty from within code
+## 
+## to properly seed dyNet add parameter to your script:
+## python run_simply.py --dynet-seed 113
+
+from bilstm_tagger.src.simplebilty import SimpleBiltyTagger
+import random
+### Use --dynet-seed $SEED
+seed=113 # assume we pass this to script
+train_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-dev.conllu"
+dev_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-test.conllu"
+in_dim=64
+h_dim=100
+c_in_dim=100
+h_layers=1
+iters=2
+trainer="sgd"
+tagger = SimpleBiltyTagger(in_dim, h_dim,c_in_dim,h_layers,embeds_file=None)
+train_X, train_Y = tagger.get_train_data(train_data)
+tagger.fit(train_X, train_Y, iters, trainer,seed=seed)
+test_X, test_Y = tagger.get_data_as_indices(dev_data)
+correct, total = tagger.evaluate(test_X, test_Y)
+print(correct, total, correct/total)
diff --git a/bilstm_tagger/src/simplebilty.py b/bilstm_tagger/src/simplebilty.py
new file mode 100755
index 0000000..11f4972
--- /dev/null
+++ b/bilstm_tagger/src/simplebilty.py
@@ -0,0 +1,598 @@
+#!/usr/bin/env python3
+# coding=utf-8
+"""
+A neural network based tagger  (bi-LSTM) - version w/o MTL
+:author: Barbara Plank
+"""
+import argparse
+import random
+import time
+import sys
+import numpy as np
+import os
+import pickle
+import dynet
+
+from bilstm_tagger.src.lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor
+from bilstm_tagger.src.lib.mio import read_conll_file, load_embeddings_file
+
+
+def main():
+    parser = argparse.ArgumentParser(description="""Run the NN tagger""")
+    parser.add_argument("--train",
+                        help="train data")  # allow multiple train files, each asociated with a task = position in the list
+    # parser.add_argument("--pred_layer", help="layer of predictons", default=1) # assume always h_layer here
+    parser.add_argument("--model", help="load model from file", required=False)
+    parser.add_argument("--iters", help="training iterations [default: 30]", required=False, type=int, default=30)
+    parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False,
+                        type=int, default=64)
+    parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False,
+                        type=int, default=100)
+    parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False, type=int, default=100)
+    parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False,
+                        type=int, default=1)
+    parser.add_argument("--test", nargs='*', help="test file(s)",
+                        required=False)  # should be in the same order/task as train
+    parser.add_argument("--dev", help="dev file(s)", required=False)
+    parser.add_argument("--output", help="output predictions to file", required=False, default=None)
+    parser.add_argument("--lower", help="lowercase words (not used)", required=False, default=False,
+                        action="store_true")
+    parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False,
+                        default=None)
+    parser.add_argument("--embeds", help="word embeddings file", required=False, default=None)
+    parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float)
+    parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh",
+                        type=MyNNTaggerArgumentOptions.acfunct)
+    parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd")
+    parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False,
+                        type=int)
+    parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int)
+    parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None)
+
+    args = parser.parse_args()
+
+    if args.save:
+        # check if folder exists
+        if os.path.isdir(args.save):
+            modeldir = os.path.dirname(args.save)
+            if not os.path.exists(modeldir):
+                os.makedirs(modeldir)
+    if args.output:
+        if os.path.isdir(args.output):
+            outdir = os.path.dirname(args.output)
+            if not os.path.exists(outdir):
+                os.makedirs(outdir)
+
+    start = time.time()
+
+    if args.model:
+        print("loading model from file {}".format(args.model), file=sys.stderr)
+        tagger = load(args.model)
+    else:
+        tagger = SimpleBiltyTagger(args.in_dim,
+                                   args.h_dim,
+                                   args.c_in_dim,
+                                   args.h_layers,
+                                   embeds_file=args.embeds,
+                                   activation=args.ac,
+                                   lower=args.lower,
+                                   noise_sigma=args.sigma)
+
+    if args.train:
+        ## read data
+        train_X, train_Y = tagger.get_train_data(args.train)
+
+        if dev:
+            dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0")
+
+        tagger.fit(args.train, args.iters, args.trainer, seed=args.dynet_seed)
+        if args.save:
+            save(tagger, args.save)
+
+    if args.test and len(args.test) != 0:
+        stdout = sys.stdout
+        # One file per test ... 
+        for i, test in enumerate(args.test):
+            if args.output != None:
+                file_pred = args.output + ".task" + str(i)
+                sys.stdout = open(file_pred, 'w')
+
+            sys.stderr.write('\nTesting Task' + str(i) + '\n')
+            sys.stderr.write('*******\n')
+            test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task" + str(i))
+            correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output)
+
+            print("\ntask%s test accuracy on %s items: %.4f" % (i, i + 1, correct / total), file=sys.stderr)
+            print(("Task" + str(i) + " Done. Took {0:.2f} seconds.".format(time.time() - start)), file=sys.stderr)
+            sys.stdout = stdout
+
+    if args.ac:
+        activation = args.ac.__name__
+    else:
+        activation = "None"
+    print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}"
+          "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}"
+          "\tembeds: {3}".format(args.in_dim, args.h_dim, args.h_layers, args.embeds, activation, args.sigma,
+                                 args.lower, args.c_in_dim), file=sys.stderr)
+
+    if args.save_embeds:
+        tagger.save_embeds(args.save_embeds)
+
+
+def load(model_file):
+    """
+    load a model from file; specify the .model file, it assumes the *pickle file in the same location
+    """
+    myparams = pickle.load(open(model_file + ".pickle", "rb"))
+    tagger = SimpleBiltyTagger(myparams["in_dim"],
+                               myparams["h_dim"],
+                               myparams["c_in_dim"],
+                               myparams["h_layers"],
+                               activation=myparams["activation"])
+    tagger.set_indices(myparams["w2i"], myparams["c2i"], myparams["tag2idx"])
+    tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \
+        tagger.build_computation_graph(myparams["num_words"],
+                                       myparams["num_chars"])
+    tagger.model.load(model_file)
+    print("model loaded: {}".format(model_file), file=sys.stderr)
+    return tagger
+
+
+def save(nntagger, model_file_name):
+    """
+    save a model; dynet only saves the parameters, need to store the rest separately
+    """
+    nntagger.model.save(model_file_name)
+    import pickle
+    myparams = {"num_words": len(nntagger.w2i),
+                "num_chars": len(nntagger.c2i),
+                "w2i": nntagger.w2i,
+                "c2i": nntagger.c2i,
+                "tag2idx": nntagger.tag2idx,
+                "activation": nntagger.activation,
+                "in_dim": nntagger.in_dim,
+                "h_dim": nntagger.h_dim,
+                "c_in_dim": nntagger.c_in_dim,
+                "h_layers": nntagger.h_layers,
+                "embeds_file": nntagger.embeds_file,
+                "pred_layer": nntagger.pred_layer
+                }
+    pickle.dump(myparams, open(model_file_name + ".pickle", "wb"))
+    print("model stored: {}".format(model_file_name), file=sys.stderr)
+
+
+class SimpleBiltyTagger(object):
+    def __init__(self, in_dim, h_dim, c_in_dim, h_layers, embeds_file=None, activation=dynet.tanh, lower=False,
+                 noise_sigma=0.1, tasks_ids=[]):
+        self.w2i = {}  # word to index mapping
+        self.c2i = {}  # char to index mapping
+        self.tag2idx = {}  # tag to tag_id mapping
+        self.pred_layer = 1  # at which layer to predict
+        self.model = dynet.Model()  # init model
+        self.in_dim = in_dim
+        self.h_dim = h_dim
+        self.c_in_dim = c_in_dim
+        self.activation = activation
+        self.lower = lower
+        self.noise_sigma = noise_sigma
+        self.h_layers = h_layers
+        self.predictors = {"inner": [], "output_layers_dict": {},
+                           "task_expected_at": {}}  # the inner layers and predictors
+        self.wembeds = None  # lookup: embeddings for words
+        self.cembeds = None  # lookup: embeddings for characters
+        self.embeds_file = embeds_file
+        self.char_rnn = None  # RNN for character input
+
+    def pick_neg_log(self, pred, gold):
+        return -dynet.log(dynet.pick(pred, gold))
+
+    def set_indices(self, w2i, c2i, tag2idx):
+        self.tag2idx = tag2idx
+        self.w2i = w2i
+        self.c2i = c2i
+
+    def fit(self, train_X, train_Y, num_epochs, train_algo, val_X=None, val_Y=None, patience=2, model_path=None,
+            seed=None):
+        """
+        train the tagger
+        """
+        print("read training data", file=sys.stderr)
+
+        if seed:
+            print(">>> using seed: ", seed, file=sys.stderr)
+            random.seed(seed)  # setting random seed
+
+        # init lookup parameters and define graph
+        print("build graph", file=sys.stderr)
+
+        num_words = len(self.w2i)
+        num_chars = len(self.c2i)
+
+        self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars)
+
+        if train_algo == "sgd":
+            trainer = dynet.SimpleSGDTrainer(self.model)
+        elif train_algo == "adam":
+            trainer = dynet.AdamTrainer(self.model)
+        else:
+            raise ValueError('%s is not a valid optimizer.' % train_algo)
+
+        assert (len(train_X) == len(train_Y))
+        train_data = list(zip(train_X, train_Y))
+
+        print('Starting training for %d epochs...' % num_epochs)
+        best_val_acc, epochs_no_improvement = 0., 0
+        if val_X is not None and val_Y is not None and model_path is not None:
+            print('Using early stopping with patience of %d...' % patience)
+        for cur_iter in range(num_epochs):
+            total_loss = 0.0
+            total_tagged = 0.0
+            random.shuffle(train_data)
+            for ((word_indices, char_indices), y) in train_data:
+                # use same predict function for training and testing
+                output = self.predict(word_indices, char_indices, train=True)
+
+                loss1 = dynet.esum([self.pick_neg_log(pred, gold) for pred, gold in zip(output, y)])
+                lv = loss1.value()
+                total_loss += lv
+                total_tagged += len(word_indices)
+
+                loss1.backward()
+                trainer.update()
+            total_loss = total_loss / total_tagged
+            print("epoch {2} {0:>12}: {1:.2f}".format("total loss", total_loss, cur_iter))
+
+            # get the best accuracy on the validation set
+            val_correct, val_total = self.evaluate(val_X, val_Y)
+            val_accuracy = val_correct / val_total
+
+            if val_X is not None and val_Y is not None and model_path is not None:
+                if val_accuracy > best_val_acc:
+                    print('Accuracy %.4f is better than best val accuracy %.4f.' % (val_accuracy, best_val_acc))
+                    best_val_acc = val_accuracy
+                    epochs_no_improvement = 0
+                    save(self, model_path)
+                else:
+                    print('Accuracy %.4f is worse than best val loss %.4f.' % (val_accuracy, best_val_acc))
+                    epochs_no_improvement += 1
+                if epochs_no_improvement == patience:
+                    print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement)
+                    break
+
+    def build_computation_graph(self, num_words, num_chars):
+        """
+        build graph and link to parameters
+        """
+        # initialize the word embeddings and the parameters
+        if self.embeds_file:
+            print("loading embeddings", file=sys.stderr)
+            embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower)
+            assert (emb_dim == self.in_dim)
+            num_words = len(set(embeddings.keys()).union(set(self.w2i.keys())))  # initialize all with embeddings
+            # init model parameters and initialize them
+            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+            init = 0
+            l = len(embeddings.keys())
+            for word in embeddings.keys():
+                # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
+                if word in self.w2i:
+                    wembeds.init_row(self.w2i[word], embeddings[word])
+                else:
+                    self.w2i[word] = len(self.w2i.keys())  # add new word
+                    wembeds.init_row(self.w2i[word], embeddings[word])
+                init += 1
+            print("initialized: {}".format(init), file=sys.stderr)
+
+        else:
+            wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+            cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+        # make it more flexible to add number of layers as specified by parameter
+        layers = []  # inner layers
+        # print("h_layers:", self.h_layers, file=sys.stderr)
+        for layer_num in range(0, self.h_layers):
+            # print(">>>", layer_num, "layer_num")
+
+            if layer_num == 0:
+                builder = dynet.LSTMBuilder(1, self.in_dim + self.c_in_dim * 2, self.h_dim,
+                                            self.model)  # in_dim: size of each layer
+                layers.append(BiRNNSequencePredictor(builder))  # returns forward and backward sequence
+            else:
+                # add inner layers (if h_layers >1)
+                builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
+                layers.append(BiRNNSequencePredictor(builder))
+
+                # store at which layer to predict task
+
+        task_num_labels = len(self.tag2idx)
+        output_layer = FFSequencePredictor(Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax))
+
+        char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
+
+        predictors = {}
+        predictors["inner"] = layers
+        predictors["output_layers_dict"] = output_layer
+        predictors["task_expected_at"] = self.h_layers
+
+        return predictors, char_rnn, wembeds, cembeds
+
+    def get_features(self, words):
+        """
+        from a list of words, return the word and word char indices
+        """
+        word_indices = []
+        word_char_indices = []
+        for word in words:
+            if word in self.w2i:
+                word_indices.append(self.w2i[word])
+            else:
+                word_indices.append(self.w2i["_UNK"])
+
+            chars_of_word = [self.c2i["<w>"]]
+            for char in word:
+                if char in self.c2i:
+                    chars_of_word.append(self.c2i[char])
+                else:
+                    chars_of_word.append(self.c2i["_UNK"])
+            chars_of_word.append(self.c2i["</w>"])
+            word_char_indices.append(chars_of_word)
+        return word_indices, word_char_indices
+
+    def get_data_as_indices(self, file_name):
+        """
+        X = list of (word_indices, word_char_indices)
+        Y = list of tag indices
+        """
+        X, Y = [], []
+        org_X, org_Y = [], []
+
+        for (words, tags) in read_conll_file(file_name):
+            word_indices, word_char_indices = self.get_features(words)
+            tag_indices = [self.tag2idx.get(tag) for tag in tags]
+            X.append((word_indices, word_char_indices))
+            Y.append(tag_indices)
+            org_X.append(words)
+            org_Y.append(tags)
+        return X, Y  # , org_X, org_Y - for now don't use
+
+    def get_data_as_indices_from_instances(self, dev_words, dev_tags):
+        """
+        Extension of get_data_as_indices. Use words and tags rather than a file as input.
+        X = list of (word_indices, word_char_indices)
+        Y = list of tag indices
+        """
+        X, Y = [], []
+        org_X, org_Y = [], []
+
+        for (words, tags) in zip(dev_words, dev_tags):
+            word_indices, word_char_indices = self.get_features(words)
+            tag_indices = [self.tag2idx.get(tag) for tag in tags]
+            X.append((word_indices, word_char_indices))
+            Y.append(tag_indices)
+            org_X.append(words)
+            org_Y.append(tags)
+        return X, Y  # , org_X, org_Y - for now don't use
+
+    def predict(self, word_indices, char_indices, train=False):
+        """
+        predict tags for a sentence represented as char+word embeddings
+        """
+        dynet.renew_cg()  # new graph
+
+        char_emb = []
+        rev_char_emb = []
+        # get representation for words
+        for chars_of_token in char_indices:
+            # use last state as word representation
+            last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1]
+            rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1]
+            char_emb.append(last_state)
+            rev_char_emb.append(rev_last_state)
+
+        wfeatures = [self.wembeds[w] for w in word_indices]
+        features = [dynet.concatenate([w, c, rev_c]) for w, c, rev_c in
+                    zip(wfeatures, char_emb, reversed(rev_char_emb))]
+
+        if train:  # only do at training time
+            features = [dynet.noise(fe, self.noise_sigma) for fe in features]
+
+        output_expected_at_layer = self.h_layers
+        output_expected_at_layer -= 1
+
+        # go through layers
+        # input is now combination of w + char emb
+        prev = features
+        num_layers = self.h_layers
+        for i in range(0, num_layers):
+            predictor = self.predictors["inner"][i]
+            forward_sequence, backward_sequence = predictor.predict_sequence(prev)
+            if i > 0 and self.activation:
+                # activation between LSTM layers
+                forward_sequence = [self.activation(s) for s in forward_sequence]
+                backward_sequence = [self.activation(s) for s in backward_sequence]
+
+            if i == output_expected_at_layer:
+                output_predictor = self.predictors["output_layers_dict"]
+                concat_layer = [dynet.concatenate([f, b]) for f, b in
+                                zip(forward_sequence, reversed(backward_sequence))]
+
+                if train and self.noise_sigma > 0.0:
+                    concat_layer = [dynet.noise(fe, self.noise_sigma) for fe in concat_layer]
+                output = output_predictor.predict_sequence(concat_layer)
+                return output
+
+            prev = forward_sequence
+            prev_rev = backward_sequence  # not used
+
+        raise Exception("oops should not be here")
+        return None
+
+    def evaluate(self, test_X, test_Y):
+        """
+        compute accuracy on a test file
+        """
+        correct = 0
+        total = 0.0
+
+        for i, ((word_indices, word_char_indices), gold_tag_indices) in enumerate(zip(test_X, test_Y)):
+            output = self.predict(word_indices, word_char_indices)
+            predicted_tag_indices = [np.argmax(o.value()) for o in output]
+
+            correct += sum(
+                [1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold])
+            total += len(gold_tag_indices)
+
+        return correct, total
+
+    # Get train data: need to read each train set (linked to a task) separately
+
+    def get_train_data(self, train_data):
+        """
+        transform training data to features (word indices)
+        map tags to integers
+        """
+        X = []
+        Y = []
+
+        # word 2 indices and tag 2 indices
+        w2i = {}  # word to index
+        c2i = {}  # char to index
+        tag2idx = {}  # tag2idx
+
+        w2i["_UNK"] = 0  # unk word / OOV
+        c2i["_UNK"] = 0  # unk char
+        c2i["<w>"] = 1  # word start
+        c2i["</w>"] = 2  # word end index
+
+        num_sentences = 0
+        num_tokens = 0
+        for instance_idx, (words, tags) in enumerate(read_conll_file(train_data)):
+            instance_word_indices = []  # sequence of word indices
+            instance_char_indices = []  # sequence of char indices
+            instance_tags_indices = []  # sequence of tag indices
+
+            for i, (word, tag) in enumerate(zip(words, tags)):
+
+                # map words and tags to indices
+                if word not in w2i:
+                    w2i[word] = len(w2i)
+                instance_word_indices.append(w2i[word])
+
+                chars_of_word = [c2i["<w>"]]
+                for char in word:
+                    if char not in c2i:
+                        c2i[char] = len(c2i)
+                    chars_of_word.append(c2i[char])
+                chars_of_word.append(c2i["</w>"])
+                instance_char_indices.append(chars_of_word)
+
+                if tag not in tag2idx:
+                    tag2idx[tag] = len(tag2idx)
+
+                instance_tags_indices.append(tag2idx.get(tag))
+
+                num_tokens += 1
+
+            num_sentences += 1
+
+            X.append((instance_word_indices,
+                      instance_char_indices))  # list of word indices, for every word list of char indices
+            Y.append(instance_tags_indices)
+
+        print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+        print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr)
+
+        assert (len(X) == len(Y))
+
+        # store mappings of words and tags to indices
+        self.set_indices(w2i, c2i, tag2idx)
+
+        return X, Y
+
+    def get_train_data_from_instances(self, train_words, train_tags):
+        """
+        Extension of get_train_data method. Extracts training data from two arrays of word and label lists.
+        transform training data to features (word indices)
+        map tags to integers
+        :param train_words: a numpy array containing lists of words
+        :param train_tags: a numpy array containing lists of corresponding tags
+        """
+        X = []
+        Y = []
+
+        # word 2 indices and tag 2 indices
+        w2i = {}  # word to index
+        c2i = {}  # char to index
+        tag2idx = {}  # tag2idx
+
+        w2i["_UNK"] = 0  # unk word / OOV
+        c2i["_UNK"] = 0  # unk char
+        c2i["<w>"] = 1  # word start
+        c2i["</w>"] = 2  # word end index
+
+        num_sentences = 0
+        num_tokens = 0
+        for instance_idx, (words, tags) in enumerate(zip(train_words, train_tags)):
+            instance_word_indices = []  # sequence of word indices
+            instance_char_indices = []  # sequence of char indices
+            instance_tags_indices = []  # sequence of tag indices
+
+            for i, (word, tag) in enumerate(zip(words, tags)):
+
+                # map words and tags to indices
+                if word not in w2i:
+                    w2i[word] = len(w2i)
+                instance_word_indices.append(w2i[word])
+
+                chars_of_word = [c2i["<w>"]]
+                for char in word:
+                    if char not in c2i:
+                        c2i[char] = len(c2i)
+                    chars_of_word.append(c2i[char])
+                chars_of_word.append(c2i["</w>"])
+                instance_char_indices.append(chars_of_word)
+
+                if tag not in tag2idx:
+                    tag2idx[tag] = len(tag2idx)
+
+                instance_tags_indices.append(tag2idx.get(tag))
+
+                num_tokens += 1
+
+            num_sentences += 1
+
+            X.append((instance_word_indices,
+                      instance_char_indices))  # list of word indices, for every word list of char indices
+            Y.append(instance_tags_indices)
+
+        print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+        print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr)
+
+        assert (len(X) == len(Y))
+
+        # store mappings of words and tags to indices
+        self.set_indices(w2i, c2i, tag2idx)
+
+        return X, Y
+
+
+class MyNNTaggerArgumentOptions(object):
+    def __init__(self):
+        pass
+
+    ### functions for checking arguments
+    def acfunct(arg):
+        """ check for allowed argument for --ac option """
+        try:
+            functions = [dynet.rectify, dynet.tanh]
+            functions = {function.__name__: function for function in functions}
+            functions["None"] = None
+            return functions[str(arg)]
+        except:
+            raise argparse.ArgumentTypeError("String {} does not match required format".format(arg, ))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bist_parser/LICENSE b/bist_parser/LICENSE
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/bist_parser/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/bist_parser/README.md b/bist_parser/README.md
new file mode 100644
index 0000000..4147304
--- /dev/null
+++ b/bist_parser/README.md
@@ -0,0 +1,75 @@
+# BIST Parsers
+## Graph & Transition based dependency parsers using BiLSTM feature extractors.
+
+The techniques behind the parser are described in the paper [Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations](https://www.transacl.org/ojs/index.php/tacl/article/viewFile/885/198). Futher materials could be found [here](http://elki.cc/#/article/Simple%20and%20Accurate%20Dependency%20Parsing%20Using%20Bidirectional%20LSTM%20Feature%20Representations).
+
+#### Required software
+
+ * Python 2.7 interpreter
+ * [DyNet library](https://github.com/clab/dynet/tree/master/python)
+
+#### Train a parsing model
+
+The software requires having a `training.conll` and `development.conll` files formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat).
+For the faster graph-based parser change directory to `bmstparser` (1200 words/sec), and for the more accurate transition-based parser change directory to `barchybrid` (800 word/sec). The benchmark was performed on a Mac book pro with i7 processor. The graph-based parser acheives an accuracy of 93.8 UAS and the transition-based parser an accuracy of 94.7 UAS on the standard Penn Treebank dataset (Standford Dependencies). The transition-based parser requires no part-of-speech tagging and setting all the tags to NN will produce the expected accuracy. The model and param files achieving those scores are available for download ([Graph-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AADgBS9hb9vy0o-UBZW9AbbKa/bestfirstorder.tar.gz?dl=0), [Transition-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AACEPp3DLQeJnRA_QyPmll93a/bestarchybrid.tar.gz?dl=0)). The trained models include improvements beyond those described in the paper, to be published soon.
+
+To train a parsing model with for either parsing architecture type the following at the command prompt:
+
+    python src/parser.py --dynet-seed 123456789 [--dynet-mem XXXX] --outdir [results directory] --train training.conll --dev development.conll --epochs 30 --lstmdims 125 --lstmlayers 2 [--extrn extrn.vectors] --bibi-lstm
+
+We use the same external embedding used in [Transition-Based Dependency Parsing with Stack Long Short-Term Memory](http://arxiv.org/abs/1505.08075) which can be downloaded from the authors [github repository](https://github.com/clab/lstm-parser/) and [directly here](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing).
+
+If you are training a transition-based parser then for optimal results you should add the following to the command prompt `--k 3 --usehead --userl`. These switch will set the stack to 3 elements; use the BiLSTM of the head of trees on the stack as feature vectors; and add the BiLSTM of the right/leftmost children to the feature vectors.
+
+Note 1: You can run it without pos embeddings by setting the pos embedding dimensions to zero (--pembedding 0).
+
+Note 2: The reported test result is the one matching the highest development score.
+
+Note 3: The parser calculates (after each iteration) the accuracies excluding punctuation symbols by running the `eval.pl` script from the CoNLL-X Shared Task and stores the results in directory specified by the `--outdir`.
+
+Note 4: The external embeddings parameter is optional and better not used when train/predicting a graph-based model.
+
+#### Parse data with your parsing model
+
+The command for parsing a `test.conll` file formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat) with a previously trained model is:
+
+    python src/parser.py --predict --outdir [results directory] --test test.conll [--extrn extrn.vectors] --model [trained model file] --params [param file generate during training]
+
+The parser will store the resulting conll file in the out directory (`--outdir`).
+
+Note 1: If you are using the arc-hybrid trained model we provided please use the `--extrn` flag and specify the location of the external embeddings file.
+
+Note 2: If you are using the first-order trained model we provided please do not use the `--extrn` flag.
+
+#### Citation
+
+If you make use of this software for research purposes, we'll appreciate citing the following:
+
+    @article{DBLP:journals/tacl/KiperwasserG16,
+        author    = {Eliyahu Kiperwasser and Yoav Goldberg},
+        title     = {Simple and Accurate Dependency Parsing Using Bidirectional {LSTM}
+               Feature Representations},
+        journal   = {{TACL}},
+        volume    = {4},
+        pages     = {313--327},
+        year      = {2016},
+        url       = {https://transacl.org/ojs/index.php/tacl/article/view/885},
+        timestamp = {Tue, 09 Aug 2016 14:51:09 +0200},
+        biburl    = {http://dblp.uni-trier.de/rec/bib/journals/tacl/KiperwasserG16},
+        bibsource = {dblp computer science bibliography, http://dblp.org}
+    }
+
+#### License
+
+This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+
+#### Contact
+
+For questions and usage issues, please contact elikip@gmail.com
+
+#### Credits
+
+[Eliyahu Kiperwasser](http://elki.cc)
+
+[Yoav Goldberg](https://www.cs.bgu.ac.il/~yoavg/uni/)
+
diff --git a/bist_parser/__init__.py b/bist_parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bist_parser/barchybrid/src/arc_hybrid.py b/bist_parser/barchybrid/src/arc_hybrid.py
new file mode 100644
index 0000000..2d74fe4
--- /dev/null
+++ b/bist_parser/barchybrid/src/arc_hybrid.py
@@ -0,0 +1,401 @@
+from dynet import *
+from utils import ParseForest, read_conll, write_conll
+from operator import itemgetter
+from itertools import chain
+import utils, time, random
+import numpy as np
+
+
+class ArcHybridLSTM:
+    def __init__(self, words, pos, rels, w2i, options):
+        self.model = Model()
+        self.trainer = AdamTrainer(self.model)
+        random.seed(1)
+
+        self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))}
+        self.activation = self.activations[options.activation]
+
+        self.oracle = options.oracle
+        self.ldims = options.lstm_dims * 2
+        self.wdims = options.wembedding_dims
+        self.pdims = options.pembedding_dims
+        self.rdims = options.rembedding_dims
+        self.layers = options.lstm_layers
+        self.wordsCount = words
+        self.vocab = {word: ind+3 for word, ind in w2i.iteritems()}
+        self.pos = {word: ind+3 for ind, word in enumerate(pos)}
+        self.rels = {word: ind for ind, word in enumerate(rels)}
+        self.irels = rels
+
+        self.headFlag = options.headFlag
+        self.rlMostFlag = options.rlMostFlag
+        self.rlFlag = options.rlFlag
+        self.k = options.window
+
+        self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0)
+
+        self.external_embedding = None
+        if options.external_embedding is not None:
+            external_embedding_fp = open(options.external_embedding,'r')
+            external_embedding_fp.readline()
+            self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp}
+            external_embedding_fp.close()
+
+            self.edim = len(self.external_embedding.values()[0])
+            self.noextrn = [0.0 for _ in xrange(self.edim)]
+            self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
+            self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
+            for word, i in self.extrnd.iteritems():
+                self.elookup.init_row(i, self.external_embedding[word])
+            self.extrnd['*PAD*'] = 1
+            self.extrnd['*INITIAL*'] = 2
+
+            print 'Load external embedding. Vector dimensions', self.edim
+
+        dims = self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0)
+        self.blstmFlag = options.blstmFlag
+        self.bibiFlag = options.bibiFlag
+
+        if self.bibiFlag:
+            self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model),
+                                    VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)]
+            self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model),
+                                     VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)]
+        elif self.blstmFlag:
+            if self.layers > 0:
+                self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)]
+            else:
+                self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)]
+
+        self.hidden_units = options.hidden_units
+        self.hidden2_units = options.hidden2_units
+        self.vocab['*PAD*'] = 1
+        self.pos['*PAD*'] = 1
+
+        self.vocab['*INITIAL*'] = 2
+        self.pos['*INITIAL*'] = 2
+
+        self.wlookup = self.model.add_lookup_parameters((len(words) + 3, self.wdims))
+        self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
+        self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))
+
+        self.word2lstm = self.model.add_parameters((self.ldims, self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0)))
+        self.word2lstmbias = self.model.add_parameters((self.ldims))
+        self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims))
+        self.lstm2lstmbias = self.model.add_parameters((self.ldims))
+
+        self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1)))
+        self.hidBias = self.model.add_parameters((self.hidden_units))
+
+        self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+        self.hid2Bias = self.model.add_parameters((self.hidden2_units))
+
+        self.outLayer = self.model.add_parameters((3, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+        self.outBias = self.model.add_parameters((3))
+
+        self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1)))
+        self.rhidBias = self.model.add_parameters((self.hidden_units))
+
+        self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+        self.rhid2Bias = self.model.add_parameters((self.hidden2_units))
+
+        self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+        self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1))
+
+
+    def __evaluate(self, stack, buf, train):
+        topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [self.empty] for i in xrange(self.k) ]
+        topBuffer = [ buf.roots[i].lstms if len(buf) > i else [self.empty] for i in xrange(1) ]
+
+        input = concatenate(list(chain(*(topStack + topBuffer))))
+
+        if self.hidden2_units > 0:
+            routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr())
+        else:
+            routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr())
+
+        if self.hidden2_units > 0:
+            output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr())
+        else:
+            output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr())
+
+        scrs, uscrs = routput.value(), output.value()
+
+        uscrs0 = uscrs[0]
+        uscrs1 = uscrs[1]
+        uscrs2 = uscrs[2]
+        if train:
+            output0 = output[0]
+            output1 = output[1]
+            output2 = output[2]
+            ret = [ [ (rel, 0, scrs[1 + j * 2] + uscrs1, routput[1 + j * 2 ] + output1) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [],
+                    [ (rel, 1, scrs[2 + j * 2] + uscrs2, routput[2 + j * 2 ] + output2) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [],
+                    [ (None, 2, scrs[0] + uscrs0, routput[0] + output0) ] if len(buf) > 0 else [] ]
+        else:
+            s1,r1 = max(zip(scrs[1::2],self.irels))
+            s2,r2 = max(zip(scrs[2::2],self.irels))
+            s1 += uscrs1
+            s2 += uscrs2
+            ret = [ [ (r1, 0, s1) ] if len(stack) > 0 and len(buf) > 0 else [],
+                    [ (r2, 1, s2) ] if len(stack) > 1 else [],
+                    [ (None, 2, scrs[0] + uscrs0) ] if len(buf) > 0 else [] ]
+        return ret
+        #return [ [ (rel, 0, scrs[1 + j * 2 + 0] + uscrs[1], routput[1 + j * 2 + 0] + output[1]) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [],
+        #         [ (rel, 1, scrs[1 + j * 2 + 1] + uscrs[2], routput[1 + j * 2 + 1] + output[2]) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [],
+        #         [ (None, 2, scrs[0] + uscrs[0], routput[0] + output[0]) ] if len(buf) > 0 else [] ]
+
+
+    def Save(self, filename):
+        self.model.save(filename)
+
+
+    def Load(self, filename):
+        self.model.load(filename)
+
+    def Init(self):
+        evec = self.elookup[1] if self.external_embedding is not None else None
+        paddingWordVec = self.wlookup[1]
+        paddingPosVec = self.plookup[1] if self.pdims > 0 else None
+
+        paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec])) + self.word2lstmbias.expr() )
+        self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)])
+
+
+    def getWordEmbeddings(self, sentence, train):
+        for root in sentence:
+            c = float(self.wordsCount.get(root.norm, 0))
+            dropFlag =  not train or (random.random() < (c/(0.25+c)))
+            root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0]
+            root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None
+
+            if self.external_embedding is not None:
+                #if not dropFlag and random.random() < 0.5:
+                #    root.evec = self.elookup[0]
+                if root.form in self.external_embedding:
+                    root.evec = self.elookup[self.extrnd[root.form]]
+                elif root.norm in self.external_embedding:
+                    root.evec = self.elookup[self.extrnd[root.norm]]
+                else:
+                    root.evec = self.elookup[0]
+            else:
+                root.evec = None
+            root.ivec = concatenate(filter(None, [root.wordvec, root.posvec, root.evec]))
+
+        if self.blstmFlag:
+            forward  = self.surfaceBuilders[0].initial_state()
+            backward = self.surfaceBuilders[1].initial_state()
+
+            for froot, rroot in zip(sentence, reversed(sentence)):
+                forward = forward.add_input( froot.ivec )
+                backward = backward.add_input( rroot.ivec )
+                froot.fvec = forward.output()
+                rroot.bvec = backward.output()
+            for root in sentence:
+                root.vec = concatenate( [root.fvec, root.bvec] )
+
+            if self.bibiFlag:
+                bforward  = self.bsurfaceBuilders[0].initial_state()
+                bbackward = self.bsurfaceBuilders[1].initial_state()
+
+                for froot, rroot in zip(sentence, reversed(sentence)):
+                    bforward = bforward.add_input( froot.vec )
+                    bbackward = bbackward.add_input( rroot.vec )
+                    froot.bfvec = bforward.output()
+                    rroot.bbvec = bbackward.output()
+                for root in sentence:
+                    root.vec = concatenate( [root.bfvec, root.bbvec] )
+
+        else:
+            for root in sentence:
+                root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr()
+                root.vec = tanh( root.ivec )
+
+
+    def Predict(self, conll_path):
+        with open(conll_path, 'r') as conllFP:
+            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
+                self.Init()
+
+                sentence = sentence[1:] + [sentence[0]]
+                self.getWordEmbeddings(sentence, False)
+                stack = ParseForest([])
+                buf = ParseForest(sentence)
+
+                for root in sentence:
+                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]
+
+                hoffset = 1 if self.headFlag else 0
+
+                while len(buf) > 0 or len(stack) > 1 :
+                    scores = self.__evaluate(stack, buf, False)
+                    best = max(chain(*scores), key = itemgetter(2) )
+
+                    if best[1] == 2:
+                        stack.roots.append(buf.roots[0])
+                        del buf.roots[0]
+
+                    elif best[1] == 0:
+                        child = stack.roots.pop()
+                        parent = buf.roots[0]
+
+                        child.pred_parent_id = parent.id
+                        child.pred_relation = best[0]
+
+                        bestOp = 0
+                        if self.rlMostFlag:
+                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+                        if self.rlFlag:
+                            parent.lstms[bestOp + hoffset] = child.vec
+
+                    elif best[1] == 1:
+                        child = stack.roots.pop()
+                        parent = stack.roots[-1]
+
+                        child.pred_parent_id = parent.id
+                        child.pred_relation = best[0]
+
+                        bestOp = 1
+                        if self.rlMostFlag:
+                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+                        if self.rlFlag:
+                            parent.lstms[bestOp + hoffset] = child.vec
+
+                renew_cg()
+                yield [sentence[-1]] + sentence[:-1]
+
+
+    def Train(self, conll_path):
+        mloss = 0.0
+        errors = 0
+        batch = 0
+        eloss = 0.0
+        eerrors = 0
+        lerrors = 0
+        etotal = 0
+        ltotal = 0
+        ninf = -float('inf')
+
+        hoffset = 1 if self.headFlag else 0
+
+        start = time.time()
+
+        with open(conll_path, 'r') as conllFP:
+            shuffledData = list(read_conll(conllFP, True))
+            random.shuffle(shuffledData)
+
+            errs = []
+            eeloss = 0.0
+
+            self.Init()
+
+            for iSentence, sentence in enumerate(shuffledData):
+                if iSentence % 100 == 0 and iSentence != 0:
+                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
+                    start = time.time()
+                    eerrors = 0
+                    eloss = 0.0
+                    etotal = 0
+                    lerrors = 0
+                    ltotal = 0
+
+                sentence = sentence[1:] + [sentence[0]]
+                self.getWordEmbeddings(sentence, True)
+                stack = ParseForest([])
+                buf = ParseForest(sentence)
+
+                for root in sentence:
+                    root.lstms = [root.vec for _ in xrange(self.nnvecs)]
+
+                hoffset = 1 if self.headFlag else 0
+
+                while len(buf) > 0 or len(stack) > 1 :
+                    scores = self.__evaluate(stack, buf, True)
+                    scores.append([(None, 3, ninf ,None)])
+
+                    alpha = stack.roots[:-2] if len(stack) > 2 else []
+                    s1 = [stack.roots[-2]] if len(stack) > 1 else []
+                    s0 = [stack.roots[-1]] if len(stack) > 0 else []
+                    b = [buf.roots[0]] if len(buf) > 0 else []
+                    beta = buf.roots[1:] if len(buf) > 1 else []
+
+                    left_cost  = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) +
+                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[0]) > 0 else 1
+                    right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) +
+                                   len([d for d in b + beta if d.parent_id == s0[0].id]) )  if len(scores[1]) > 0 else 1
+                    shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) +
+                                   len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) )  if len(scores[2]) > 0 else 1
+                    costs = (left_cost, right_cost, shift_cost, 1)
+
+                    bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or  s[0] == stack.roots[-1].relation ) ), key=itemgetter(2))
+                    bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or  ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2))
+                    best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong
+
+                    if best[1] == 2:
+                        stack.roots.append(buf.roots[0])
+                        del buf.roots[0]
+
+                    elif best[1] == 0:
+                        child = stack.roots.pop()
+                        parent = buf.roots[0]
+
+                        child.pred_parent_id = parent.id
+                        child.pred_relation = best[0]
+
+                        bestOp = 0
+                        if self.rlMostFlag:
+                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+                        if self.rlFlag:
+                            parent.lstms[bestOp + hoffset] = child.vec
+
+                    elif best[1] == 1:
+                        child = stack.roots.pop()
+                        parent = stack.roots[-1]
+
+                        child.pred_parent_id = parent.id
+                        child.pred_relation = best[0]
+
+                        bestOp = 1
+                        if self.rlMostFlag:
+                            parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+                        if self.rlFlag:
+                            parent.lstms[bestOp + hoffset] = child.vec
+
+                    if bestValid[2] < bestWrong[2] + 1.0:
+                        loss = bestWrong[3] - bestValid[3]
+                        mloss += 1.0 + bestWrong[2] - bestValid[2]
+                        eloss += 1.0 + bestWrong[2] - bestValid[2]
+                        errs.append(loss)
+
+                    if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
+                        lerrors += 1
+                        if child.pred_parent_id != child.parent_id:
+                            errors += 1
+                            eerrors += 1
+
+                    etotal += 1
+
+                if len(errs) > 50: # or True:
+                    #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
+                    eerrs = esum(errs)
+                    scalar_loss = eerrs.scalar_value()
+                    eerrs.backward()
+                    self.trainer.update()
+                    errs = []
+                    lerrs = []
+
+                    renew_cg()
+                    self.Init()
+
+        if len(errs) > 0:
+            eerrs = (esum(errs)) # * (1.0/(float(len(errs))))
+            eerrs.scalar_value()
+            eerrs.backward()
+            self.trainer.update()
+
+            errs = []
+            lerrs = []
+
+            renew_cg()
+
+        self.trainer.update_epoch()
+        print "Loss: ", mloss/iSentence
diff --git a/bist_parser/barchybrid/src/parser.py b/bist_parser/barchybrid/src/parser.py
new file mode 100644
index 0000000..8ddbe95
--- /dev/null
+++ b/bist_parser/barchybrid/src/parser.py
@@ -0,0 +1,76 @@
+from optparse import OptionParser
+from arc_hybrid import ArcHybridLSTM
+import pickle, utils, os, time, sys
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/PTB_SD_3_3_0/train.conll")
+    parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/PTB_SD_3_3_0/dev.conll")
+    parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/PTB_SD_3_3_0/test.conll")
+    parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle")
+    parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE")
+    parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="barchybrid.model")
+    parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100)
+    parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25)
+    parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25)
+    parser.add_option("--epochs", type="int", dest="epochs", default=30)
+    parser.add_option("--hidden", type="int", dest="hidden_units", default=100)
+    parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0)
+    parser.add_option("--k", type="int", dest="window", default=3)
+    parser.add_option("--lr", type="float", dest="learning_rate", default=0.1)
+    parser.add_option("--outdir", type="string", dest="output", default="results")
+    parser.add_option("--activation", type="string", dest="activation", default="tanh")
+    parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2)
+    parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=200)
+    parser.add_option("--dynet-seed", type="int", dest="seed", default=7)
+    parser.add_option("--disableoracle", action="store_false", dest="oracle", default=True)
+    parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True)
+    parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False)
+    parser.add_option("--usehead", action="store_true", dest="headFlag", default=False)
+    parser.add_option("--userlmost", action="store_true", dest="rlFlag", default=False)
+    parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False)
+    parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
+    parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512)
+
+    (options, args) = parser.parse_args()
+    print 'Using external embedding:', options.external_embedding
+
+    if not options.predictFlag:
+        if not (options.rlFlag or options.rlMostFlag or options.headFlag):
+            print 'You must use either --userlmost or --userl or --usehead (you can use multiple)'
+            sys.exit()
+
+        print 'Preparing vocab'
+        words, w2i, pos, rels = utils.vocab(options.conll_train)
+
+        with open(os.path.join(options.output, options.params), 'w') as paramsfp:
+            pickle.dump((words, w2i, pos, rels, options), paramsfp)
+        print 'Finished collecting vocab'
+
+        print 'Initializing blstm arc hybrid:'
+        parser = ArcHybridLSTM(words, pos, rels, w2i, options)
+
+        for epoch in xrange(options.epochs):
+            print 'Starting epoch', epoch
+            parser.Train(options.conll_train)
+            devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll')
+            utils.write_conll(devpath, parser.Predict(options.conll_dev))
+            os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath  + ' > ' + devpath + '.txt &')
+            print 'Finished predicting dev'
+            parser.Save(os.path.join(options.output, options.model + str(epoch+1)))
+    else:
+        with open(options.params, 'r') as paramsfp:
+            words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)
+
+        stored_opt.external_embedding = options.external_embedding
+
+        parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
+        parser.Load(options.model)
+        tespath = os.path.join(options.output, 'test_pred.conll')
+        ts = time.time()
+        pred = list(parser.Predict(options.conll_test))
+        te = time.time()
+        utils.write_conll(tespath, pred)
+        os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt &')
+        print 'Finished predicting test',te-ts
+
diff --git a/bist_parser/barchybrid/src/utils.py b/bist_parser/barchybrid/src/utils.py
new file mode 100644
index 0000000..7b21851
--- /dev/null
+++ b/bist_parser/barchybrid/src/utils.py
@@ -0,0 +1,114 @@
+from collections import Counter
+import re
+
+
+class ConllEntry:
+    def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
+        self.id = id
+        self.form = form
+        self.norm = normalize(form)
+        self.cpos = cpos.upper()
+        self.pos = pos.upper()
+        self.parent_id = parent_id
+        self.relation = relation
+
+
+class ParseForest:
+    def __init__(self, sentence):
+        self.roots = list(sentence)
+
+        for root in self.roots:
+            root.children = []
+            root.scores = None
+            root.parent = None
+            root.pred_parent_id = 0 # None
+            root.pred_relation = 'rroot' # None
+            root.vecs = None
+            root.lstms = None
+
+    def __len__(self):
+        return len(self.roots)
+
+
+    def Attach(self, parent_index, child_index):
+        parent = self.roots[parent_index]
+        child = self.roots[child_index]
+
+        child.pred_parent_id = parent.id
+        del self.roots[child_index]
+
+
+def isProj(sentence):
+    forest = ParseForest(sentence)
+    unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}
+
+    for _ in xrange(len(sentence)):
+        for i in xrange(len(forest.roots) - 1):
+            if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0:
+                unassigned[forest.roots[i+1].id]-=1
+                forest.Attach(i+1, i)
+                break
+            if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0:
+                unassigned[forest.roots[i].id]-=1
+                forest.Attach(i, i+1)
+                break
+
+    return len(forest.roots) == 1
+
+def vocab(conll_path):
+    wordsCount = Counter()
+    posCount = Counter()
+    relCount = Counter()
+
+    with open(conll_path, 'r') as conllFP:
+        for sentence in read_conll(conllFP, True):
+            wordsCount.update([node.norm for node in sentence])
+            posCount.update([node.pos for node in sentence])
+            relCount.update([node.relation for node in sentence])
+
+    return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},  posCount.keys(), relCount.keys())
+
+def read_conll(fh, proj):
+    dropped = 0
+    read = 0
+    root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
+    tokens = [root]
+    for line in fh:
+        tok = line.strip().split()
+        if not tok:
+            if len(tokens)>1:
+                if not proj or isProj(tokens):
+                    yield tokens
+                else:
+                    print 'Non-projective sentence dropped'
+                    dropped += 1
+                read += 1
+            tokens = [root]
+            id = 0
+        else:
+            tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
+    if len(tokens) > 1:
+        yield tokens
+
+    print dropped, 'dropped non-projective sentences.'
+    print read, 'sentences read.'
+
+
+def write_conll(fn, conll_gen):
+    with open(fn, 'w') as fh:
+        for sentence in conll_gen:
+            for entry in sentence[1:]:
+                fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
+                fh.write('\n')
+            fh.write('\n')
+
+
+numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
+def normalize(word):
+    return 'NUM' if numberRegex.match(word) else word.lower()
+
+cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB", 
+             "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV", 
+             ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET", 
+             "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS", 
+             "-LRB-": ".", "-RRB-": "."}
diff --git a/bist_parser/barchybrid/src/utils/eval.pl b/bist_parser/barchybrid/src/utils/eval.pl
new file mode 100644
index 0000000..3db9837
--- /dev/null
+++ b/bist_parser/barchybrid/src/utils/eval.pl
@@ -0,0 +1,1826 @@
+#!/usr/bin/env perl
+
+# Author: Yuval Krymolowski
+# Addition of precision and recall 
+#   and of frame confusion list: Sabine Buchholz
+# Addition of DEPREL + ATTACHMENT:
+#   Prokopis Prokopidis (prokopis at ilsp dot gr)
+# Acknowledgements: 
+#   to Markus Kuhn for suggesting the use of 
+#   the Unicode category property
+
+if ($] < 5.008001)
+{
+  printf STDERR <<EOM
+
+ This script requires PERL 5.8.1 for running.
+ The new version is needed for proper handling
+ of Unicode characters.
+
+ Please obtain a new version or contact the shared task team
+ if you are unable to upgrade PERL.
+
+EOM
+;
+  exit(1) ;
+}
+
+require Encode;
+
+use strict ;
+use warnings;
+use Getopt::Std ;
+
+my ($usage) = <<EOT
+
+  CoNLL-X evaluation script:
+
+   [perl] eval.pl [OPTIONS] -g <gold standard> -s <system output>
+
+  This script evaluates a system output with respect to a gold standard.
+  Both files should be in UTF-8 encoded CoNLL-X tabular format.
+
+  Punctuation tokens (those where all characters have the Unicode
+  category property "Punctuation") are ignored for scoring (unless the
+  -p flag is used).
+
+  The output breaks down the errors according to their type and context.
+
+  Optional parameters:
+     -o FILE : output: print output to FILE (default is standard output)
+     -q : quiet:       only print overall performance, without the details
+     -b : evalb:       produce output in a format similar to evalb 
+                       (http://nlp.cs.nyu.edu/evalb/); use together with -q
+     -p : punctuation: also score on punctuation (default is not to score on it)
+     -v : version:     show the version number
+     -h : help:        print this help text and exit
+
+EOT
+;
+
+my ($line_num) ;
+my ($sep) = '0x01' ;
+
+my ($START) = '.S' ;
+my ($END) = '.E' ;
+
+my ($con_err_num) = 3 ;
+my ($freq_err_num) = 10 ;
+my ($spec_err_loc_con) = 8 ;
+
+################################################################################
+###                              subfunctions                                ###
+################################################################################
+
+# Whether a string consists entirely of characters with the Unicode
+# category property "Punctuation" (see "man perlunicode")
+sub is_uni_punct
+{
+  my ($word) = @_ ;
+
+  return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ;
+}
+
+# The length of a unicode string, excluding non-spacing marks
+# (for example vowel marks in Arabic)
+
+sub uni_len
+{
+  my ($word) = @_ ;
+  my ($ch, $l) ;
+
+  $l = 0 ;
+  foreach $ch (split(//,  Encode::decode_utf8($word)))
+  {
+    if ($ch !~ /^\p{NonspacingMark}/)
+    {
+      $l++ ;
+    }
+  }
+
+  return $l ;
+}
+
+sub filter_context_counts
+{ # filter_context_counts
+
+  my ($vec, $num, $max_len) = @_ ;
+  my ($con, $l, $thresh) ;
+
+  $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ;
+
+  foreach $con (keys %{$vec})
+  {
+    if (${$vec}{$con} < $thresh)
+    {
+      delete ${$vec}{$con} ;
+      next ;
+    }
+
+    $l = uni_len($con) ;
+
+    if ($l > ${$max_len})
+    {
+      ${$max_len} = $l ;
+    }
+  }
+
+} # filter_context_counts
+
+sub print_context
+{ # print_context
+
+  my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ;
+  my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ;
+
+  printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ;
+  printf OUT "  ||" ;
+  printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ;
+  printf OUT "\n" ;
+  printf OUT "  %s-+------+------+------+-----", '-' x $max_con_pos_len;
+  printf OUT "--++" ;
+  printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+  printf OUT "\n" ;
+
+  @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ;
+  @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ;
+
+  $n = scalar @v_con ;
+  if (scalar @v_con_pos > $n)
+  {
+    $n = scalar @v_con_pos ;
+  }
+
+  foreach $i (0 .. $n-1)
+  {
+    if (defined $v_con_pos[$i])
+    {
+      $con_pos = $v_con_pos[$i] ;
+      printf OUT "  %-*s | %4d | %4d | %4d | %4d",
+	$max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos},
+	  ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos},
+	    ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ;
+    }
+    else
+    {
+      printf OUT "  %-*s | %4s | %4s | %4s | %4s",
+	$max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ;
+    }
+
+    printf OUT "  ||" ;
+
+    if (defined $v_con[$i])
+    {
+      $con = $v_con[$i] ;
+      printf OUT "  %-*s | %4d | %4d | %4d | %4d",
+	$max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con},
+	  ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con},
+	    ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ;
+    }
+    else
+    {
+      printf OUT "  %-*s | %4s | %4s | %4s | %4s",
+	$max_con_len, ' ', ' ', ' ', ' ', ' ' ;
+    }
+
+    printf OUT "\n" ;
+  }
+
+  printf OUT "  %s-+------+------+------+-----", '-' x $max_con_pos_len;
+  printf OUT "--++" ;
+  printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+  printf OUT "\n" ;
+
+  printf OUT "\n\n" ;
+
+} # print_context
+
+sub num_as_word
+{
+  my ($num) = @_ ;
+
+  $num = abs($num) ;
+
+  if ($num == 1)
+  {
+    return ('one word') ;
+  }
+  elsif ($num == 2)
+  {
+    return ('two words') ;
+  }
+  elsif ($num == 3)
+  {
+    return ('three words') ;
+  }
+  elsif ($num == 4)
+  {
+    return ('four words') ;
+  }
+  else
+  {
+    return ($num.' words') ;
+  }
+}
+
+sub describe_err
+{ # describe_err
+
+  my ($head_err, $head_aft_bef, $dep_err) = @_ ;
+  my ($dep_g, $dep_s, $desc) ;
+  my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ;
+
+  if ($head_err eq '-')
+  {
+    $desc = 'correct head' ;
+
+    if ($head_aft_bef_s eq '0')
+    {
+      $desc .= ' (0)' ;
+    }
+    elsif ($head_aft_bef_s eq 'e')
+    {
+      $desc .= ' (the focus word)' ;
+    }
+    elsif ($head_aft_bef_s eq 'a')
+    {
+      $desc .= ' (after the focus word)' ;
+    }
+    elsif ($head_aft_bef_s eq 'b')
+    {
+      $desc .= ' (before the focus word)' ;
+    }
+  }
+  elsif ($head_aft_bef_s eq '0')
+  {
+    $desc = 'head = 0 instead of ' ;
+    if ($head_aft_bef_g eq 'a')
+    {
+      $desc.= 'after ' ;
+    }
+    if ($head_aft_bef_g eq 'b')
+    {
+      $desc.= 'before ' ;
+    }
+    $desc .= 'the focus word' ;
+  }
+  elsif ($head_aft_bef_g eq '0')
+  {
+    $desc = 'head is ' ;
+    if ($head_aft_bef_g eq 'a')
+    {
+      $desc.= 'after ' ;
+    }
+    if ($head_aft_bef_g eq 'b')
+    {
+      $desc.= 'before ' ;
+    }
+    $desc .= 'the focus word instead of 0' ;
+  }
+  else
+  {
+    $desc = num_as_word($head_err) ;
+    if ($head_err < 0)
+    {
+      $desc .= ' before' ;
+    }
+    else
+    {
+      $desc .= ' after' ;
+    }
+
+    $desc = 'head '.$desc.' the correct head ' ;
+
+    if ($head_aft_bef_s eq '0')
+    {
+      $desc .= '(0' ;
+    }
+    elsif ($head_aft_bef_s eq 'e')
+    {
+      $desc .= '(the focus word' ;
+    }
+    elsif ($head_aft_bef_s eq 'a')
+    {
+      $desc .= '(after the focus word' ;
+    }
+    elsif ($head_aft_bef_s eq 'b')
+    {
+      $desc .= '(before the focus word' ;
+    }
+
+    if ($head_aft_bef_g ne $head_aft_bef_s)
+    {
+      $desc .= ' instead of' ;
+      if ($head_aft_bef_s eq '0')
+      {
+	$desc .= '0' ;
+      }
+      elsif ($head_aft_bef_s eq 'e')
+      {
+	$desc .= 'the focus word' ;
+      }
+      elsif ($head_aft_bef_s eq 'a')
+      {
+	$desc .= 'after the focus word' ;
+      }
+      elsif ($head_aft_bef_s eq 'b')
+      {
+	$desc .= 'before the focus word' ;
+      }
+    }
+
+    $desc .= ')' ;
+  }
+
+  $desc .= ', ' ;
+
+  if ($dep_err eq '-')
+  {
+    $desc .= 'correct dependency' ;
+  }
+  else
+  {
+    ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ;
+    $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ;
+  }
+
+  return($desc) ;
+
+} # describe_err
+
+sub get_context
+{ # get_context
+
+  my ($sent, $i_w) = @_ ;
+  my ($w_2, $w_1, $w1, $w2) ;
+  my ($p_2, $p_1, $p1, $p2) ;
+
+  if ($i_w >= 2)
+  {
+    $w_2 = ${${$sent}[$i_w-2]}{word} ;
+    $p_2 = ${${$sent}[$i_w-2]}{pos} ;
+  }
+  else
+  {
+    $w_2 = $START ;
+    $p_2 = $START ;
+  }
+
+  if ($i_w >= 1)
+  {
+    $w_1 = ${${$sent}[$i_w-1]}{word} ;
+    $p_1 = ${${$sent}[$i_w-1]}{pos} ;
+  }
+  else
+  {
+    $w_1 = $START ;
+    $p_1 = $START ;
+  }
+
+  if ($i_w <= scalar @{$sent}-2)
+  {
+    $w1 = ${${$sent}[$i_w+1]}{word} ;
+    $p1 = ${${$sent}[$i_w+1]}{pos} ;
+  }
+  else
+  {
+    $w1 = $END ;
+    $p1 = $END ;
+  }
+
+  if ($i_w <= scalar @{$sent}-3)
+  {
+    $w2 = ${${$sent}[$i_w+2]}{word} ;
+    $p2 = ${${$sent}[$i_w+2]}{pos} ;
+  }
+  else
+  {
+    $w2 = $END ;
+    $p2 = $END ;
+  }
+
+  return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ;
+
+} # get_context
+
+sub read_sent
+{ # read_sent
+
+  my ($sent_gold, $sent_sys) = @_ ;
+  my ($line_g, $line_s, $new_sent) ;
+  my (%fields_g, %fields_s) ;
+
+  $new_sent = 1 ;
+
+  @{$sent_gold} = () ;
+  @{$sent_sys} = () ;
+
+  while (1)
+  { # main reading loop
+
+    $line_g = <GOLD> ;
+    $line_s = <SYS> ;
+
+    $line_num++ ;
+
+    # system output has fewer lines than gold standard
+    if ((defined $line_g) && (! defined $line_s))
+    {
+	printf STDERR "line mismatch, line %d:\n", $line_num ;
+	printf STDERR " gold: %s", $line_g ;
+	printf STDERR " sys : past end of file\n" ;
+	exit(1) ;
+    }
+
+    # system output has more lines than gold standard
+    if ((! defined $line_g) && (defined $line_s))
+    {
+	printf STDERR "line mismatch, line %d:\n", $line_num ;
+	printf STDERR " gold: past end of file\n" ;
+	printf STDERR " sys : %s", $line_s ;
+	exit(1) ;
+    }
+    
+    # end of file reached for both
+    if ((! defined $line_g) && (! defined $line_s))
+    {
+	return (1) ;
+    }
+
+    # one contains end of sentence but other one does not
+    if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/))
+    {
+      printf STDERR "line mismatch, line %d:\n", $line_num ;
+      printf STDERR " gold: %s", $line_g ;
+      printf STDERR " sys : %s", $line_s ;
+      exit(1) ;
+    }
+
+    # end of sentence reached
+    if ($line_g =~ /^\s+$/)
+    {
+	return(0) ;
+    }
+
+    # now both lines contain information
+
+    if ($new_sent)
+    {
+      $new_sent = 0 ;
+    }
+
+    # 'official' column names
+    # options.output = ['id','form','lemma','cpostag','postag',
+    #                   'feats','head','deprel','phead','pdeprel']
+
+    @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ;
+
+    push @{$sent_gold}, { %fields_g } ;
+
+    @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ;
+
+    if (($fields_g{word} ne $fields_s{word})
+	||
+	($fields_g{pos} ne $fields_s{pos}))
+    {
+      printf STDERR "Word/pos mismatch, line %d:\n", $line_num ;
+      printf STDERR " gold: %s", $line_g ;
+      printf STDERR " sys : %s", $line_s ;
+      #exit(1) ;
+    }
+
+    push @{$sent_sys}, { %fields_s } ;
+
+  } # main reading loop
+  
+} # read_sent
+
+################################################################################
+###                                  main                                    ###
+################################################################################
+
+our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ;
+
+my ($sent_num, $eof, $word_num, @err_sent) ;
+my (@sent_gold, @sent_sys, @starts) ;
+my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ;
+my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ;
+my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ;
+my ($loc_con, %loc_con_err_counts, %err_desc) ;
+my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ;
+my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ;
+my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ;
+my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ;
+my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ;
+my (%freq_err, $err) ;
+
+my ($i, $j, $i_w, $l, $n_args) ;
+my ($w_2, $w_1, $w1, $w2) ;
+my ($wp_2, $wp_1, $wp1, $wp2) ;
+my ($p_2, $p_1, $p1, $p2) ;
+
+my ($short_output) ;
+my ($score_on_punct) ;
+$counts{punct} = 0; # initialize
+
+getopts("g:o:s:qvhpb") ;
+
+if (defined $opt_v)
+{
+    my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $';
+    my @parts = split ' ',$id;
+    print "Version $parts[2]\n";
+    exit(0);
+}
+
+if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s)))
+{
+  die $usage ;
+}
+
+if (! defined $opt_g)
+{
+  die "Gold standard file (-g) missing\n" ;
+}
+
+if (! defined $opt_s)
+{
+  die "System output file (-s) missing\n" ;
+}
+
+if (! defined $opt_o)
+{
+  $opt_o = '-' ;
+}
+
+if (defined $opt_q)
+{
+    $short_output = 1 ;
+} else {
+    $short_output = 0 ;
+}
+
+if (defined $opt_p)
+{
+    $score_on_punct = 1 ;
+} else {
+    $score_on_punct = 0 ;
+}
+
+$line_num = 0 ;
+$sent_num = 0 ;
+$eof = 0 ;
+
+@err_sent = () ;
+@starts = () ;
+
+%{$err_sent[0]} = () ;
+
+$max_pos_len = length('CPOS') ;
+
+################################################################################
+###                              reading input                               ###
+################################################################################
+
+open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ;
+open (SYS,  "<$opt_s") || die "Could not open system output file $opt_s\n" ;
+open (OUT,  ">$opt_o") || die "Could not open output file $opt_o\n" ;
+
+
+if (defined $opt_b) {  # produce output similar to evalb
+    print OUT "     Sent.          Attachment      Correct        Scoring          \n";
+    print OUT "    ID Tokens  -   Unlab. Lab.   HEAD HEAD+DEPREL   tokens   - - - -\n";
+    print OUT "  ============================================================================\n";
+}
+
+
+while (! $eof)
+{ # main reading loop
+
+  $starts[$sent_num] = $line_num+1 ;
+  $eof = read_sent(\@sent_gold, \@sent_sys) ;
+
+  $sent_num++ ;
+
+  %{$err_sent[$sent_num]} = () ;
+  $word_num = scalar @sent_gold ;
+
+  # for accuracy per sentence
+  my %sent_counts = ( tot      => 0,
+		      err_any  => 0,
+		      err_head => 0
+		      ); 
+
+  # printf "$sent_num $word_num\n" ;
+
+  my @frames_g = ('** '); # the initial frame for the virtual root
+  my @frames_s = ('** '); # the initial frame for the virtual root
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+      push @frames_g, ''; # initialize
+      push @frames_s, ''; # initialize
+  }
+
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+
+    ($word, $pos, $head_g, $dep_g)
+      = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+    $wp = $word.' / '.$pos ;
+
+    # printf "%d: %s %s %s %s\n", $i_w,  $word, $pos, $head_g, $dep_g ;
+
+    if ((! $score_on_punct) && is_uni_punct($word))
+    {
+      $counts{punct}++ ;
+      # ignore punctuations
+      next ;
+    }
+
+    if (length($pos) > $max_pos_len)
+    {
+      $max_pos_len = length($pos) ;
+    }
+
+    ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+    $counts{tot}++ ;
+    $counts{word}{$wp}{tot}++ ;
+    $counts{pos}{$pos}{tot}++ ;
+    $counts{head}{$head_g-$i_w-1}{tot}++ ;
+
+    # for frame confusions
+    # add child to frame of parent
+    $frames_g[$head_g] .= "$dep_g ";
+    $frames_s[$head_s] .= "$dep_s ";
+    # add to frame of token itself
+    $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero
+    $frames_s[$i_w+1] .= "*$dep_g* ";
+
+    # for precision and recall of DEPREL
+    $counts{dep}{$dep_g}{tot}++ ;     # counts for gold standard deprels
+    $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions
+    $counts{dep_s}{$dep_s}{tot}++ ;   # counts for system deprels
+    $counts{all_dep}{$dep_g} = 1 ;    # list of all deprels that occur ...
+    $counts{all_dep}{$dep_s} = 1 ;    # ... in either gold or system output
+
+    # for precision and recall of HEAD direction
+    my $dir_g;
+    if ($head_g == 0) {
+	$dir_g = 'to_root';
+    } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero
+                                 # also below
+	$dir_g = 'left';
+    } elsif ($head_g > $i_w+1) {
+	$dir_g = 'right';
+    } else {
+        # token links to itself; should never happen in correct gold standard
+	$dir_g = 'self'; 
+    }
+    my $dir_s;
+    if ($head_s == 0) {
+	$dir_s = 'to_root';
+    } elsif ($head_s < $i_w+1) {
+	$dir_s = 'left';
+    } elsif ($head_s > $i_w+1) {
+	$dir_s = 'right';
+    } else {
+        # token links to itself; should not happen in good system 
+        # (but not forbidden in shared task)
+	$dir_s = 'self'; 
+    }
+    $counts{dir_g}{$dir_g}{tot}++ ;   # counts for gold standard head direction
+    $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions
+    $counts{dir_s}{$dir_s}{tot}++ ;   # counts for system head direction
+
+    # for precision and recall of HEAD distance
+    my $dist_g;
+    if ($head_g == 0) {
+	$dist_g = 'to_root';
+    } elsif ( abs($head_g - ($i_w+1)) <= 1 ) {
+	$dist_g = '1'; # includes the 'self' cases
+    } elsif ( abs($head_g - ($i_w+1)) <= 2 ) {
+	$dist_g = '2';
+    } elsif ( abs($head_g - ($i_w+1)) <= 6 ) {
+	$dist_g = '3-6';
+    } else {
+	$dist_g = '7-...';
+    }
+    my $dist_s;
+    if ($head_s == 0) {
+	$dist_s = 'to_root';
+    } elsif ( abs($head_s - ($i_w+1)) <= 1 ) {
+	$dist_s = '1'; # includes the 'self' cases
+    } elsif ( abs($head_s - ($i_w+1)) <= 2 ) {
+	$dist_s = '2';
+    } elsif ( abs($head_s - ($i_w+1)) <= 6 ) {
+	$dist_s = '3-6';
+    } else {
+	$dist_s = '7-...';
+    }
+    $counts{dist_g}{$dist_g}{tot}++ ;    # counts for gold standard head distance
+    $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions
+    $counts{dist_s}{$dist_s}{tot}++ ;    # counts for system head distance
+
+
+    $err_head = ($head_g ne $head_s) ; # error in head
+    $err_dep = ($dep_g ne $dep_s) ;    # error in deprel
+
+    $head_err = '-' ;
+    $dep_err = '-' ;
+
+    # for accuracy per sentence
+    $sent_counts{tot}++ ;
+    if ($err_dep || $err_head) {
+	$sent_counts{err_any}++ ;
+    }
+    if ($err_head) {
+	$sent_counts{err_head}++ ;
+    }
+
+    # total counts and counts for CPOS involved in errors
+
+    if ($head_g eq '0')
+    {
+      $head_aft_bef_g = '0' ;
+    }
+    elsif ($head_g eq $i_w+1)
+    {
+      $head_aft_bef_g = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    if ($head_s eq '0')
+    {
+      $head_aft_bef_s = '0' ;
+    }
+    elsif ($head_s eq $i_w+1)
+    {
+      $head_aft_bef_s = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+    if ($err_head)
+    {
+      if ($head_aft_bef_s eq '0')
+      {
+	$head_err = 0 ;
+      }
+      else
+      {
+	$head_err = $head_s-$head_g ;
+      }
+
+      $err_sent[$sent_num]{head}++ ;
+      $counts{err_head}{tot}++ ;
+      $counts{err_head}{$head_err}++ ;
+
+      $counts{word}{err_head}{$wp}++ ;
+      $counts{pos}{$pos}{err_head}{tot}++ ;
+      $counts{pos}{$pos}{err_head}{$head_err}++ ;
+    }
+
+    if ($err_dep)
+    {
+      $dep_err = $dep_g.'->'.$dep_s ;
+      $err_sent[$sent_num]{dep}++ ;
+      $counts{err_dep}{tot}++ ;
+      $counts{err_dep}{$dep_err}++ ;
+
+      $counts{word}{err_dep}{$wp}++ ;
+      $counts{pos}{$pos}{err_dep}{tot}++ ;
+      $counts{pos}{$pos}{err_dep}{$dep_err}++ ;
+
+      if ($err_head)
+      {
+	$counts{err_both}++ ;
+	$counts{pos}{$pos}{err_both}++ ;
+      }
+    }
+
+    ### DEPREL + ATTACHMENT
+    if ((!$err_dep) && ($err_head)) {
+	$counts{err_head_corr_dep}{tot}++ ;
+	$counts{err_head_corr_dep}{$dep_s}++ ;
+    }
+    ### DEPREL + ATTACHMENT
+
+    # counts for words involved in errors
+
+    if (! ($err_head || $err_dep))
+    {
+      next ;
+    }
+
+    $err_sent[$sent_num]{word}++ ;
+    $counts{err_any}++ ;
+    $counts{word}{err_any}{$wp}++ ;
+    $counts{pos}{$pos}{err_any}++ ;
+
+    ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+    if ($w_2 ne $START)
+    {
+      $wp_2 = $w_2.' / '.$p_2 ;
+    }
+    else
+    {
+      $wp_2 = $w_2 ;
+    }
+
+    if ($w_1 ne $START)
+    {
+      $wp_1 = $w_1.' / '.$p_1 ;
+    }
+    else
+    {
+      $wp_1 = $w_1 ;
+    }
+
+    if ($w1 ne $END)
+    {
+      $wp1 = $w1.' / '.$p1 ;
+    }
+    else
+    {
+      $wp1 = $w1 ;
+    }
+
+    if ($w2 ne $END)
+    {
+      $wp2 = $w2.' / '.$p2 ;
+    }
+    else
+    {
+      $wp2 = $w2 ;
+    }
+
+    $con_bef = $wp_1 ;
+    $con_bef_2 = $wp_2.' + '.$wp_1 ;
+    $con_aft = $wp1 ;
+    $con_aft_2 = $wp1.' + '.$wp2 ;
+
+    $con_pos_bef = $p_1 ;
+    $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+    $con_pos_aft = $p1 ;
+    $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+    if ($w_1 ne $START)
+    {
+      # do not count '.S' as a word context
+      $counts{con_bef_2}{tot}{$con_bef_2}++ ;
+      $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ;
+      $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ;
+      $counts{con_bef}{tot}{$con_bef}++ ;
+      $counts{con_bef}{err_head}{$con_bef} += $err_head ;
+      $counts{con_bef}{err_dep}{$con_bef} += $err_dep ;
+    }
+
+    if ($w1 ne $END)
+    {
+      # do not count '.E' as a word context
+      $counts{con_aft_2}{tot}{$con_aft_2}++ ;
+      $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ;
+      $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ;
+      $counts{con_aft}{tot}{$con_aft}++ ;
+      $counts{con_aft}{err_head}{$con_aft} += $err_head ;
+      $counts{con_aft}{err_dep}{$con_aft} += $err_dep ;
+    }
+
+    $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ;
+    $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ;
+    $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ;
+    $counts{con_pos_bef}{tot}{$con_pos_bef}++ ;
+    $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ;
+    $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ;
+
+    $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ;
+    $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ;
+    $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ;
+    $counts{con_pos_aft}{tot}{$con_pos_aft}++ ;
+    $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ;
+    $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ;
+
+    $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+    $freq_err{$err}++ ;
+
+  } # loop on words
+
+  foreach $i_w (0 .. $word_num) # including one for the virtual root
+  { # loop on words
+      if ($frames_g[$i_w] ne $frames_s[$i_w]) {
+	  $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ;
+      }
+  }
+
+  if (defined $opt_b) { # produce output similar to evalb
+      if ($word_num > 0) {
+	  my ($unlabeled,$labeled) = ('NaN', 'NaN');
+	  if ($sent_counts{tot} > 0) { # there are scoring tokens
+	      $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot};
+	      $labeled   = 100-$sent_counts{err_any} *100.0/$sent_counts{tot};
+	  }
+	  printf OUT "  %4d %4d    0  %6.2f %6.2f  %4d    %4d        %4d    0 0 0 0\n", 
+	  $sent_num, $word_num, 
+	  $unlabeled, $labeled, 
+	  $sent_counts{tot}-$sent_counts{err_head}, 
+	  $sent_counts{tot}-$sent_counts{err_any}, 
+	  $sent_counts{tot},;
+      }
+  }
+
+} # main reading loop
+
+################################################################################
+###                             printing output                              ###
+################################################################################
+
+if (defined $opt_b) {  # produce output similar to evalb
+    print OUT "\n\n";
+}
+printf OUT "  Labeled   attachment score: %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_any},      $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ;
+printf OUT "  Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ;
+printf OUT "  Label accuracy score:       %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ;
+
+if ($short_output)
+{
+    exit(0) ;
+}
+printf OUT "\n  %s\n\n", '=' x 80 ;
+printf OUT "  Evaluation of the results in %s\n  vs. gold standard %s:\n\n", $opt_s, $opt_g ;
+
+printf OUT "  Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ;
+
+printf OUT "  Number of non-scoring tokens: $counts{punct}\n\n";
+
+printf OUT "  The overall accuracy and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %-5s | %-5s |   %%  | %-5s |   %%  | %-5s |   %%\n",
+  'Accuracy', 'words', 'right', 'right', 'both' ;
+printf OUT "  %-10s | %-5s | %-5s |      | %-5s |      | %-5s |\n",
+  ' ', ' ', 'head', ' dep', 'right' ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+  'total', $counts{tot},
+  $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot},
+  $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot},
+  $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+    if (! defined($counts{pos}{$pos}{err_head}{tot}))
+    {
+	$counts{pos}{$pos}{err_head}{tot} = 0 ;
+    }
+    if (! defined($counts{pos}{$pos}{err_dep}{tot}))
+    {
+	$counts{pos}{$pos}{err_dep}{tot} = 0 ;
+    }
+    if (! defined($counts{pos}{$pos}{err_any}))
+    {
+	$counts{pos}{$pos}{err_any} = 0 ;
+    }
+
+    printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+    $pos, $counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ;
+}
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  The overall error rate and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %-5s | %-5s |   %%  | %-5s |   %%  | %-5s |   %%\n",
+  'Error', 'words', 'head', ' dep', 'both' ;
+printf OUT "  %-10s | %-5s | %-5s |      | %-5s |      | %-5s |\n",
+
+  'Rate', ' ', 'err', ' err', 'wrong' ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+  'total', $counts{tot},
+  $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot},
+  $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot},
+  $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+    if (! defined($counts{pos}{$pos}{err_both}))
+    {
+	$counts{pos}{$pos}{err_both} = 0 ;
+    }
+
+    printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+    $pos, $counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ;
+    
+}
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+### added by Sabine Buchholz
+printf OUT "\n\n";
+printf OUT "  Precision and recall of DEPREL\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  deprel          | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dep2}{$dep}{$dep})) {
+	$tot_corr = $counts{dep2}{$dep}{$dep};
+    } 
+    if (defined($counts{dep}{$dep}{tot})) {
+    	$tot_g = $counts{dep}{$dep}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dep_s}{$dep}{tot})) {
+	$tot_s = $counts{dep_s}{$dep}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+### DEPREL + ATTACHMENT:
+### Same as Sabine's DEPREL apart from $tot_corr calculation
+printf OUT "\n\n";
+printf OUT "  Precision and recall of DEPREL + ATTACHMENT\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  deprel          | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dep2}{$dep}{$dep})) {
+	if (defined($counts{err_head_corr_dep}{$dep})) {
+	    $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep};
+	} else {
+	    $tot_corr = $counts{dep2}{$dep}{$dep};
+	}
+    } 
+    if (defined($counts{dep}{$dep}{tot})) {
+    	$tot_g = $counts{dep}{$dep}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dep_s}{$dep}{tot})) {
+	$tot_s = $counts{dep_s}{$dep}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+### DEPREL + ATTACHMENT
+
+printf OUT "\n\n";
+printf OUT "  Precision and recall of binned HEAD direction\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  direction       | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dir ('to_root', 'left', 'right', 'self') {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dir2}{$dir}{$dir})) {
+	$tot_corr = $counts{dir2}{$dir}{$dir};
+    } 
+    if (defined($counts{dir_g}{$dir}{tot})) {
+    	$tot_g = $counts{dir_g}{$dir}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dir_s}{$dir}{tot})) {
+	$tot_s = $counts{dir_s}{$dir}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT "  Precision and recall of binned HEAD distance\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  distance        | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dist ('to_root', '1', '2', '3-6', '7-...') {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dist2}{$dist}{$dist})) {
+	$tot_corr = $counts{dist2}{$dist}{$dist};
+    } 
+    if (defined($counts{dist_g}{$dist}{tot})) {
+    	$tot_g = $counts{dist_g}{$dist}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dist_s}{$dist}{tot})) {
+	$tot_s = $counts{dist_s}{$dist}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT "  Frame confusions (gold versus system; *...* marks the head token)\n\n";
+foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}})
+{
+    if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later)
+    {
+	printf OUT "  %3d  %s\n", $counts{frame2}{$frame}, $frame;
+    }
+}
+### end of: added by Sabine Buchholz
+
+
+#
+# Leave only the 5 words mostly involved in errors
+#
+
+
+$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ;
+
+# ensure enough space for title
+$max_word_len = length('word') ;
+
+foreach $word (keys %{$counts{word}{err_any}})
+{
+  if ($counts{word}{err_any}{$word} < $thresh)
+  {
+    delete $counts{word}{err_any}{$word} ;
+    next ;
+  }
+
+  $l = uni_len($word) ;
+  if ($l > $max_word_len)
+  {
+    $max_word_len = $l ;
+  }
+}
+
+# filter a case when the difference between the error counts
+# for 2-word and 1-word contexts is small
+# (leave the 2-word context)
+
+foreach $con (keys %{$counts{con_aft_2}{tot}})
+{
+  ($w1) = split(/\+/, $con) ;
+  
+  if (defined $counts{con_aft}{tot}{$w1} &&
+      $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1)
+  {
+    delete $counts{con_aft}{tot}{$w1} ;
+  }
+}
+
+foreach $con (keys %{$counts{con_bef_2}{tot}})
+{
+  ($w_2, $w_1) = split(/\+/, $con) ;
+
+  if (defined $counts{con_bef}{tot}{$w_1} &&
+      $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1)
+  {
+    delete $counts{con_bef}{tot}{$w_1} ;
+  }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+  ($p1) = split(/\+/, $con_pos) ;
+
+  if (defined($counts{con_pos_aft}{tot}{$p1}) &&
+      $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1)
+  {
+    delete $counts{con_pos_aft}{tot}{$p1} ;
+  }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+  ($p_2, $p_1) = split(/\+/, $con_pos) ;
+
+  if (defined($counts{con_pos_bef}{tot}{$p_1}) &&
+      $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1)
+  {
+    delete $counts{con_pos_bef}{tot}{$p_1} ;
+  }
+}
+
+# for each context type, take the three contexts most involved in errors
+
+$max_con_len = 0 ;
+
+filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ;
+
+# for each CPOS context type, take the three CPOS contexts most involved in errors
+
+$max_con_pos_len = 0 ;
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+  if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_bef_2}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef}{tot}})
+{
+  if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_bef}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft}{tot}})
+{
+  if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_aft}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+  if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_aft_2}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+# printing
+
+# ------------- focus words
+
+printf OUT "\n\n" ;
+printf OUT "  %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ;
+
+printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ;
+printf OUT "  %s-+------+------+------+------\n", '-' x $max_word_len;
+
+foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}})
+{
+    if (!defined($counts{word}{err_head}{$word}))
+    {
+	$counts{word}{err_head}{$word} = 0 ;
+    }
+    if (! defined($counts{word}{err_dep}{$word}))
+    {
+	$counts{word}{err_dep}{$word} = 0 ;
+    }
+    if (! defined($counts{word}{err_any}{$word}))
+    {
+	$counts{word}{err_any}{$word} = 0;
+    }
+    printf OUT "  %-*s | %4d | %4d | %4d | %4d\n",
+    $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word},
+    $counts{word}{err_head}{$word},
+    $counts{word}{err_dep}{$word},
+    $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ;
+}
+
+printf OUT "  %s-+------+------+------+------\n", '-' x $max_word_len;
+
+# ------------- contexts
+
+printf OUT "\n\n" ;
+
+printf OUT "  one-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  two-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  one-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  two-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ;
+
+# ------------- Sentences
+
+printf OUT "  Sentence with the highest number of word errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word})
+		 <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  Sentence with the highest number of head errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) 
+		 <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  Sentence with the highest number of dependency errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) 
+		 <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+#
+# Second pass, collect statistics of the frequent errors
+#
+
+# filter the errors, leave the most frequent $freq_err_num errors
+
+$i = 0 ;
+
+$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ;
+
+foreach $err (keys %freq_err)
+{
+  if ($freq_err{$err} < $thresh)
+  {
+    delete $freq_err{$err} ;
+  }
+}
+
+# in case there are several errors with the threshold count
+
+$freq_err_num = scalar keys %freq_err ;
+
+%err_counts = () ;
+
+$eof = 0 ;
+
+seek (GOLD, 0, 0) ;
+seek (SYS, 0, 0) ;
+
+while (! $eof)
+{ # second reading loop
+
+  $eof = read_sent(\@sent_gold, \@sent_sys) ;
+  $sent_num++ ;
+
+  $word_num = scalar @sent_gold ;
+
+  # printf "$sent_num $word_num\n" ;
+  
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+    ($word, $pos, $head_g, $dep_g)
+      = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+
+    # printf "%d: %s %s %s %s\n", $i_w,  $word, $pos, $head_g, $dep_g ;
+
+    if ((! $score_on_punct) && is_uni_punct($word))
+    {
+      # ignore punctuations
+      next ;
+    }
+
+    ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+    $err_head = ($head_g ne $head_s) ;
+    $err_dep = ($dep_g ne $dep_s) ;
+
+    $head_err = '-' ;
+    $dep_err = '-' ;
+
+    if ($head_g eq '0')
+    {
+      $head_aft_bef_g = '0' ;
+    }
+    elsif ($head_g eq $i_w+1)
+    {
+      $head_aft_bef_g = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    if ($head_s eq '0')
+    {
+      $head_aft_bef_s = '0' ;
+    }
+    elsif ($head_s eq $i_w+1)
+    {
+      $head_aft_bef_s = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+    if ($err_head)
+    {
+      if ($head_aft_bef_s eq '0')
+      {
+	$head_err = 0 ;
+      }
+      else
+      {
+	$head_err = $head_s-$head_g ;
+      }
+    }
+
+    if ($err_dep)
+    {
+      $dep_err = $dep_g.'->'.$dep_s ;
+    }
+
+    if (! ($err_head || $err_dep))
+    {
+      next ;
+    }
+
+    # handle only the most frequent errors
+
+    $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+
+    if (! exists $freq_err{$err})
+    {
+      next ;
+    }
+
+    ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+    $con_bef = $w_1 ;
+    $con_bef_2 = $w_2.' + '.$w_1 ;
+    $con_aft = $w1 ;
+    $con_aft_2 = $w1.' + '.$w2 ;
+
+    $con_pos_bef = $p_1 ;
+    $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+    $con_pos_aft = $p1 ;
+    $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+    @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ;
+
+    # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n",
+    #  $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ;
+    
+    @bits = (0, 0, 0, 0, 0, 0) ;
+    $j = 0 ;
+
+    while ($j == 0)
+    {
+      for ($i = 0; $i <= $#bits; $i++)
+      {
+	if ($bits[$i] == 0)
+	{
+	  $bits[$i] = 1 ;
+	  $j = 0 ;
+	  last ;
+	}
+	else
+	{
+	  $bits[$i] = 0 ;
+	  $j = 1 ;
+	}
+      }
+
+      @e_bits = @cur_err ;
+
+      for ($i = 0; $i <= $#bits; $i++)
+      {
+	if (! $bits[$i])
+	{
+	  $e_bits[$i] = '*' ;
+	}
+      }
+
+      # include also the last case which is the most general
+      # (wildcards for everything)
+      $err_counts{$err}{join($sep, @e_bits)}++ ;
+
+    }
+
+  } # loop on words
+} # second reading loop
+
+printf OUT "\n\n" ;
+printf OUT "  Specific errors, %d most frequent errors:", $freq_err_num ;
+printf OUT "\n  %s\n", '=' x 41 ;
+
+
+# deleting local contexts which are too general
+
+foreach $err (keys %err_counts)
+{
+  foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+		    keys %{$err_counts{$err}})
+  {
+    @cur_err = split(/\Q$sep\E/, $loc_con) ;
+
+    # In this loop, one or two elements of the local context are
+    # replaced with '*' to make it more general. If the entry for
+    # the general context has the same count it is removed.
+
+    foreach $i (0 .. $#cur_err)
+    {
+      $w1 = $cur_err[$i] ;
+      if ($cur_err[$i] eq '*')
+      {
+	next ;
+      }
+      $cur_err[$i] = '*' ;
+      $con1 = join($sep, @cur_err) ;
+      if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+	   && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+      {
+	delete $err_counts{$err}{$con1} ;
+      }
+      for ($j = $i+1; $j <=$#cur_err; $j++)
+      {
+	if ($cur_err[$j] eq '*')
+	{
+	  next ;
+	}
+	$w2 = $cur_err[$j] ;
+	$cur_err[$j] = '*' ;
+	$con1 = join($sep, @cur_err) ;
+	if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+	     && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+	{
+	  delete $err_counts{$err}{$con1} ;
+	}
+	$cur_err[$j] = $w2 ;
+      }
+      $cur_err[$i] = $w1 ;
+    }
+  }
+}
+
+# Leaving only the topmost local contexts for each error
+
+foreach $err (keys %err_counts)
+{
+  $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ;
+
+  # of the threshold is too low, take the 2nd highest count
+  # (the highest may be the total which is the generic case
+  #   and not relevant for printing)
+
+  if ($thresh < 5)
+  {
+    $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ;
+  }
+
+  foreach $loc_con (keys %{$err_counts{$err}})
+  {
+    if ($err_counts{$err}{$loc_con} < $thresh)
+    {
+      delete $err_counts{$err}{$loc_con} ;
+    }
+    else
+    {
+      if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*')))
+      {
+	$loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ;
+      }
+    }
+  }
+}
+
+# printing an error summary
+
+# calculating the context field length
+
+$max_word_spec_len= length('word') ;
+$max_con_aft_len = length('word') ;
+$max_con_bef_len = length('word') ;
+$max_con_pos_len = length('CPOS') ;
+
+foreach $err (keys %err_counts)
+{
+  foreach $loc_con (sort keys %{$err_counts{$err}})
+  {
+    ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $loc_con) ;
+
+    $l = uni_len($word) ;
+    if ($l > $max_word_spec_len)
+    {
+      $max_word_spec_len = $l ;
+    }
+
+    $l = uni_len($con_bef) ;
+    if ($l > $max_con_bef_len)
+    {
+      $max_con_bef_len = $l ;
+    }
+
+    $l = uni_len($con_aft) ;
+    if ($l > $max_con_aft_len)
+    {
+      $max_con_aft_len = $l ;
+    }
+
+    if (length($con_pos_aft) > $max_con_pos_len)
+    {
+      $max_con_pos_len = length($con_pos_aft) ;
+    }
+
+    if (length($con_pos_bef) > $max_con_pos_len)
+    {
+      $max_con_pos_len = length($con_pos_bef) ;
+    }
+  }
+}
+
+$err_counter = 0 ;
+
+foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err)
+{
+
+  ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ;
+
+  $err_counter++ ;
+  $err_desc{$err} = sprintf("%2d. ", $err_counter).
+    describe_err($head_err, $head_aft_bef, $dep_err) ;
+  
+  # printf OUT "  %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ;
+  printf OUT "\n" ;
+  printf OUT "  %s : %d times\n", $err_desc{$err}, $freq_err{$err} ;
+
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  printf OUT "  %-*s | %-*s | %-*s | %s\n",
+      $max_con_pos_len+$max_con_bef_len+3, '  Before',
+	$max_word_spec_len+$max_pos_len+3, '   Focus',
+	  $max_con_pos_len+$max_con_aft_len+3, '  After',
+	    'Count' ;
+
+  printf OUT "  %-*s   %-*s | %-*s   %-*s | %-*s   %-*s |\n",
+    $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+       $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+	$max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+		    keys %{$err_counts{$err}})
+  {
+    if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*')))
+    {
+      next ;
+    }
+
+    $con1 = $loc_con ;
+    $con1 =~ s/\*/ /g ;
+
+    ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $con1) ;
+
+    printf OUT "  %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n",
+      $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+	  $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+	    $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft,
+	      $err_counts{$err}{$loc_con} ;
+  }
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+}
+
+printf OUT "\n\n" ;
+printf OUT "  Local contexts involved in several frequent errors:" ;
+printf OUT "\n  %s\n", '=' x 51 ;
+printf OUT "\n\n" ;
+
+foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=>
+			  scalar keys %{$loc_con_err_counts{$a}}}
+		  keys %loc_con_err_counts)
+{
+
+  if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1)
+  {
+    next ;
+  }
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  printf OUT "  %-*s | %-*s | %-*s \n",
+      $max_con_pos_len+$max_con_bef_len+3, '  Before',
+	$max_word_spec_len+$max_pos_len+3, '   Focus',
+	  $max_con_pos_len+$max_con_aft_len+3, '  After' ;
+
+  printf OUT "  %-*s   %-*s | %-*s   %-*s | %-*s   %-*s \n",
+    $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+       $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+	$max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  $con1 = $loc_con ;
+  $con1 =~ s/\*/ /g ;
+
+  ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $con1) ;
+
+  printf OUT "  %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n",
+    $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+      $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+	$max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ;
+	  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=>
+			$loc_con_err_counts{$loc_con}{$a}}
+		keys %{$loc_con_err_counts{$loc_con}})
+  {
+    printf OUT "  %s : %d times\n", $err_desc{$err},
+      $loc_con_err_counts{$loc_con}{$err} ;
+  }
+
+  printf OUT "\n" ;
+}
+
+close GOLD ;
+close SYS ;
+
+close OUT ;
diff --git a/bist_parser/bmstparser/src/decoder.py b/bist_parser/bmstparser/src/decoder.py
new file mode 100644
index 0000000..f93b74f
--- /dev/null
+++ b/bist_parser/bmstparser/src/decoder.py
@@ -0,0 +1,105 @@
+# This file contains routines from Lisbon Machine Learning summer school.
+# The code is freely distributed under a MIT license. https://github.com/LxMLS/lxmls-toolkit/
+
+import numpy as np
+import sys
+from collections import defaultdict, namedtuple
+from operator import itemgetter
+
+
+def parse_proj(scores, gold=None):
+    '''
+    Parse using Eisner's algorithm.
+    '''
+    nr, nc = np.shape(scores)
+    if nr != nc:
+        raise ValueError("scores must be a squared matrix with nw+1 rows")
+
+    N = nr - 1 # Number of words (excluding root).
+
+    # Initialize CKY table.
+    complete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1). 
+    incomplete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1). 
+    complete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1). 
+    incomplete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1).
+
+    incomplete[0, :, 0] -= np.inf
+
+    # Loop from smaller items to larger items.
+    for k in range(1,N+1):
+        for s in range(N-k+1):
+            t = s+k
+            
+            # First, create incomplete items.
+            # left tree
+            incomplete_vals0 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[t, s] + (0.0 if gold is not None and gold[s]==t else 1.0)
+            incomplete[s, t, 0] = np.max(incomplete_vals0)
+            incomplete_backtrack[s, t, 0] = s + np.argmax(incomplete_vals0)
+            # right tree
+            incomplete_vals1 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[s, t] + (0.0 if gold is not None and gold[t]==s else 1.0)
+            incomplete[s, t, 1] = np.max(incomplete_vals1)
+            incomplete_backtrack[s, t, 1] = s + np.argmax(incomplete_vals1)
+
+            # Second, create complete items.
+            # left tree
+            complete_vals0 = complete[s, s:t, 0] + incomplete[s:t, t, 0]
+            complete[s, t, 0] = np.max(complete_vals0)
+            complete_backtrack[s, t, 0] = s + np.argmax(complete_vals0)
+            # right tree
+            complete_vals1 = incomplete[s, (s+1):(t+1), 1] + complete[(s+1):(t+1), t, 1]
+            complete[s, t, 1] = np.max(complete_vals1)
+            complete_backtrack[s, t, 1] = s + 1 + np.argmax(complete_vals1)
+        
+    value = complete[0][N][1]
+    heads = [-1 for _ in range(N+1)] #-np.ones(N+1, dtype=int)
+    backtrack_eisner(incomplete_backtrack, complete_backtrack, 0, N, 1, 1, heads)
+
+    value_proj = 0.0
+    for m in range(1,N+1):
+        h = heads[m]
+        value_proj += scores[h,m]
+
+    return heads
+
+
+def backtrack_eisner(incomplete_backtrack, complete_backtrack, s, t, direction, complete, heads):
+    '''
+    Backtracking step in Eisner's algorithm.
+    - incomplete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position,
+    an end position, and a direction flag (0 means left, 1 means right). This array contains
+    the arg-maxes of each step in the Eisner algorithm when building *incomplete* spans.
+    - complete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position,
+    an end position, and a direction flag (0 means left, 1 means right). This array contains
+    the arg-maxes of each step in the Eisner algorithm when building *complete* spans.
+    - s is the current start of the span
+    - t is the current end of the span
+    - direction is 0 (left attachment) or 1 (right attachment)
+    - complete is 1 if the current span is complete, and 0 otherwise
+    - heads is a (NW+1)-sized numpy array of integers which is a placeholder for storing the 
+    head of each word.
+    '''
+    if s == t:
+        return
+    if complete:
+        r = complete_backtrack[s][t][direction]
+        if direction == 0:
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 0, 1, heads)
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 0, 0, heads)
+            return
+        else:
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 0, heads)
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 1, 1, heads)
+            return
+    else:
+        r = incomplete_backtrack[s][t][direction]
+        if direction == 0:
+            heads[s] = t
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads)
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads)
+            return
+        else:
+            heads[t] = s
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads)
+            backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads)
+            return
+
diff --git a/bist_parser/bmstparser/src/mstlstm.py b/bist_parser/bmstparser/src/mstlstm.py
new file mode 100644
index 0000000..e403d59
--- /dev/null
+++ b/bist_parser/bmstparser/src/mstlstm.py
@@ -0,0 +1,496 @@
+from dynet import *
+from bist_parser.bmstparser.src.utils import read_conll, write_conll
+from bist_parser.bmstparser.src import utils, decoder
+from operator import itemgetter
+import time, random
+import numpy as np
+
+
+class MSTParserLSTM:
+    def __init__(self, vocab, pos, rels, w2i, options):
+        self.model = Model()
+        random.seed(1)
+        self.trainer = AdamTrainer(self.model)
+
+        self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))}
+        self.activation = self.activations[options.activation]
+
+        self.blstmFlag = options.blstmFlag
+        self.labelsFlag = options.labelsFlag
+        self.costaugFlag = options.costaugFlag
+        self.bibiFlag = options.bibiFlag
+
+        self.ldims = options.lstm_dims
+        self.wdims = options.wembedding_dims
+        self.pdims = options.pembedding_dims
+        self.rdims = options.rembedding_dims
+        self.layers = options.lstm_layers
+        self.wordsCount = vocab
+        self.vocab = {word: ind+3 for word, ind in iter(w2i.items())}
+        self.pos = {word: ind+3 for ind, word in enumerate(pos)}
+        self.rels = {word: ind for ind, word in enumerate(rels)}
+        self.irels = rels
+
+        self.external_embedding, self.edim = None, 0
+        if options.external_embedding is not None:
+            external_embedding_fp = open(options.external_embedding,'r')
+            external_embedding_fp.readline()
+            self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp}
+            external_embedding_fp.close()
+
+            self.edim = len(self.external_embedding.values()[0])
+            self.noextrn = [0.0 for _ in range(self.edim)]
+            self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
+            self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
+            for word, i in iter(self.extrnd.items()):
+                self.elookup.init_row(i, self.external_embedding[word])
+            self.extrnd['*PAD*'] = 1
+            self.extrnd['*INITIAL*'] = 2
+
+            print('Load external embedding. Vector dimensions', self.edim)
+
+        if self.bibiFlag:
+            self.builders = [VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+                             VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+            self.bbuilders = [VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model),
+                              VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model)]
+        elif self.layers > 0:
+            self.builders = [VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+                             VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+        else:
+            self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+                             SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+
+        self.hidden_units = options.hidden_units
+        self.hidden2_units = options.hidden2_units
+
+        self.vocab['*PAD*'] = 1
+        self.pos['*PAD*'] = 1
+
+        self.vocab['*INITIAL*'] = 2
+        self.pos['*INITIAL*'] = 2
+
+        self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims))
+        self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
+        self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))
+
+        self.hidLayerFOH = self.model.add_parameters((self.hidden_units, self.ldims * 2))
+        self.hidLayerFOM = self.model.add_parameters((self.hidden_units, self.ldims * 2))
+        self.hidBias = self.model.add_parameters((self.hidden_units))
+
+        self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+        self.hid2Bias = self.model.add_parameters((self.hidden2_units))
+
+        self.outLayer = self.model.add_parameters((1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+
+        if self.labelsFlag:
+            self.rhidLayerFOH = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
+            self.rhidLayerFOM = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
+            self.rhidBias = self.model.add_parameters((self.hidden_units))
+
+            self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+            self.rhid2Bias = self.model.add_parameters((self.hidden2_units))
+
+            self.routLayer = self.model.add_parameters((len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+            self.routBias = self.model.add_parameters((len(self.irels)))
+
+
+    def  __getExpr(self, sentence, i, j, train):
+
+        if sentence[i].headfov is None:
+            sentence[i].headfov = self.hidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]])
+        if sentence[j].modfov is None:
+            sentence[j].modfov  = self.hidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]])
+
+        if self.hidden2_units > 0:
+            output = self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr())) # + self.outBias
+        else:
+            output = self.outLayer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr()) # + self.outBias
+
+        return output
+
+
+    def __evaluate(self, sentence, train):
+        exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ]
+        scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ])
+
+        return scores, exprs
+
+
+    def __evaluateLabel(self, sentence, i, j):
+        if sentence[i].rheadfov is None:
+            sentence[i].rheadfov = self.rhidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]])
+        if sentence[j].rmodfov is None:
+            sentence[j].rmodfov  = self.rhidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]])
+
+        if self.hidden2_units > 0:
+            output = self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr())) + self.routBias.expr()
+        else:
+            output = self.routLayer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr()) + self.routBias.expr()
+
+        return output.value(), output
+
+
+    def Save(self, filename):
+        self.model.save(filename)
+
+
+    def Load(self, filename):
+        self.model.load(filename)
+
+
+    def Predict(self, conll_path):
+        with open(conll_path, 'r') as conllFP:
+            for iSentence, sentence in enumerate(read_conll(conllFP)):
+                for entry in sentence:
+                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
+                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                    evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None
+                    entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+                    entry.lstms = [entry.vec, entry.vec]
+                    entry.headfov = None
+                    entry.modfov = None
+
+                    entry.rheadfov = None
+                    entry.rmodfov = None
+
+                if self.blstmFlag:
+                    lstm_forward = self.builders[0].initial_state()
+                    lstm_backward = self.builders[1].initial_state()
+
+                    for entry, rentry in zip(sentence, reversed(sentence)):
+                        lstm_forward = lstm_forward.add_input(entry.vec)
+                        lstm_backward = lstm_backward.add_input(rentry.vec)
+
+                        entry.lstms[1] = lstm_forward.output()
+                        rentry.lstms[0] = lstm_backward.output()
+
+                    if self.bibiFlag:
+                        for entry in sentence:
+                            entry.vec = concatenate(entry.lstms)
+
+                        blstm_forward = self.bbuilders[0].initial_state()
+                        blstm_backward = self.bbuilders[1].initial_state()
+
+                        for entry, rentry in zip(sentence, reversed(sentence)):
+                            blstm_forward = blstm_forward.add_input(entry.vec)
+                            blstm_backward = blstm_backward.add_input(rentry.vec)
+
+                            entry.lstms[1] = blstm_forward.output()
+                            rentry.lstms[0] = blstm_backward.output()
+
+                scores, exprs = self.__evaluate(sentence, True)
+                heads = decoder.parse_proj(scores)
+
+                for entry, head in zip(sentence, heads):
+                    entry.pred_parent_id = head
+                    entry.pred_relation = '_'
+
+                dump = False
+
+                if self.labelsFlag:
+                    for modifier, head in enumerate(heads[1:]):
+                        scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
+                        sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]
+
+                renew_cg()
+                if not dump:
+                    yield sentence
+
+    def PredictOnEntries(self, conll_entries):
+        for iSentence, sentence in enumerate(conll_entries):
+            for entry in sentence:
+                wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
+                posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None
+                entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+                entry.lstms = [entry.vec, entry.vec]
+                entry.headfov = None
+                entry.modfov = None
+
+                entry.rheadfov = None
+                entry.rmodfov = None
+
+            if self.blstmFlag:
+                lstm_forward = self.builders[0].initial_state()
+                lstm_backward = self.builders[1].initial_state()
+
+                for entry, rentry in zip(sentence, reversed(sentence)):
+                    lstm_forward = lstm_forward.add_input(entry.vec)
+                    lstm_backward = lstm_backward.add_input(rentry.vec)
+
+                    entry.lstms[1] = lstm_forward.output()
+                    rentry.lstms[0] = lstm_backward.output()
+
+                if self.bibiFlag:
+                    for entry in sentence:
+                        entry.vec = concatenate(entry.lstms)
+
+                    blstm_forward = self.bbuilders[0].initial_state()
+                    blstm_backward = self.bbuilders[1].initial_state()
+
+                    for entry, rentry in zip(sentence, reversed(sentence)):
+                        blstm_forward = blstm_forward.add_input(entry.vec)
+                        blstm_backward = blstm_backward.add_input(rentry.vec)
+
+                        entry.lstms[1] = blstm_forward.output()
+                        rentry.lstms[0] = blstm_backward.output()
+
+            scores, exprs = self.__evaluate(sentence, True)
+            heads = decoder.parse_proj(scores)
+
+            for entry, head in zip(sentence, heads):
+                entry.pred_parent_id = head
+                entry.pred_relation = '_'
+
+            dump = False
+
+            if self.labelsFlag:
+                for modifier, head in enumerate(heads[1:]):
+                    scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
+                    sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]
+
+            renew_cg()
+            if not dump:
+                yield sentence
+
+    def Train(self, conll_path):
+        errors = 0
+        batch = 0
+        eloss = 0.0
+        mloss = 0.0
+        eerrors = 0
+        etotal = 0
+        start = time.time()
+
+        with open(conll_path, 'r') as conllFP:
+            shuffledData = list(read_conll(conllFP))
+            random.shuffle(shuffledData)
+
+            errs = []
+            lerrs = []
+            eeloss = 0.0
+
+            for iSentence, sentence in enumerate(shuffledData):
+                if iSentence % 100 == 0 and iSentence != 0:
+                    print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start)
+                    start = time.time()
+                    eerrors = 0
+                    eloss = 0.0
+                    etotal = 0
+                    lerrors = 0
+                    ltotal = 0
+
+                for entry in sentence:
+                    c = float(self.wordsCount.get(entry.norm, 0))
+                    dropFlag = (random.random() < (c/(0.25+c)))
+                    wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
+                    posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                    evec = None
+
+                    if self.external_embedding is not None:
+                        evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
+                    entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+                    entry.lstms = [entry.vec, entry.vec]
+                    entry.headfov = None
+                    entry.modfov = None
+
+                    entry.rheadfov = None
+                    entry.rmodfov = None
+
+                if self.blstmFlag:
+                    lstm_forward = self.builders[0].initial_state()
+                    lstm_backward = self.builders[1].initial_state()
+
+                    for entry, rentry in zip(sentence, reversed(sentence)):
+                        lstm_forward = lstm_forward.add_input(entry.vec)
+                        lstm_backward = lstm_backward.add_input(rentry.vec)
+
+                        entry.lstms[1] = lstm_forward.output()
+                        rentry.lstms[0] = lstm_backward.output()
+
+                    if self.bibiFlag:
+                        for entry in sentence:
+                            entry.vec = concatenate(entry.lstms)
+
+                        blstm_forward = self.bbuilders[0].initial_state()
+                        blstm_backward = self.bbuilders[1].initial_state()
+
+                        for entry, rentry in zip(sentence, reversed(sentence)):
+                            blstm_forward = blstm_forward.add_input(entry.vec)
+                            blstm_backward = blstm_backward.add_input(rentry.vec)
+
+                            entry.lstms[1] = blstm_forward.output()
+                            rentry.lstms[0] = blstm_backward.output()
+
+                scores, exprs = self.__evaluate(sentence, True)
+                gold = [entry.parent_id for entry in sentence]
+                heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)
+
+                if self.labelsFlag:
+                    for modifier, head in enumerate(gold[1:]):
+                        rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1)
+                        goldLabelInd = self.rels[sentence[modifier+1].relation]
+                        wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
+                        if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
+                            lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])
+
+                e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
+                eerrors += e
+                if e > 0:
+                    loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
+                    eloss += (e)
+                    mloss += (e)
+                    errs.extend(loss)
+
+                etotal += len(sentence)
+
+                if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
+                    eeloss = 0.0
+
+                    if len(errs) > 0 or len(lerrs) > 0:
+                        eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+                        eerrs.scalar_value()
+                        eerrs.backward()
+                        self.trainer.update()
+                        errs = []
+                        lerrs = []
+
+                    renew_cg()
+
+        if len(errs) > 0:
+            eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+            eerrs.scalar_value()
+            eerrs.backward()
+            self.trainer.update()
+
+            errs = []
+            lerrs = []
+            eeloss = 0.0
+
+            renew_cg()
+
+        self.trainer.update_epoch()
+        print("Loss: ", mloss/iSentence)
+
+    def TrainOnEntries(self, shuffledData):
+        errors = 0
+        batch = 0
+        eloss = 0.0
+        mloss = 0.0
+        eerrors = 0
+        etotal = 0
+        start = time.time()
+
+        random.shuffle(shuffledData)
+
+        errs = []
+        lerrs = []
+        eeloss = 0.0
+
+        for iSentence, sentence in enumerate(shuffledData):
+            if iSentence % 100 == 0 and iSentence != 0:
+                print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start)
+                start = time.time()
+                eerrors = 0
+                eloss = 0.0
+                etotal = 0
+                lerrors = 0
+                ltotal = 0
+
+            for entry in sentence:
+                c = float(self.wordsCount.get(entry.norm, 0))
+                dropFlag = (random.random() < (c/(0.25+c)))
+                wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
+                posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+                evec = None
+
+                if self.external_embedding is not None:
+                    evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
+                entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+                entry.lstms = [entry.vec, entry.vec]
+                entry.headfov = None
+                entry.modfov = None
+
+                entry.rheadfov = None
+                entry.rmodfov = None
+
+            if self.blstmFlag:
+                lstm_forward = self.builders[0].initial_state()
+                lstm_backward = self.builders[1].initial_state()
+
+                for entry, rentry in zip(sentence, reversed(sentence)):
+                    lstm_forward = lstm_forward.add_input(entry.vec)
+                    lstm_backward = lstm_backward.add_input(rentry.vec)
+
+                    entry.lstms[1] = lstm_forward.output()
+                    rentry.lstms[0] = lstm_backward.output()
+
+                if self.bibiFlag:
+                    for entry in sentence:
+                        entry.vec = concatenate(entry.lstms)
+
+                    blstm_forward = self.bbuilders[0].initial_state()
+                    blstm_backward = self.bbuilders[1].initial_state()
+
+                    for entry, rentry in zip(sentence, reversed(sentence)):
+                        blstm_forward = blstm_forward.add_input(entry.vec)
+                        blstm_backward = blstm_backward.add_input(rentry.vec)
+
+                        entry.lstms[1] = blstm_forward.output()
+                        rentry.lstms[0] = blstm_backward.output()
+
+            scores, exprs = self.__evaluate(sentence, True)
+            gold = [entry.parent_id for entry in sentence]
+            heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)
+
+            if self.labelsFlag:
+                for modifier, head in enumerate(gold[1:]):
+                    rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1)
+                    goldLabelInd = self.rels[sentence[modifier+1].relation]
+                    wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
+                    if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
+                        lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])
+
+            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
+            eerrors += e
+            if e > 0:
+                loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
+                eloss += (e)
+                mloss += (e)
+                errs.extend(loss)
+
+            etotal += len(sentence)
+
+            if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
+                eeloss = 0.0
+
+                if len(errs) > 0 or len(lerrs) > 0:
+                    eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+                    eerrs.scalar_value()
+                    eerrs.backward()
+                    self.trainer.update()
+                    errs = []
+                    lerrs = []
+
+                renew_cg()
+
+        if len(errs) > 0:
+            eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+            eerrs.scalar_value()
+            eerrs.backward()
+            self.trainer.update()
+
+            errs = []
+            lerrs = []
+            eeloss = 0.0
+
+            renew_cg()
+
+        self.trainer.update_epoch()
+        print("Loss: ", mloss/iSentence)
+
diff --git a/bist_parser/bmstparser/src/parser.py b/bist_parser/bmstparser/src/parser.py
new file mode 100644
index 0000000..19b2980
--- /dev/null
+++ b/bist_parser/bmstparser/src/parser.py
@@ -0,0 +1,75 @@
+from optparse import OptionParser
+from bist_parser.bmstparser.src import utils, mstlstm
+import pickle
+import os.path
+import time
+
+
+if __name__ == '__main__':
+    parser = OptionParser()
+    parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/en-universal-train.conll.ptb")
+    parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/en-universal-dev.conll.ptb")
+    parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/en-universal-test.conll.ptb")
+    parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE")
+    parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle")
+    parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="neuralfirstorder.model")
+    parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100)
+    parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25)
+    parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25)
+    parser.add_option("--epochs", type="int", dest="epochs", default=30)
+    parser.add_option("--hidden", type="int", dest="hidden_units", default=100)
+    parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0)
+    parser.add_option("--lr", type="float", dest="learning_rate", default=0.1)
+    parser.add_option("--outdir", type="string", dest="output", default="results")
+    parser.add_option("--activation", type="string", dest="activation", default="tanh")
+    parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2)
+    parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=125)
+    parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True)
+    parser.add_option("--disablelabels", action="store_false", dest="labelsFlag", default=True)
+    parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
+    parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False)
+    parser.add_option("--disablecostaug", action="store_false", dest="costaugFlag", default=True)
+    parser.add_option("--dynet-seed", type="int", dest="seed", default=0)
+    parser.add_option("--dynet-mem", type="int", dest="mem", default=0)
+
+    (options, args) = parser.parse_args()
+
+    print('Using external embedding:', options.external_embedding)
+
+    if options.predictFlag:
+        with open(options.params, 'rb') as paramsfp:
+            words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)
+
+        stored_opt.external_embedding = options.external_embedding
+
+        print('Initializing lstm mstparser:')
+        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, stored_opt)
+
+        parser.Load(options.model)
+        tespath = os.path.join(options.output, 'test_pred.conll')
+
+        ts = time.time()
+        test_res = list(parser.Predict(options.conll_test))
+        te = time.time()
+        print('Finished predicting test.', te-ts, 'seconds.')
+        utils.write_conll(tespath, test_res)
+
+        os.system('perl src/util_scripts/eval.pl -g ' + options.conll_test + ' -s ' + tespath  + ' > ' + tespath + '.txt')
+    else:
+        print('Preparing vocab')
+        words, w2i, pos, rels = utils.vocab(options.conll_train)
+
+        with open(os.path.join(options.output, options.params), 'wb') as paramsfp:
+            pickle.dump((words, w2i, pos, rels, options), paramsfp)
+        print('Finished collecting vocab')
+
+        print('Initializing lstm mstparser:')
+        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options)
+
+        for epoch in range(options.epochs):
+            print('Starting epoch', epoch)
+            parser.Train(options.conll_train)
+            devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll')
+            utils.write_conll(devpath, parser.Predict(options.conll_dev))
+            parser.Save(os.path.join(options.output, os.path.basename(options.model) + str(epoch+1)))
+            os.system('perl src/util_scripts/eval.pl -g ' + options.conll_dev  + ' -s ' + devpath  + ' > ' + devpath + '.txt')
diff --git a/bist_parser/bmstparser/src/util_scripts/eval.pl b/bist_parser/bmstparser/src/util_scripts/eval.pl
new file mode 100644
index 0000000..3db9837
--- /dev/null
+++ b/bist_parser/bmstparser/src/util_scripts/eval.pl
@@ -0,0 +1,1826 @@
+#!/usr/bin/env perl
+
+# Author: Yuval Krymolowski
+# Addition of precision and recall 
+#   and of frame confusion list: Sabine Buchholz
+# Addition of DEPREL + ATTACHMENT:
+#   Prokopis Prokopidis (prokopis at ilsp dot gr)
+# Acknowledgements: 
+#   to Markus Kuhn for suggesting the use of 
+#   the Unicode category property
+
+if ($] < 5.008001)
+{
+  printf STDERR <<EOM
+
+ This script requires PERL 5.8.1 for running.
+ The new version is needed for proper handling
+ of Unicode characters.
+
+ Please obtain a new version or contact the shared task team
+ if you are unable to upgrade PERL.
+
+EOM
+;
+  exit(1) ;
+}
+
+require Encode;
+
+use strict ;
+use warnings;
+use Getopt::Std ;
+
+my ($usage) = <<EOT
+
+  CoNLL-X evaluation script:
+
+   [perl] eval.pl [OPTIONS] -g <gold standard> -s <system output>
+
+  This script evaluates a system output with respect to a gold standard.
+  Both files should be in UTF-8 encoded CoNLL-X tabular format.
+
+  Punctuation tokens (those where all characters have the Unicode
+  category property "Punctuation") are ignored for scoring (unless the
+  -p flag is used).
+
+  The output breaks down the errors according to their type and context.
+
+  Optional parameters:
+     -o FILE : output: print output to FILE (default is standard output)
+     -q : quiet:       only print overall performance, without the details
+     -b : evalb:       produce output in a format similar to evalb 
+                       (http://nlp.cs.nyu.edu/evalb/); use together with -q
+     -p : punctuation: also score on punctuation (default is not to score on it)
+     -v : version:     show the version number
+     -h : help:        print this help text and exit
+
+EOT
+;
+
+my ($line_num) ;
+my ($sep) = '0x01' ;
+
+my ($START) = '.S' ;
+my ($END) = '.E' ;
+
+my ($con_err_num) = 3 ;
+my ($freq_err_num) = 10 ;
+my ($spec_err_loc_con) = 8 ;
+
+################################################################################
+###                              subfunctions                                ###
+################################################################################
+
+# Whether a string consists entirely of characters with the Unicode
+# category property "Punctuation" (see "man perlunicode")
+sub is_uni_punct
+{
+  my ($word) = @_ ;
+
+  return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ;
+}
+
+# The length of a unicode string, excluding non-spacing marks
+# (for example vowel marks in Arabic)
+
+sub uni_len
+{
+  my ($word) = @_ ;
+  my ($ch, $l) ;
+
+  $l = 0 ;
+  foreach $ch (split(//,  Encode::decode_utf8($word)))
+  {
+    if ($ch !~ /^\p{NonspacingMark}/)
+    {
+      $l++ ;
+    }
+  }
+
+  return $l ;
+}
+
+sub filter_context_counts
+{ # filter_context_counts
+
+  my ($vec, $num, $max_len) = @_ ;
+  my ($con, $l, $thresh) ;
+
+  $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ;
+
+  foreach $con (keys %{$vec})
+  {
+    if (${$vec}{$con} < $thresh)
+    {
+      delete ${$vec}{$con} ;
+      next ;
+    }
+
+    $l = uni_len($con) ;
+
+    if ($l > ${$max_len})
+    {
+      ${$max_len} = $l ;
+    }
+  }
+
+} # filter_context_counts
+
+sub print_context
+{ # print_context
+
+  my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ;
+  my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ;
+
+  printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ;
+  printf OUT "  ||" ;
+  printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ;
+  printf OUT "\n" ;
+  printf OUT "  %s-+------+------+------+-----", '-' x $max_con_pos_len;
+  printf OUT "--++" ;
+  printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+  printf OUT "\n" ;
+
+  @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ;
+  @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ;
+
+  $n = scalar @v_con ;
+  if (scalar @v_con_pos > $n)
+  {
+    $n = scalar @v_con_pos ;
+  }
+
+  foreach $i (0 .. $n-1)
+  {
+    if (defined $v_con_pos[$i])
+    {
+      $con_pos = $v_con_pos[$i] ;
+      printf OUT "  %-*s | %4d | %4d | %4d | %4d",
+	$max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos},
+	  ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos},
+	    ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ;
+    }
+    else
+    {
+      printf OUT "  %-*s | %4s | %4s | %4s | %4s",
+	$max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ;
+    }
+
+    printf OUT "  ||" ;
+
+    if (defined $v_con[$i])
+    {
+      $con = $v_con[$i] ;
+      printf OUT "  %-*s | %4d | %4d | %4d | %4d",
+	$max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con},
+	  ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con},
+	    ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ;
+    }
+    else
+    {
+      printf OUT "  %-*s | %4s | %4s | %4s | %4s",
+	$max_con_len, ' ', ' ', ' ', ' ', ' ' ;
+    }
+
+    printf OUT "\n" ;
+  }
+
+  printf OUT "  %s-+------+------+------+-----", '-' x $max_con_pos_len;
+  printf OUT "--++" ;
+  printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+  printf OUT "\n" ;
+
+  printf OUT "\n\n" ;
+
+} # print_context
+
+sub num_as_word
+{
+  my ($num) = @_ ;
+
+  $num = abs($num) ;
+
+  if ($num == 1)
+  {
+    return ('one word') ;
+  }
+  elsif ($num == 2)
+  {
+    return ('two words') ;
+  }
+  elsif ($num == 3)
+  {
+    return ('three words') ;
+  }
+  elsif ($num == 4)
+  {
+    return ('four words') ;
+  }
+  else
+  {
+    return ($num.' words') ;
+  }
+}
+
+sub describe_err
+{ # describe_err
+
+  my ($head_err, $head_aft_bef, $dep_err) = @_ ;
+  my ($dep_g, $dep_s, $desc) ;
+  my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ;
+
+  if ($head_err eq '-')
+  {
+    $desc = 'correct head' ;
+
+    if ($head_aft_bef_s eq '0')
+    {
+      $desc .= ' (0)' ;
+    }
+    elsif ($head_aft_bef_s eq 'e')
+    {
+      $desc .= ' (the focus word)' ;
+    }
+    elsif ($head_aft_bef_s eq 'a')
+    {
+      $desc .= ' (after the focus word)' ;
+    }
+    elsif ($head_aft_bef_s eq 'b')
+    {
+      $desc .= ' (before the focus word)' ;
+    }
+  }
+  elsif ($head_aft_bef_s eq '0')
+  {
+    $desc = 'head = 0 instead of ' ;
+    if ($head_aft_bef_g eq 'a')
+    {
+      $desc.= 'after ' ;
+    }
+    if ($head_aft_bef_g eq 'b')
+    {
+      $desc.= 'before ' ;
+    }
+    $desc .= 'the focus word' ;
+  }
+  elsif ($head_aft_bef_g eq '0')
+  {
+    $desc = 'head is ' ;
+    if ($head_aft_bef_g eq 'a')
+    {
+      $desc.= 'after ' ;
+    }
+    if ($head_aft_bef_g eq 'b')
+    {
+      $desc.= 'before ' ;
+    }
+    $desc .= 'the focus word instead of 0' ;
+  }
+  else
+  {
+    $desc = num_as_word($head_err) ;
+    if ($head_err < 0)
+    {
+      $desc .= ' before' ;
+    }
+    else
+    {
+      $desc .= ' after' ;
+    }
+
+    $desc = 'head '.$desc.' the correct head ' ;
+
+    if ($head_aft_bef_s eq '0')
+    {
+      $desc .= '(0' ;
+    }
+    elsif ($head_aft_bef_s eq 'e')
+    {
+      $desc .= '(the focus word' ;
+    }
+    elsif ($head_aft_bef_s eq 'a')
+    {
+      $desc .= '(after the focus word' ;
+    }
+    elsif ($head_aft_bef_s eq 'b')
+    {
+      $desc .= '(before the focus word' ;
+    }
+
+    if ($head_aft_bef_g ne $head_aft_bef_s)
+    {
+      $desc .= ' instead of' ;
+      if ($head_aft_bef_s eq '0')
+      {
+	$desc .= '0' ;
+      }
+      elsif ($head_aft_bef_s eq 'e')
+      {
+	$desc .= 'the focus word' ;
+      }
+      elsif ($head_aft_bef_s eq 'a')
+      {
+	$desc .= 'after the focus word' ;
+      }
+      elsif ($head_aft_bef_s eq 'b')
+      {
+	$desc .= 'before the focus word' ;
+      }
+    }
+
+    $desc .= ')' ;
+  }
+
+  $desc .= ', ' ;
+
+  if ($dep_err eq '-')
+  {
+    $desc .= 'correct dependency' ;
+  }
+  else
+  {
+    ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ;
+    $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ;
+  }
+
+  return($desc) ;
+
+} # describe_err
+
+sub get_context
+{ # get_context
+
+  my ($sent, $i_w) = @_ ;
+  my ($w_2, $w_1, $w1, $w2) ;
+  my ($p_2, $p_1, $p1, $p2) ;
+
+  if ($i_w >= 2)
+  {
+    $w_2 = ${${$sent}[$i_w-2]}{word} ;
+    $p_2 = ${${$sent}[$i_w-2]}{pos} ;
+  }
+  else
+  {
+    $w_2 = $START ;
+    $p_2 = $START ;
+  }
+
+  if ($i_w >= 1)
+  {
+    $w_1 = ${${$sent}[$i_w-1]}{word} ;
+    $p_1 = ${${$sent}[$i_w-1]}{pos} ;
+  }
+  else
+  {
+    $w_1 = $START ;
+    $p_1 = $START ;
+  }
+
+  if ($i_w <= scalar @{$sent}-2)
+  {
+    $w1 = ${${$sent}[$i_w+1]}{word} ;
+    $p1 = ${${$sent}[$i_w+1]}{pos} ;
+  }
+  else
+  {
+    $w1 = $END ;
+    $p1 = $END ;
+  }
+
+  if ($i_w <= scalar @{$sent}-3)
+  {
+    $w2 = ${${$sent}[$i_w+2]}{word} ;
+    $p2 = ${${$sent}[$i_w+2]}{pos} ;
+  }
+  else
+  {
+    $w2 = $END ;
+    $p2 = $END ;
+  }
+
+  return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ;
+
+} # get_context
+
+sub read_sent
+{ # read_sent
+
+  my ($sent_gold, $sent_sys) = @_ ;
+  my ($line_g, $line_s, $new_sent) ;
+  my (%fields_g, %fields_s) ;
+
+  $new_sent = 1 ;
+
+  @{$sent_gold} = () ;
+  @{$sent_sys} = () ;
+
+  while (1)
+  { # main reading loop
+
+    $line_g = <GOLD> ;
+    $line_s = <SYS> ;
+
+    $line_num++ ;
+
+    # system output has fewer lines than gold standard
+    if ((defined $line_g) && (! defined $line_s))
+    {
+	printf STDERR "line mismatch, line %d:\n", $line_num ;
+	printf STDERR " gold: %s", $line_g ;
+	printf STDERR " sys : past end of file\n" ;
+	exit(1) ;
+    }
+
+    # system output has more lines than gold standard
+    if ((! defined $line_g) && (defined $line_s))
+    {
+	printf STDERR "line mismatch, line %d:\n", $line_num ;
+	printf STDERR " gold: past end of file\n" ;
+	printf STDERR " sys : %s", $line_s ;
+	exit(1) ;
+    }
+    
+    # end of file reached for both
+    if ((! defined $line_g) && (! defined $line_s))
+    {
+	return (1) ;
+    }
+
+    # one contains end of sentence but other one does not
+    if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/))
+    {
+      printf STDERR "line mismatch, line %d:\n", $line_num ;
+      printf STDERR " gold: %s", $line_g ;
+      printf STDERR " sys : %s", $line_s ;
+      exit(1) ;
+    }
+
+    # end of sentence reached
+    if ($line_g =~ /^\s+$/)
+    {
+	return(0) ;
+    }
+
+    # now both lines contain information
+
+    if ($new_sent)
+    {
+      $new_sent = 0 ;
+    }
+
+    # 'official' column names
+    # options.output = ['id','form','lemma','cpostag','postag',
+    #                   'feats','head','deprel','phead','pdeprel']
+
+    @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ;
+
+    push @{$sent_gold}, { %fields_g } ;
+
+    @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ;
+
+    if (($fields_g{word} ne $fields_s{word})
+	||
+	($fields_g{pos} ne $fields_s{pos}))
+    {
+      printf STDERR "Word/pos mismatch, line %d:\n", $line_num ;
+      printf STDERR " gold: %s", $line_g ;
+      printf STDERR " sys : %s", $line_s ;
+      #exit(1) ;
+    }
+
+    push @{$sent_sys}, { %fields_s } ;
+
+  } # main reading loop
+  
+} # read_sent
+
+################################################################################
+###                                  main                                    ###
+################################################################################
+
+our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ;
+
+my ($sent_num, $eof, $word_num, @err_sent) ;
+my (@sent_gold, @sent_sys, @starts) ;
+my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ;
+my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ;
+my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ;
+my ($loc_con, %loc_con_err_counts, %err_desc) ;
+my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ;
+my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ;
+my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ;
+my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ;
+my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ;
+my (%freq_err, $err) ;
+
+my ($i, $j, $i_w, $l, $n_args) ;
+my ($w_2, $w_1, $w1, $w2) ;
+my ($wp_2, $wp_1, $wp1, $wp2) ;
+my ($p_2, $p_1, $p1, $p2) ;
+
+my ($short_output) ;
+my ($score_on_punct) ;
+$counts{punct} = 0; # initialize
+
+getopts("g:o:s:qvhpb") ;
+
+if (defined $opt_v)
+{
+    my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $';
+    my @parts = split ' ',$id;
+    print "Version $parts[2]\n";
+    exit(0);
+}
+
+if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s)))
+{
+  die $usage ;
+}
+
+if (! defined $opt_g)
+{
+  die "Gold standard file (-g) missing\n" ;
+}
+
+if (! defined $opt_s)
+{
+  die "System output file (-s) missing\n" ;
+}
+
+if (! defined $opt_o)
+{
+  $opt_o = '-' ;
+}
+
+if (defined $opt_q)
+{
+    $short_output = 1 ;
+} else {
+    $short_output = 0 ;
+}
+
+if (defined $opt_p)
+{
+    $score_on_punct = 1 ;
+} else {
+    $score_on_punct = 0 ;
+}
+
+$line_num = 0 ;
+$sent_num = 0 ;
+$eof = 0 ;
+
+@err_sent = () ;
+@starts = () ;
+
+%{$err_sent[0]} = () ;
+
+$max_pos_len = length('CPOS') ;
+
+################################################################################
+###                              reading input                               ###
+################################################################################
+
+open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ;
+open (SYS,  "<$opt_s") || die "Could not open system output file $opt_s\n" ;
+open (OUT,  ">$opt_o") || die "Could not open output file $opt_o\n" ;
+
+
+if (defined $opt_b) {  # produce output similar to evalb
+    print OUT "     Sent.          Attachment      Correct        Scoring          \n";
+    print OUT "    ID Tokens  -   Unlab. Lab.   HEAD HEAD+DEPREL   tokens   - - - -\n";
+    print OUT "  ============================================================================\n";
+}
+
+
+while (! $eof)
+{ # main reading loop
+
+  $starts[$sent_num] = $line_num+1 ;
+  $eof = read_sent(\@sent_gold, \@sent_sys) ;
+
+  $sent_num++ ;
+
+  %{$err_sent[$sent_num]} = () ;
+  $word_num = scalar @sent_gold ;
+
+  # for accuracy per sentence
+  my %sent_counts = ( tot      => 0,
+		      err_any  => 0,
+		      err_head => 0
+		      ); 
+
+  # printf "$sent_num $word_num\n" ;
+
+  my @frames_g = ('** '); # the initial frame for the virtual root
+  my @frames_s = ('** '); # the initial frame for the virtual root
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+      push @frames_g, ''; # initialize
+      push @frames_s, ''; # initialize
+  }
+
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+
+    ($word, $pos, $head_g, $dep_g)
+      = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+    $wp = $word.' / '.$pos ;
+
+    # printf "%d: %s %s %s %s\n", $i_w,  $word, $pos, $head_g, $dep_g ;
+
+    if ((! $score_on_punct) && is_uni_punct($word))
+    {
+      $counts{punct}++ ;
+      # ignore punctuations
+      next ;
+    }
+
+    if (length($pos) > $max_pos_len)
+    {
+      $max_pos_len = length($pos) ;
+    }
+
+    ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+    $counts{tot}++ ;
+    $counts{word}{$wp}{tot}++ ;
+    $counts{pos}{$pos}{tot}++ ;
+    $counts{head}{$head_g-$i_w-1}{tot}++ ;
+
+    # for frame confusions
+    # add child to frame of parent
+    $frames_g[$head_g] .= "$dep_g ";
+    $frames_s[$head_s] .= "$dep_s ";
+    # add to frame of token itself
+    $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero
+    $frames_s[$i_w+1] .= "*$dep_g* ";
+
+    # for precision and recall of DEPREL
+    $counts{dep}{$dep_g}{tot}++ ;     # counts for gold standard deprels
+    $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions
+    $counts{dep_s}{$dep_s}{tot}++ ;   # counts for system deprels
+    $counts{all_dep}{$dep_g} = 1 ;    # list of all deprels that occur ...
+    $counts{all_dep}{$dep_s} = 1 ;    # ... in either gold or system output
+
+    # for precision and recall of HEAD direction
+    my $dir_g;
+    if ($head_g == 0) {
+	$dir_g = 'to_root';
+    } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero
+                                 # also below
+	$dir_g = 'left';
+    } elsif ($head_g > $i_w+1) {
+	$dir_g = 'right';
+    } else {
+        # token links to itself; should never happen in correct gold standard
+	$dir_g = 'self'; 
+    }
+    my $dir_s;
+    if ($head_s == 0) {
+	$dir_s = 'to_root';
+    } elsif ($head_s < $i_w+1) {
+	$dir_s = 'left';
+    } elsif ($head_s > $i_w+1) {
+	$dir_s = 'right';
+    } else {
+        # token links to itself; should not happen in good system 
+        # (but not forbidden in shared task)
+	$dir_s = 'self'; 
+    }
+    $counts{dir_g}{$dir_g}{tot}++ ;   # counts for gold standard head direction
+    $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions
+    $counts{dir_s}{$dir_s}{tot}++ ;   # counts for system head direction
+
+    # for precision and recall of HEAD distance
+    my $dist_g;
+    if ($head_g == 0) {
+	$dist_g = 'to_root';
+    } elsif ( abs($head_g - ($i_w+1)) <= 1 ) {
+	$dist_g = '1'; # includes the 'self' cases
+    } elsif ( abs($head_g - ($i_w+1)) <= 2 ) {
+	$dist_g = '2';
+    } elsif ( abs($head_g - ($i_w+1)) <= 6 ) {
+	$dist_g = '3-6';
+    } else {
+	$dist_g = '7-...';
+    }
+    my $dist_s;
+    if ($head_s == 0) {
+	$dist_s = 'to_root';
+    } elsif ( abs($head_s - ($i_w+1)) <= 1 ) {
+	$dist_s = '1'; # includes the 'self' cases
+    } elsif ( abs($head_s - ($i_w+1)) <= 2 ) {
+	$dist_s = '2';
+    } elsif ( abs($head_s - ($i_w+1)) <= 6 ) {
+	$dist_s = '3-6';
+    } else {
+	$dist_s = '7-...';
+    }
+    $counts{dist_g}{$dist_g}{tot}++ ;    # counts for gold standard head distance
+    $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions
+    $counts{dist_s}{$dist_s}{tot}++ ;    # counts for system head distance
+
+
+    $err_head = ($head_g ne $head_s) ; # error in head
+    $err_dep = ($dep_g ne $dep_s) ;    # error in deprel
+
+    $head_err = '-' ;
+    $dep_err = '-' ;
+
+    # for accuracy per sentence
+    $sent_counts{tot}++ ;
+    if ($err_dep || $err_head) {
+	$sent_counts{err_any}++ ;
+    }
+    if ($err_head) {
+	$sent_counts{err_head}++ ;
+    }
+
+    # total counts and counts for CPOS involved in errors
+
+    if ($head_g eq '0')
+    {
+      $head_aft_bef_g = '0' ;
+    }
+    elsif ($head_g eq $i_w+1)
+    {
+      $head_aft_bef_g = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    if ($head_s eq '0')
+    {
+      $head_aft_bef_s = '0' ;
+    }
+    elsif ($head_s eq $i_w+1)
+    {
+      $head_aft_bef_s = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+    if ($err_head)
+    {
+      if ($head_aft_bef_s eq '0')
+      {
+	$head_err = 0 ;
+      }
+      else
+      {
+	$head_err = $head_s-$head_g ;
+      }
+
+      $err_sent[$sent_num]{head}++ ;
+      $counts{err_head}{tot}++ ;
+      $counts{err_head}{$head_err}++ ;
+
+      $counts{word}{err_head}{$wp}++ ;
+      $counts{pos}{$pos}{err_head}{tot}++ ;
+      $counts{pos}{$pos}{err_head}{$head_err}++ ;
+    }
+
+    if ($err_dep)
+    {
+      $dep_err = $dep_g.'->'.$dep_s ;
+      $err_sent[$sent_num]{dep}++ ;
+      $counts{err_dep}{tot}++ ;
+      $counts{err_dep}{$dep_err}++ ;
+
+      $counts{word}{err_dep}{$wp}++ ;
+      $counts{pos}{$pos}{err_dep}{tot}++ ;
+      $counts{pos}{$pos}{err_dep}{$dep_err}++ ;
+
+      if ($err_head)
+      {
+	$counts{err_both}++ ;
+	$counts{pos}{$pos}{err_both}++ ;
+      }
+    }
+
+    ### DEPREL + ATTACHMENT
+    if ((!$err_dep) && ($err_head)) {
+	$counts{err_head_corr_dep}{tot}++ ;
+	$counts{err_head_corr_dep}{$dep_s}++ ;
+    }
+    ### DEPREL + ATTACHMENT
+
+    # counts for words involved in errors
+
+    if (! ($err_head || $err_dep))
+    {
+      next ;
+    }
+
+    $err_sent[$sent_num]{word}++ ;
+    $counts{err_any}++ ;
+    $counts{word}{err_any}{$wp}++ ;
+    $counts{pos}{$pos}{err_any}++ ;
+
+    ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+    if ($w_2 ne $START)
+    {
+      $wp_2 = $w_2.' / '.$p_2 ;
+    }
+    else
+    {
+      $wp_2 = $w_2 ;
+    }
+
+    if ($w_1 ne $START)
+    {
+      $wp_1 = $w_1.' / '.$p_1 ;
+    }
+    else
+    {
+      $wp_1 = $w_1 ;
+    }
+
+    if ($w1 ne $END)
+    {
+      $wp1 = $w1.' / '.$p1 ;
+    }
+    else
+    {
+      $wp1 = $w1 ;
+    }
+
+    if ($w2 ne $END)
+    {
+      $wp2 = $w2.' / '.$p2 ;
+    }
+    else
+    {
+      $wp2 = $w2 ;
+    }
+
+    $con_bef = $wp_1 ;
+    $con_bef_2 = $wp_2.' + '.$wp_1 ;
+    $con_aft = $wp1 ;
+    $con_aft_2 = $wp1.' + '.$wp2 ;
+
+    $con_pos_bef = $p_1 ;
+    $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+    $con_pos_aft = $p1 ;
+    $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+    if ($w_1 ne $START)
+    {
+      # do not count '.S' as a word context
+      $counts{con_bef_2}{tot}{$con_bef_2}++ ;
+      $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ;
+      $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ;
+      $counts{con_bef}{tot}{$con_bef}++ ;
+      $counts{con_bef}{err_head}{$con_bef} += $err_head ;
+      $counts{con_bef}{err_dep}{$con_bef} += $err_dep ;
+    }
+
+    if ($w1 ne $END)
+    {
+      # do not count '.E' as a word context
+      $counts{con_aft_2}{tot}{$con_aft_2}++ ;
+      $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ;
+      $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ;
+      $counts{con_aft}{tot}{$con_aft}++ ;
+      $counts{con_aft}{err_head}{$con_aft} += $err_head ;
+      $counts{con_aft}{err_dep}{$con_aft} += $err_dep ;
+    }
+
+    $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ;
+    $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ;
+    $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ;
+    $counts{con_pos_bef}{tot}{$con_pos_bef}++ ;
+    $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ;
+    $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ;
+
+    $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ;
+    $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ;
+    $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ;
+    $counts{con_pos_aft}{tot}{$con_pos_aft}++ ;
+    $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ;
+    $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ;
+
+    $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+    $freq_err{$err}++ ;
+
+  } # loop on words
+
+  foreach $i_w (0 .. $word_num) # including one for the virtual root
+  { # loop on words
+      if ($frames_g[$i_w] ne $frames_s[$i_w]) {
+	  $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ;
+      }
+  }
+
+  if (defined $opt_b) { # produce output similar to evalb
+      if ($word_num > 0) {
+	  my ($unlabeled,$labeled) = ('NaN', 'NaN');
+	  if ($sent_counts{tot} > 0) { # there are scoring tokens
+	      $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot};
+	      $labeled   = 100-$sent_counts{err_any} *100.0/$sent_counts{tot};
+	  }
+	  printf OUT "  %4d %4d    0  %6.2f %6.2f  %4d    %4d        %4d    0 0 0 0\n", 
+	  $sent_num, $word_num, 
+	  $unlabeled, $labeled, 
+	  $sent_counts{tot}-$sent_counts{err_head}, 
+	  $sent_counts{tot}-$sent_counts{err_any}, 
+	  $sent_counts{tot},;
+      }
+  }
+
+} # main reading loop
+
+################################################################################
+###                             printing output                              ###
+################################################################################
+
+if (defined $opt_b) {  # produce output similar to evalb
+    print OUT "\n\n";
+}
+printf OUT "  Labeled   attachment score: %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_any},      $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ;
+printf OUT "  Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ;
+printf OUT "  Label accuracy score:       %d / %d * 100 = %.2f %%\n", 
+    $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ;
+
+if ($short_output)
+{
+    exit(0) ;
+}
+printf OUT "\n  %s\n\n", '=' x 80 ;
+printf OUT "  Evaluation of the results in %s\n  vs. gold standard %s:\n\n", $opt_s, $opt_g ;
+
+printf OUT "  Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ;
+
+printf OUT "  Number of non-scoring tokens: $counts{punct}\n\n";
+
+printf OUT "  The overall accuracy and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %-5s | %-5s |   %%  | %-5s |   %%  | %-5s |   %%\n",
+  'Accuracy', 'words', 'right', 'right', 'both' ;
+printf OUT "  %-10s | %-5s | %-5s |      | %-5s |      | %-5s |\n",
+  ' ', ' ', 'head', ' dep', 'right' ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+  'total', $counts{tot},
+  $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot},
+  $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot},
+  $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+    if (! defined($counts{pos}{$pos}{err_head}{tot}))
+    {
+	$counts{pos}{$pos}{err_head}{tot} = 0 ;
+    }
+    if (! defined($counts{pos}{$pos}{err_dep}{tot}))
+    {
+	$counts{pos}{$pos}{err_dep}{tot} = 0 ;
+    }
+    if (! defined($counts{pos}{$pos}{err_any}))
+    {
+	$counts{pos}{$pos}{err_any} = 0 ;
+    }
+
+    printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+    $pos, $counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ;
+}
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  The overall error rate and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %-5s | %-5s |   %%  | %-5s |   %%  | %-5s |   %%\n",
+  'Error', 'words', 'head', ' dep', 'both' ;
+printf OUT "  %-10s | %-5s | %-5s |      | %-5s |      | %-5s |\n",
+
+  'Rate', ' ', 'err', ' err', 'wrong' ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+  'total', $counts{tot},
+  $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot},
+  $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot},
+  $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+    if (! defined($counts{pos}{$pos}{err_both}))
+    {
+	$counts{pos}{$pos}{err_both} = 0 ;
+    }
+
+    printf OUT "  %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+    $pos, $counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+    $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ;
+    
+}
+
+printf OUT "%s\n", "  -----------+-------+-------+------+-------+------+-------+-------" ;
+
+### added by Sabine Buchholz
+printf OUT "\n\n";
+printf OUT "  Precision and recall of DEPREL\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  deprel          | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dep2}{$dep}{$dep})) {
+	$tot_corr = $counts{dep2}{$dep}{$dep};
+    } 
+    if (defined($counts{dep}{$dep}{tot})) {
+    	$tot_g = $counts{dep}{$dep}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dep_s}{$dep}{tot})) {
+	$tot_s = $counts{dep_s}{$dep}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+### DEPREL + ATTACHMENT:
+### Same as Sabine's DEPREL apart from $tot_corr calculation
+printf OUT "\n\n";
+printf OUT "  Precision and recall of DEPREL + ATTACHMENT\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  deprel          | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dep2}{$dep}{$dep})) {
+	if (defined($counts{err_head_corr_dep}{$dep})) {
+	    $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep};
+	} else {
+	    $tot_corr = $counts{dep2}{$dep}{$dep};
+	}
+    } 
+    if (defined($counts{dep}{$dep}{tot})) {
+    	$tot_g = $counts{dep}{$dep}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dep_s}{$dep}{tot})) {
+	$tot_s = $counts{dep_s}{$dep}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+### DEPREL + ATTACHMENT
+
+printf OUT "\n\n";
+printf OUT "  Precision and recall of binned HEAD direction\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  direction       | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dir ('to_root', 'left', 'right', 'self') {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dir2}{$dir}{$dir})) {
+	$tot_corr = $counts{dir2}{$dir}{$dir};
+    } 
+    if (defined($counts{dir_g}{$dir}{tot})) {
+    	$tot_g = $counts{dir_g}{$dir}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dir_s}{$dir}{tot})) {
+	$tot_s = $counts{dir_s}{$dir}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT "  Precision and recall of binned HEAD distance\n\n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+printf OUT "  distance        | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT "  ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dist ('to_root', '1', '2', '3-6', '7-...') {
+    # initialize
+    my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+    if (defined($counts{dist2}{$dist}{$dist})) {
+	$tot_corr = $counts{dist2}{$dist}{$dist};
+    } 
+    if (defined($counts{dist_g}{$dist}{tot})) {
+    	$tot_g = $counts{dist_g}{$dist}{tot};
+	$rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+    }
+    if (defined($counts{dist_s}{$dist}{tot})) {
+	$tot_s = $counts{dist_s}{$dist}{tot};
+	$prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+    }
+    printf OUT "  %-15s | %4d | %7d | %6d | %10s | %13s\n",
+    $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT "  Frame confusions (gold versus system; *...* marks the head token)\n\n";
+foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}})
+{
+    if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later)
+    {
+	printf OUT "  %3d  %s\n", $counts{frame2}{$frame}, $frame;
+    }
+}
+### end of: added by Sabine Buchholz
+
+
+#
+# Leave only the 5 words mostly involved in errors
+#
+
+
+$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ;
+
+# ensure enough space for title
+$max_word_len = length('word') ;
+
+foreach $word (keys %{$counts{word}{err_any}})
+{
+  if ($counts{word}{err_any}{$word} < $thresh)
+  {
+    delete $counts{word}{err_any}{$word} ;
+    next ;
+  }
+
+  $l = uni_len($word) ;
+  if ($l > $max_word_len)
+  {
+    $max_word_len = $l ;
+  }
+}
+
+# filter a case when the difference between the error counts
+# for 2-word and 1-word contexts is small
+# (leave the 2-word context)
+
+foreach $con (keys %{$counts{con_aft_2}{tot}})
+{
+  ($w1) = split(/\+/, $con) ;
+  
+  if (defined $counts{con_aft}{tot}{$w1} &&
+      $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1)
+  {
+    delete $counts{con_aft}{tot}{$w1} ;
+  }
+}
+
+foreach $con (keys %{$counts{con_bef_2}{tot}})
+{
+  ($w_2, $w_1) = split(/\+/, $con) ;
+
+  if (defined $counts{con_bef}{tot}{$w_1} &&
+      $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1)
+  {
+    delete $counts{con_bef}{tot}{$w_1} ;
+  }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+  ($p1) = split(/\+/, $con_pos) ;
+
+  if (defined($counts{con_pos_aft}{tot}{$p1}) &&
+      $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1)
+  {
+    delete $counts{con_pos_aft}{tot}{$p1} ;
+  }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+  ($p_2, $p_1) = split(/\+/, $con_pos) ;
+
+  if (defined($counts{con_pos_bef}{tot}{$p_1}) &&
+      $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1)
+  {
+    delete $counts{con_pos_bef}{tot}{$p_1} ;
+  }
+}
+
+# for each context type, take the three contexts most involved in errors
+
+$max_con_len = 0 ;
+
+filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ;
+
+# for each CPOS context type, take the three CPOS contexts most involved in errors
+
+$max_con_pos_len = 0 ;
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+  if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_bef_2}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef}{tot}})
+{
+  if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_bef}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft}{tot}})
+{
+  if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_aft}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+  if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh)
+  {
+    delete $counts{con_pos_aft_2}{tot}{$con_pos} ;
+    next ;
+  }
+  if (length($con_pos) > $max_con_pos_len)
+  {
+    $max_con_pos_len = length($con_pos) ;
+  }
+}
+
+# printing
+
+# ------------- focus words
+
+printf OUT "\n\n" ;
+printf OUT "  %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ;
+
+printf OUT "  %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ;
+printf OUT "  %s-+------+------+------+------\n", '-' x $max_word_len;
+
+foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}})
+{
+    if (!defined($counts{word}{err_head}{$word}))
+    {
+	$counts{word}{err_head}{$word} = 0 ;
+    }
+    if (! defined($counts{word}{err_dep}{$word}))
+    {
+	$counts{word}{err_dep}{$word} = 0 ;
+    }
+    if (! defined($counts{word}{err_any}{$word}))
+    {
+	$counts{word}{err_any}{$word} = 0;
+    }
+    printf OUT "  %-*s | %4d | %4d | %4d | %4d\n",
+    $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word},
+    $counts{word}{err_head}{$word},
+    $counts{word}{err_dep}{$word},
+    $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ;
+}
+
+printf OUT "  %s-+------+------+------+------\n", '-' x $max_word_len;
+
+# ------------- contexts
+
+printf OUT "\n\n" ;
+
+printf OUT "  one-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  two-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  one-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT "  two-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ;
+
+# ------------- Sentences
+
+printf OUT "  Sentence with the highest number of word errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word})
+		 <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  Sentence with the highest number of head errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) 
+		 <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT "  Sentence with the highest number of dependency errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) 
+		 <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ;
+printf OUT "   Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+  $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+#
+# Second pass, collect statistics of the frequent errors
+#
+
+# filter the errors, leave the most frequent $freq_err_num errors
+
+$i = 0 ;
+
+$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ;
+
+foreach $err (keys %freq_err)
+{
+  if ($freq_err{$err} < $thresh)
+  {
+    delete $freq_err{$err} ;
+  }
+}
+
+# in case there are several errors with the threshold count
+
+$freq_err_num = scalar keys %freq_err ;
+
+%err_counts = () ;
+
+$eof = 0 ;
+
+seek (GOLD, 0, 0) ;
+seek (SYS, 0, 0) ;
+
+while (! $eof)
+{ # second reading loop
+
+  $eof = read_sent(\@sent_gold, \@sent_sys) ;
+  $sent_num++ ;
+
+  $word_num = scalar @sent_gold ;
+
+  # printf "$sent_num $word_num\n" ;
+  
+  foreach $i_w (0 .. $word_num-1)
+  { # loop on words
+    ($word, $pos, $head_g, $dep_g)
+      = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+
+    # printf "%d: %s %s %s %s\n", $i_w,  $word, $pos, $head_g, $dep_g ;
+
+    if ((! $score_on_punct) && is_uni_punct($word))
+    {
+      # ignore punctuations
+      next ;
+    }
+
+    ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+    $err_head = ($head_g ne $head_s) ;
+    $err_dep = ($dep_g ne $dep_s) ;
+
+    $head_err = '-' ;
+    $dep_err = '-' ;
+
+    if ($head_g eq '0')
+    {
+      $head_aft_bef_g = '0' ;
+    }
+    elsif ($head_g eq $i_w+1)
+    {
+      $head_aft_bef_g = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    if ($head_s eq '0')
+    {
+      $head_aft_bef_s = '0' ;
+    }
+    elsif ($head_s eq $i_w+1)
+    {
+      $head_aft_bef_s = 'e' ;
+    }
+    else
+    {
+      $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+    }
+
+    $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+    if ($err_head)
+    {
+      if ($head_aft_bef_s eq '0')
+      {
+	$head_err = 0 ;
+      }
+      else
+      {
+	$head_err = $head_s-$head_g ;
+      }
+    }
+
+    if ($err_dep)
+    {
+      $dep_err = $dep_g.'->'.$dep_s ;
+    }
+
+    if (! ($err_head || $err_dep))
+    {
+      next ;
+    }
+
+    # handle only the most frequent errors
+
+    $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+
+    if (! exists $freq_err{$err})
+    {
+      next ;
+    }
+
+    ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+    $con_bef = $w_1 ;
+    $con_bef_2 = $w_2.' + '.$w_1 ;
+    $con_aft = $w1 ;
+    $con_aft_2 = $w1.' + '.$w2 ;
+
+    $con_pos_bef = $p_1 ;
+    $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+    $con_pos_aft = $p1 ;
+    $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+    @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ;
+
+    # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n",
+    #  $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ;
+    
+    @bits = (0, 0, 0, 0, 0, 0) ;
+    $j = 0 ;
+
+    while ($j == 0)
+    {
+      for ($i = 0; $i <= $#bits; $i++)
+      {
+	if ($bits[$i] == 0)
+	{
+	  $bits[$i] = 1 ;
+	  $j = 0 ;
+	  last ;
+	}
+	else
+	{
+	  $bits[$i] = 0 ;
+	  $j = 1 ;
+	}
+      }
+
+      @e_bits = @cur_err ;
+
+      for ($i = 0; $i <= $#bits; $i++)
+      {
+	if (! $bits[$i])
+	{
+	  $e_bits[$i] = '*' ;
+	}
+      }
+
+      # include also the last case which is the most general
+      # (wildcards for everything)
+      $err_counts{$err}{join($sep, @e_bits)}++ ;
+
+    }
+
+  } # loop on words
+} # second reading loop
+
+printf OUT "\n\n" ;
+printf OUT "  Specific errors, %d most frequent errors:", $freq_err_num ;
+printf OUT "\n  %s\n", '=' x 41 ;
+
+
+# deleting local contexts which are too general
+
+foreach $err (keys %err_counts)
+{
+  foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+		    keys %{$err_counts{$err}})
+  {
+    @cur_err = split(/\Q$sep\E/, $loc_con) ;
+
+    # In this loop, one or two elements of the local context are
+    # replaced with '*' to make it more general. If the entry for
+    # the general context has the same count it is removed.
+
+    foreach $i (0 .. $#cur_err)
+    {
+      $w1 = $cur_err[$i] ;
+      if ($cur_err[$i] eq '*')
+      {
+	next ;
+      }
+      $cur_err[$i] = '*' ;
+      $con1 = join($sep, @cur_err) ;
+      if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+	   && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+      {
+	delete $err_counts{$err}{$con1} ;
+      }
+      for ($j = $i+1; $j <=$#cur_err; $j++)
+      {
+	if ($cur_err[$j] eq '*')
+	{
+	  next ;
+	}
+	$w2 = $cur_err[$j] ;
+	$cur_err[$j] = '*' ;
+	$con1 = join($sep, @cur_err) ;
+	if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+	     && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+	{
+	  delete $err_counts{$err}{$con1} ;
+	}
+	$cur_err[$j] = $w2 ;
+      }
+      $cur_err[$i] = $w1 ;
+    }
+  }
+}
+
+# Leaving only the topmost local contexts for each error
+
+foreach $err (keys %err_counts)
+{
+  $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ;
+
+  # of the threshold is too low, take the 2nd highest count
+  # (the highest may be the total which is the generic case
+  #   and not relevant for printing)
+
+  if ($thresh < 5)
+  {
+    $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ;
+  }
+
+  foreach $loc_con (keys %{$err_counts{$err}})
+  {
+    if ($err_counts{$err}{$loc_con} < $thresh)
+    {
+      delete $err_counts{$err}{$loc_con} ;
+    }
+    else
+    {
+      if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*')))
+      {
+	$loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ;
+      }
+    }
+  }
+}
+
+# printing an error summary
+
+# calculating the context field length
+
+$max_word_spec_len= length('word') ;
+$max_con_aft_len = length('word') ;
+$max_con_bef_len = length('word') ;
+$max_con_pos_len = length('CPOS') ;
+
+foreach $err (keys %err_counts)
+{
+  foreach $loc_con (sort keys %{$err_counts{$err}})
+  {
+    ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $loc_con) ;
+
+    $l = uni_len($word) ;
+    if ($l > $max_word_spec_len)
+    {
+      $max_word_spec_len = $l ;
+    }
+
+    $l = uni_len($con_bef) ;
+    if ($l > $max_con_bef_len)
+    {
+      $max_con_bef_len = $l ;
+    }
+
+    $l = uni_len($con_aft) ;
+    if ($l > $max_con_aft_len)
+    {
+      $max_con_aft_len = $l ;
+    }
+
+    if (length($con_pos_aft) > $max_con_pos_len)
+    {
+      $max_con_pos_len = length($con_pos_aft) ;
+    }
+
+    if (length($con_pos_bef) > $max_con_pos_len)
+    {
+      $max_con_pos_len = length($con_pos_bef) ;
+    }
+  }
+}
+
+$err_counter = 0 ;
+
+foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err)
+{
+
+  ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ;
+
+  $err_counter++ ;
+  $err_desc{$err} = sprintf("%2d. ", $err_counter).
+    describe_err($head_err, $head_aft_bef, $dep_err) ;
+  
+  # printf OUT "  %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ;
+  printf OUT "\n" ;
+  printf OUT "  %s : %d times\n", $err_desc{$err}, $freq_err{$err} ;
+
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  printf OUT "  %-*s | %-*s | %-*s | %s\n",
+      $max_con_pos_len+$max_con_bef_len+3, '  Before',
+	$max_word_spec_len+$max_pos_len+3, '   Focus',
+	  $max_con_pos_len+$max_con_aft_len+3, '  After',
+	    'Count' ;
+
+  printf OUT "  %-*s   %-*s | %-*s   %-*s | %-*s   %-*s |\n",
+    $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+       $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+	$max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+		    keys %{$err_counts{$err}})
+  {
+    if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*')))
+    {
+      next ;
+    }
+
+    $con1 = $loc_con ;
+    $con1 =~ s/\*/ /g ;
+
+    ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $con1) ;
+
+    printf OUT "  %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n",
+      $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+	  $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+	    $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft,
+	      $err_counts{$err}{$loc_con} ;
+  }
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+}
+
+printf OUT "\n\n" ;
+printf OUT "  Local contexts involved in several frequent errors:" ;
+printf OUT "\n  %s\n", '=' x 51 ;
+printf OUT "\n\n" ;
+
+foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=>
+			  scalar keys %{$loc_con_err_counts{$a}}}
+		  keys %loc_con_err_counts)
+{
+
+  if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1)
+  {
+    next ;
+  }
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  printf OUT "  %-*s | %-*s | %-*s \n",
+      $max_con_pos_len+$max_con_bef_len+3, '  Before',
+	$max_word_spec_len+$max_pos_len+3, '   Focus',
+	  $max_con_pos_len+$max_con_aft_len+3, '  After' ;
+
+  printf OUT "  %-*s   %-*s | %-*s   %-*s | %-*s   %-*s \n",
+    $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+       $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+	$max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  $con1 = $loc_con ;
+  $con1 =~ s/\*/ /g ;
+
+  ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+      split(/\Q$sep\E/, $con1) ;
+
+  printf OUT "  %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n",
+    $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+      $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+	$max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ;
+	  
+  printf OUT "  %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+    '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+       '-' x $max_pos_len, '-' x $max_word_spec_len,
+	'-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+  foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=>
+			$loc_con_err_counts{$loc_con}{$a}}
+		keys %{$loc_con_err_counts{$loc_con}})
+  {
+    printf OUT "  %s : %d times\n", $err_desc{$err},
+      $loc_con_err_counts{$loc_con}{$err} ;
+  }
+
+  printf OUT "\n" ;
+}
+
+close GOLD ;
+close SYS ;
+
+close OUT ;
diff --git a/bist_parser/bmstparser/src/utils.py b/bist_parser/bmstparser/src/utils.py
new file mode 100644
index 0000000..901e3b5
--- /dev/null
+++ b/bist_parser/bmstparser/src/utils.py
@@ -0,0 +1,93 @@
+from collections import Counter
+import re
+
+
+class ConllEntry:
+    def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
+        self.id = id
+        self.form = form
+        self.norm = normalize(form)
+        self.cpos = cpos.upper()
+        self.pos = pos.upper()
+        self.parent_id = parent_id
+        self.relation = relation
+
+        self.pred_parent_id = None
+        self.pred_relation = None
+
+
+def vocab(conll_path):
+    wordsCount = Counter()
+    posCount = Counter()
+    relCount = Counter()
+
+    with open(conll_path, 'r') as conllFP:
+        for sentence in read_conll(conllFP):
+            wordsCount.update([node.norm for node in sentence])
+            posCount.update([node.pos for node in sentence])
+            relCount.update([node.relation for node in sentence])
+
+    return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},
+            [k for k in posCount.keys()], [k for k in relCount.keys()])
+
+
+def vocab_conll(conll_entries):
+    """
+    Create the vocabulary directly from CoNLL entries.
+    :param conll_entries: a list of lists of CoNLL entries
+    :return: the words count, a word-to-id mapping, a list of pos count keys, a list of rel count keys
+    """
+    wordsCount = Counter()
+    posCount = Counter()
+    relCount = Counter()
+
+    for sentence in conll_entries:
+        wordsCount.update([node.norm for node in sentence])
+        posCount.update([node.pos for node in sentence])
+        relCount.update([node.relation for node in sentence])
+
+    return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},
+            [k for k in posCount.keys()], [k for k in relCount.keys()])
+
+
+def read_conll(fh):
+    root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', -1, 'rroot')
+    tokens = [root]
+    for line in fh:
+        tok = line.strip().split()
+        if not tok:
+            if len(tokens)>1: yield tokens
+            tokens = [root]
+        else:
+            tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
+    if len(tokens) > 1:
+        yield tokens
+
+
+def write_conll(fn, conll_gen):
+    with open(fn, 'w') as fh:
+        for sentence in conll_gen:
+            for entry in sentence[1:]:
+                fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
+                fh.write('\n')
+            fh.write('\n')
+
+
+def write_original_conll(fn, conll_original):
+    """
+    Write original CoNLL entries to file (in contrast to predicted/generated CoNLL entries).
+    :param fn: the path of the file to which the CoNLL entries should be written
+    :param conll_original: the original CoNLL entries that should be written to the file
+    """
+    with open(fn, 'w') as fh:
+        for sentence in conll_original:
+            for entry in sentence[1:]:
+                fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.parent_id), entry.relation, '_', '_']))
+                fh.write('\n')
+            fh.write('\n')
+
+
+numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
+def normalize(word):
+    return 'NUM' if numberRegex.match(word) else word.lower()
+
diff --git a/constants.py b/constants.py
new file mode 100644
index 0000000..62c3d85
--- /dev/null
+++ b/constants.py
@@ -0,0 +1,39 @@
+"""
+Constants that are shared across files.
+"""
+
+NEG_ID = 0  # the negative sentiment id
+POS_ID = 1  # the positive sentiment id
+NEU_ID = 2  # the neutral sentiment id
+
+# feature-related constants
+FEATURE_SETS = ['similarity', 'topic_similarity', 'word_embedding_similarity',
+                'diversity']
+SIMILARITY_FUNCTIONS = ['jensen-shannon', 'renyi', 'cosine', 'euclidean',
+                        'variational', 'bhattacharyya']
+DIVERSITY_FEATURES = ['num_word_types', 'type_token_ratio', 'entropy',
+                      'simpsons_index', 'quadratic_entropy', 'renyi_entropy']
+
+# task-related constants
+POS = 'pos'
+POS_BILSTM = 'pos_bilstm'
+SENTIMENT = 'sentiment'
+PARSING = 'parsing'
+TASKS = [POS, POS_BILSTM, SENTIMENT, PARSING]
+POS_PARSING_TRG_DOMAINS = ['answers', 'emails', 'newsgroups', 'reviews', 'weblogs', 'wsj']
+SENTIMENT_TRG_DOMAINS = ['books', 'dvd', 'electronics', 'kitchen']
+TASK2TRAIN_EXAMPLES = {
+    POS: 2000, POS_BILSTM: 2000, SENTIMENT: 1600, PARSING: 2000
+}
+TASK2DOMAINS = {
+    POS: POS_PARSING_TRG_DOMAINS, POS_BILSTM: POS_PARSING_TRG_DOMAINS,
+    SENTIMENT: SENTIMENT_TRG_DOMAINS, PARSING: POS_PARSING_TRG_DOMAINS
+}
+
+# method-related constants
+BAYES_OPT = 'bayes-opt'
+RANDOM = 'random'
+MOST_SIMILAR_DOMAIN = 'most-similar-domain'
+MOST_SIMILAR_EXAMPLES = 'most-similar-examples'
+ALL_SOURCE_DATA = 'all-source-data'
+BASELINES = [RANDOM, MOST_SIMILAR_DOMAIN, MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA]
diff --git a/data_utils.py b/data_utils.py
new file mode 100644
index 0000000..ad50b57
--- /dev/null
+++ b/data_utils.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Utility methods for loading and processing data.
+"""
+
+import os
+import codecs
+from collections import Counter
+import itertools
+import operator
+
+import numpy as np
+import scipy.sparse
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from constants import NEG_ID, POS_ID
+from simpletagger import read_conll_file
+
+from constants import SENTIMENT, POS, POS_BILSTM, PARSING, \
+    SENTIMENT_TRG_DOMAINS, POS_PARSING_TRG_DOMAINS
+from bist_parser.bmstparser.src.utils import read_conll
+
+
+class Vocab:
+    """
+    The vocabulary class. Stores the word-to-id mapping.
+    """
+    def __init__(self, max_vocab_size, vocab_path):
+        self.max_vocab_size = max_vocab_size
+        self.vocab_path = vocab_path
+        self.size = 0
+        self.word2id = {}
+        self.id2word = {}
+
+    def load(self):
+        """
+        Loads the vocabulary from the vocabulary path.
+        """
+        assert self.size == 0, 'Vocabulary has already been loaded or built.'
+        print('Reading vocabulary from %s...' % self.vocab_path)
+        with codecs.open(self.vocab_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= self.max_vocab_size:
+                    print('Vocab in file is larger than max vocab size. '
+                          'Only using top %d words.' % self.max_vocab_size)
+                    break
+                word, idx = line.split('\t')
+                self.word2id[word] = int(idx.strip())
+        self.size = len(self.word2id)
+        self.id2word = {index: word for word, index in self.word2id.items()}
+        assert self.size <= self.max_vocab_size, \
+            'Loaded vocab is of size %d., max vocab size is %d.' % (
+                self.size, self.max_vocab_size)
+
+    def create(self, texts, lowercase=True):
+        """
+        Creates the vocabulary and stores it at the vocabulary path.
+        :param texts: a list of lists of tokens
+        :param lowercase: lowercase the input texts
+        """
+        assert self.size == 0, 'Vocabulary has already been loaded or built.'
+        print('Building the vocabulary...')
+        if lowercase:
+            print('Lower-casing the input texts...')
+            texts = [[word.lower() for word in text] for text in texts]
+
+        word_counts = Counter(itertools.chain(*texts))
+
+        # get the n most common words
+        most_common = word_counts.most_common(n=self.max_vocab_size)
+
+        # construct the word to index mapping
+        self.word2id = {word: index for index, (word, count)
+                        in enumerate(most_common)}
+        self.id2word = {index: word for word, index in self.word2id.items()}
+
+        print('Writing vocabulary to %s...' % self.vocab_path)
+        with codecs.open(self.vocab_path, 'w', encoding='utf-8') as f:
+            for word, index in sorted(self.word2id.items(),
+                                      key=operator.itemgetter(1)):
+                f.write('%s\t%d\n' % (word, index))
+        self.size = len(self.word2id)
+
+
+def get_all_docs(domain_data_pairs, unlabeled=True):
+    """
+    Return all labeled and undocumented documents of multiple domains.
+    :param domain_data_pairs: a list of (domain, (labeled_reviews, labels,
+                              unlabeled_reviews)) tuples as obtained by
+                              domain2data.items()
+    :param unlabeled: whether unlabeled documents should be incorporated
+    :return: a list containing the documents from all domains, the corresponding
+             labels, and a list containing the domain of each example
+    """
+    docs, labels, domains = [], [], []
+    for domain, (labeled_docs, doc_labels, unlabeled_docs) in domain_data_pairs:
+        length_of_docs = 0
+        if not scipy.sparse.issparse(labeled_docs):
+            # if the labeled documents are not a sparse matrix, i.e.
+            # a tf-idf matrix, we can just flatten them into one array
+            docs += labeled_docs
+            length_of_docs += len(labeled_docs)
+            if unlabeled:
+                # if specified, we add the unlabeled documents
+                docs += unlabeled_docs
+                length_of_docs += len(labeled_docs)
+        else:
+            # if it is a sparse matrix, we just append the docs as a list and
+            # then stack the list in the end
+            docs.append(labeled_docs)
+            length_of_docs += labeled_docs.shape[0]
+            if unlabeled and unlabeled_docs is not None:
+                docs.append(unlabeled_docs)
+                length_of_docs += unlabeled_docs.shape[0]
+        labels.append(doc_labels)
+
+        # we just add the corresponding domain for each document so that we can
+        # later see where the docs came from
+        domains += [domain] * length_of_docs
+    if scipy.sparse.issparse(labeled_docs):
+        # finally, if the matrix was sparse, we can stack the documents together
+        docs = scipy.sparse.vstack(docs)
+    return docs, np.hstack(labels), domains
+
+
+def get_tfidf_data(domain2data, vocab):
+    """
+    Transform the tokenized documents of each domain into a tf-idf matrix.
+    :param domain2data: the mapping of domains to a (tokenized_reviews, labels,
+                        tokenized_unlabeled_reviews) tuple
+    :param vocab: the Vocabulary class
+    :return: a mapping of domains to a (labeled_tfidf_matrix, labels,
+             unlabeled_tfidf_matrix) tuple where both tfidf matrices are
+             scipy.sparse.csr.csr_matrix with shape (num_examples, vocab_size)
+    """
+    domain2tfidf_data = {}
+    for domain, (labeled_examples, labels, unlabeled_examples) in domain2data.items():
+
+        # apply the vectorizer to the already tokenized and pre-processed input
+        vectorizer = TfidfVectorizer(vocabulary=vocab.word2id,
+                                     tokenizer=lambda x: x,
+                                     preprocessor=lambda x: x)
+
+        # fit the vectorizer to both labeled and unlabeled examples but keep
+        # the transformed examples separate
+        vectorizer.fit(labeled_examples + unlabeled_examples)
+        tfidf_labeled_examples = vectorizer.transform(labeled_examples)
+
+        # note: we cap unlabeled examples at 100k (only relevant for the books
+        # domain in the large-scale setting)
+        unlabeled_examples = unlabeled_examples[:100000]
+        tfidf_unlabeled_examples = vectorizer.transform(unlabeled_examples) \
+            if len(unlabeled_examples) != 0 else None
+        assert isinstance(tfidf_labeled_examples, scipy.sparse.csr.csr_matrix),\
+            'The input is not a sparse matrix.'
+        assert isinstance(labels, np.ndarray), 'Labels are not a numpy array.'
+        domain2tfidf_data[domain] = [tfidf_labeled_examples, labels,
+                                     tfidf_unlabeled_examples]
+    return domain2tfidf_data
+
+  
+def log_to_file(log_file, run_dict, trg_domain, args):
+    """
+    Log the results of experiment runs to a file.
+    :param log_file: the file used for logging
+    :param run_dict: a dictionary mapping a method name to a list of
+                     (val_accuracy, test_accuracy) tuples or a list
+                     of (val_accuracy, test_accuracy, best_feature_weight)
+                     tuples for the bayes-opt method
+    :param trg_domain: the target domain
+    :param args: the arguments used as input to the script
+    """
+    with open(log_file, 'a') as f:
+        for method, scores in run_dict.items():
+            best_feature_weights = ''
+            if len(scores) == 0:
+                continue
+            if method.startswith('bayes-opt'):
+                val_accuracies, test_accuracies, best_feature_weights = \
+                    zip(*scores)
+            else:
+                val_accuracies, test_accuracies = zip(*scores)
+            mean_val, std_val = np.mean(val_accuracies), np.std(val_accuracies)
+            mean_test, std_test = np.mean(test_accuracies),\
+                                  np.std(test_accuracies)
+            # target domain. method. feature_sets.  # all other params
+            f.write('%s\t%s\t%s\t%.4f (+-%.4f)\t%.4f (+-%.4f)\t[%s]\t[%s]\t%s\t'
+                    '%s\n'
+                    % (trg_domain, method, ' '.join(args.feature_sets),
+                       mean_val, std_val, mean_test, std_test,
+                       ', '.join(['%.4f' % v for v in val_accuracies]),
+                       ', '.join(['%.4f' % t for t in test_accuracies]),
+                       str(list(best_feature_weights)),
+                       ' '.join(['%s=%s' % (arg, str(getattr(args, arg)))
+                                 for arg in vars(args)])))
+
+
+def read_feature_weights_file(feature_weights_path):
+    """
+    Reads a manually created file containing the learned feature weights for
+    some task, trg domain, and feature set and returns them.
+    The file format is this (note that ~ is used as delimiter to avoid clash
+    with other delimiters in the feature sets):
+    books~similarity diversity~[0.0, -0.66, -0.66, 0.66, 0.66, -0.66, 0.66, 0.0, 0.0, -0.66, 0.66, 0.66]
+    ...
+    :param feature_weights_path: the path to the feature weights file
+    :return: a generator of tuples (feature_weights_domain, feature_set, feature_weights)
+    """
+    print('Reading feature weights from %s...' % feature_weights_path)
+    with open(feature_weights_path, 'r') as f:
+        for line in f:
+            feature_weights_domain, feature_set, feature_weights =\
+                line.split('~')
+            feature_weights = feature_weights.strip('[]\n')
+            feature_weights = feature_weights.split(', ')
+            feature_weights = [float(f) for f in feature_weights]
+            print('Feature weights domain: %s. Feature set: %s. '
+                  'Feature weights: %s' %
+                  (feature_weights_domain, feature_set, str(feature_weights)))
+            yield feature_weights_domain, feature_set, feature_weights
+
+
+def task2read_data_func(task):
+    """Returns the read data method for each task."""
+    if task == SENTIMENT:
+        return read_processed
+    if task in [POS, POS_BILSTM]:
+        return read_tagging_data
+    if task == PARSING:
+        return read_parsing_data
+    raise ValueError(
+        'No data reading function available for task %s.' % task)
+
+
+# =============== sentiment data functions =======
+
+def read_processed(dir_path):
+    """
+    Reads the processed files in the processed_acl directory.
+    :param dir_path: the directory containing the processed_acl folder
+    :return: a dictionary that maps domains to a tuple of
+             (labeled_reviews,labels, unlabeled_reviews); labeled_reviews is
+             a list of reviews where each review is a list of (unordered)
+             ngrams; labels is a numpy array of label ids of shape (num_labels);
+             unlabeled_reviews has the same format as labeled_reviews
+    """
+    domains_path = os.path.join(dir_path, 'processed_acl')
+    assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+                                          domains_path)
+    domains = os.listdir(domains_path)
+    assert set(domains) == set(SENTIMENT_TRG_DOMAINS)
+    domain2data = {domain: [[], [], None] for domain in domains}
+    for domain in domains:
+        print('Processing %s...' % domain)
+        # file names are positive.review, negative.review, and unlabeled.review
+        # positive and negative each contain 2k examples;
+        # unlabeled contains ~4k examples
+        splits = ['positive', 'negative', 'unlabeled']
+        for split in splits:
+            print('Processing %s/%s...' % (domain, split), end='')
+            file_path = os.path.join(domains_path, domain, '%s.review' % split)
+            assert os.path.exists(file_path), '%s does not exist.' % file_path
+            reviews = []
+            with open(file_path, encoding='utf-8') as f:
+                for line in f:
+                    # get the pre-processed features; these are a white-space
+                    # separated list of unigram/bigram occurrence counts in
+                    # the document, e.g. "must:1", "still_has:1"
+                    features = line.split(' ')[:-1]
+
+                    # convert the features to a sequence (note: order does not
+                    # matter here); we do this to be able to later use the
+                    # same post-processing as for data from other sources
+                    review = []
+                    for feature in features:
+                        ngram, count = feature.split(':')
+                        for _ in range(int(count)):
+                            review.append(ngram)
+
+                    # add the review to the reviews
+                    reviews.append(review)
+
+            # the domain2data dict maps a domain to a tuple of
+            # (reviews, labels, unlabeled_reviews)
+            if split == 'unlabeled':
+                # add the unlabeled reviews at the third position of the tuple
+                domain2data[domain][2] = reviews
+            else:
+                # add labels with the same polarity as the file
+                domain2data[domain][0] += reviews
+                domain2data[domain][1] += [sentiment2id(split)] * len(reviews)
+
+            print(' Processed %d reviews.' % len(reviews))
+        domain2data[domain][1] = np.array(domain2data[domain][1])
+    return domain2data
+
+
+def sentiment2id(sentiment):
+    """
+    Maps a sentiment to a label id.
+    :param sentiment: the sentiment; one of [positive, pos, negative, neg]
+    :return: the id of the specified sentiment
+    """
+    if sentiment in ['positive', 'pos']:
+        return POS_ID
+    if sentiment in ['negative', 'neg']:
+        return NEG_ID
+    raise ValueError('%s is not a valid sentiment.' % sentiment)
+
+
+# =============== tagging data functions ======
+
+def read_tagging_data(dir_path, top_k_unlabeled=2000):
+    """
+    Reads the CoNLL tagging files in the gweb_sancl/pos directory. Outputs the
+    documents as list of lists with tokens and lists of corresponding tags.
+    The domains are reviews, answer, emails, newsblogs, weblogs, wsj and
+    the corresponding files are called gweb-{domain}-{dev|test}.conll in folder
+    gweb_sancl/pos/{domain}
+    :param dir_path: the path to the directory gweb_sancl
+    :param top_k_unlabeled: only use the top k unlabeled examples
+    :return: a dictionary that maps domains to a tuple of (labeled_examples,
+             labels, unlabeled_examples); labeled_examples is a list of
+             sentences where each sentence is a list of tokens; labels
+             is a list of tags for each sentence; unlabeled_examples has the
+             same format as labeled_examples
+    """
+    domains_path = os.path.join(dir_path, 'pos')
+    assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+                                         domains_path)
+    domains = [d for d in os.listdir(domains_path)]
+    print(domains)
+    assert set(domains) == set(POS_PARSING_TRG_DOMAINS)
+    domain2data = {domain: [[], [], None] for domain in domains}
+    for domain in domains:
+        print('Processing %s...' % domain)
+        # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll
+        splits = ['dev', 'test', 'unlabeled']
+        for split in splits:
+            print('Processing %s/%s...' % (domain, split), end='')
+
+            if split == 'unlabeled':
+                file_path = os.path.join(dir_path, 'unlabeled',
+                                         'gweb-%s.unlabeled.txt' % (domain))
+                assert os.path.exists(file_path), ('%s does not exist.' %
+                                                   file_path)
+                unlabeled_data = []
+                print(file_path)
+                with open(file_path,'rb') as f:
+                    for line in f:
+                        line = line.decode('utf-8','ignore').strip().split()
+                        unlabeled_data.append(line)
+                # add the unlabeled reviews at the third position of the tuple
+                print('Read %s number of unlabeled sentences'
+                      % len(unlabeled_data))
+
+                unlabeled_data = unlabeled_data[:top_k_unlabeled]
+                print('Took top {} documents '.format(top_k_unlabeled))
+                domain2data[domain][2] = unlabeled_data
+            else:
+
+                file_path = os.path.join(domains_path, domain,
+                                         'gweb-%s-%s.conll' % (domain, split))
+                assert os.path.exists(file_path), ('%s does not exist.' %
+                                                   file_path)
+
+                data = list(read_conll_file(file_path))
+                words = [words for words, tags in data]
+                tags = [tags for words, tags in data]
+                domain2data[domain][0] += words
+                domain2data[domain][1] += tags
+
+            print(' Processed %d sentences.' % len(data))
+        domain2data[domain][1] = np.array(domain2data[domain][1])
+    return domain2data
+
+
+# =============== parsing data functions ======
+
+def read_parsing_data(dir_path, top_k_unlabeled=2000):
+    """
+    Reads the CoNLL parsing files in the gweb_sancl/pos directory
+    :param dir_path: The gweb_sancl directory path.
+    :param top_k_unlabeled: only use the top k unlabeled examples
+    :return: a dictionary that maps domains to a tuple of (
+             labeled_conll_entries, pseudo_labels, unlabeled_conll_entries);
+             labeled_conll_entries is a list of CoNLLEntry containing the
+             word forms, annotations, and target labels to be used for
+             parsing; since each CoNLLEntry already contains the target label,
+             pseudo_labels only contains pseudo-labels; unlabeled_conll_entries
+             are used as unlabeled data
+    """
+    domains_path = os.path.join(dir_path, 'parse')
+    assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+                                          domains_path)
+    domains = [d for d in os.listdir(domains_path)]
+    print(domains)
+    assert set(domains) == set(POS_PARSING_TRG_DOMAINS)
+    domain2data = {domain: [[], [], None] for domain in domains}
+    for domain in domains:
+        print('Processing %s...' % domain)
+        # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll
+        splits = ['dev', 'test', 'unlabeled']
+        for split in splits:
+            print('Processing %s/%s...' % (domain, split), end='')
+            if split == 'unlabeled':
+                file_path = os.path.join(dir_path, 'unlabeled',
+                                         'gweb-%s.unlabeled.txt' % (domain))
+                assert os.path.exists(file_path), ('%s does not exist.' %
+                                                   file_path)
+                unlabeled_data = []
+                with open(file_path,'rb') as f:
+                    for line in f:
+                        line = line.decode('utf-8','ignore').strip().split()
+                        unlabeled_data.append(line)
+
+                # add the unlabeled reviews at the third position of the tuple
+                print('Read %s number of unlabeled sentences' % len(unlabeled_data))
+
+                unlabeled_data = unlabeled_data[:top_k_unlabeled]
+                print('Took top {} documents '.format(top_k_unlabeled))
+                domain2data[domain][2] = unlabeled_data
+            else:
+                if domain == 'wsj' and split == 'test':
+                    file_path = os.path.join(domains_path, domain,
+                                             'ontonotes-%s-%s.conll'
+                                             % (domain, split))
+                else:
+                    file_path = os.path.join(domains_path, domain,
+                                             'gweb-%s-%s.conll'
+                                             % (domain, split))
+                assert os.path.exists(file_path), ('%s does not exist.' %
+                                                   file_path)
+
+                with open(file_path, 'r') as conll_file_path:
+                    data = list(read_conll(conll_file_path))
+                domain2data[domain][0] += data
+
+                # add pseudo-labels since the model doesn't use explicit
+                # labels for training
+                domain2data[domain][1] += [0] * len(data)
+        domain2data[domain][1] = np.array(domain2data[domain][1])
+    return domain2data
+
+
+def read_parsing_evaluation(evaluation_file_path):
+    """
+    Read the labeled attachment score, unlabeled attachment score, and label
+    accuracy score from a file produced by the parsing evaluation perl
+    script. The beginning of the file looks like this:
+    Labeled   attachment score: 6995 / 9615 * 100 = 72.75 %
+    Unlabeled attachment score: 7472 / 9615 * 100 = 77.71 %
+    Label accuracy score:       8038 / 9615 * 100 = 83.60 %
+    ...
+    :param evaluation_file_path: the path of the evaluation file produced by the perl script
+    :return: the labeled attachment score, the unlabeled attachment score, and the label accuracy score
+    """
+    try:
+        with open(evaluation_file_path, 'r') as f:
+            lines = f.readlines()
+            las = float(lines[0].split('=')[1].strip('% \n'))
+            uas = float(lines[1].split('=')[1].strip('% \n'))
+            acc = float(lines[2].split('=')[1].strip('% \n'))
+    except Exception:
+        las = 0.0
+        uas = 0.0
+        acc = 0.0
+    return las, uas, acc
diff --git a/similarity.py b/similarity.py
new file mode 100644
index 0000000..fdf91a0
--- /dev/null
+++ b/similarity.py
@@ -0,0 +1,342 @@
+"""
+Methods for measuring domain similarity according to different metrics based on
+different representations.
+"""
+
+import os
+
+from sklearn.feature_extraction.text import CountVectorizer
+import gensim
+
+import numpy as np
+import scipy.stats
+import scipy.spatial.distance
+
+
+# SIMILARITY MEASURES
+
+def jensen_shannon_divergence(repr1, repr2):
+    """Calculates Jensen-Shannon divergence (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)."""
+    avg_repr = 0.5 * (repr1 + repr2)
+    sim = 1 - 0.5 * (scipy.stats.entropy(repr1, avg_repr) + scipy.stats.entropy(repr1, avg_repr))
+    if np.isinf(sim):
+        # the similarity is -inf if no term in the document is in the vocabulary
+        return 0
+    return sim
+
+
+def renyi_divergence(repr1, repr2, alpha=0.99):
+    """Calculates Renyi divergence (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R.C3.A9nyi_divergence)."""
+    log_sum = np.sum([np.power(p, alpha) / np.power(q, alpha-1) for (p, q) in zip(repr1, repr2)])
+    sim = 1 / (alpha - 1) * np.log(log_sum)
+    if np.isinf(sim):
+        # the similarity is -inf if no term in the document is in the vocabulary
+        return 0
+    return sim
+
+
+def cosine_similarity(repr1, repr2):
+    """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity)."""
+    if repr1 is None or repr2 is None:
+        return 0
+    assert not (np.isnan(repr2).any() or np.isinf(repr2).any())
+    assert not (np.isnan(repr1).any() or np.isinf(repr1).any())
+    sim = 1 - scipy.spatial.distance.cosine(repr1, repr2)
+    if np.isnan(sim):
+        # the similarity is nan if no term in the document is in the vocabulary
+        return 0
+    return sim
+
+
+def euclidean_distance(repr1, repr2):
+    """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance)."""
+    sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)]))
+    return sim
+
+
+def variational_distance(repr1, repr2):
+    """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry)."""
+    sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)])
+    return sim
+
+
+def kl_divergence(repr1, repr2):
+    """Calculates Kullback-Leibler divergence (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)."""
+    sim = scipy.stats.entropy(repr1, repr2)
+    return sim
+
+
+def bhattacharyya_distance(repr1, repr2):
+    """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance)."""
+    sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)]))
+    assert not np.isnan(sim), 'Error: Similarity is nan.'
+    if np.isinf(sim):
+        # the similarity is -inf if no term in the review is in the vocabulary
+        return 0
+    return sim
+
+
+def similarity_name2value(s_name, repr1, repr2):
+    """Given a similarity function name, return the corresponding similarity function value."""
+    if s_name == 'jensen-shannon':
+        return jensen_shannon_divergence(repr1, repr2)
+    if s_name == 'renyi':
+        return renyi_divergence(repr1, repr2)
+    if s_name == 'cos' or s_name == 'cosine':
+        return cosine_similarity(repr1, repr2)
+    if s_name == 'euclidean':
+        return euclidean_distance(repr1, repr2)
+    if s_name == 'variational':
+        return variational_distance(repr1, repr2)
+    if s_name == 'kl':
+        return kl_divergence(repr1, repr2)
+    if s_name == 'bhattacharyya':
+        return bhattacharyya_distance(repr1, repr2)
+    raise ValueError('%s is not a valid feature name.' % s_name)
+
+
+# TERM DISTRIBUTIONS
+
+def get_domain_term_dists(term_dist_path, domain2data, vocab, lowercase=True):
+    """
+    Retrieves relative term distributions from the provided domains.
+    :param term_dist_path: the path where the term distributions of the domains
+                           should be saved
+    :param domain2data: the mapping of domains to (labeled_examples, labels,
+                        unlabeled_examples) tuples
+    :param vocab: the Vocabulary object
+    :param lowercase: lower-case the input data
+    :return: a mapping of domains to their term distributions,
+             i.e. a numpy array of shape (vocab_size,)
+    """
+    domain2term_dist = {}
+    if os.path.exists(term_dist_path):
+        print('Loading the term distributions from file...')
+        with open(term_dist_path, 'r') as f:
+            for line in f:
+                domain, term_dist = line.strip().split('\t')
+                term_dist = np.fromstring(term_dist, count=vocab.size, sep=' ')
+                assert len(term_dist) == vocab.size,\
+                    ('Length of term dist for %s should be %d, is %d.' %
+                     (domain, vocab.size, len(term_dist)))
+                assert np.round(np.sum(term_dist), 6) == 1,\
+                    ('Sum of term distribution is %.6f instead of 1. The '
+                     'vocabulary was likely created with a larger '
+                     'max_vocab_size.' % np.sum(term_dist))
+                domain2term_dist[domain] = term_dist
+        assert set(domain2term_dist.keys()) == set(domain2data.keys()),\
+            ('Term distributions are not saved for all domains: "%s" and "%s"'
+             'are not equal.' % (' '.join(domain2term_dist.keys()),
+                                 ' '.join(domain2data.keys())))
+        return domain2term_dist
+
+    if lowercase:
+        print('Lower-casing the data for calculating the term distributions...')
+
+    # get the term domain counts for the term distributions
+    for domain, (examples, _, unlabeled_examples) in domain2data.items():
+        domain2term_dist[domain] = get_term_dist(
+            examples + unlabeled_examples, vocab, lowercase)
+
+    print('Writing relative frequency distributions to %s...' % term_dist_path)
+    with open(term_dist_path, 'w') as f:
+        for domain, term_dist in domain2term_dist.items():
+            f.write('%s\t%s\n' % (domain, ' '.join([str(c) for c in term_dist])))
+    return domain2term_dist
+
+
+def get_term_dist(docs, vocab, lowercase=True):
+    """
+    Calculates the term distribution of a list of documents.
+    :param docs: a list of tokenized docs; can also contain a single document
+    :param vocab: the Vocabulary object
+    :param lowercase: lower-case the input data
+    :return: the term distribution of the input documents,
+             i.e. a numpy array of shape (vocab_size,)
+    """
+    term_dist = np.zeros(vocab.size)
+    for doc in docs:
+        for word in doc:
+            if lowercase:
+                word = word.lower()
+            if word in vocab.word2id:
+                term_dist[vocab.word2id[word]] += 1
+
+    # normalize absolute freqs to obtain a relative frequency term distribution
+    term_dist /= np.sum(term_dist)
+    if np.isnan(np.sum(term_dist)):
+        # the sum is nan if docs only contains one document and that document
+        # has no words in the vocabulary
+        term_dist = np.zeros(vocab.size)
+    return term_dist
+
+
+def get_most_similar_domain(trg_domain, domain2term_dists,
+                            similarity_name='jensen-shannon'):
+    """
+    Given a target domain, retrieve the domain that is most similar to it
+    according to some domain similarity measure (default: Jensen-Shannon
+    divergence).
+    :param trg_domain: the target domain
+    :param domain2term_dists: a mapping of domain names to their term distribution
+                              (a numpy array of shape (vocab_size,) )
+    :param similarity_name: a string indicating the name of the similarity
+                            measure used (default: 'jensen-shannon')
+    :return: the domain most similar to the target domain
+    """
+    highest_sim_score, most_similar_domain = 0, None
+    trg_term_dist = domain2term_dists[trg_domain]
+    for domain, src_term_dist in domain2term_dists.items():
+        if domain == trg_domain:
+            continue
+        sim_score = similarity_name2value(similarity_name, src_term_dist, trg_term_dist)
+        if sim_score > highest_sim_score:
+            highest_sim_score, most_similar_domain = sim_score, domain
+    return most_similar_domain
+
+
+# TOPIC DISTRIBUTIONS
+
+def train_topic_model(examples, vocab, num_topics=50, num_iterations=2000,
+                      num_passes=10):
+    """
+    Trains an LDA topic model on the provided list of tokenised documents and
+    returns the vectorizer used for the transformation and the trained LDA
+    model.
+    :param examples: a list of tokenised documents of all domains
+    :param vocab: the Vocabulary object
+    :param num_topics: the number of topics that should be used
+    :param num_iterations: the number of iterations
+    :param num_passes: the number of passes over the corpus that should be
+                       performed
+    :return: the CountVectorizer used for transforming the corpus and the
+             trained LDA topic model
+    """
+    # the text is already tokenized and pre-processed; we only need to
+    # transform it to vectors
+    vectorizer = CountVectorizer(vocabulary=vocab.word2id,
+                                 tokenizer=lambda x: x,
+                                 preprocessor=lambda x: x)
+    lda_corpus = vectorizer.fit_transform(examples)
+
+    # the gensim LDA implementation requires a sparse corpus;
+    # we could also use sci-kit learn instead
+    lda_corpus = gensim.matutils.Sparse2Corpus(lda_corpus,
+                                               documents_columns=False)
+    print('Training LDA model on data of all domains with %d topics, '
+          '%d iterations, %d passes...' % (num_topics, num_iterations,
+                                           num_passes))
+    lda_model = gensim.models.LdaMulticore(
+        lda_corpus, num_topics=num_topics, id2word=vocab.id2word,
+        iterations=num_iterations, passes=num_passes)
+    return vectorizer, lda_model
+
+
+def get_topic_distributions(examples, vectorizer, lda_model):
+    """
+    Retrieve the topic distributions of a collection of documents.
+    :param examples: a list of tokenised documents
+    :param vectorizer: the CountVectorizer used for transforming the documents
+    :param lda_model: the trained LDA model
+    :return: an array of shape (num_examples, num_topics) containing the topic
+             distribution of each example
+    """
+    vectorized_corpus = vectorizer.transform(examples)
+    gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus,
+                                                  documents_columns=False)
+    topic_representations = []
+    for doc in gensim_corpus:
+        topic_representations.append(
+            [topic_prob for (topic_id, topic_prob) in
+             lda_model.get_document_topics(doc, minimum_probability=0.)])
+    return np.array(topic_representations)
+
+
+# PRE-TRAINED WORD EMBEDDINGS METHODS
+
+def load_word_vectors(file, vocab_word_vec_file, word2id, vector_size=300,
+                      header=False):
+    """
+    Loads word vectors from a text file, e.g. the one obtained from
+    http://nlp.stanford.edu/projects/glove/.
+    :param file: the file the word vectors should be loaded from
+    :param vocab_word_vec_file: the file where the word embeddings in the
+                                vocabulary can be stored for faster retrieval
+    :param word2id: the mapping of words to their ids in the vocabulary
+    :param vector_size: the size of the word vectors
+    :param header: whether the word vectors text file contains a header;
+                   default is False
+    :return a dictionary mapping each word to its numpy word vector
+    """
+    word2vector = {}
+    if os.path.exists(vocab_word_vec_file):
+        print('Loading vocabulary word vectors from %s...' % vocab_word_vec_file)
+        with open(vocab_word_vec_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                word = line.split(' ')[0]
+                assert word in word2id, ('Error: %s in vocab word vec file is '
+                                         'not in vocab.' % word)
+                line = ' '.join(line.split(' ')[1:]).strip()
+                vector = np.fromstring(line, dtype=float, sep=' ')
+                assert len(vector) == vector_size,\
+                    ('Error: %d != vector size %d for word %s.'
+                     % (len(vector), vector_size, word))
+                word2vector[word] = vector
+        return word2vector
+
+    print('Reading word vectors from %s...' % file)
+    with open(file, 'r', encoding='utf-8') as f:
+        for i, line in enumerate(f):
+            if i == 0 and header:
+                continue
+            if i % 100000 == 0 and i > 0:
+                print('Processed %d vectors.' % i)
+            word = line.split(' ')[0]
+            if word not in word2id:
+                continue
+            line = ' '.join(line.split(' ')[1:]).strip()
+            vector = np.fromstring(line, dtype=float, sep=' ')
+            assert len(vector) == vector_size
+            word2vector[word] = vector
+
+    print('Writing word vectors to %s...' % vocab_word_vec_file)
+    with open(vocab_word_vec_file, 'w', encoding='utf-8') as f:
+        for word, vector in word2vector.items():
+            f.write('%s %s\n' % (word, ' '.join([str(c) for c in vector])))
+    return word2vector
+
+
+def weighted_sum_of_embeddings(docs, word2id, word2vector, term_dist):
+    """
+    Get a weighted sum of embeddings representation for a list of documents
+    belonging to one domain. The documents are represented as a list of
+    ngrams. Also works if the list only contains a single document.
+    :param docs: a list of documents
+    :param word2id: the mapping of words to their ids in the vocabulary
+    :param word2vector: the mapping of words to their vector representations
+    :param term_dist: the term distribution of the data the words belong to
+    :return: the vector representation of the provided list of documents
+    """
+    # the factor with which the word probability is smoothed, we empirically
+    # set this to the value used in Mikolov et al. (2013)
+    t = 10e-5
+    word_embed_representations = []
+    for doc in docs:
+        doc_vector = np.zeros(len(list(word2vector.values())[0]))
+        word_vector_count = 0
+        for word in doc:
+            if word in word2vector:
+                vector = word2vector[word]
+
+                # weight the vector with the smoothed inverse probability of
+                # the word
+                doc_vector += np.sqrt(t / (term_dist[word2id[word]])) * vector
+                word_vector_count += 1
+        if word_vector_count == 0:
+            # this might be because the review is in another language by
+            # accident; set count to 1 to avoid division by 0
+            word_vector_count = 1
+        doc_vector /= word_vector_count
+        assert not (np.isnan(doc_vector).any() or np.isinf(doc_vector).any())
+        word_embed_representations.append(doc_vector)
+    return np.array(word_embed_representations)
diff --git a/simpletagger.py b/simpletagger.py
new file mode 100644
index 0000000..5b39a05
--- /dev/null
+++ b/simpletagger.py
@@ -0,0 +1,359 @@
+#!/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Simple structured perceptron tagger (bplank, parts by andersjo) - Language Proc 2
+import argparse
+import codecs
+from collections import defaultdict, Counter
+import json
+import re
+import numpy as np
+import sys
+import random
+
+np.set_printoptions(precision=4)
+
+
+def read_conll_file(file_name):
+    """
+    read in a file with format:
+    word1    tag1
+    ...      ...
+    wordN    tagN
+
+    Sentences MUST be separated by newlines!
+
+    :param file_name: file to read in
+    :return: generator of instances ((list of  words, list of tags) pairs)
+    """
+    current_words = []
+    current_tags = []
+
+    for line in codecs.open(file_name, encoding='utf-8'):
+        line = line.strip()
+
+        if line:
+            word, tag = line.split('\t')
+            current_words.append(word)
+            current_tags.append(tag)
+
+        else:
+            yield (current_words, current_tags)
+            current_words = []
+            current_tags = []
+
+    # if file does not end in newline (it should...), check whether there is an instance in the buffer
+    if current_tags != []:
+        yield (current_words, current_tags)
+
+
+def memoize(f):
+    """
+    helper function to be used as decorator to memoize features
+    :param f:
+    :return:
+    """
+    memo = {}
+    def helper(*args):
+        key = tuple(args[1:])
+        try:
+            return memo[key]
+        except KeyError:
+            memo[key] = f(*args)
+            return memo[key]
+    return helper
+
+
+class StructuredPerceptron(object):
+    """
+    implements a structured perceptron as described in Collins 2002
+    """
+
+    def __init__(self, seed=1512141834):
+        """
+        initialize model
+        :return:
+        """
+        self.feature_weights = defaultdict(float)
+        self.tags = set()
+
+        self.START = "__START__"
+        self.END = "__END__"
+        print("using seed: {}".format(seed))
+        random.seed(seed)
+        np.random.seed(seed)
+
+    def fit(self, train_data, iterations=5, learning_rate=0.2):
+        """
+        read in a CoNLL file, extract emission features iterate over instances to train weight vector
+        :param file_name:
+        :return:
+        """
+        averaged_weights = Counter()
+
+        for iteration in range(iterations):
+            correct = 0
+            total = 0.0
+            sys.stderr.write('iteration %s\n************\n' % (iteration+1))
+
+            for i, (words, tags) in enumerate(train_data):
+                if i%100==0:
+                    sys.stderr.write('%s'%i)
+                elif i%10==0:
+                    sys.stderr.write('.')
+
+                for tag in tags:
+                    self.tags.add(tag)
+
+                # get prediction
+                prediction = self.decode(words)
+
+                # derive global features
+                global_gold_features = self.get_global_features(words, tags)
+                global_prediction_features = self.get_global_features(words, prediction)
+
+                # update weight vector
+                for fid, count in global_gold_features.items():
+                    self.feature_weights[fid] += learning_rate * count
+                for fid, count in global_prediction_features.items():
+                    self.feature_weights[fid] -= learning_rate * count
+
+                # compute training accuracy for this iteration
+                correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold])
+                total += len(tags)
+
+            #sys.stderr.write('\n\t%s features\n' % (len(self.feature_weights)))
+            averaged_weights.update(self.feature_weights)
+            sys.stderr.write('\tTraining accuracy: %.4f\n\n' % (correct/total))
+
+            random.shuffle(train_data)
+
+        self.feature_weights = averaged_weights
+
+    def get_global_features(self, words, tags):
+        """
+        count how often each feature fired for the whole sentence
+        :param words:
+        :param tags:
+        :return:
+        """
+        feature_counts = Counter()
+
+        for i, (word, tag) in enumerate(zip(words, tags)):
+            previous_tag = self.START if i == 0 else tags[i-1]
+            feature_counts.update(self.get_features(word, tag, previous_tag))
+
+        return feature_counts
+
+    @memoize
+    def get_features(self, word, tag, previous_tag):
+        """
+        get all features that can be derived from the word and tags
+        :param word:
+        :param tag:
+        :param previous_tag:
+        :return:
+        """
+        word_lower = word.lower()
+        prefix = word_lower[:3]
+        suffix = word_lower[-3:]
+
+        features = [
+                    'TAG_%s' % (tag),                       # current tag
+                    'TAG_BIGRAM_%s_%s' % (previous_tag, tag),  # tag bigrams
+                    'WORD+TAG_%s_%s' % (word, tag),            # word-tag combination
+                    'WORD_LOWER+TAG_%s_%s' % (word_lower, tag),# word-tag combination (lowercase)
+                    'UPPER_%s_%s' % (word[0].isupper(), tag),  # word starts with uppercase letter
+                    'DASH_%s_%s' % ('-' in word, tag),         # word contains a dash
+                    'PREFIX+TAG_%s_%s' % (prefix, tag),        # prefix and tag
+                    'SUFFIX+TAG_%s_%s' % (suffix, tag),        # suffix and tag
+
+                    #########################
+                    # ADD MOAAAAR FEATURES! #
+                    #########################
+                    ('WORDSHAPE', self.shape(word), tag),
+                    'WORD+TAG_BIGRAM_%s_%s_%s' % (word, tag, previous_tag),
+                    'SUFFIX+2TAGS_%s_%s_%s' % (suffix, previous_tag, tag),
+                    'PREFIX+2TAGS_%s_%s_%s' % (prefix, previous_tag, tag)
+        ]
+
+        return features
+
+    @memoize
+    def shape(self, x):
+        result = []
+        for c in x:
+            if c.isupper():
+                result.append('X')
+            elif c.islower():
+                result.append('x')
+            elif c in '0123456789':
+                result.append('d')
+        else:
+            result.append(c)
+
+        # replace multiple occurrences of a character with 'x*' and return it
+        return re.sub(r"x+", "x*", ''.join(result))
+
+    def decode(self,words):
+        """
+        Find best sequence
+        :param words:
+        :return:
+        """
+        N=len(words)
+        M=len(self.tags) #number of tags
+        tags=list(self.tags)
+
+        # create trellis of size M (number of tags) x N (sentence length)
+        Q = np.ones((len(self.tags), N)) * float('-Inf')
+        backp = np.ones((len(self.tags), N), dtype=np.int16) * -1 #backpointers
+
+        ### initialization step
+        cur_word=words[0]
+        for j in range(M):
+            # initialize probs for tags j at position 1 (first word)
+            cur_tag=tags[j]
+            features = self.get_features(words[0], cur_tag, self.START)
+            feature_weights = sum((self.feature_weights[x] for x in features))
+            Q[j,0]=feature_weights
+
+        # iteration step
+        # filling the lattice, for every position and every tag find viterbi score Q
+        for i in range(1,N):
+            # for every tag
+            for j in range(M):
+                # checks if we are at end or start
+                tag=tags[j]
+
+                best_score = float('-Inf')
+
+                # for every possible previous tag
+                for k in range(M):
+
+                    # k=previous tag
+                    previous_tag=tags[k]
+
+                    best_before=Q[k,i-1] # score until best step before
+
+                    features = self.get_features(words[i], tag, previous_tag)
+                    feature_weights = sum((self.feature_weights[x] for x in features))
+
+                    score = best_before + feature_weights
+
+                    if score > best_score:
+                        Q[j,i]=score
+                        best_score = score
+                        backp[j,i]=k #best tag
+
+        # final best
+        #best_id=np.argmax(Q[:, -1]) #the same
+        best_id=Q[:,-1].argmax()
+
+        ## print best tags in reverse order
+        predtags=[]
+        predtags.append(tags[best_id])
+
+        for i in range(N-1,0,-1):
+            idx=int(backp[best_id,i])
+            predtags.append(tags[idx])
+            best_id=idx
+
+        #return reversed predtags
+        #return (words,predtags[::-1])
+        return predtags[::-1]
+
+    def predict(self, test_data):
+        """
+        Get predictions for entire test set
+        :param test_data:
+        :return:
+        """
+        return [self.decode(words) for words in test_data]
+
+    def predict_eval(self, test_data, output=False):
+        """
+        compute accuracy on a test file
+        :param file_name:
+        :param output:
+        :return:
+        """
+        correct = 0
+        total = 0.0
+        sys.stderr.write('\nTesting\n')
+        sys.stderr.write('*******\n')
+
+        for i, (words, tags) in enumerate(test_data):
+            if i%100==0:
+                sys.stderr.write('%s'%i)
+            elif i%10==0:
+                sys.stderr.write('.')
+
+            # get prediction
+            prediction = self.decode(words)
+
+            if output:
+                for word, gold, pred in zip(words, tags, prediction):
+                    print("{}\t{}\t{}".format(word, gold, pred))
+                print("")
+
+            correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold])
+            total += len(tags)
+        print("\nTest accuracy on %s items: %.4f" % (i+1, correct/total), file=sys.stderr)
+
+    def save(self, file_name):
+        """
+        save model
+        :param file_name:
+        :return:
+        """
+        print("saving model...", end=' ', file=sys.stderr)
+        with codecs.open(file_name, "w", encoding='utf-8') as model:
+            model.write("%s\n" % json.dumps({'tags': list(self.tags), 'weights': dict(self.feature_weights)}))
+        print("done", file=sys.stderr)
+
+    def load(self, file_name):
+        """
+        load model from JSON file
+        :param file_name:
+        :return:
+        """
+        print("loading model...", end=' ', file=sys.stderr)
+        model_data = codecs.open(file_name, 'r', encoding='utf-8').readline().strip()
+        model = json.loads(model_data)
+        self.tags = set(model['tags'])
+        self.feature_weights = model['weights']
+        print("done", file=sys.stderr)
+
+
+# if script is run from command line, automatically execute the following
+if __name__=="__main__":
+
+    # parse command line options
+    parser = argparse.ArgumentParser(description="""Run a structured perceptron""")
+    parser.add_argument("--train", help="train model on a file (CoNLL format)", required=False)
+    parser.add_argument("--test", help="test model on a file (CoNLL format)", required=False)
+    parser.add_argument("--output", help="output predictions to stdout", required=False,action="store_true")
+    parser.add_argument("--load", help="load model from JSON file", required=False)
+    parser.add_argument("--save", help="save model as JSON file", required=False)
+    parser.add_argument("--iterations", help="number of training iterations", required=False, default=5, type=int)
+    parser.add_argument("--learning_rate", help="learning rate during training", required=False, default=0.2, type=float)
+    args = parser.parse_args()
+
+    # create new model
+    sp = StructuredPerceptron()
+
+    if args.load:
+        sp.load(args.load)
+
+    if args.train:
+        train_data = list(read_conll_file(args.train))
+        sp.fit(train_data, iterations=args.iterations, learning_rate=args.learning_rate)
+
+    if args.save:
+        sp.save(args.save)
+
+    # check whether to show predictions
+    if args.test:
+        test_data = list(read_conll_file(args.test))
+        sp.predict_eval(test_data, output=args.output)
diff --git a/task_utils.py b/task_utils.py
new file mode 100644
index 0000000..99ab066
--- /dev/null
+++ b/task_utils.py
@@ -0,0 +1,409 @@
+"""
+Utility methods that are used for training and evaluation of the tasks.
+"""
+
+import os
+import operator
+import numpy as np
+import random
+from collections import namedtuple
+
+from sklearn import svm
+from sklearn.metrics import accuracy_score
+
+import data_utils
+from constants import POS_ID, NEG_ID, SENTIMENT, POS, POS_BILSTM, PARSING,\
+    BAYES_OPT
+from simpletagger import StructuredPerceptron
+
+from bist_parser.bmstparser.src import mstlstm
+from bist_parser.bmstparser.src.utils import vocab_conll, write_conll,\
+    write_original_conll
+
+from bilstm_tagger.src.simplebilty import SimpleBiltyTagger, load
+
+NUM_EPOCHS = 50
+PATIENCE = 2
+
+
+def get_data_subsets(feature_vals, feature_weights, train_data, train_labels,
+                     task, num_train_examples):
+    """
+    Given the feature values and the feature weights, return the stratified
+    subset of the training data with the highest feature scores.
+    :param feature_vals: a numpy array of shape (num_train_data, num_features)
+                         containing the feature values
+    :param feature_weights: a numpy array of shape (num_features, ) containing
+                            the weight for each feature
+    :param train_data: a sparse numpy array of shape (num_train_data, vocab_size)
+                       containing the training data
+    :param train_labels: a numpy array of shape (num_train_data) containing the
+                         training labels
+    :param task: the task; this determines whether we use stratification
+    :param num_train_examples: the number of training examples for the
+                               respective task
+    :return: subsets of the training data and its labels as a tuple of two
+             numpy arrays
+    """
+    # calculate the scores as the dot product between feature values and weights
+    scores = feature_vals.dot(np.transpose(feature_weights))
+
+    # sort the indices by their scores
+    sorted_index_score_pairs = sorted(zip(range(len(scores)), scores),
+                                      key=operator.itemgetter(1), reverse=True)
+
+    # get the top indices
+    top_indices, _ = zip(*sorted_index_score_pairs)
+
+    if task == SENTIMENT:
+        # for sentiment, rather than taking the top n indices, we still want to
+        # have a stratified training set so we take the top n/2 positive and
+        # top n/2 negative indices
+        top_pos_indices = [idx for idx in top_indices if train_labels[idx] ==
+                           POS_ID][:int(num_train_examples/2)]
+        top_neg_indices = [idx for idx in top_indices if train_labels[idx] ==
+                           NEG_ID][:int(num_train_examples/2)]
+        top_indices = top_pos_indices + top_neg_indices
+    elif task in [POS, POS_BILSTM, PARSING]:
+        # for POS tagging and parsing, we don't need a stratified train set
+        top_indices = list(top_indices[:num_train_examples])
+    else:
+        raise ValueError('Top index retrieval not implemented for %s.' % task)
+
+    if isinstance(train_data, list):
+        # numpy indexing does not work if train_data is a list
+        return [train_data[idx] for idx in top_indices],\
+               train_labels[top_indices]
+
+    # we get the corresponding subsets of the training data and the labels
+    return train_data[top_indices], train_labels[top_indices]
+
+
+def task2train_and_evaluate_func(task):
+    """Return the train_and_evaluate function for a task."""
+    if task == SENTIMENT:
+        return train_and_evaluate_sentiment
+    if task == POS:
+        return train_and_evaluate_pos
+    if task == POS_BILSTM:
+        return train_and_evaluate_pos_bilstm
+    if task == PARSING:
+        return train_and_evaluate_parsing
+    raise ValueError('Train_and_evaluate is not implemented for %s.' % task)
+
+
+def train_and_evaluate_sentiment(train_data, train_labels, val_data, val_labels,
+                                 test_data=None, test_labels=None,
+                                 parser_output_path=None, perl_script_path=None):
+    """
+    Trains an SVM on the provided training data. Calculates accuracy on the
+    validation set and (optionally) on the test set.
+    :param train_data: the training data; a sparse numpy matrix of shape
+                       (num_examples, max_vocab_size)
+    :param train_labels: the training labels; a numpy array of shape (num_labels)
+    :param val_data: the validation data; same format as the training data
+    :param val_labels: the validation labels
+    :param test_data: the test data
+    :param test_labels: the test labels
+    :param parser_output_path: only necessary for parsing; is ignored here
+    :param perl_script_path: only necessary for parsing; is ignored here
+    :return: the validation accuracy and (optionally) the test data;
+            otherwise None
+    """
+    print('Training the SVM on %d examples...' % train_data.shape[0])
+    clf = svm.SVC()
+    clf.fit(train_data, train_labels)
+
+    # validate the configuration on the validation and test set (if provided)
+    val_predictions = clf.predict(val_data)
+    val_accuracy = accuracy_score(val_labels, val_predictions)
+    print('Val acc: %.5f' % val_accuracy)
+    test_accuracy = None
+    if test_data is not None and test_labels is not None:
+        test_predictions = clf.predict(test_data)
+        test_accuracy = accuracy_score(test_labels, test_predictions)
+        print('Test acc: %.5f' % test_accuracy)
+    return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_pos(train_data, train_labels, val_data, val_labels,
+                           test_data=None, test_labels=None,
+                           parser_output_path=None, perl_script_path=None):
+    """
+    Trains the tagger on the provided training data. Calculates accuracy on the
+    validation set and (optionally) on the test set.
+    :param train_data: the training data; a list of lists of shape
+                       (num_examples, sequence_length)
+    :param train_labels: the training labels; a list of lists of tags
+    :param val_data: the validation data; same format as the training data
+    :param val_labels: the validation labels
+    :param test_data: the test data
+    :param test_labels: the test labels
+    :param parser_output_path: only necessary for parsing; is ignored here
+    :param perl_script_path: only necessary for parsing; is ignored here
+    :return: the validation accuracy and (optionally) the test acc; else None
+    """
+    print('Training the tagger on %d examples...' % len(train_data))
+    sp = StructuredPerceptron()
+    tr_data = [(words, tags) for words, tags in zip(train_data, train_labels)]
+    pos_iterations, pos_learning_rate = 5, 0.2
+    sp.fit(tr_data, iterations=pos_iterations, learning_rate=pos_learning_rate)
+
+    # validate the configuration on the validation and test set (if provided)
+    val_predictions = sp.predict(val_data)
+
+    val_accuracy = pos_accuracy_score(val_labels, val_predictions)
+    print('Val acc: %.5f' % val_accuracy)
+
+    test_accuracy = None
+    if test_data is not None and test_labels is not None:
+        test_predictions = sp.predict(test_data)
+        test_accuracy = pos_accuracy_score(test_labels, test_predictions)
+        print('Test acc: %.5f' % test_accuracy)
+    return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_pos_bilstm(train_data, train_labels, val_data, val_labels,
+                                  test_data=None, test_labels=None,
+                                  parser_output_path=None, perl_script_path=None):
+    """
+    Trains the tagger on the provided training data. Calculates accuracy on the
+    validation set and (optionally) on the test set.
+    :param train_data: the training data; a list of lists of shape
+                       (num_examples, sequence_length)
+    :param train_labels: the training labels; a list of lists of tags
+    :param val_data: the validation data; same format as the training data
+    :param val_labels: the validation labels
+    :param test_data: the test data
+    :param test_labels: the test labels
+    :return: the validation accuracy and (optionally) the test data; else None
+    """
+    print('Training the BiLSTM tagger on %d examples...' % len(train_data))
+    in_dim = 64
+    h_dim = 100
+    c_in_dim = 100
+    h_layers = 1
+    trainer = "adam"
+    # temporary file used to restore best model; random number is used to avoid
+    # name clash in parallel runs
+    model_path = '/tmp/bilstm_tagger_model_%d' % random.randint(0, 1000000)
+    tagger = SimpleBiltyTagger(in_dim, h_dim, c_in_dim, h_layers,
+                               embeds_file=None)
+    train_X, train_Y = tagger.get_train_data_from_instances(train_data,
+                                                            train_labels)
+    val_X, val_Y = tagger.get_data_as_indices_from_instances(val_data,
+                                                             val_labels)
+
+    # train the model with early stopping
+    tagger.fit(train_X, train_Y, NUM_EPOCHS, trainer, val_X=val_X, val_Y=val_Y,
+               patience=PATIENCE, model_path=model_path)
+
+    # load the best model and remove the model files
+    tagger = load(model_path)
+    os.unlink(model_path)
+    os.unlink(model_path + '.pickle')  # file used to save the parameters
+    val_correct, val_total = tagger.evaluate(val_X, val_Y)
+    val_accuracy = val_correct / val_total
+    print('Val acc: %.5f' % val_accuracy)
+
+    test_accuracy = None
+    if test_data is not None and test_labels is not None:
+        test_X, test_Y = tagger.get_data_as_indices_from_instances(test_data,
+                                                                   test_labels)
+        test_correct, test_total = tagger.evaluate(test_X, test_Y)
+        test_accuracy = test_correct / test_total
+        print('Test acc: %.5f' % test_accuracy)
+    return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_parsing(train_data, train_labels, val_data, val_labels,
+                               test_data=None, test_labels=None,
+                               parser_output_path=None, perl_script_path=None):
+    """
+    Trains the parser on the provided training data. Calculates LAS on the
+    validation set and (optionally) on the test set.
+    :param train_data: the training data; a list of CoNLL entries
+    :param train_labels: pseudo-labels; not used as labels as labels are
+                         contained in train_data
+    :param val_data: the validation data; same format as the training data
+    :param val_labels: pseud-labels; not used as contained in val_data
+    :param test_data: the test data
+    :param test_labels: pseudo-labels; not used as contained in test_data
+    :return: the validation accuracy and (optionally) the test data; else None
+    """
+    print('Training the parser on %d examples...' % len(train_data))
+    if test_data is not None:
+        # incorporate the test data as some POS tags (e.g. XX) might only
+        # appear in the target domain
+        words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data, test_data]))
+    else:
+        words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data]))
+
+    # set the variables used for initializing the parser and initialize the
+    # parser
+    ParserOptions = namedtuple('parser_options',
+                               'activation, blstmFlag, labelsFlag, costaugFlag,'
+                               ' bibiFlag, lstm_dims, wembedding_dims, '
+                               'pembedding_dims, rembedding_dims, lstm_layers, '
+                               'external_embedding, hidden_units, '
+                               'hidden2_units, epochs')
+    parser_options = ParserOptions(
+        epochs=NUM_EPOCHS,
+        activation='tanh',
+        blstmFlag=True,
+        labelsFlag=True,
+        costaugFlag=True,
+        bibiFlag=False,
+        lstm_dims=125,
+        wembedding_dims=100,
+        pembedding_dims=25,
+        rembedding_dims=25,
+        lstm_layers=2,
+        external_embedding=None,
+        hidden_units=100,
+        hidden2_units=0
+    )
+    parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
+
+    # write the dev data to a file
+    dev_data_path = os.path.join(parser_output_path, 'dev.conll')
+    write_original_conll(dev_data_path, val_data)
+
+    # set the variables used for tracking training progress for early stopping
+    best_dev_las, epochs_no_improvement = 0., 0
+    best_model_path = os.path.join(parser_output_path, 'parser')
+    print('Training model for %d max epochs with early stopping with patience '
+          '%d...' % (NUM_EPOCHS, PATIENCE))
+    for epoch in range(parser_options.epochs):
+        print('Starting epoch', epoch)
+        parser.TrainOnEntries(train_data)
+
+        # write the predictions to a file
+        pred_path = os.path.join(parser_output_path,
+                                 'dev_pred_epoch_' + str(epoch + 1) + '.conll')
+        write_conll(pred_path, parser.PredictOnEntries(val_data))
+        eval_path = pred_path + '.eval'
+        perl_script_command = ('perl %s -g %s -s %s > %s' % (
+            perl_script_path,dev_data_path, pred_path, eval_path))
+        print('Evaluating with %s...' % perl_script_command)
+        os.system(perl_script_command)
+        las, uas, acc = data_utils.read_parsing_evaluation(eval_path)
+
+        # remove the predictions and the evaluation file
+        if os.path.exists(pred_path):
+            os.unlink(pred_path)
+        if os.path.exists(eval_path):
+            os.unlink(eval_path)
+        if las > best_dev_las:
+            print('LAS %.2f is better than best dev LAS %.2f.'
+                  % (las, best_dev_las))
+            best_dev_las = las
+            epochs_no_improvement = 0
+            parser.Save(best_model_path)
+        else:
+            print('LAS %.2f is worse than best dev LAS %.2f.'
+                  % (las, best_dev_las))
+            epochs_no_improvement += 1
+        if epochs_no_improvement == PATIENCE:
+            print('No improvement for %d epochs. Early stopping...'
+                  % epochs_no_improvement)
+            print('Best dev LAS:', best_dev_las)
+            break
+
+    test_las = None
+    if test_data is not None:
+        # load the best model
+        parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
+        parser.Load(best_model_path)
+
+        # first write the dev data to a file
+        test_data_path = os.path.join(parser_output_path, 'test.conll')
+        write_original_conll(test_data_path, test_data)
+
+        # then write the prediction to another file
+        pred_path = os.path.join(parser_output_path, 'test_pred.conll')
+        write_conll(pred_path, parser.PredictOnEntries(test_data))
+        eval_path = pred_path + '.eval'
+        perl_script_command = ('perl %s -g %s -s %s > %s' % (
+            perl_script_path, test_data_path, pred_path, eval_path))
+        print('Evaluating with %s...' % perl_script_command)
+        os.system(perl_script_command)
+        test_las, test_uas, test_acc = data_utils.read_parsing_evaluation(
+            eval_path)
+        print('Test LAS:', test_las, 'test UAS:', test_uas,
+              'test acc:', test_acc)
+
+    # remove the saved parser
+    if os.path.exists(best_model_path):
+        os.unlink(best_model_path)
+    return best_dev_las, test_las
+
+
+def train_pretrained_weights(feature_values, X_train, y_train, train_domains,
+                             num_train_examples, X_val, y_val, X_test, y_test,
+                             trg_domain, args, feature_names,
+                             parser_output_path, perl_script_path):
+    """
+    Train a model using pre-trained data selection weights (which could have
+    been trained on an other model/domain/task).
+    :param feature_values: a numpy array of shape (num_examples, num_features)
+    :param X_train: the training data
+    :param y_train: the training labels
+    :param train_domains: a list of training domains, only used for counting
+    :param num_train_examples: the number of examples used for training
+    :param X_val: the validation data
+    :param y_val: the validation labels
+    :param X_test: the test data
+    :param y_test: the test labels
+    :param trg_domain: the target domain
+    :param args: the arguments used for calling the script; used for logging
+    :param feature_names: a list of the feature names
+    :param parser_output_path: the output path of the parser
+    :param perl_script_path: the path to the perl script
+    :return:
+    """
+    for feat_weights_domain, feat_weights_feats, feature_weights in \
+            data_utils.read_feature_weights_file(args.feature_weights_file):
+        assert len(feature_weights) == len(feature_names)
+        assert set(args.feature_sets) == set(feat_weights_feats.split(' '))
+
+        if trg_domain != feat_weights_domain:
+            continue
+
+        # count how many examples belong to each source domain
+        train_domain_subset, _ = get_data_subsets(
+            feature_values, feature_weights, train_domains, y_train, args.task,
+            num_train_examples)
+        for subset_domain in set(train_domain_subset):
+            print('# of %s in train data for trg domain %s: %d'
+                  % (subset_domain, trg_domain,
+                     train_domain_subset.count(subset_domain)))
+            continue
+
+        # get the train subset with the highest scores and train
+        train_subset, labels_subset = get_data_subsets(
+            feature_values, feature_weights, X_train, y_train, args.task,
+            num_train_examples)
+        val_accuracy, test_accuracy = task2train_and_evaluate_func(args.task)(
+            train_subset, labels_subset, X_val, y_val, X_test, y_test,
+            parser_output_path=parser_output_path,
+            perl_script_path=perl_script_path)
+        dict_key = ('%s-X-domain-%s-%s' % (BAYES_OPT, feat_weights_domain,
+                                           feat_weights_feats))
+
+        # log the result to the log file
+        data_utils.log_to_file(args.log_file, {dict_key: [(
+            val_accuracy, test_accuracy, feature_weights)]}, trg_domain, args)
+
+
+def pos_accuracy_score(gold, predicted):
+    """
+    Calculate the accuracy for POS.
+    :param gold: a list of lists of gold tags
+    :param predicted: a list of lists of predicted tags
+    :return the accuracy score
+    """
+    tags_correct = np.sum([1 for gold_tags, pred_tags in zip(gold, predicted)
+                           for g, p in zip(gold_tags, pred_tags) if g == p])
+    tags_total = len([t for g in gold for t in g])  # ravel list
+    return tags_correct/float(tags_total)