diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2f8633b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,116 @@
+# LaTeX temporary files
+*.aux
+*.log
+*.toc
+
+# PDF output - usually a bad idea to keep this in Git
+*.pdf
+
+# Latexmk
+*.fdb_latexmk
+
+# SyncTeX
+*.synctex.gz
+
+# LaTeX Beamer
+*.snm
+*.vrb
+*.nav
+*.out
+
+# BibTeX
+*.bbl
+*.blg
+
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..33da967
--- /dev/null
+++ b/README.md
@@ -0,0 +1,195 @@
+# Learning to select data for transfer learning with Bayesian Optimization
+
+Sebastian Ruder, Barbara Plank (2017). Learning to select data for transfer
+learning with Bayesian Optimization. _In Proceedings of the 2017 Conference
+on Empirical Methods in Natural Language Processing_, Copenhagen, Denmark.
+
+## Requirements
+
+### RoBO
+
+The Robust Bayesian Optimization framework [RoBO](http://automl.github.io/RoBO/) needs to be installed.
+It can be installed using the following steps:
+
+1. First, install `libeigen3-dev` as a prerequisite:
+`sudo apt-get install libeigen3-dev` (*)
+2. Then, clone the RoBO repository:
+`git clone https://github.com/automl/RoBO.git`
+3. Change into the directory: `cd RoBO/`
+4. Install RoBOs requirements:
+`for req in $(cat all_requirements.txt); do pip install $req; done`
+5. Finally, install RoBO:
+`python setup.py install`
+
+For the topic models, `gensim` needs to be installed:
+`pip install gensim`
+
+### DyNet
+
+We use the neural network library [DyNet](http://dynet.readthedocs.io/en/latest/index.html),
+which works well with networks that have dynamic structures. DyNet can be
+installed by following the instructions [here](http://dynet.readthedocs.io/en/latest/python.html#manual-installation).
+
+## Repository structure
+
+- `bilstm_tagger`: The repository containing code for the Bi-LSTM tagger from
+Plank et al. (2016).
+- `bist_parser`: The repository containing the code for the BIST parser from
+Kiperwasser and Goldberg (2016).
+- `bayes_opt.py`: The main logic for running Bayesian Optimization.
+- `constants.py`: Constants that are shared across all files.
+- `data_utils.py`: Utility methods for data reading and processing.
+- `similarity.py`: Methods for measuring domain similarity.
+- `simpletagger.py`: Code for running the Structured Perceptron POS tagger.
+- `task_utils.py`: Utility methods for training and evaluation.
+
+## Instructions
+
+### Running Bayesian Optimization
+
+The main logic for running Bayesian Optimization can be found in `bayes_opt.py`.
+The features that are currently used are currently defined in `constants.py` as
+`FEATURE_SETS` and are split into diversity and similarity features.
+Bayesian Optimization minimizes the validation error on the specified dataset.
+
+### Example usage
+
+```
+python bayes_opt.py --dynet-autobatch 1 -d data/gweb_sancl -m models/model \
+ -t emails newsgroups reviews weblogs wsj --task pos \
+ -b random most-similar-examples \
+ --parser-output-path parser_outputs \
+ --perl-script-path bist_parser/bmstparser/src/util_scripts/eval.pl \
+ -f similarity --z-norm --num-iterations 100 \
+ --num-runs 1 --log-file logs/log
+```
+
+- `dynet-autobatch 1`: use DyNet auto-batching
+- `-d data/gweb_sancl`: use the data from the SANCL 2012 shared task
+- `-m models/model`: specify the directory where the model should be saved
+- `-t emails newsgroups reviews weblogs wsj`: adapt to the specified target
+domains in the order they were provided
+- `--task pos`: perform POS tagging with the Structured Perceptron model
+- `-b`: use the random and most-similar-examples baselines
+- `--parser-output-path`, `--perl-script-path`: only required when performing
+parsing
+- `-f`: use only similarity features with Bayesian Optimization
+- `--z-norm`: perform z-normalisation (recommended)
+- `--num-iterations`: perform 100 iterations of Bayesian Optimization
+- `--num-runs`: perform one run of Bayesian Optimization per target domain
+- `--log-file`: log the results of the baselines and Bayesian Optimization to
+ this file
+
+### Adding a new task
+
+In order to add a new task, you need to do several things:
+- Add the new task to `TASKS`, `TASK2TRAIN_EXAMPLES`, and `TASK2DOMAINS` in
+`constants.py`.
+- Add a method to read data for the task to `data_utils.py` and add the
+mapping to `data_utils.task2read_data_func`.
+- Add a method to train and evaluate the task to `task_utils.py` and add the
+mapping to `task_utils.task2train_and_evaluate_func`.
+- Add the function that should be minimized to `bayes_opt.py` and add the
+mapping to `task2_objective_function`. The function should take
+as input the feature weights and output the error.
+
+### Adding new features
+
+New feature sets or features can be added by adding them to `constants.py`.
+Similarity features or new representations can be added to
+`similarity.py`. Diversity features or any other features can to be added to
+`features.py`. All new features must be added to
+`get_feature_representations` and `get_feature_names` in `features.py`.
+
+
+
+## Data
+
+### Multi-Domain Sentiment Dataset
+
+The Amazon Reviews Multi-Domain Sentiment Dataset (Blitzer et al., 2007)
+used in the current Bayesian Optimization experiment can be downloaded
+using the following steps:
+1. Create a new `amazon-reviews` directory:
+`mkdir amazon-reviews`
+2. Change into the directory:
+`cd amazon-reviews`
+3. Download the dataset:
+`wget https://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz`
+4. Extract the dataset:
+`tar -xvf processed_acl.tar.gz`
+
+In `bayes_opt.py`, the `data-path` argument should now be pointed to
+the `amazon-reviews` directory.
+
+### Multi-domain POS and parsing data
+
+We use the data from the [SANCL 2012 shared task/English Web Treebank](https://catalog.ldc.upenn.edu/ldc2012t13).
+
+### Word embedding data
+
+Pre-trained word embeddings can be downloaded from [here](http://nlp.stanford.edu/projects/glove/).
+We are using GloVe embeddings in the paper, but other pre-trained embeddings are also possible.
+Smaller embedding files can be used for faster iteration.
+
+
+## Models
+
+### BIST parser
+
+We use the BIST parser from Kiperwasser and Goldberg (2016) for our experiments. The parser repo can be found
+[here](https://github.com/elikip/bist-parser) and was integrated using [`git submodule`](http://stackoverflow.com/questions/2140985/how-to-set-up-a-git-project-to-use-an-external-repo-submodule).
+
+For running the parser with Bayesian Optimization, two additional hyperparameters are necessary:
+- `--perl-script-path`: This is the location of the `perl` script that is used to evaluate the parser's predictions.
+ The script is located in `bist_parser/bmstparser/src/util_scripts/eval.pl` per default.
+- `--parser-output-path`: This is the location of the folder where the parser's predictions and the output of the
+ `perl` script will be written to.
+
+Per default, Labeled Attachment Score on the held-out validation set is used to evaluate the parser's performance and
+evaluation results are saved to a subfolders of `parser-output-path` that indicate the target domain and feature sets
+used. Another subsubfolder is created for the best weights configuration so that Labeled Attachment Score, Unlabeled
+Attachment Score and Accuracy as well as other statistics are available for the final test set evaluation.
+
+### Bi-LSTM tagger
+
+The Bi-LSTM tagger we are using is a simplified, single-task version of the
+hierarchical Multi-task Bi-LSTM tagger used by Plank et al. (2016). The source
+repository of the tagger can be found [here](https://github.com/bplank/bilstm-aux/).
+
+## (*) Installing Eigen without sudo rights
+
+In case you you do not have sudo rights to run `sudo apt-get install
+libeigen3-dev` here is a workaround.
+
+Create a folder where you download the sources of libeigen3-dev:
+
+```
+mkdir -p tools/eigen3
+cd tools/eigen3
+apt-get source libeigen3-dev
+```
+
+Afterwards point the required packages for `RoBo` to the folder just created: `tools/eigen3/eigen3-3.2.0`
+
+For instance, to install the 'george' requirement of `RoBo`, add the `--global-option` parameters pointing to the eigen directory:
+
+```
+pip install git+https://github.com/sfalkner/george.git --global-option=build_ext --global-option=-I/path/to/tools/eigen3/eigen3-3.2.0
+```
+
+(see http://dan.iel.fm/george/current/user/quickstart/#installation -> if you have Eigen in a strange place)
+
+
+## Reference
+
+If you make use of the contents of this repository, we appreciate citing the following paper:
+```
+@inproceedings{ruder2017select,
+ title={{Learning to select data for transfer learning with Bayesian Optimization}},
+ author={Ruder, Sebastian and Plank, Barbara},
+ booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processin},
+ year={2017}
+}
+```
+
diff --git a/bayes_opt.py b/bayes_opt.py
new file mode 100644
index 0000000..5714a07
--- /dev/null
+++ b/bayes_opt.py
@@ -0,0 +1,459 @@
+"""
+Run Bayesian optimization to learn to learn select data for transfer learning.
+
+Uses Python 3.5.
+"""
+
+import os
+import argparse
+import logging
+import pickle
+import copy
+
+import numpy as np
+from scipy import stats
+from sklearn.cross_validation import train_test_split
+
+from robo.fmin import bayesian_optimization
+
+import task_utils
+import data_utils
+import similarity
+import features
+from constants import FEATURE_SETS, SENTIMENT, POS, POS_BILSTM, PARSING,\
+ TASK2TRAIN_EXAMPLES, TASK2DOMAINS, TASKS, POS_PARSING_TRG_DOMAINS,\
+ SENTIMENT_TRG_DOMAINS, BASELINES, BAYES_OPT, RANDOM, MOST_SIMILAR_DOMAIN,\
+ MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA
+
+from bist_parser.bmstparser.src.utils import ConllEntry
+
+
+def task2_objective_function(task):
+ """Returns the objective function of a task."""
+ if task == SENTIMENT:
+ return objective_function_sentiment
+ if task == POS:
+ return objective_function_pos
+ if task == POS_BILSTM:
+ return objective_function_pos_bilstm
+ if task == PARSING:
+ return objective_function_parsing
+ raise ValueError('No objective function implemented for %s.' % task)
+
+
+def objective_function_sentiment(feature_weights):
+ """
+ The objective function to optimize for sentiment analysis.
+ :param feature_weights: a numpy array; these are the weights of the features
+ that we want to learn
+ :return: the error that should be minimized
+ """
+ train_subset, train_labels_subset = task_utils.get_data_subsets(
+ feature_values, feature_weights, X_train, y_train, SENTIMENT,
+ TASK2TRAIN_EXAMPLES[SENTIMENT])
+
+ # train and evaluate the SVM; we input the test documents here but only
+ # minimize the validation error
+ val_accuracy, _ = task_utils.train_and_evaluate_sentiment(
+ train_subset, train_labels_subset, X_val, y_val, X_test, y_test)
+
+ # we minimize the error; the lower the better
+ error = 1 - float(val_accuracy)
+ return error
+
+
+def objective_function_pos(feature_weights):
+ """
+ The objective function to optimize for POS tagging.
+ :param feature_weights: a numpy array; these are the weights of the features
+ that we want to learn
+ :return: the error that should be minimized
+ """
+ train_subset, train_labels_subset = task_utils.get_data_subsets(
+ feature_values, feature_weights, X_train, y_train, POS,
+ TASK2TRAIN_EXAMPLES[POS])
+
+ # train and evaluate the tagger; we input the test documents here but only
+ # minimize the validation error
+ val_accuracy, _ = task_utils.train_and_evaluate_pos(
+ train_subset, train_labels_subset, X_val, y_val)
+
+ # we minimize the error; the lower the better
+ error = 1 - float(val_accuracy)
+ return error
+
+
+def objective_function_pos_bilstm(feature_weights):
+ """
+ The objective function to optimize for POS tagging.
+ :param feature_weights: a numpy array; these are the weights of the features
+ that we want to learn
+ :return: the error that should be minimized
+ """
+ train_subset, train_labels_subset = task_utils.get_data_subsets(
+ feature_values, feature_weights, X_train, y_train, POS_BILSTM,
+ TASK2TRAIN_EXAMPLES[POS_BILSTM])
+
+ # train and evaluate the tagger; we input the test documents here but only
+ # minimize the validation error
+ val_accuracy, _ = task_utils.train_and_evaluate_pos_bilstm(
+ train_subset, train_labels_subset, X_val, y_val)
+
+ # we minimize the error; the lower the better
+ error = 1 - float(val_accuracy)
+ return error
+
+
+def objective_function_parsing(feature_weights):
+ """
+ The objective function to optimize for dependency parsing.
+ :param feature_weights: a numpy array; these are the weights of the features
+ that we want to learn
+ :return: the error that should be minimized
+ """
+ train_subset, train_labels_subset = task_utils.get_data_subsets(
+ feature_values, feature_weights, X_train, y_train, PARSING,
+ TASK2TRAIN_EXAMPLES[PARSING])
+ val_accuracy, _ = task_utils.train_and_evaluate_parsing(
+ train_subset, train_labels_subset, X_val, y_val,
+ parser_output_path=parser_output_path,
+ perl_script_path=perl_script_path)
+ error = 100 - float(val_accuracy)
+ return error
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description='Learn to select data using Bayesian Optimization.',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ # dynet parameters
+ parser.add_argument('--dynet-autobatch', type=int,
+ help='use auto-batching (1) (should be first argument)')
+ parser.add_argument('--dynet-mem', default=5000, help='the memory used',
+ type=int) # Note: needs to be given to the script!
+ parser.add_argument('--dynet-seed', default=1512141834, type=int,
+ help='the dynet seed') # Note: needs to still be given!
+
+ # domain and data paths
+ parser.add_argument('-d', '--data-path', required=True,
+ help='the path to the directory containing the '
+ 'processed_acl or gweb_sancl directory')
+ parser.add_argument('-m', '--model-dir', required=True,
+ help='the directory where the model should be saved')
+ parser.add_argument('-t', '--trg-domains', nargs='+', required=True,
+ choices=POS_PARSING_TRG_DOMAINS + SENTIMENT_TRG_DOMAINS,
+ help='the domains to which to adapt')
+ parser.add_argument('--task', choices=TASKS, required=True,
+ help='the task which to optimize')
+ parser.add_argument('-b', '--baselines', nargs='+', choices=BASELINES,
+ default=[RANDOM],
+ help='the baselines that should be compared against')
+ parser.add_argument('-o', '--parser-output-path',
+ default='outputs', help='the output path of the parser')
+ parser.add_argument('-p', '--perl-script-path', help='perl script path',
+ default='bist_parser/bmstparser/src/util_scripts/eval'
+ '.pl')
+
+ # feature parameters
+ parser.add_argument('-f', '--feature-sets', nargs='+', default=['similarity'],
+ choices=FEATURE_SETS,
+ help='which feature sets (similarity, topic_similarity,'
+ 'word_embedding_similarity, diversity) '
+ 'to use; default: similarity')
+ parser.add_argument('--z-norm', action='store_true',
+ help='use z-normalisation') # important to specify
+ parser.add_argument('--feature-weights-file',
+ help='a file containing learned feature weights to be'
+ 'used for cross-domain experiments')
+
+ # word embedding parameters
+ parser.add_argument('-wv', '--word2vec-path', help='the path to the word'
+ 'vector file')
+ parser.add_argument('-vs', '--vector-size', type=int, default=300,
+ help='the size of the word vectors')
+ parser.add_argument('--header', action='store_true',
+ help='whether the word embeddings file contains header;'
+ 'GloVe embeddings used in the paper have no header')
+
+ # processing parameters
+ parser.add_argument('-v', '--max-vocab-size', default=10000, type=int,
+ help='the maximum size of the vocabulary')
+
+ # training parameters
+ parser.add_argument('--num-iterations', default=100, type=int)
+ parser.add_argument('--logging', action='store_true', help='perform logging')
+ parser.add_argument('--num-runs', type=int, default=1,
+ help='the number of experiment runs for each domain')
+ parser.add_argument('--log-file', required=True,
+ help='the path to which validation and test accuracies'
+ 'should be logged')
+
+ args = parser.parse_args()
+
+ # switch on logging if specified to see the output of LDA training and of
+ # the Bayesian optimization
+ if args.logging:
+ logging.basicConfig(level=logging.INFO)
+
+ assert os.path.exists(args.data_path), ('Error: %s does not exist.' %
+ args.data_path)
+ assert not args.word2vec_path or os.path.exists(args.word2vec_path), \
+ 'Error: %s does not exist.' % args.word2vec_path
+
+ # create the model directory if it does not exist
+ if not os.path.exists(args.model_dir):
+ print('Creating %s...' % args.model_dir)
+ os.makedirs(args.model_dir)
+
+ # perl script path and parser output path are only required for parsing
+ perl_script_path = None
+ if args.task == PARSING:
+ assert args.parser_output_path is not None
+ assert args.perl_script_path is not None
+ if not os.path.exists(args.parser_output_path):
+ os.makedirs('Creating output path %s.' % args.parser_output_path)
+ assert os.path.exists(args.perl_script_path)
+ perl_script_path = args.perl_script_path
+
+ # get the task-specific methods and hyper-parameters
+ num_train_examples = TASK2TRAIN_EXAMPLES[args.task]
+ task_trg_domains = TASK2DOMAINS[args.task]
+ read_data = data_utils.task2read_data_func(args.task)
+ train_and_evaluate = task_utils.task2train_and_evaluate_func(args.task)
+ objective_function = task2_objective_function(args.task)
+
+ # get the names of the individual features in the feature sets
+ assert args.word2vec_path or 'diversity' not in args.feature_sets,\
+ 'Error: Word2vec path is required for quadratic entropy in ' \
+ 'diversity-based features.'
+ feature_names = features.get_feature_names(args.feature_sets)
+
+ if args.feature_weights_file:
+ print('Training model with pre-learned feature weights rather than '
+ 'learning new ones...')
+ assert os.path.exists(args.feature_weights_file),\
+ 'Error: %s does not exist.' % args.feature_weights_file
+
+ # read the data and pickle it or load it
+ preproc_data_path = os.path.join(args.model_dir,
+ 'preproc_data_%s.pkl' % args.task)
+ if not os.path.exists(preproc_data_path):
+ domain2data = read_data(args.data_path)
+ print('Saving domain2data object to %s...' % preproc_data_path)
+ with open(preproc_data_path, 'wb') as f:
+ pickle.dump(domain2data, f)
+ else:
+ print('Loading domain2data object from %s...' % preproc_data_path)
+ with open(preproc_data_path, 'rb') as f:
+ domain2data = pickle.load(f)
+ assert set(task_trg_domains) == set(domain2data.keys())
+
+ # create the vocabulary or load it if it was already created
+ vocab_path = os.path.join(args.model_dir, 'vocab.txt')
+ vocab = data_utils.Vocab(args.max_vocab_size, vocab_path)
+ if not os.path.exists(vocab_path):
+ # retrieve all available tokenised sentences
+ tokenised_sentences = data_utils.get_all_docs(
+ domain2data.items(), unlabeled=True)[0]
+ if args.task == PARSING:
+ # get the word form from every ConllEntry
+ tokenised_sentences = [[token.form if isinstance(token, ConllEntry)
+ else token for token in tokens]
+ for tokens in tokenised_sentences]
+ vocab.create(tokenised_sentences)
+ del tokenised_sentences
+ else:
+ vocab.load()
+
+ # load word vectors if we are using them
+ word2vec = None
+ if args.word2vec_path:
+ vocab_word2vec_file = os.path.join(args.model_dir, 'vocab_word2vec.txt')
+ word2vec = similarity.load_word_vectors(
+ args.word2vec_path, vocab_word2vec_file, vocab.word2id,
+ vector_size=args.vector_size, header=args.header)
+
+ # perform the task-specific pre-processing
+ if args.task == SENTIMENT:
+ print('Creating binary training data...')
+ domain2train_data = data_utils.get_tfidf_data(domain2data, vocab)
+ elif args.task in [POS, POS_BILSTM]:
+ print('Using words as training data for POS tagging...')
+ domain2train_data = domain2data
+ elif args.task == PARSING:
+ print('Using CoNLL entries as training data for parsing. Using word '
+ 'forms to extract feature representations...')
+ domain2train_data = copy.deepcopy(domain2data)
+ for domain, domain_data in domain2data.items():
+ domain_data[0] = [[conll_entry.form for conll_entry in conll_entries]
+ for conll_entries in domain_data[0]]
+ else:
+ raise ValueError('Data preproc for %s is not implemented.' % args.task)
+
+ print('Creating relative term frequency distributions for all domains...')
+ term_dist_path = os.path.join(args.model_dir, 'term_dist.txt')
+ domain2term_dist = similarity.get_domain_term_dists(
+ term_dist_path, domain2data, vocab)
+
+ # perform optimization for every target domain
+ for trg_domain in args.trg_domains:
+ print('Target domain:', trg_domain)
+
+ # set the domain and similarity-specific parser output path for parsing
+ parser_output_path, best_weights_parser_output_path = None, None
+ if args.task == PARSING:
+ parser_output_path = os.path.join(
+ args.parser_output_path, '%s-%s' % (trg_domain, '_'.join(
+ args.feature_sets)))
+ if not os.path.exists(parser_output_path):
+ print('Creating %s...' % parser_output_path)
+ os.makedirs(parser_output_path)
+ # use a separate subfolder for the best weights
+ best_weights_parser_output_path = os.path.join(parser_output_path,
+ 'best-weights')
+ if not os.path.exists(best_weights_parser_output_path):
+ os.makedirs(best_weights_parser_output_path)
+
+ # get the training data of all source domains (not the target domain)
+ X_train, y_train, train_domains = data_utils.get_all_docs(
+ [(k, v) for (k, v) in sorted(domain2train_data.items())
+ if k != trg_domain], unlabeled=False)
+
+ # get the unprocessed examples for extracting the feature values
+ examples, y_train_check, train_domains_check = data_utils.get_all_docs(
+ [(k, v) for (k, v) in sorted(domain2data.items())
+ if k != trg_domain], unlabeled=False)
+
+ # some sanity checks just to make sure the processed and the
+ # unprocessed data still correspond to the same examples
+ assert np.array_equal(y_train, y_train_check)
+ assert len(train_domains) == len(train_domains_check),\
+ 'Error: %d != %d.' % (len(train_domains), len(train_domains_check))
+ assert train_domains == train_domains_check, ('Error: %s != %s' % (
+ str(train_domains), str(train_domains_check)))
+ if args.task in [POS, POS_BILSTM, PARSING]:
+ # for sentiment, we are using a sparse matrix
+ X_train = np.array(X_train)
+ print('Training data shape:', X_train.shape, y_train.shape)
+
+ # train topic model if any of the features requires a topic distribution
+ topic_vectorizer, lda_model = None, None
+ if any(f_name.startswith('topic') for f_name in feature_names):
+ # train a topic model on labeled and unlabeled data of all domains
+ topic_vectorizer, lda_model = similarity.train_topic_model(
+ data_utils.get_all_docs(
+ domain2data.items(), unlabeled=True)[0], vocab)
+
+ # get the feature representations of the training data
+ print('Creating the feature representations for the training data. '
+ 'This may take some time...')
+ feature_values = features.get_feature_representations(
+ feature_names, examples, domain2data[trg_domain][0], vocab,
+ word2vec, topic_vectorizer, lda_model)
+
+ if args.z_norm:
+ # apply z-normalisation; this is important for good performance
+ print('Z-normalizing features...')
+ print('First five example features before normalisation:',
+ feature_values[:5, :])
+ print('Standard deviation of features:', np.std(feature_values,
+ axis=0))
+ print('Mean of features:', np.mean(feature_values, axis=0))
+ feature_values = stats.zscore(feature_values, axis=0)
+
+ # delete unnecessary variables to save space
+ del examples, y_train_check, train_domains_check
+
+ # run num_runs iterations of the optimization and baselines in order to
+ # compute statistics around mean/variance; things that vary between
+ # runs: validation/test split; train set of random baseline;
+ # final BayesOpt parameters; the feature values are constant for each
+ # run, which is why we generate them before to reduce the overhead
+ run_dict = {method: [] for method in BASELINES + [BAYES_OPT]}
+ for i in range(args.num_runs):
+ print('\nTarget domain %s. Run %d/%d.' % (trg_domain, i+1,
+ args.num_runs))
+
+ # get the evaluation data from the target domain
+ X_test, y_test, _ = domain2train_data[trg_domain]
+
+ # split off a validation set from the evaluation data
+ X_test, X_val, y_test, y_val = train_test_split(
+ X_test, y_test, test_size=100, stratify=y_test
+ if args.task == SENTIMENT else None)
+ print('# of validation examples: %d. # of test examples: %d.'
+ % (len(y_val), len(y_test)))
+
+ # train the model with pre-learned feature weights if specified
+ if args.feature_weights_file:
+ print('Training with pre-learned feature weights...')
+ task_utils.train_pretrained_weights(
+ feature_values, X_train, y_train, train_domains,
+ num_train_examples, X_val, y_val, X_test, y_test,
+ trg_domain, args, feature_names, parser_output_path,
+ perl_script_path)
+ continue
+
+ for baseline in args.baselines:
+
+ # select the training data dependent on the baseline
+ if baseline == RANDOM:
+ print('Randomly selecting examples...')
+ train_subset, _, labels_subset, _ = train_test_split(
+ X_train, y_train, train_size=num_train_examples,
+ stratify=y_train if args.task == SENTIMENT else None)
+ elif baseline == ALL_SOURCE_DATA:
+ print('Selecting all source data examples...')
+ train_subset, labels_subset = X_train, y_train
+ elif baseline == MOST_SIMILAR_DOMAIN:
+ print('Selecting examples from the most similar domain...')
+ most_similar_domain = similarity.get_most_similar_domain(
+ trg_domain, domain2term_dist)
+ train_subset, labels_subset, _ = domain2train_data[
+ most_similar_domain]
+ train_subset, _, labels_subset, _ = train_test_split(
+ train_subset, labels_subset, train_size=num_train_examples,
+ stratify=labels_subset if args.task == SENTIMENT else None)
+ elif baseline == MOST_SIMILAR_EXAMPLES:
+ print('Selecting the most similar examples...')
+ one_all_weights = np.ones(len(feature_names))
+ one_all_weights[1:] = 0
+ train_subset, labels_subset = task_utils.get_data_subsets(
+ feature_values, one_all_weights, X_train, y_train,
+ args.task, num_train_examples)
+ else:
+ raise ValueError('%s is not a baseline.' % baseline)
+
+ # train the baseline
+ val_accuracy, test_accuracy = train_and_evaluate(
+ train_subset, labels_subset, X_val, y_val,
+ X_test, y_test, parser_output_path=parser_output_path,
+ perl_script_path=perl_script_path)
+ run_dict[baseline].append((val_accuracy, test_accuracy))
+
+ # define the lower and upper bounds of the input space [-1, 1]
+ lower = np.array(len(feature_names) * [-1])
+ upper = np.array(len(feature_names) * [1])
+ print('Lower limits shape:', lower.shape)
+ print('Upper limits shape:', upper.shape)
+
+ print('Running Bayesian Optimization...')
+ res = bayesian_optimization(objective_function, lower=lower,
+ upper=upper,
+ num_iterations=args.num_iterations)
+
+ best_feature_weights = res['x_opt']
+ print('Best feature weights', best_feature_weights)
+ train_subset, labels_subset = task_utils.get_data_subsets(
+ feature_values, best_feature_weights, X_train, y_train,
+ args.task, num_train_examples)
+ val_accuracy, test_accuracy = train_and_evaluate(
+ train_subset, labels_subset, X_val, y_val, X_test, y_test,
+ parser_output_path=best_weights_parser_output_path,
+ perl_script_path=perl_script_path)
+ run_dict[BAYES_OPT].append((val_accuracy, test_accuracy,
+ best_feature_weights))
+
+ # log the results of all methods to the log file
+ data_utils.log_to_file(args.log_file, run_dict, trg_domain, args)
diff --git a/bilstm_tagger/License b/bilstm_tagger/License
new file mode 100755
index 0000000..87c6de2
--- /dev/null
+++ b/bilstm_tagger/License
@@ -0,0 +1,13 @@
+Copyright 2016 The bilstm-aux authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/bilstm_tagger/README.md b/bilstm_tagger/README.md
new file mode 100755
index 0000000..95c4940
--- /dev/null
+++ b/bilstm_tagger/README.md
@@ -0,0 +1,108 @@
+## bi-LSTM tagger
+
+Bidirectional Long-Short Term Memory tagger
+
+If you use this tagger please cite our paper:
+http://arxiv.org/abs/1604.05529
+
+### Requirements
+
+* python3
+* [dynet](https://github.com/clab/dynet)
+
+## Installation
+
+Download and install dynet in a directory of your choice DYNETDIR:
+
+```
+mkdir $DYNETDIR
+git clone https://github.com/clab/dynet
+```
+
+Follow the instructions in the Dynet documentation (use `-DPYTHON`,
+see http://dynet.readthedocs.io/en/latest/python.html).
+
+And compile dynet:
+
+```
+cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python`
+```
+
+(if you have a GPU:
+
+```
+cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python` -DBACKEND=cuda
+```
+)
+
+After successful installation open python and import dynet, you can
+test if the installation worked with:
+
+```
+>>> import dynet
+[dynet] random seed: 2809331847
+[dynet] allocating memory: 512MB
+[dynet] memory allocation done.
+>>> dynet.__version__
+2.0
+```
+
+(You may need to set you PYTHONPATH to include Dynet's `build/python`)
+
+#### DyNet supports python 3
+
+The old bilstm-aux had a patch to work with python 3. This
+is no longer necessary, as DyNet supports python 3 as of
+https://github.com/clab/dynet/pull/130#issuecomment-259656695
+
+
+#### Example command
+
+Training the tagger:
+
+```
+python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1 > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2
+```
+
+#### Embeddings
+
+The poly embeddings [(Al-Rfou et al.,
+2013)](https://sites.google.com/site/rmyeid/projects/polyglot) can be
+downloaded from [here](http://www.let.rug.nl/bplank/bilty/embeds.tar.gz) (0.6GB)
+
+
+#### A couple of remarks
+
+The choice of 22 languages from UD1.2 (rather than 33) is described in
+our TACL parsing paper, Section 3.1. [(Agić et al.,
+2016)](https://transacl.org/ojs/index.php/tacl/article/view/869). Note,
+however, that the bi-LSTM tagger does not require large amounts of
+training data (as discussed in our paper). Therefore above are
+results for all languages in UD1.3 (for the canonical language
+subparts, i.e., those with just the language prefix, no further
+suffix; e.g. 'nl' but not 'nl_lassy', and those languages which are
+distributed with word forms).
+
+The `bilty` code is a significantly refactored version of the code
+originally used in the paper. For example, `bilty` supports multi-task
+learning with output layers at different layers (`--pred_layer`), and
+it correctly supports stacked LSTMs (see e.g., Ballesteros et al.,
+2015, Dyer et al., 2015). The results on UD1.3 are obtained with
+`bilty` using no stacking (`--h_layers 1`).
+
+#### Recommended setting for `bilty`:
+
+* 3 stacked LSTMs, predicting on outermost layer, otherwise default settings, i.e., `--h_layers 3 --pred_layer 3`
+
+#### Reference
+
+```
+@inproceedings{plank:ea:2016,
+ title={{Multilingual Part-of-Speech Tagging with Bidirectional Long Short-Term Memory Models and Auxiliary Loss}},
+ author={Plank, Barbara and S{\o}gaard, Anders and Goldberg, Yoav},
+ booktitle={ACL 2016, arXiv preprint arXiv:1604.05529},
+ url={http://arxiv.org/abs/1604.05529},
+ year={2016}
+}
+```
+
diff --git a/bilstm_tagger/langs/lang_canonic.txt b/bilstm_tagger/langs/lang_canonic.txt
new file mode 100755
index 0000000..9e15b31
--- /dev/null
+++ b/bilstm_tagger/langs/lang_canonic.txt
@@ -0,0 +1,39 @@
+ar
+bg
+ca
+cs
+cu
+da
+de
+el
+en
+es
+et
+eu
+fa
+fi
+fr
+ga
+gl
+got
+grc
+he
+hi
+hr
+hu
+id
+it
+kk
+la
+lv
+nl
+no
+pl
+pt
+ro
+ru
+sl
+sv
+ta
+tr
+zh
diff --git a/bilstm_tagger/langs/lang_with_embeds.txt b/bilstm_tagger/langs/lang_with_embeds.txt
new file mode 100755
index 0000000..a595b41
--- /dev/null
+++ b/bilstm_tagger/langs/lang_with_embeds.txt
@@ -0,0 +1,26 @@
+ar
+bg
+ca
+cs
+da
+de
+el
+en
+es
+et
+eu
+fa
+fi
+fr
+ga
+he
+hi
+hr
+id
+it
+nl
+no
+pl
+pt
+sl
+sv
diff --git a/bilstm_tagger/results-UD1.3-pycnn.md b/bilstm_tagger/results-UD1.3-pycnn.md
new file mode 100755
index 0000000..1a9a47f
--- /dev/null
+++ b/bilstm_tagger/results-UD1.3-pycnn.md
@@ -0,0 +1,64 @@
+
+#### Results on UD1.3
+
+NB. The results below are with the old version of Dynet (pycnn).
+
+The table below provides results on UD1.3 (iters=20, h_layers=1).
+
++poly is using pre-trained embeddings to initialize
+word embeddings. Note that for some languages it slightly hurts performance.
+
+```
+python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1 > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2
+```
+
+| Lang | i20-h1 | +poly |
+| ---| -----:| -----:|
+| ar | 96.07 | 96.37 |
+| bg | 98.21 | 98.12 |
+| ca | 98.11 | 98.24 |
+| cs | 98.63 | 98.60 |
+| cu | 96.48 | -- |
+| da | 96.06 | 96.04 |
+| de | 92.91 | 93.64 |
+| el | 97.85 | 98.36 |
+| en | 94.60 | 95.04 |
+| es | 95.23 | 95.76 |
+| et | 95.75 | 96.57 |
+| eu | 93.86 | 95.40 |
+| fa | 96.82 | 97.38 |
+| fi | 94.32 | 95.35 |
+| fr | 96.34 | 96.45 |
+| ga | 90.50 | 91.29 |
+| gl | 96.89 | -- |
+| got | 95.97 | -- |
+| grc | 94.36 | -- |
+| he | 95.25 | 96.78 |
+| hi | 96.37 | 96.93 |
+| hr | 94.98 | 96.07 |
+| hu | 93.84 | -- |
+| id | 93.17 | 93.55 |
+| it | 97.40 | 97.82 |
+| kk | 77.68 | -- |
+| la | 90.17 | -- |
+| lv | 91.42 | -- |
+| nl | 90.02 | 89.87 |
+| no | 97.58 | 97.97 |
+| pl | 96.30 | 97.36 |
+| pt | 97.21 | 97.46 |
+| ro | 95.49 | -- |
+| ru | 95.69 | -- |
+| sl | 97.53 | 96.42 |
+| sv | 96.49 | 96.76 |
+| ta | 84.51 | -- |
+| tr | 93.81 | -- |
+| zh | 93.13 | -- |
+
+Using pre-trained embeddings often helps to improve accuracy, however, does not
+strictly hold for all languages.
+
+For more information, predictions files and pre-trained models
+visit [http://www.let.rug.nl/bplank/bilty/](http://www.let.rug.nl/bplank/bilty/)
+
+
+
diff --git a/bilstm_tagger/scripts/submit-bilty-ud1.3.sh b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh
new file mode 100755
index 0000000..f5c661b
--- /dev/null
+++ b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# train models on UD 1.3
+#
+SUBMIT=0
+
+PARTITION=nodes
+mkdir -p runs
+
+CORPUSDIR=~/corpora/pos/ud1.3/orgtok/goldpos/
+EXPDIR=/data/p252438/experiments/bilty
+
+tagger=bilty
+mkdir -p $EXPDIR/models/$tagger
+mkdir -p $EXPDIR/nohup
+mkdir -p $EXPDIR/predictions/$tagger
+
+ITERS=20
+#ITERS=30
+SIGMA=0.2
+CDIM=100
+
+SEED=1512141834
+TRAINER=sgd
+INDIM=64
+HLAYERS=1
+#HLAYERS=3
+T0_OUT=$HLAYERS
+
+for lang in `cat langs/lang_with_embeds.txt`; # all for which we have poly embeds (26)
+do
+ TRAIN=$lang-ud-train.conllu
+ JOBNAME=bilty-$lang-ud1.3-poly-i$ITERS-h$HLAYERS
+
+ echo "#!/bin/bash" > $$tmp
+ echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp
+ echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp
+ echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp
+ echo "module load CMake" >> $$tmp
+
+ echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --embeds embeds/poly_a/$lang.polyglot.txt --h_layers $HLAYERS --pred_layer $T0_OUT > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp
+
+ if [ $SUBMIT -eq 1 ] ; then
+ echo "SUBMIT"
+ sbatch $$tmp
+ fi
+ cat $$tmp
+ rm $$tmp
+done
+
+for lang in `cat langs/lang_canonic.txt` ; # all without embeddings (but only canical names)
+do
+ TRAIN=$lang-ud-train.conllu
+ JOBNAME=bilty-$lang-ud1.3-i$ITERS-h$HLAYERS
+
+ echo "#!/bin/bash" > $$tmp
+ echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp
+ echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp
+ echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp
+ echo "module load CMake" >> $$tmp
+
+ echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --h_layers $HLAYERS --pred_layer $T0_OUT > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp
+
+ if [ $SUBMIT -eq 1 ] ; then
+ echo "SUBMIT"
+ sbatch $$tmp
+ fi
+
+ cat $$tmp
+ rm $$tmp
+done
diff --git a/bilstm_tagger/src/bilty.py b/bilstm_tagger/src/bilty.py
new file mode 100755
index 0000000..e3d89d7
--- /dev/null
+++ b/bilstm_tagger/src/bilty.py
@@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+# coding=utf-8
+"""
+A neural network based tagger (bi-LSTM)
+:author: Barbara Plank
+"""
+import argparse
+import random
+import time
+import sys
+import numpy as np
+import os
+import pickle
+import dynet
+
+from lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor
+from lib.mio import read_conll_file, load_embeddings_file
+
+
+def main():
+ parser = argparse.ArgumentParser(description="""Run the NN tagger""")
+ parser.add_argument("--train", nargs='*', help="train folder for each task") # allow multiple train files, each asociated with a task = position in the list
+ parser.add_argument("--pred_layer", nargs='*', help="layer of predictons for each task", required=True) # for each task the layer on which it is predicted (default 1)
+ parser.add_argument("--model", help="load model from file", required=False)
+ parser.add_argument("--iters", help="training iterations [default: 30]", required=False,type=int,default=30)
+ parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False,type=int,default=64)
+ parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False,type=int,default=100)
+ parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False,type=int,default=100)
+ parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False,type=int,default=1)
+ parser.add_argument("--test", nargs='*', help="test file(s)", required=False) # should be in the same order/task as train
+ parser.add_argument("--dev", help="dev file(s)", required=False)
+ parser.add_argument("--output", help="output predictions to file", required=False,default=None)
+ parser.add_argument("--lower", help="lowercase words (not used)", required=False,default=False,action="store_true")
+ parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False,default=None)
+ parser.add_argument("--embeds", help="word embeddings file", required=False, default=None)
+ parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float)
+ parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh", type=MyNNTaggerArgumentOptions.acfunct)
+ parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd")
+ parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False, type=int)
+ parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int)
+ parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None)
+
+ args = parser.parse_args()
+
+ if args.train:
+ if not args.pred_layer:
+ print("--pred_layer required!")
+ exit()
+
+ if args.dynet_seed:
+ print(">>> using seed: ", args.dynet_seed, file=sys.stderr)
+ np.random.seed(args.dynet_seed)
+ random.seed(args.dynet_seed)
+
+ if args.c_in_dim == 0:
+ print("no character embeddings", file=sys.stderr)
+
+ if args.save:
+ # check if folder exists
+ if os.path.isdir(args.save):
+ modeldir = os.path.dirname(args.save)
+ if not os.path.exists(modeldir):
+ os.makedirs(modeldir)
+ if args.output:
+ if os.path.isdir(args.output):
+ outdir = os.path.dirname(args.output)
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+
+ start = time.time()
+
+ if args.model:
+ print("loading model from file {}".format(args.model), file=sys.stderr)
+ tagger = load(args)
+ else:
+ tagger = NNTagger(args.in_dim,
+ args.h_dim,
+ args.c_in_dim,
+ args.h_layers,
+ args.pred_layer,
+ embeds_file=args.embeds,
+ activation=args.ac,
+ lower=args.lower,
+ noise_sigma=args.sigma)
+
+ if args.train and len( args.train ) != 0:
+ tagger.fit(args.train, args.iters, args.trainer, dev=args.dev)
+ if args.save:
+ save(tagger, args)
+
+ if args.test and len( args.test ) != 0:
+ stdout = sys.stdout
+ # One file per test ...
+ for i, test in enumerate( args.test ):
+ if args.output != None:
+ file_pred = args.output+".task"+str(i)
+ sys.stdout = open(file_pred, 'w')
+
+ sys.stderr.write('\nTesting Task'+str(i)+'\n')
+ sys.stderr.write('*******\n')
+ test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task"+str(i))
+ correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output)
+
+ print("\ntask%s test accuracy on %s items: %.4f" % (i, i+1, correct/total), file=sys.stderr)
+ print(("Task"+str(i)+" Done. Took {0:.2f} seconds.".format(time.time()-start)),file=sys.stderr)
+ sys.stdout = stdout
+
+
+ if args.ac:
+ activation=args.ac.__name__
+ else:
+ activation="None"
+ print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}"
+ "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}"
+ "\tembeds: {3}".format(args.in_dim,args.h_dim,args.h_layers,args.embeds,activation, args.sigma, args.lower, args.c_in_dim), file=sys.stderr)
+
+ if args.save_embeds:
+ tagger.save_embeds(args.save_embeds)
+
+def load(args):
+ """
+ load a model from file; specify the .model file, it assumes the *pickle file in the same location
+ """
+ myparams = pickle.load(open(args.model+".pickle", "rb"))
+ tagger = NNTagger(myparams["in_dim"],
+ myparams["h_dim"],
+ myparams["c_in_dim"],
+ myparams["h_layers"],
+ myparams["pred_layer"],
+ activation=myparams["activation"], tasks_ids=myparams["tasks_ids"])
+ tagger.set_indices(myparams["w2i"],myparams["c2i"],myparams["task2tag2idx"])
+ tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \
+ tagger.build_computation_graph(myparams["num_words"],
+ myparams["num_chars"])
+ #tagger.model.load(str.encode(args.model))
+ tagger.model.load(args.model)
+ print("model loaded: {}".format(args.model), file=sys.stderr)
+ return tagger
+
+def save(nntagger, args):
+ """
+ save a model; dynet only saves the parameters, need to store the rest separately
+ """
+ outdir = args.save
+ modelname = outdir + ".model"
+ #nntagger.model.save(str.encode(modelname)) #python3 needs it as bytes - no longer!
+ nntagger.model.save(modelname)
+ import pickle
+ print(nntagger.task2tag2idx)
+ myparams = {"num_words": len(nntagger.w2i),
+ "num_chars": len(nntagger.c2i),
+ "tasks_ids": nntagger.tasks_ids,
+ "w2i": nntagger.w2i,
+ "c2i": nntagger.c2i,
+ "task2tag2idx": nntagger.task2tag2idx,
+ "activation": nntagger.activation,
+ "in_dim": nntagger.in_dim,
+ "h_dim": nntagger.h_dim,
+ "c_in_dim": nntagger.c_in_dim,
+ "h_layers": nntagger.h_layers,
+ "embeds_file": nntagger.embeds_file,
+ "pred_layer": nntagger.pred_layer
+ }
+ pickle.dump(myparams, open( modelname+".pickle", "wb" ) )
+ print("model stored: {}".format(modelname), file=sys.stderr)
+
+
+class NNTagger(object):
+
+ def __init__(self,in_dim,h_dim,c_in_dim,h_layers,pred_layer,embeds_file=None,activation=dynet.tanh, lower=False, noise_sigma=0.1, tasks_ids=[]):
+ self.w2i = {} # word to index mapping
+ self.c2i = {} # char to index mapping
+ self.tasks_ids = tasks_ids # list of names for each task
+ self.task2tag2idx = {} # need one dictionary per task
+ self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task
+ self.model = dynet.Model() #init model
+ self.in_dim = in_dim
+ self.h_dim = h_dim
+ self.c_in_dim = c_in_dim
+ self.activation = activation
+ self.lower = lower
+ self.noise_sigma = noise_sigma
+ self.h_layers = h_layers
+ self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors
+ self.wembeds = None # lookup: embeddings for words
+ self.cembeds = None # lookup: embeddings for characters
+ self.embeds_file = embeds_file
+ self.char_rnn = None # RNN for character input
+
+
+ def pick_neg_log(self, pred, gold):
+ return -dynet.log(dynet.pick(pred, gold))
+
+ def set_indices(self, w2i, c2i, task2t2i):
+ for task_id in task2t2i:
+ self.task2tag2idx[task_id] = task2t2i[task_id]
+ self.w2i = w2i
+ self.c2i = c2i
+
+ def fit(self, list_folders_name, num_iterations, train_algo, dev=None):
+ """
+ train the tagger
+ """
+ print("read training data",file=sys.stderr)
+
+ nb_tasks = len( list_folders_name )
+
+ train_X, train_Y, task_labels, w2i, c2i, task2t2i = self.get_train_data(list_folders_name)
+
+ ## after calling get_train_data we have self.tasks_ids
+ self.task2layer = {task_id: out_layer for task_id, out_layer in zip(self.tasks_ids, self.pred_layer)}
+ print("task2layer", self.task2layer, file=sys.stderr)
+
+ # store mappings of words and tags to indices
+ self.set_indices(w2i,c2i,task2t2i)
+
+ if dev:
+ dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0")
+
+ # init lookup parameters and define graph
+ print("build graph",file=sys.stderr)
+
+ num_words = len(self.w2i)
+ num_chars = len(self.c2i)
+
+ assert(nb_tasks==len(self.pred_layer))
+
+ self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars)
+
+ if train_algo == "sgd":
+ trainer = dynet.SimpleSGDTrainer(self.model)
+ elif train_algo == "adam":
+ trainer = dynet.AdamTrainer(self.model)
+
+ train_data = list(zip(train_X,train_Y, task_labels))
+
+ for iter in range(num_iterations):
+ total_loss=0.0
+ total_tagged=0.0
+ random.shuffle(train_data)
+ for ((word_indices,char_indices),y, task_of_instance) in train_data:
+ # use same predict function for training and testing
+ output = self.predict(word_indices, char_indices, task_of_instance, train=True)
+
+ loss1 = dynet.esum([self.pick_neg_log(pred,gold) for pred, gold in zip(output, y)])
+ lv = loss1.value()
+ total_loss += lv
+ total_tagged += len(word_indices)
+
+ loss1.backward()
+ trainer.update()
+
+ print("iter {2} {0:>12}: {1:.2f}".format("total loss",total_loss/total_tagged,iter), file=sys.stderr)
+
+ if dev:
+ # evaluate after every epoch
+ correct, total = self.evaluate(dev_X, dev_Y, org_X, org_Y, task_labels)
+ print("\ndev accuracy: %.4f" % (correct/total), file=sys.stderr)
+
+
+
+ def build_computation_graph(self, num_words, num_chars):
+ """
+ build graph and link to parameters
+ """
+ # initialize the word embeddings and the parameters
+ if self.embeds_file:
+ print("loading embeddings", file=sys.stderr)
+ embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower)
+ assert(emb_dim==self.in_dim)
+ num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
+ # init model parameters and initialize them
+ wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+ cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+ init=0
+ l = len(embeddings.keys())
+ for word in embeddings.keys():
+ # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
+ if word in self.w2i:
+ wembeds.init_row(self.w2i[word], embeddings[word])
+ else:
+ self.w2i[word]=len(self.w2i.keys()) # add new word
+ wembeds.init_row(self.w2i[word], embeddings[word])
+ init+=1
+ print("initialized: {}".format(init), file=sys.stderr)
+
+ else:
+ wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+ cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+
+ #make it more flexible to add number of layers as specified by parameter
+ layers = [] # inner layers
+ output_layers_dict = {} # from task_id to actual softmax predictor
+ task_expected_at = {} # map task_id => output_layer_#
+
+ # connect output layers to tasks
+ for output_layer, task_id in zip(self.pred_layer, self.tasks_ids):
+ if output_layer > self.h_layers:
+ raise ValueError("cannot have a task at a layer which is beyond the model, increase h_layers")
+ task_expected_at[task_id] = output_layer
+
+ print("task expected at", task_expected_at, file=sys.stderr)
+
+ nb_tasks = len( self.tasks_ids )
+
+ print("h_layers:", self.h_layers, file=sys.stderr)
+ for layer_num in range(0,self.h_layers):
+ print(">>>", layer_num, "layer_num")
+
+ if layer_num == 0:
+ builder = dynet.LSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer
+ layers.append(BiRNNSequencePredictor(builder)) #returns forward and backward sequence
+ else:
+ # add inner layers (if h_layers >1)
+ builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
+ layers.append(BiRNNSequencePredictor(builder))
+
+ # store at which layer to predict task
+ for task_id in self.tasks_ids:
+ task_num_labels= len(self.task2tag2idx[task_id])
+ output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax))
+
+ sys.stderr.write('#\nOutput layers'+str(len(output_layers_dict))+'\n')
+
+ char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
+
+ predictors = {}
+ predictors["inner"] = layers
+ predictors["output_layers_dict"] = output_layers_dict
+ predictors["task_expected_at"] = task_expected_at
+
+ return predictors, char_rnn, wembeds, cembeds
+
+ def get_features(self, words):
+ """
+ from a list of words, return the word and word char indices
+ """
+ word_indices = []
+ word_char_indices = []
+ for word in words:
+ if word in self.w2i:
+ word_indices.append(self.w2i[word])
+ else:
+ word_indices.append(self.w2i["_UNK"])
+
+ chars_of_word = [self.c2i[""]]
+ for char in word:
+ if char in self.c2i:
+ chars_of_word.append(self.c2i[char])
+ else:
+ chars_of_word.append(self.c2i["_UNK"])
+ chars_of_word.append(self.c2i[""])
+ word_char_indices.append(chars_of_word)
+ return word_indices, word_char_indices
+
+
+ def get_data_as_indices(self, folder_name, task):
+ """
+ X = list of (word_indices, word_char_indices)
+ Y = list of tag indices
+ """
+ X, Y = [],[]
+ org_X, org_Y = [], []
+ task_labels = []
+ for (words, tags) in read_conll_file(folder_name):
+ word_indices, word_char_indices = self.get_features(words)
+ tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags]
+ X.append((word_indices,word_char_indices))
+ Y.append(tag_indices)
+ org_X.append(words)
+ org_Y.append(tags)
+ task_labels.append( task )
+ return X, Y, org_X, org_Y, task_labels
+
+
+ def predict(self, word_indices, char_indices, task_id, train=False):
+ """
+ predict tags for a sentence represented as char+word embeddings
+ """
+ dynet.renew_cg() # new graph
+
+ char_emb = []
+ rev_char_emb = []
+ # get representation for words
+ for chars_of_token in char_indices:
+ # use last state as word representation
+ last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1]
+ rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1]
+ char_emb.append(last_state)
+ rev_char_emb.append(rev_last_state)
+
+ wfeatures = [self.wembeds[w] for w in word_indices]
+ features = [dynet.concatenate([w,c,rev_c]) for w,c,rev_c in zip(wfeatures,char_emb,reversed(rev_char_emb))]
+
+ if train: # only do at training time
+ features = [dynet.noise(fe,self.noise_sigma) for fe in features]
+
+ output_expected_at_layer = self.predictors["task_expected_at"][task_id]
+ output_expected_at_layer -=1
+
+ # go through layers
+ # input is now combination of w + char emb
+ prev = features
+ num_layers = self.h_layers
+# for i in range(0,num_layers-1):
+ for i in range(0,num_layers):
+ predictor = self.predictors["inner"][i]
+ forward_sequence, backward_sequence = predictor.predict_sequence(prev)
+ if i > 0 and self.activation:
+ # activation between LSTM layers
+ forward_sequence = [self.activation(s) for s in forward_sequence]
+ backward_sequence = [self.activation(s) for s in backward_sequence]
+
+ if i == output_expected_at_layer:
+ output_predictor = self.predictors["output_layers_dict"][task_id]
+ concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))]
+
+ if train and self.noise_sigma > 0.0:
+ concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer]
+ output = output_predictor.predict_sequence(concat_layer)
+ return output
+
+ prev = forward_sequence
+ prev_rev = backward_sequence # not used
+
+ raise Exception("oops should not be here")
+ return None
+
+ def evaluate(self, test_X, test_Y, org_X, org_Y, task_labels, output_predictions=None, verbose=True):
+ """
+ compute accuracy on a test file
+ """
+ correct = 0
+ total = 0.0
+
+ if output_predictions != None:
+ i2w = {self.w2i[w] : w for w in self.w2i.keys()}
+ task_id = task_labels[0] #get first
+ print(task_id,"labels:", self.task2tag2idx[task_id], file=sys.stderr )
+ i2t = {self.task2tag2idx[task_id][t] : t for t in self.task2tag2idx[task_id].keys()}
+
+ for i, ((word_indices, word_char_indices), gold_tag_indices, task_of_instance) in enumerate(zip(test_X, test_Y, task_labels)):
+ if verbose:
+ if i%100==0:
+ sys.stderr.write('%s'%i)
+ elif i%10==0:
+ sys.stderr.write('.')
+
+ output = self.predict(word_indices, word_char_indices, task_of_instance)
+ predicted_tag_indices = [np.argmax(o.value()) for o in output]
+ if output_predictions:
+ prediction = [i2t[idx] for idx in predicted_tag_indices]
+
+ words = org_X[i]
+ gold = org_Y[i]
+
+ for w,g,p in zip(words,gold,prediction):
+ print(("{}\t{}\t{}".format(w,g,p)))
+ print("")
+ correct += sum([1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold])
+ total += len(gold_tag_indices)
+
+ return correct, total
+
+
+
+ # Get train data: need to read each train set (linked to a task) separately
+
+ def get_train_data(self, list_folders_name):
+ """
+
+ :param list_folders_name: list of folders names
+ :param lower: whether to lowercase tokens
+
+ transform training data to features (word indices)
+ map tags to integers
+ """
+ X = []
+ Y = []
+ task_labels = [] #keeps track of where instances come from "task1" or "task2"..
+ self.tasks_ids = [] #record the id of the tasks
+
+ #num_sentences=0
+ #num_tokens=0
+
+ # word 2 indices and tag 2 indices
+ w2i = {} # word to index
+ c2i = {} # char to index
+ task2tag2idx = {} # id of the task -> tag2idx
+
+ w2i["_UNK"] = 0 # unk word / OOV
+ c2i["_UNK"] = 0 # unk char
+ c2i[""] = 1 # word start
+ c2i[""] = 2 # word end index
+
+
+ for i, folder_name in enumerate( list_folders_name ):
+ num_sentences=0
+ num_tokens=0
+ task_id = 'task'+str(i)
+ self.tasks_ids.append( task_id )
+ if task_id not in task2tag2idx:
+ task2tag2idx[task_id] = {}
+ for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)):
+ num_sentences += 1
+ instance_word_indices = [] #sequence of word indices
+ instance_char_indices = [] #sequence of char indices
+ instance_tags_indices = [] #sequence of tag indices
+
+ for i, (word, tag) in enumerate(zip(words, tags)):
+ num_tokens += 1
+
+ # map words and tags to indices
+ if word not in w2i:
+ w2i[word] = len(w2i)
+ instance_word_indices.append(w2i[word])
+
+ chars_of_word = [c2i[""]]
+ for char in word:
+ if char not in c2i:
+ c2i[char] = len(c2i)
+ chars_of_word.append(c2i[char])
+ chars_of_word.append(c2i[""])
+ instance_char_indices.append(chars_of_word)
+
+ if tag not in task2tag2idx[task_id]:
+ #tag2idx[tag]=len(tag2idx)
+ task2tag2idx[task_id][tag]=len(task2tag2idx[task_id])
+
+ instance_tags_indices.append(task2tag2idx[task_id].get(tag))
+
+ X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
+ Y.append(instance_tags_indices)
+ task_labels.append(task_id)
+
+ #self.num_labels[task_id] = len( task2tag2idx[task_id] )
+
+ if num_sentences == 0 or num_tokens == 0:
+ sys.exit( "No data read from: "+folder_name )
+ print("TASK "+task_id+" "+folder_name, file=sys.stderr )
+ print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+ print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)
+
+ assert(len(X)==len(Y))
+ return X, Y, task_labels, w2i, c2i, task2tag2idx #sequence of features, sequence of labels, necessary mappings
+
+
+ def save_embeds(self, out_filename):
+ # construct reverse mapping
+ i2w = {self.w2i[w]: w for w in self.w2i.keys()}
+
+ OUT = open(out_filename+".w.emb","w")
+ for word_id in i2w.keys():
+ wembeds_expression = self.wembeds[word_id]
+ word = i2w[word_id]
+ OUT.write("{} {}\n".format(word," ".join([str(x) for x in wembeds_expression.npvalue()])))
+ OUT.close()
+
+
+class MyNNTaggerArgumentOptions(object):
+ def __init__(self):
+ pass
+ ### functions for checking arguments
+ def acfunct(arg):
+ """ check for allowed argument for --ac option """
+ try:
+ functions = [dynet.rectify, dynet.tanh]
+ functions = { function.__name__ : function for function in functions}
+ functions["None"] = None
+ return functions[str(arg)]
+ except:
+ raise argparse.ArgumentTypeError("String {} does not match required format".format(arg,))
+
+
+
+if __name__=="__main__":
+ main()
diff --git a/bilstm_tagger/src/run_simple.py b/bilstm_tagger/src/run_simple.py
new file mode 100755
index 0000000..60eec71
--- /dev/null
+++ b/bilstm_tagger/src/run_simple.py
@@ -0,0 +1,23 @@
+#### Example of using bilty from within code
+##
+## to properly seed dyNet add parameter to your script:
+## python run_simply.py --dynet-seed 113
+
+from bilstm_tagger.src.simplebilty import SimpleBiltyTagger
+import random
+### Use --dynet-seed $SEED
+seed=113 # assume we pass this to script
+train_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-dev.conllu"
+dev_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-test.conllu"
+in_dim=64
+h_dim=100
+c_in_dim=100
+h_layers=1
+iters=2
+trainer="sgd"
+tagger = SimpleBiltyTagger(in_dim, h_dim,c_in_dim,h_layers,embeds_file=None)
+train_X, train_Y = tagger.get_train_data(train_data)
+tagger.fit(train_X, train_Y, iters, trainer,seed=seed)
+test_X, test_Y = tagger.get_data_as_indices(dev_data)
+correct, total = tagger.evaluate(test_X, test_Y)
+print(correct, total, correct/total)
diff --git a/bilstm_tagger/src/simplebilty.py b/bilstm_tagger/src/simplebilty.py
new file mode 100755
index 0000000..11f4972
--- /dev/null
+++ b/bilstm_tagger/src/simplebilty.py
@@ -0,0 +1,598 @@
+#!/usr/bin/env python3
+# coding=utf-8
+"""
+A neural network based tagger (bi-LSTM) - version w/o MTL
+:author: Barbara Plank
+"""
+import argparse
+import random
+import time
+import sys
+import numpy as np
+import os
+import pickle
+import dynet
+
+from bilstm_tagger.src.lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor
+from bilstm_tagger.src.lib.mio import read_conll_file, load_embeddings_file
+
+
+def main():
+ parser = argparse.ArgumentParser(description="""Run the NN tagger""")
+ parser.add_argument("--train",
+ help="train data") # allow multiple train files, each asociated with a task = position in the list
+ # parser.add_argument("--pred_layer", help="layer of predictons", default=1) # assume always h_layer here
+ parser.add_argument("--model", help="load model from file", required=False)
+ parser.add_argument("--iters", help="training iterations [default: 30]", required=False, type=int, default=30)
+ parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False,
+ type=int, default=64)
+ parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False,
+ type=int, default=100)
+ parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False, type=int, default=100)
+ parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False,
+ type=int, default=1)
+ parser.add_argument("--test", nargs='*', help="test file(s)",
+ required=False) # should be in the same order/task as train
+ parser.add_argument("--dev", help="dev file(s)", required=False)
+ parser.add_argument("--output", help="output predictions to file", required=False, default=None)
+ parser.add_argument("--lower", help="lowercase words (not used)", required=False, default=False,
+ action="store_true")
+ parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False,
+ default=None)
+ parser.add_argument("--embeds", help="word embeddings file", required=False, default=None)
+ parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float)
+ parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh",
+ type=MyNNTaggerArgumentOptions.acfunct)
+ parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd")
+ parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False,
+ type=int)
+ parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int)
+ parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None)
+
+ args = parser.parse_args()
+
+ if args.save:
+ # check if folder exists
+ if os.path.isdir(args.save):
+ modeldir = os.path.dirname(args.save)
+ if not os.path.exists(modeldir):
+ os.makedirs(modeldir)
+ if args.output:
+ if os.path.isdir(args.output):
+ outdir = os.path.dirname(args.output)
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ start = time.time()
+
+ if args.model:
+ print("loading model from file {}".format(args.model), file=sys.stderr)
+ tagger = load(args.model)
+ else:
+ tagger = SimpleBiltyTagger(args.in_dim,
+ args.h_dim,
+ args.c_in_dim,
+ args.h_layers,
+ embeds_file=args.embeds,
+ activation=args.ac,
+ lower=args.lower,
+ noise_sigma=args.sigma)
+
+ if args.train:
+ ## read data
+ train_X, train_Y = tagger.get_train_data(args.train)
+
+ if dev:
+ dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0")
+
+ tagger.fit(args.train, args.iters, args.trainer, seed=args.dynet_seed)
+ if args.save:
+ save(tagger, args.save)
+
+ if args.test and len(args.test) != 0:
+ stdout = sys.stdout
+ # One file per test ...
+ for i, test in enumerate(args.test):
+ if args.output != None:
+ file_pred = args.output + ".task" + str(i)
+ sys.stdout = open(file_pred, 'w')
+
+ sys.stderr.write('\nTesting Task' + str(i) + '\n')
+ sys.stderr.write('*******\n')
+ test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task" + str(i))
+ correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output)
+
+ print("\ntask%s test accuracy on %s items: %.4f" % (i, i + 1, correct / total), file=sys.stderr)
+ print(("Task" + str(i) + " Done. Took {0:.2f} seconds.".format(time.time() - start)), file=sys.stderr)
+ sys.stdout = stdout
+
+ if args.ac:
+ activation = args.ac.__name__
+ else:
+ activation = "None"
+ print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}"
+ "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}"
+ "\tembeds: {3}".format(args.in_dim, args.h_dim, args.h_layers, args.embeds, activation, args.sigma,
+ args.lower, args.c_in_dim), file=sys.stderr)
+
+ if args.save_embeds:
+ tagger.save_embeds(args.save_embeds)
+
+
+def load(model_file):
+ """
+ load a model from file; specify the .model file, it assumes the *pickle file in the same location
+ """
+ myparams = pickle.load(open(model_file + ".pickle", "rb"))
+ tagger = SimpleBiltyTagger(myparams["in_dim"],
+ myparams["h_dim"],
+ myparams["c_in_dim"],
+ myparams["h_layers"],
+ activation=myparams["activation"])
+ tagger.set_indices(myparams["w2i"], myparams["c2i"], myparams["tag2idx"])
+ tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \
+ tagger.build_computation_graph(myparams["num_words"],
+ myparams["num_chars"])
+ tagger.model.load(model_file)
+ print("model loaded: {}".format(model_file), file=sys.stderr)
+ return tagger
+
+
+def save(nntagger, model_file_name):
+ """
+ save a model; dynet only saves the parameters, need to store the rest separately
+ """
+ nntagger.model.save(model_file_name)
+ import pickle
+ myparams = {"num_words": len(nntagger.w2i),
+ "num_chars": len(nntagger.c2i),
+ "w2i": nntagger.w2i,
+ "c2i": nntagger.c2i,
+ "tag2idx": nntagger.tag2idx,
+ "activation": nntagger.activation,
+ "in_dim": nntagger.in_dim,
+ "h_dim": nntagger.h_dim,
+ "c_in_dim": nntagger.c_in_dim,
+ "h_layers": nntagger.h_layers,
+ "embeds_file": nntagger.embeds_file,
+ "pred_layer": nntagger.pred_layer
+ }
+ pickle.dump(myparams, open(model_file_name + ".pickle", "wb"))
+ print("model stored: {}".format(model_file_name), file=sys.stderr)
+
+
+class SimpleBiltyTagger(object):
+ def __init__(self, in_dim, h_dim, c_in_dim, h_layers, embeds_file=None, activation=dynet.tanh, lower=False,
+ noise_sigma=0.1, tasks_ids=[]):
+ self.w2i = {} # word to index mapping
+ self.c2i = {} # char to index mapping
+ self.tag2idx = {} # tag to tag_id mapping
+ self.pred_layer = 1 # at which layer to predict
+ self.model = dynet.Model() # init model
+ self.in_dim = in_dim
+ self.h_dim = h_dim
+ self.c_in_dim = c_in_dim
+ self.activation = activation
+ self.lower = lower
+ self.noise_sigma = noise_sigma
+ self.h_layers = h_layers
+ self.predictors = {"inner": [], "output_layers_dict": {},
+ "task_expected_at": {}} # the inner layers and predictors
+ self.wembeds = None # lookup: embeddings for words
+ self.cembeds = None # lookup: embeddings for characters
+ self.embeds_file = embeds_file
+ self.char_rnn = None # RNN for character input
+
+ def pick_neg_log(self, pred, gold):
+ return -dynet.log(dynet.pick(pred, gold))
+
+ def set_indices(self, w2i, c2i, tag2idx):
+ self.tag2idx = tag2idx
+ self.w2i = w2i
+ self.c2i = c2i
+
+ def fit(self, train_X, train_Y, num_epochs, train_algo, val_X=None, val_Y=None, patience=2, model_path=None,
+ seed=None):
+ """
+ train the tagger
+ """
+ print("read training data", file=sys.stderr)
+
+ if seed:
+ print(">>> using seed: ", seed, file=sys.stderr)
+ random.seed(seed) # setting random seed
+
+ # init lookup parameters and define graph
+ print("build graph", file=sys.stderr)
+
+ num_words = len(self.w2i)
+ num_chars = len(self.c2i)
+
+ self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars)
+
+ if train_algo == "sgd":
+ trainer = dynet.SimpleSGDTrainer(self.model)
+ elif train_algo == "adam":
+ trainer = dynet.AdamTrainer(self.model)
+ else:
+ raise ValueError('%s is not a valid optimizer.' % train_algo)
+
+ assert (len(train_X) == len(train_Y))
+ train_data = list(zip(train_X, train_Y))
+
+ print('Starting training for %d epochs...' % num_epochs)
+ best_val_acc, epochs_no_improvement = 0., 0
+ if val_X is not None and val_Y is not None and model_path is not None:
+ print('Using early stopping with patience of %d...' % patience)
+ for cur_iter in range(num_epochs):
+ total_loss = 0.0
+ total_tagged = 0.0
+ random.shuffle(train_data)
+ for ((word_indices, char_indices), y) in train_data:
+ # use same predict function for training and testing
+ output = self.predict(word_indices, char_indices, train=True)
+
+ loss1 = dynet.esum([self.pick_neg_log(pred, gold) for pred, gold in zip(output, y)])
+ lv = loss1.value()
+ total_loss += lv
+ total_tagged += len(word_indices)
+
+ loss1.backward()
+ trainer.update()
+ total_loss = total_loss / total_tagged
+ print("epoch {2} {0:>12}: {1:.2f}".format("total loss", total_loss, cur_iter))
+
+ # get the best accuracy on the validation set
+ val_correct, val_total = self.evaluate(val_X, val_Y)
+ val_accuracy = val_correct / val_total
+
+ if val_X is not None and val_Y is not None and model_path is not None:
+ if val_accuracy > best_val_acc:
+ print('Accuracy %.4f is better than best val accuracy %.4f.' % (val_accuracy, best_val_acc))
+ best_val_acc = val_accuracy
+ epochs_no_improvement = 0
+ save(self, model_path)
+ else:
+ print('Accuracy %.4f is worse than best val loss %.4f.' % (val_accuracy, best_val_acc))
+ epochs_no_improvement += 1
+ if epochs_no_improvement == patience:
+ print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement)
+ break
+
+ def build_computation_graph(self, num_words, num_chars):
+ """
+ build graph and link to parameters
+ """
+ # initialize the word embeddings and the parameters
+ if self.embeds_file:
+ print("loading embeddings", file=sys.stderr)
+ embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower)
+ assert (emb_dim == self.in_dim)
+ num_words = len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings
+ # init model parameters and initialize them
+ wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+ cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+ init = 0
+ l = len(embeddings.keys())
+ for word in embeddings.keys():
+ # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers)
+ if word in self.w2i:
+ wembeds.init_row(self.w2i[word], embeddings[word])
+ else:
+ self.w2i[word] = len(self.w2i.keys()) # add new word
+ wembeds.init_row(self.w2i[word], embeddings[word])
+ init += 1
+ print("initialized: {}".format(init), file=sys.stderr)
+
+ else:
+ wembeds = self.model.add_lookup_parameters((num_words, self.in_dim))
+ cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim))
+
+ # make it more flexible to add number of layers as specified by parameter
+ layers = [] # inner layers
+ # print("h_layers:", self.h_layers, file=sys.stderr)
+ for layer_num in range(0, self.h_layers):
+ # print(">>>", layer_num, "layer_num")
+
+ if layer_num == 0:
+ builder = dynet.LSTMBuilder(1, self.in_dim + self.c_in_dim * 2, self.h_dim,
+ self.model) # in_dim: size of each layer
+ layers.append(BiRNNSequencePredictor(builder)) # returns forward and backward sequence
+ else:
+ # add inner layers (if h_layers >1)
+ builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model)
+ layers.append(BiRNNSequencePredictor(builder))
+
+ # store at which layer to predict task
+
+ task_num_labels = len(self.tag2idx)
+ output_layer = FFSequencePredictor(Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax))
+
+ char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model))
+
+ predictors = {}
+ predictors["inner"] = layers
+ predictors["output_layers_dict"] = output_layer
+ predictors["task_expected_at"] = self.h_layers
+
+ return predictors, char_rnn, wembeds, cembeds
+
+ def get_features(self, words):
+ """
+ from a list of words, return the word and word char indices
+ """
+ word_indices = []
+ word_char_indices = []
+ for word in words:
+ if word in self.w2i:
+ word_indices.append(self.w2i[word])
+ else:
+ word_indices.append(self.w2i["_UNK"])
+
+ chars_of_word = [self.c2i[""]]
+ for char in word:
+ if char in self.c2i:
+ chars_of_word.append(self.c2i[char])
+ else:
+ chars_of_word.append(self.c2i["_UNK"])
+ chars_of_word.append(self.c2i[""])
+ word_char_indices.append(chars_of_word)
+ return word_indices, word_char_indices
+
+ def get_data_as_indices(self, file_name):
+ """
+ X = list of (word_indices, word_char_indices)
+ Y = list of tag indices
+ """
+ X, Y = [], []
+ org_X, org_Y = [], []
+
+ for (words, tags) in read_conll_file(file_name):
+ word_indices, word_char_indices = self.get_features(words)
+ tag_indices = [self.tag2idx.get(tag) for tag in tags]
+ X.append((word_indices, word_char_indices))
+ Y.append(tag_indices)
+ org_X.append(words)
+ org_Y.append(tags)
+ return X, Y # , org_X, org_Y - for now don't use
+
+ def get_data_as_indices_from_instances(self, dev_words, dev_tags):
+ """
+ Extension of get_data_as_indices. Use words and tags rather than a file as input.
+ X = list of (word_indices, word_char_indices)
+ Y = list of tag indices
+ """
+ X, Y = [], []
+ org_X, org_Y = [], []
+
+ for (words, tags) in zip(dev_words, dev_tags):
+ word_indices, word_char_indices = self.get_features(words)
+ tag_indices = [self.tag2idx.get(tag) for tag in tags]
+ X.append((word_indices, word_char_indices))
+ Y.append(tag_indices)
+ org_X.append(words)
+ org_Y.append(tags)
+ return X, Y # , org_X, org_Y - for now don't use
+
+ def predict(self, word_indices, char_indices, train=False):
+ """
+ predict tags for a sentence represented as char+word embeddings
+ """
+ dynet.renew_cg() # new graph
+
+ char_emb = []
+ rev_char_emb = []
+ # get representation for words
+ for chars_of_token in char_indices:
+ # use last state as word representation
+ last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1]
+ rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1]
+ char_emb.append(last_state)
+ rev_char_emb.append(rev_last_state)
+
+ wfeatures = [self.wembeds[w] for w in word_indices]
+ features = [dynet.concatenate([w, c, rev_c]) for w, c, rev_c in
+ zip(wfeatures, char_emb, reversed(rev_char_emb))]
+
+ if train: # only do at training time
+ features = [dynet.noise(fe, self.noise_sigma) for fe in features]
+
+ output_expected_at_layer = self.h_layers
+ output_expected_at_layer -= 1
+
+ # go through layers
+ # input is now combination of w + char emb
+ prev = features
+ num_layers = self.h_layers
+ for i in range(0, num_layers):
+ predictor = self.predictors["inner"][i]
+ forward_sequence, backward_sequence = predictor.predict_sequence(prev)
+ if i > 0 and self.activation:
+ # activation between LSTM layers
+ forward_sequence = [self.activation(s) for s in forward_sequence]
+ backward_sequence = [self.activation(s) for s in backward_sequence]
+
+ if i == output_expected_at_layer:
+ output_predictor = self.predictors["output_layers_dict"]
+ concat_layer = [dynet.concatenate([f, b]) for f, b in
+ zip(forward_sequence, reversed(backward_sequence))]
+
+ if train and self.noise_sigma > 0.0:
+ concat_layer = [dynet.noise(fe, self.noise_sigma) for fe in concat_layer]
+ output = output_predictor.predict_sequence(concat_layer)
+ return output
+
+ prev = forward_sequence
+ prev_rev = backward_sequence # not used
+
+ raise Exception("oops should not be here")
+ return None
+
+ def evaluate(self, test_X, test_Y):
+ """
+ compute accuracy on a test file
+ """
+ correct = 0
+ total = 0.0
+
+ for i, ((word_indices, word_char_indices), gold_tag_indices) in enumerate(zip(test_X, test_Y)):
+ output = self.predict(word_indices, word_char_indices)
+ predicted_tag_indices = [np.argmax(o.value()) for o in output]
+
+ correct += sum(
+ [1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold])
+ total += len(gold_tag_indices)
+
+ return correct, total
+
+ # Get train data: need to read each train set (linked to a task) separately
+
+ def get_train_data(self, train_data):
+ """
+ transform training data to features (word indices)
+ map tags to integers
+ """
+ X = []
+ Y = []
+
+ # word 2 indices and tag 2 indices
+ w2i = {} # word to index
+ c2i = {} # char to index
+ tag2idx = {} # tag2idx
+
+ w2i["_UNK"] = 0 # unk word / OOV
+ c2i["_UNK"] = 0 # unk char
+ c2i[""] = 1 # word start
+ c2i[""] = 2 # word end index
+
+ num_sentences = 0
+ num_tokens = 0
+ for instance_idx, (words, tags) in enumerate(read_conll_file(train_data)):
+ instance_word_indices = [] # sequence of word indices
+ instance_char_indices = [] # sequence of char indices
+ instance_tags_indices = [] # sequence of tag indices
+
+ for i, (word, tag) in enumerate(zip(words, tags)):
+
+ # map words and tags to indices
+ if word not in w2i:
+ w2i[word] = len(w2i)
+ instance_word_indices.append(w2i[word])
+
+ chars_of_word = [c2i[""]]
+ for char in word:
+ if char not in c2i:
+ c2i[char] = len(c2i)
+ chars_of_word.append(c2i[char])
+ chars_of_word.append(c2i[""])
+ instance_char_indices.append(chars_of_word)
+
+ if tag not in tag2idx:
+ tag2idx[tag] = len(tag2idx)
+
+ instance_tags_indices.append(tag2idx.get(tag))
+
+ num_tokens += 1
+
+ num_sentences += 1
+
+ X.append((instance_word_indices,
+ instance_char_indices)) # list of word indices, for every word list of char indices
+ Y.append(instance_tags_indices)
+
+ print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+ print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr)
+
+ assert (len(X) == len(Y))
+
+ # store mappings of words and tags to indices
+ self.set_indices(w2i, c2i, tag2idx)
+
+ return X, Y
+
+ def get_train_data_from_instances(self, train_words, train_tags):
+ """
+ Extension of get_train_data method. Extracts training data from two arrays of word and label lists.
+ transform training data to features (word indices)
+ map tags to integers
+ :param train_words: a numpy array containing lists of words
+ :param train_tags: a numpy array containing lists of corresponding tags
+ """
+ X = []
+ Y = []
+
+ # word 2 indices and tag 2 indices
+ w2i = {} # word to index
+ c2i = {} # char to index
+ tag2idx = {} # tag2idx
+
+ w2i["_UNK"] = 0 # unk word / OOV
+ c2i["_UNK"] = 0 # unk char
+ c2i[""] = 1 # word start
+ c2i[""] = 2 # word end index
+
+ num_sentences = 0
+ num_tokens = 0
+ for instance_idx, (words, tags) in enumerate(zip(train_words, train_tags)):
+ instance_word_indices = [] # sequence of word indices
+ instance_char_indices = [] # sequence of char indices
+ instance_tags_indices = [] # sequence of tag indices
+
+ for i, (word, tag) in enumerate(zip(words, tags)):
+
+ # map words and tags to indices
+ if word not in w2i:
+ w2i[word] = len(w2i)
+ instance_word_indices.append(w2i[word])
+
+ chars_of_word = [c2i[""]]
+ for char in word:
+ if char not in c2i:
+ c2i[char] = len(c2i)
+ chars_of_word.append(c2i[char])
+ chars_of_word.append(c2i[""])
+ instance_char_indices.append(chars_of_word)
+
+ if tag not in tag2idx:
+ tag2idx[tag] = len(tag2idx)
+
+ instance_tags_indices.append(tag2idx.get(tag))
+
+ num_tokens += 1
+
+ num_sentences += 1
+
+ X.append((instance_word_indices,
+ instance_char_indices)) # list of word indices, for every word list of char indices
+ Y.append(instance_tags_indices)
+
+ print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
+ print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr)
+
+ assert (len(X) == len(Y))
+
+ # store mappings of words and tags to indices
+ self.set_indices(w2i, c2i, tag2idx)
+
+ return X, Y
+
+
+class MyNNTaggerArgumentOptions(object):
+ def __init__(self):
+ pass
+
+ ### functions for checking arguments
+ def acfunct(arg):
+ """ check for allowed argument for --ac option """
+ try:
+ functions = [dynet.rectify, dynet.tanh]
+ functions = {function.__name__: function for function in functions}
+ functions["None"] = None
+ return functions[str(arg)]
+ except:
+ raise argparse.ArgumentTypeError("String {} does not match required format".format(arg, ))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bist_parser/LICENSE b/bist_parser/LICENSE
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/bist_parser/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/bist_parser/README.md b/bist_parser/README.md
new file mode 100644
index 0000000..4147304
--- /dev/null
+++ b/bist_parser/README.md
@@ -0,0 +1,75 @@
+# BIST Parsers
+## Graph & Transition based dependency parsers using BiLSTM feature extractors.
+
+The techniques behind the parser are described in the paper [Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations](https://www.transacl.org/ojs/index.php/tacl/article/viewFile/885/198). Futher materials could be found [here](http://elki.cc/#/article/Simple%20and%20Accurate%20Dependency%20Parsing%20Using%20Bidirectional%20LSTM%20Feature%20Representations).
+
+#### Required software
+
+ * Python 2.7 interpreter
+ * [DyNet library](https://github.com/clab/dynet/tree/master/python)
+
+#### Train a parsing model
+
+The software requires having a `training.conll` and `development.conll` files formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat).
+For the faster graph-based parser change directory to `bmstparser` (1200 words/sec), and for the more accurate transition-based parser change directory to `barchybrid` (800 word/sec). The benchmark was performed on a Mac book pro with i7 processor. The graph-based parser acheives an accuracy of 93.8 UAS and the transition-based parser an accuracy of 94.7 UAS on the standard Penn Treebank dataset (Standford Dependencies). The transition-based parser requires no part-of-speech tagging and setting all the tags to NN will produce the expected accuracy. The model and param files achieving those scores are available for download ([Graph-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AADgBS9hb9vy0o-UBZW9AbbKa/bestfirstorder.tar.gz?dl=0), [Transition-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AACEPp3DLQeJnRA_QyPmll93a/bestarchybrid.tar.gz?dl=0)). The trained models include improvements beyond those described in the paper, to be published soon.
+
+To train a parsing model with for either parsing architecture type the following at the command prompt:
+
+ python src/parser.py --dynet-seed 123456789 [--dynet-mem XXXX] --outdir [results directory] --train training.conll --dev development.conll --epochs 30 --lstmdims 125 --lstmlayers 2 [--extrn extrn.vectors] --bibi-lstm
+
+We use the same external embedding used in [Transition-Based Dependency Parsing with Stack Long Short-Term Memory](http://arxiv.org/abs/1505.08075) which can be downloaded from the authors [github repository](https://github.com/clab/lstm-parser/) and [directly here](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing).
+
+If you are training a transition-based parser then for optimal results you should add the following to the command prompt `--k 3 --usehead --userl`. These switch will set the stack to 3 elements; use the BiLSTM of the head of trees on the stack as feature vectors; and add the BiLSTM of the right/leftmost children to the feature vectors.
+
+Note 1: You can run it without pos embeddings by setting the pos embedding dimensions to zero (--pembedding 0).
+
+Note 2: The reported test result is the one matching the highest development score.
+
+Note 3: The parser calculates (after each iteration) the accuracies excluding punctuation symbols by running the `eval.pl` script from the CoNLL-X Shared Task and stores the results in directory specified by the `--outdir`.
+
+Note 4: The external embeddings parameter is optional and better not used when train/predicting a graph-based model.
+
+#### Parse data with your parsing model
+
+The command for parsing a `test.conll` file formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat) with a previously trained model is:
+
+ python src/parser.py --predict --outdir [results directory] --test test.conll [--extrn extrn.vectors] --model [trained model file] --params [param file generate during training]
+
+The parser will store the resulting conll file in the out directory (`--outdir`).
+
+Note 1: If you are using the arc-hybrid trained model we provided please use the `--extrn` flag and specify the location of the external embeddings file.
+
+Note 2: If you are using the first-order trained model we provided please do not use the `--extrn` flag.
+
+#### Citation
+
+If you make use of this software for research purposes, we'll appreciate citing the following:
+
+ @article{DBLP:journals/tacl/KiperwasserG16,
+ author = {Eliyahu Kiperwasser and Yoav Goldberg},
+ title = {Simple and Accurate Dependency Parsing Using Bidirectional {LSTM}
+ Feature Representations},
+ journal = {{TACL}},
+ volume = {4},
+ pages = {313--327},
+ year = {2016},
+ url = {https://transacl.org/ojs/index.php/tacl/article/view/885},
+ timestamp = {Tue, 09 Aug 2016 14:51:09 +0200},
+ biburl = {http://dblp.uni-trier.de/rec/bib/journals/tacl/KiperwasserG16},
+ bibsource = {dblp computer science bibliography, http://dblp.org}
+ }
+
+#### License
+
+This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+
+#### Contact
+
+For questions and usage issues, please contact elikip@gmail.com
+
+#### Credits
+
+[Eliyahu Kiperwasser](http://elki.cc)
+
+[Yoav Goldberg](https://www.cs.bgu.ac.il/~yoavg/uni/)
+
diff --git a/bist_parser/__init__.py b/bist_parser/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bist_parser/barchybrid/src/arc_hybrid.py b/bist_parser/barchybrid/src/arc_hybrid.py
new file mode 100644
index 0000000..2d74fe4
--- /dev/null
+++ b/bist_parser/barchybrid/src/arc_hybrid.py
@@ -0,0 +1,401 @@
+from dynet import *
+from utils import ParseForest, read_conll, write_conll
+from operator import itemgetter
+from itertools import chain
+import utils, time, random
+import numpy as np
+
+
+class ArcHybridLSTM:
+ def __init__(self, words, pos, rels, w2i, options):
+ self.model = Model()
+ self.trainer = AdamTrainer(self.model)
+ random.seed(1)
+
+ self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))}
+ self.activation = self.activations[options.activation]
+
+ self.oracle = options.oracle
+ self.ldims = options.lstm_dims * 2
+ self.wdims = options.wembedding_dims
+ self.pdims = options.pembedding_dims
+ self.rdims = options.rembedding_dims
+ self.layers = options.lstm_layers
+ self.wordsCount = words
+ self.vocab = {word: ind+3 for word, ind in w2i.iteritems()}
+ self.pos = {word: ind+3 for ind, word in enumerate(pos)}
+ self.rels = {word: ind for ind, word in enumerate(rels)}
+ self.irels = rels
+
+ self.headFlag = options.headFlag
+ self.rlMostFlag = options.rlMostFlag
+ self.rlFlag = options.rlFlag
+ self.k = options.window
+
+ self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0)
+
+ self.external_embedding = None
+ if options.external_embedding is not None:
+ external_embedding_fp = open(options.external_embedding,'r')
+ external_embedding_fp.readline()
+ self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp}
+ external_embedding_fp.close()
+
+ self.edim = len(self.external_embedding.values()[0])
+ self.noextrn = [0.0 for _ in xrange(self.edim)]
+ self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
+ self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
+ for word, i in self.extrnd.iteritems():
+ self.elookup.init_row(i, self.external_embedding[word])
+ self.extrnd['*PAD*'] = 1
+ self.extrnd['*INITIAL*'] = 2
+
+ print 'Load external embedding. Vector dimensions', self.edim
+
+ dims = self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0)
+ self.blstmFlag = options.blstmFlag
+ self.bibiFlag = options.bibiFlag
+
+ if self.bibiFlag:
+ self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model),
+ VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)]
+ self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model),
+ VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)]
+ elif self.blstmFlag:
+ if self.layers > 0:
+ self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)]
+ else:
+ self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)]
+
+ self.hidden_units = options.hidden_units
+ self.hidden2_units = options.hidden2_units
+ self.vocab['*PAD*'] = 1
+ self.pos['*PAD*'] = 1
+
+ self.vocab['*INITIAL*'] = 2
+ self.pos['*INITIAL*'] = 2
+
+ self.wlookup = self.model.add_lookup_parameters((len(words) + 3, self.wdims))
+ self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
+ self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))
+
+ self.word2lstm = self.model.add_parameters((self.ldims, self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0)))
+ self.word2lstmbias = self.model.add_parameters((self.ldims))
+ self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims))
+ self.lstm2lstmbias = self.model.add_parameters((self.ldims))
+
+ self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1)))
+ self.hidBias = self.model.add_parameters((self.hidden_units))
+
+ self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+ self.hid2Bias = self.model.add_parameters((self.hidden2_units))
+
+ self.outLayer = self.model.add_parameters((3, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+ self.outBias = self.model.add_parameters((3))
+
+ self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1)))
+ self.rhidBias = self.model.add_parameters((self.hidden_units))
+
+ self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+ self.rhid2Bias = self.model.add_parameters((self.hidden2_units))
+
+ self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+ self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1))
+
+
+ def __evaluate(self, stack, buf, train):
+ topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [self.empty] for i in xrange(self.k) ]
+ topBuffer = [ buf.roots[i].lstms if len(buf) > i else [self.empty] for i in xrange(1) ]
+
+ input = concatenate(list(chain(*(topStack + topBuffer))))
+
+ if self.hidden2_units > 0:
+ routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr())
+ else:
+ routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr())
+
+ if self.hidden2_units > 0:
+ output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr())
+ else:
+ output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr())
+
+ scrs, uscrs = routput.value(), output.value()
+
+ uscrs0 = uscrs[0]
+ uscrs1 = uscrs[1]
+ uscrs2 = uscrs[2]
+ if train:
+ output0 = output[0]
+ output1 = output[1]
+ output2 = output[2]
+ ret = [ [ (rel, 0, scrs[1 + j * 2] + uscrs1, routput[1 + j * 2 ] + output1) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [],
+ [ (rel, 1, scrs[2 + j * 2] + uscrs2, routput[2 + j * 2 ] + output2) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [],
+ [ (None, 2, scrs[0] + uscrs0, routput[0] + output0) ] if len(buf) > 0 else [] ]
+ else:
+ s1,r1 = max(zip(scrs[1::2],self.irels))
+ s2,r2 = max(zip(scrs[2::2],self.irels))
+ s1 += uscrs1
+ s2 += uscrs2
+ ret = [ [ (r1, 0, s1) ] if len(stack) > 0 and len(buf) > 0 else [],
+ [ (r2, 1, s2) ] if len(stack) > 1 else [],
+ [ (None, 2, scrs[0] + uscrs0) ] if len(buf) > 0 else [] ]
+ return ret
+ #return [ [ (rel, 0, scrs[1 + j * 2 + 0] + uscrs[1], routput[1 + j * 2 + 0] + output[1]) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [],
+ # [ (rel, 1, scrs[1 + j * 2 + 1] + uscrs[2], routput[1 + j * 2 + 1] + output[2]) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [],
+ # [ (None, 2, scrs[0] + uscrs[0], routput[0] + output[0]) ] if len(buf) > 0 else [] ]
+
+
+ def Save(self, filename):
+ self.model.save(filename)
+
+
+ def Load(self, filename):
+ self.model.load(filename)
+
+ def Init(self):
+ evec = self.elookup[1] if self.external_embedding is not None else None
+ paddingWordVec = self.wlookup[1]
+ paddingPosVec = self.plookup[1] if self.pdims > 0 else None
+
+ paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec])) + self.word2lstmbias.expr() )
+ self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)])
+
+
+ def getWordEmbeddings(self, sentence, train):
+ for root in sentence:
+ c = float(self.wordsCount.get(root.norm, 0))
+ dropFlag = not train or (random.random() < (c/(0.25+c)))
+ root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0]
+ root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None
+
+ if self.external_embedding is not None:
+ #if not dropFlag and random.random() < 0.5:
+ # root.evec = self.elookup[0]
+ if root.form in self.external_embedding:
+ root.evec = self.elookup[self.extrnd[root.form]]
+ elif root.norm in self.external_embedding:
+ root.evec = self.elookup[self.extrnd[root.norm]]
+ else:
+ root.evec = self.elookup[0]
+ else:
+ root.evec = None
+ root.ivec = concatenate(filter(None, [root.wordvec, root.posvec, root.evec]))
+
+ if self.blstmFlag:
+ forward = self.surfaceBuilders[0].initial_state()
+ backward = self.surfaceBuilders[1].initial_state()
+
+ for froot, rroot in zip(sentence, reversed(sentence)):
+ forward = forward.add_input( froot.ivec )
+ backward = backward.add_input( rroot.ivec )
+ froot.fvec = forward.output()
+ rroot.bvec = backward.output()
+ for root in sentence:
+ root.vec = concatenate( [root.fvec, root.bvec] )
+
+ if self.bibiFlag:
+ bforward = self.bsurfaceBuilders[0].initial_state()
+ bbackward = self.bsurfaceBuilders[1].initial_state()
+
+ for froot, rroot in zip(sentence, reversed(sentence)):
+ bforward = bforward.add_input( froot.vec )
+ bbackward = bbackward.add_input( rroot.vec )
+ froot.bfvec = bforward.output()
+ rroot.bbvec = bbackward.output()
+ for root in sentence:
+ root.vec = concatenate( [root.bfvec, root.bbvec] )
+
+ else:
+ for root in sentence:
+ root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr()
+ root.vec = tanh( root.ivec )
+
+
+ def Predict(self, conll_path):
+ with open(conll_path, 'r') as conllFP:
+ for iSentence, sentence in enumerate(read_conll(conllFP, False)):
+ self.Init()
+
+ sentence = sentence[1:] + [sentence[0]]
+ self.getWordEmbeddings(sentence, False)
+ stack = ParseForest([])
+ buf = ParseForest(sentence)
+
+ for root in sentence:
+ root.lstms = [root.vec for _ in xrange(self.nnvecs)]
+
+ hoffset = 1 if self.headFlag else 0
+
+ while len(buf) > 0 or len(stack) > 1 :
+ scores = self.__evaluate(stack, buf, False)
+ best = max(chain(*scores), key = itemgetter(2) )
+
+ if best[1] == 2:
+ stack.roots.append(buf.roots[0])
+ del buf.roots[0]
+
+ elif best[1] == 0:
+ child = stack.roots.pop()
+ parent = buf.roots[0]
+
+ child.pred_parent_id = parent.id
+ child.pred_relation = best[0]
+
+ bestOp = 0
+ if self.rlMostFlag:
+ parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+ if self.rlFlag:
+ parent.lstms[bestOp + hoffset] = child.vec
+
+ elif best[1] == 1:
+ child = stack.roots.pop()
+ parent = stack.roots[-1]
+
+ child.pred_parent_id = parent.id
+ child.pred_relation = best[0]
+
+ bestOp = 1
+ if self.rlMostFlag:
+ parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+ if self.rlFlag:
+ parent.lstms[bestOp + hoffset] = child.vec
+
+ renew_cg()
+ yield [sentence[-1]] + sentence[:-1]
+
+
+ def Train(self, conll_path):
+ mloss = 0.0
+ errors = 0
+ batch = 0
+ eloss = 0.0
+ eerrors = 0
+ lerrors = 0
+ etotal = 0
+ ltotal = 0
+ ninf = -float('inf')
+
+ hoffset = 1 if self.headFlag else 0
+
+ start = time.time()
+
+ with open(conll_path, 'r') as conllFP:
+ shuffledData = list(read_conll(conllFP, True))
+ random.shuffle(shuffledData)
+
+ errs = []
+ eeloss = 0.0
+
+ self.Init()
+
+ for iSentence, sentence in enumerate(shuffledData):
+ if iSentence % 100 == 0 and iSentence != 0:
+ print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
+ start = time.time()
+ eerrors = 0
+ eloss = 0.0
+ etotal = 0
+ lerrors = 0
+ ltotal = 0
+
+ sentence = sentence[1:] + [sentence[0]]
+ self.getWordEmbeddings(sentence, True)
+ stack = ParseForest([])
+ buf = ParseForest(sentence)
+
+ for root in sentence:
+ root.lstms = [root.vec for _ in xrange(self.nnvecs)]
+
+ hoffset = 1 if self.headFlag else 0
+
+ while len(buf) > 0 or len(stack) > 1 :
+ scores = self.__evaluate(stack, buf, True)
+ scores.append([(None, 3, ninf ,None)])
+
+ alpha = stack.roots[:-2] if len(stack) > 2 else []
+ s1 = [stack.roots[-2]] if len(stack) > 1 else []
+ s0 = [stack.roots[-1]] if len(stack) > 0 else []
+ b = [buf.roots[0]] if len(buf) > 0 else []
+ beta = buf.roots[1:] if len(buf) > 1 else []
+
+ left_cost = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) +
+ len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[0]) > 0 else 1
+ right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) +
+ len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[1]) > 0 else 1
+ shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) +
+ len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) ) if len(scores[2]) > 0 else 1
+ costs = (left_cost, right_cost, shift_cost, 1)
+
+ bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or s[0] == stack.roots[-1].relation ) ), key=itemgetter(2))
+ bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2))
+ best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong
+
+ if best[1] == 2:
+ stack.roots.append(buf.roots[0])
+ del buf.roots[0]
+
+ elif best[1] == 0:
+ child = stack.roots.pop()
+ parent = buf.roots[0]
+
+ child.pred_parent_id = parent.id
+ child.pred_relation = best[0]
+
+ bestOp = 0
+ if self.rlMostFlag:
+ parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+ if self.rlFlag:
+ parent.lstms[bestOp + hoffset] = child.vec
+
+ elif best[1] == 1:
+ child = stack.roots.pop()
+ parent = stack.roots[-1]
+
+ child.pred_parent_id = parent.id
+ child.pred_relation = best[0]
+
+ bestOp = 1
+ if self.rlMostFlag:
+ parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset]
+ if self.rlFlag:
+ parent.lstms[bestOp + hoffset] = child.vec
+
+ if bestValid[2] < bestWrong[2] + 1.0:
+ loss = bestWrong[3] - bestValid[3]
+ mloss += 1.0 + bestWrong[2] - bestValid[2]
+ eloss += 1.0 + bestWrong[2] - bestValid[2]
+ errs.append(loss)
+
+ if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation):
+ lerrors += 1
+ if child.pred_parent_id != child.parent_id:
+ errors += 1
+ eerrors += 1
+
+ etotal += 1
+
+ if len(errs) > 50: # or True:
+ #eerrs = ((esum(errs)) * (1.0/(float(len(errs)))))
+ eerrs = esum(errs)
+ scalar_loss = eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+ errs = []
+ lerrs = []
+
+ renew_cg()
+ self.Init()
+
+ if len(errs) > 0:
+ eerrs = (esum(errs)) # * (1.0/(float(len(errs))))
+ eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+
+ errs = []
+ lerrs = []
+
+ renew_cg()
+
+ self.trainer.update_epoch()
+ print "Loss: ", mloss/iSentence
diff --git a/bist_parser/barchybrid/src/parser.py b/bist_parser/barchybrid/src/parser.py
new file mode 100644
index 0000000..8ddbe95
--- /dev/null
+++ b/bist_parser/barchybrid/src/parser.py
@@ -0,0 +1,76 @@
+from optparse import OptionParser
+from arc_hybrid import ArcHybridLSTM
+import pickle, utils, os, time, sys
+
+if __name__ == '__main__':
+ parser = OptionParser()
+ parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/PTB_SD_3_3_0/train.conll")
+ parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/PTB_SD_3_3_0/dev.conll")
+ parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/PTB_SD_3_3_0/test.conll")
+ parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle")
+ parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE")
+ parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="barchybrid.model")
+ parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100)
+ parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25)
+ parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25)
+ parser.add_option("--epochs", type="int", dest="epochs", default=30)
+ parser.add_option("--hidden", type="int", dest="hidden_units", default=100)
+ parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0)
+ parser.add_option("--k", type="int", dest="window", default=3)
+ parser.add_option("--lr", type="float", dest="learning_rate", default=0.1)
+ parser.add_option("--outdir", type="string", dest="output", default="results")
+ parser.add_option("--activation", type="string", dest="activation", default="tanh")
+ parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2)
+ parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=200)
+ parser.add_option("--dynet-seed", type="int", dest="seed", default=7)
+ parser.add_option("--disableoracle", action="store_false", dest="oracle", default=True)
+ parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True)
+ parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False)
+ parser.add_option("--usehead", action="store_true", dest="headFlag", default=False)
+ parser.add_option("--userlmost", action="store_true", dest="rlFlag", default=False)
+ parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False)
+ parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
+ parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512)
+
+ (options, args) = parser.parse_args()
+ print 'Using external embedding:', options.external_embedding
+
+ if not options.predictFlag:
+ if not (options.rlFlag or options.rlMostFlag or options.headFlag):
+ print 'You must use either --userlmost or --userl or --usehead (you can use multiple)'
+ sys.exit()
+
+ print 'Preparing vocab'
+ words, w2i, pos, rels = utils.vocab(options.conll_train)
+
+ with open(os.path.join(options.output, options.params), 'w') as paramsfp:
+ pickle.dump((words, w2i, pos, rels, options), paramsfp)
+ print 'Finished collecting vocab'
+
+ print 'Initializing blstm arc hybrid:'
+ parser = ArcHybridLSTM(words, pos, rels, w2i, options)
+
+ for epoch in xrange(options.epochs):
+ print 'Starting epoch', epoch
+ parser.Train(options.conll_train)
+ devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll')
+ utils.write_conll(devpath, parser.Predict(options.conll_dev))
+ os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt &')
+ print 'Finished predicting dev'
+ parser.Save(os.path.join(options.output, options.model + str(epoch+1)))
+ else:
+ with open(options.params, 'r') as paramsfp:
+ words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)
+
+ stored_opt.external_embedding = options.external_embedding
+
+ parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt)
+ parser.Load(options.model)
+ tespath = os.path.join(options.output, 'test_pred.conll')
+ ts = time.time()
+ pred = list(parser.Predict(options.conll_test))
+ te = time.time()
+ utils.write_conll(tespath, pred)
+ os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt &')
+ print 'Finished predicting test',te-ts
+
diff --git a/bist_parser/barchybrid/src/utils.py b/bist_parser/barchybrid/src/utils.py
new file mode 100644
index 0000000..7b21851
--- /dev/null
+++ b/bist_parser/barchybrid/src/utils.py
@@ -0,0 +1,114 @@
+from collections import Counter
+import re
+
+
+class ConllEntry:
+ def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
+ self.id = id
+ self.form = form
+ self.norm = normalize(form)
+ self.cpos = cpos.upper()
+ self.pos = pos.upper()
+ self.parent_id = parent_id
+ self.relation = relation
+
+
+class ParseForest:
+ def __init__(self, sentence):
+ self.roots = list(sentence)
+
+ for root in self.roots:
+ root.children = []
+ root.scores = None
+ root.parent = None
+ root.pred_parent_id = 0 # None
+ root.pred_relation = 'rroot' # None
+ root.vecs = None
+ root.lstms = None
+
+ def __len__(self):
+ return len(self.roots)
+
+
+ def Attach(self, parent_index, child_index):
+ parent = self.roots[parent_index]
+ child = self.roots[child_index]
+
+ child.pred_parent_id = parent.id
+ del self.roots[child_index]
+
+
+def isProj(sentence):
+ forest = ParseForest(sentence)
+ unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence}
+
+ for _ in xrange(len(sentence)):
+ for i in xrange(len(forest.roots) - 1):
+ if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0:
+ unassigned[forest.roots[i+1].id]-=1
+ forest.Attach(i+1, i)
+ break
+ if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0:
+ unassigned[forest.roots[i].id]-=1
+ forest.Attach(i, i+1)
+ break
+
+ return len(forest.roots) == 1
+
+def vocab(conll_path):
+ wordsCount = Counter()
+ posCount = Counter()
+ relCount = Counter()
+
+ with open(conll_path, 'r') as conllFP:
+ for sentence in read_conll(conllFP, True):
+ wordsCount.update([node.norm for node in sentence])
+ posCount.update([node.pos for node in sentence])
+ relCount.update([node.relation for node in sentence])
+
+ return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys())
+
+def read_conll(fh, proj):
+ dropped = 0
+ read = 0
+ root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot')
+ tokens = [root]
+ for line in fh:
+ tok = line.strip().split()
+ if not tok:
+ if len(tokens)>1:
+ if not proj or isProj(tokens):
+ yield tokens
+ else:
+ print 'Non-projective sentence dropped'
+ dropped += 1
+ read += 1
+ tokens = [root]
+ id = 0
+ else:
+ tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
+ if len(tokens) > 1:
+ yield tokens
+
+ print dropped, 'dropped non-projective sentences.'
+ print read, 'sentences read.'
+
+
+def write_conll(fn, conll_gen):
+ with open(fn, 'w') as fh:
+ for sentence in conll_gen:
+ for entry in sentence[1:]:
+ fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
+ fh.write('\n')
+ fh.write('\n')
+
+
+numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
+def normalize(word):
+ return 'NUM' if numberRegex.match(word) else word.lower()
+
+cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB",
+ "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV",
+ ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET",
+ "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS",
+ "-LRB-": ".", "-RRB-": "."}
diff --git a/bist_parser/barchybrid/src/utils/eval.pl b/bist_parser/barchybrid/src/utils/eval.pl
new file mode 100644
index 0000000..3db9837
--- /dev/null
+++ b/bist_parser/barchybrid/src/utils/eval.pl
@@ -0,0 +1,1826 @@
+#!/usr/bin/env perl
+
+# Author: Yuval Krymolowski
+# Addition of precision and recall
+# and of frame confusion list: Sabine Buchholz
+# Addition of DEPREL + ATTACHMENT:
+# Prokopis Prokopidis (prokopis at ilsp dot gr)
+# Acknowledgements:
+# to Markus Kuhn for suggesting the use of
+# the Unicode category property
+
+if ($] < 5.008001)
+{
+ printf STDERR < -s
+
+ This script evaluates a system output with respect to a gold standard.
+ Both files should be in UTF-8 encoded CoNLL-X tabular format.
+
+ Punctuation tokens (those where all characters have the Unicode
+ category property "Punctuation") are ignored for scoring (unless the
+ -p flag is used).
+
+ The output breaks down the errors according to their type and context.
+
+ Optional parameters:
+ -o FILE : output: print output to FILE (default is standard output)
+ -q : quiet: only print overall performance, without the details
+ -b : evalb: produce output in a format similar to evalb
+ (http://nlp.cs.nyu.edu/evalb/); use together with -q
+ -p : punctuation: also score on punctuation (default is not to score on it)
+ -v : version: show the version number
+ -h : help: print this help text and exit
+
+EOT
+;
+
+my ($line_num) ;
+my ($sep) = '0x01' ;
+
+my ($START) = '.S' ;
+my ($END) = '.E' ;
+
+my ($con_err_num) = 3 ;
+my ($freq_err_num) = 10 ;
+my ($spec_err_loc_con) = 8 ;
+
+################################################################################
+### subfunctions ###
+################################################################################
+
+# Whether a string consists entirely of characters with the Unicode
+# category property "Punctuation" (see "man perlunicode")
+sub is_uni_punct
+{
+ my ($word) = @_ ;
+
+ return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ;
+}
+
+# The length of a unicode string, excluding non-spacing marks
+# (for example vowel marks in Arabic)
+
+sub uni_len
+{
+ my ($word) = @_ ;
+ my ($ch, $l) ;
+
+ $l = 0 ;
+ foreach $ch (split(//, Encode::decode_utf8($word)))
+ {
+ if ($ch !~ /^\p{NonspacingMark}/)
+ {
+ $l++ ;
+ }
+ }
+
+ return $l ;
+}
+
+sub filter_context_counts
+{ # filter_context_counts
+
+ my ($vec, $num, $max_len) = @_ ;
+ my ($con, $l, $thresh) ;
+
+ $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ;
+
+ foreach $con (keys %{$vec})
+ {
+ if (${$vec}{$con} < $thresh)
+ {
+ delete ${$vec}{$con} ;
+ next ;
+ }
+
+ $l = uni_len($con) ;
+
+ if ($l > ${$max_len})
+ {
+ ${$max_len} = $l ;
+ }
+ }
+
+} # filter_context_counts
+
+sub print_context
+{ # print_context
+
+ my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ;
+ my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ;
+
+ printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ;
+ printf OUT " ||" ;
+ printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ;
+ printf OUT "\n" ;
+ printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len;
+ printf OUT "--++" ;
+ printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+ printf OUT "\n" ;
+
+ @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ;
+ @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ;
+
+ $n = scalar @v_con ;
+ if (scalar @v_con_pos > $n)
+ {
+ $n = scalar @v_con_pos ;
+ }
+
+ foreach $i (0 .. $n-1)
+ {
+ if (defined $v_con_pos[$i])
+ {
+ $con_pos = $v_con_pos[$i] ;
+ printf OUT " %-*s | %4d | %4d | %4d | %4d",
+ $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos},
+ ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos},
+ ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ;
+ }
+ else
+ {
+ printf OUT " %-*s | %4s | %4s | %4s | %4s",
+ $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ;
+ }
+
+ printf OUT " ||" ;
+
+ if (defined $v_con[$i])
+ {
+ $con = $v_con[$i] ;
+ printf OUT " %-*s | %4d | %4d | %4d | %4d",
+ $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con},
+ ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con},
+ ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ;
+ }
+ else
+ {
+ printf OUT " %-*s | %4s | %4s | %4s | %4s",
+ $max_con_len, ' ', ' ', ' ', ' ', ' ' ;
+ }
+
+ printf OUT "\n" ;
+ }
+
+ printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len;
+ printf OUT "--++" ;
+ printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+ printf OUT "\n" ;
+
+ printf OUT "\n\n" ;
+
+} # print_context
+
+sub num_as_word
+{
+ my ($num) = @_ ;
+
+ $num = abs($num) ;
+
+ if ($num == 1)
+ {
+ return ('one word') ;
+ }
+ elsif ($num == 2)
+ {
+ return ('two words') ;
+ }
+ elsif ($num == 3)
+ {
+ return ('three words') ;
+ }
+ elsif ($num == 4)
+ {
+ return ('four words') ;
+ }
+ else
+ {
+ return ($num.' words') ;
+ }
+}
+
+sub describe_err
+{ # describe_err
+
+ my ($head_err, $head_aft_bef, $dep_err) = @_ ;
+ my ($dep_g, $dep_s, $desc) ;
+ my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ;
+
+ if ($head_err eq '-')
+ {
+ $desc = 'correct head' ;
+
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= ' (0)' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= ' (the focus word)' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= ' (after the focus word)' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= ' (before the focus word)' ;
+ }
+ }
+ elsif ($head_aft_bef_s eq '0')
+ {
+ $desc = 'head = 0 instead of ' ;
+ if ($head_aft_bef_g eq 'a')
+ {
+ $desc.= 'after ' ;
+ }
+ if ($head_aft_bef_g eq 'b')
+ {
+ $desc.= 'before ' ;
+ }
+ $desc .= 'the focus word' ;
+ }
+ elsif ($head_aft_bef_g eq '0')
+ {
+ $desc = 'head is ' ;
+ if ($head_aft_bef_g eq 'a')
+ {
+ $desc.= 'after ' ;
+ }
+ if ($head_aft_bef_g eq 'b')
+ {
+ $desc.= 'before ' ;
+ }
+ $desc .= 'the focus word instead of 0' ;
+ }
+ else
+ {
+ $desc = num_as_word($head_err) ;
+ if ($head_err < 0)
+ {
+ $desc .= ' before' ;
+ }
+ else
+ {
+ $desc .= ' after' ;
+ }
+
+ $desc = 'head '.$desc.' the correct head ' ;
+
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= '(0' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= '(the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= '(after the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= '(before the focus word' ;
+ }
+
+ if ($head_aft_bef_g ne $head_aft_bef_s)
+ {
+ $desc .= ' instead of' ;
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= '0' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= 'the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= 'after the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= 'before the focus word' ;
+ }
+ }
+
+ $desc .= ')' ;
+ }
+
+ $desc .= ', ' ;
+
+ if ($dep_err eq '-')
+ {
+ $desc .= 'correct dependency' ;
+ }
+ else
+ {
+ ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ;
+ $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ;
+ }
+
+ return($desc) ;
+
+} # describe_err
+
+sub get_context
+{ # get_context
+
+ my ($sent, $i_w) = @_ ;
+ my ($w_2, $w_1, $w1, $w2) ;
+ my ($p_2, $p_1, $p1, $p2) ;
+
+ if ($i_w >= 2)
+ {
+ $w_2 = ${${$sent}[$i_w-2]}{word} ;
+ $p_2 = ${${$sent}[$i_w-2]}{pos} ;
+ }
+ else
+ {
+ $w_2 = $START ;
+ $p_2 = $START ;
+ }
+
+ if ($i_w >= 1)
+ {
+ $w_1 = ${${$sent}[$i_w-1]}{word} ;
+ $p_1 = ${${$sent}[$i_w-1]}{pos} ;
+ }
+ else
+ {
+ $w_1 = $START ;
+ $p_1 = $START ;
+ }
+
+ if ($i_w <= scalar @{$sent}-2)
+ {
+ $w1 = ${${$sent}[$i_w+1]}{word} ;
+ $p1 = ${${$sent}[$i_w+1]}{pos} ;
+ }
+ else
+ {
+ $w1 = $END ;
+ $p1 = $END ;
+ }
+
+ if ($i_w <= scalar @{$sent}-3)
+ {
+ $w2 = ${${$sent}[$i_w+2]}{word} ;
+ $p2 = ${${$sent}[$i_w+2]}{pos} ;
+ }
+ else
+ {
+ $w2 = $END ;
+ $p2 = $END ;
+ }
+
+ return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ;
+
+} # get_context
+
+sub read_sent
+{ # read_sent
+
+ my ($sent_gold, $sent_sys) = @_ ;
+ my ($line_g, $line_s, $new_sent) ;
+ my (%fields_g, %fields_s) ;
+
+ $new_sent = 1 ;
+
+ @{$sent_gold} = () ;
+ @{$sent_sys} = () ;
+
+ while (1)
+ { # main reading loop
+
+ $line_g = ;
+ $line_s = ;
+
+ $line_num++ ;
+
+ # system output has fewer lines than gold standard
+ if ((defined $line_g) && (! defined $line_s))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : past end of file\n" ;
+ exit(1) ;
+ }
+
+ # system output has more lines than gold standard
+ if ((! defined $line_g) && (defined $line_s))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: past end of file\n" ;
+ printf STDERR " sys : %s", $line_s ;
+ exit(1) ;
+ }
+
+ # end of file reached for both
+ if ((! defined $line_g) && (! defined $line_s))
+ {
+ return (1) ;
+ }
+
+ # one contains end of sentence but other one does not
+ if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : %s", $line_s ;
+ exit(1) ;
+ }
+
+ # end of sentence reached
+ if ($line_g =~ /^\s+$/)
+ {
+ return(0) ;
+ }
+
+ # now both lines contain information
+
+ if ($new_sent)
+ {
+ $new_sent = 0 ;
+ }
+
+ # 'official' column names
+ # options.output = ['id','form','lemma','cpostag','postag',
+ # 'feats','head','deprel','phead','pdeprel']
+
+ @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ;
+
+ push @{$sent_gold}, { %fields_g } ;
+
+ @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ;
+
+ if (($fields_g{word} ne $fields_s{word})
+ ||
+ ($fields_g{pos} ne $fields_s{pos}))
+ {
+ printf STDERR "Word/pos mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : %s", $line_s ;
+ #exit(1) ;
+ }
+
+ push @{$sent_sys}, { %fields_s } ;
+
+ } # main reading loop
+
+} # read_sent
+
+################################################################################
+### main ###
+################################################################################
+
+our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ;
+
+my ($sent_num, $eof, $word_num, @err_sent) ;
+my (@sent_gold, @sent_sys, @starts) ;
+my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ;
+my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ;
+my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ;
+my ($loc_con, %loc_con_err_counts, %err_desc) ;
+my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ;
+my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ;
+my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ;
+my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ;
+my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ;
+my (%freq_err, $err) ;
+
+my ($i, $j, $i_w, $l, $n_args) ;
+my ($w_2, $w_1, $w1, $w2) ;
+my ($wp_2, $wp_1, $wp1, $wp2) ;
+my ($p_2, $p_1, $p1, $p2) ;
+
+my ($short_output) ;
+my ($score_on_punct) ;
+$counts{punct} = 0; # initialize
+
+getopts("g:o:s:qvhpb") ;
+
+if (defined $opt_v)
+{
+ my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $';
+ my @parts = split ' ',$id;
+ print "Version $parts[2]\n";
+ exit(0);
+}
+
+if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s)))
+{
+ die $usage ;
+}
+
+if (! defined $opt_g)
+{
+ die "Gold standard file (-g) missing\n" ;
+}
+
+if (! defined $opt_s)
+{
+ die "System output file (-s) missing\n" ;
+}
+
+if (! defined $opt_o)
+{
+ $opt_o = '-' ;
+}
+
+if (defined $opt_q)
+{
+ $short_output = 1 ;
+} else {
+ $short_output = 0 ;
+}
+
+if (defined $opt_p)
+{
+ $score_on_punct = 1 ;
+} else {
+ $score_on_punct = 0 ;
+}
+
+$line_num = 0 ;
+$sent_num = 0 ;
+$eof = 0 ;
+
+@err_sent = () ;
+@starts = () ;
+
+%{$err_sent[0]} = () ;
+
+$max_pos_len = length('CPOS') ;
+
+################################################################################
+### reading input ###
+################################################################################
+
+open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ;
+open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ;
+open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ;
+
+
+if (defined $opt_b) { # produce output similar to evalb
+ print OUT " Sent. Attachment Correct Scoring \n";
+ print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n";
+ print OUT " ============================================================================\n";
+}
+
+
+while (! $eof)
+{ # main reading loop
+
+ $starts[$sent_num] = $line_num+1 ;
+ $eof = read_sent(\@sent_gold, \@sent_sys) ;
+
+ $sent_num++ ;
+
+ %{$err_sent[$sent_num]} = () ;
+ $word_num = scalar @sent_gold ;
+
+ # for accuracy per sentence
+ my %sent_counts = ( tot => 0,
+ err_any => 0,
+ err_head => 0
+ );
+
+ # printf "$sent_num $word_num\n" ;
+
+ my @frames_g = ('** '); # the initial frame for the virtual root
+ my @frames_s = ('** '); # the initial frame for the virtual root
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+ push @frames_g, ''; # initialize
+ push @frames_s, ''; # initialize
+ }
+
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+
+ ($word, $pos, $head_g, $dep_g)
+ = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+ $wp = $word.' / '.$pos ;
+
+ # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ;
+
+ if ((! $score_on_punct) && is_uni_punct($word))
+ {
+ $counts{punct}++ ;
+ # ignore punctuations
+ next ;
+ }
+
+ if (length($pos) > $max_pos_len)
+ {
+ $max_pos_len = length($pos) ;
+ }
+
+ ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+ $counts{tot}++ ;
+ $counts{word}{$wp}{tot}++ ;
+ $counts{pos}{$pos}{tot}++ ;
+ $counts{head}{$head_g-$i_w-1}{tot}++ ;
+
+ # for frame confusions
+ # add child to frame of parent
+ $frames_g[$head_g] .= "$dep_g ";
+ $frames_s[$head_s] .= "$dep_s ";
+ # add to frame of token itself
+ $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero
+ $frames_s[$i_w+1] .= "*$dep_g* ";
+
+ # for precision and recall of DEPREL
+ $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels
+ $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions
+ $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels
+ $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ...
+ $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output
+
+ # for precision and recall of HEAD direction
+ my $dir_g;
+ if ($head_g == 0) {
+ $dir_g = 'to_root';
+ } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero
+ # also below
+ $dir_g = 'left';
+ } elsif ($head_g > $i_w+1) {
+ $dir_g = 'right';
+ } else {
+ # token links to itself; should never happen in correct gold standard
+ $dir_g = 'self';
+ }
+ my $dir_s;
+ if ($head_s == 0) {
+ $dir_s = 'to_root';
+ } elsif ($head_s < $i_w+1) {
+ $dir_s = 'left';
+ } elsif ($head_s > $i_w+1) {
+ $dir_s = 'right';
+ } else {
+ # token links to itself; should not happen in good system
+ # (but not forbidden in shared task)
+ $dir_s = 'self';
+ }
+ $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction
+ $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions
+ $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction
+
+ # for precision and recall of HEAD distance
+ my $dist_g;
+ if ($head_g == 0) {
+ $dist_g = 'to_root';
+ } elsif ( abs($head_g - ($i_w+1)) <= 1 ) {
+ $dist_g = '1'; # includes the 'self' cases
+ } elsif ( abs($head_g - ($i_w+1)) <= 2 ) {
+ $dist_g = '2';
+ } elsif ( abs($head_g - ($i_w+1)) <= 6 ) {
+ $dist_g = '3-6';
+ } else {
+ $dist_g = '7-...';
+ }
+ my $dist_s;
+ if ($head_s == 0) {
+ $dist_s = 'to_root';
+ } elsif ( abs($head_s - ($i_w+1)) <= 1 ) {
+ $dist_s = '1'; # includes the 'self' cases
+ } elsif ( abs($head_s - ($i_w+1)) <= 2 ) {
+ $dist_s = '2';
+ } elsif ( abs($head_s - ($i_w+1)) <= 6 ) {
+ $dist_s = '3-6';
+ } else {
+ $dist_s = '7-...';
+ }
+ $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance
+ $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions
+ $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance
+
+
+ $err_head = ($head_g ne $head_s) ; # error in head
+ $err_dep = ($dep_g ne $dep_s) ; # error in deprel
+
+ $head_err = '-' ;
+ $dep_err = '-' ;
+
+ # for accuracy per sentence
+ $sent_counts{tot}++ ;
+ if ($err_dep || $err_head) {
+ $sent_counts{err_any}++ ;
+ }
+ if ($err_head) {
+ $sent_counts{err_head}++ ;
+ }
+
+ # total counts and counts for CPOS involved in errors
+
+ if ($head_g eq '0')
+ {
+ $head_aft_bef_g = '0' ;
+ }
+ elsif ($head_g eq $i_w+1)
+ {
+ $head_aft_bef_g = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ if ($head_s eq '0')
+ {
+ $head_aft_bef_s = '0' ;
+ }
+ elsif ($head_s eq $i_w+1)
+ {
+ $head_aft_bef_s = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+ if ($err_head)
+ {
+ if ($head_aft_bef_s eq '0')
+ {
+ $head_err = 0 ;
+ }
+ else
+ {
+ $head_err = $head_s-$head_g ;
+ }
+
+ $err_sent[$sent_num]{head}++ ;
+ $counts{err_head}{tot}++ ;
+ $counts{err_head}{$head_err}++ ;
+
+ $counts{word}{err_head}{$wp}++ ;
+ $counts{pos}{$pos}{err_head}{tot}++ ;
+ $counts{pos}{$pos}{err_head}{$head_err}++ ;
+ }
+
+ if ($err_dep)
+ {
+ $dep_err = $dep_g.'->'.$dep_s ;
+ $err_sent[$sent_num]{dep}++ ;
+ $counts{err_dep}{tot}++ ;
+ $counts{err_dep}{$dep_err}++ ;
+
+ $counts{word}{err_dep}{$wp}++ ;
+ $counts{pos}{$pos}{err_dep}{tot}++ ;
+ $counts{pos}{$pos}{err_dep}{$dep_err}++ ;
+
+ if ($err_head)
+ {
+ $counts{err_both}++ ;
+ $counts{pos}{$pos}{err_both}++ ;
+ }
+ }
+
+ ### DEPREL + ATTACHMENT
+ if ((!$err_dep) && ($err_head)) {
+ $counts{err_head_corr_dep}{tot}++ ;
+ $counts{err_head_corr_dep}{$dep_s}++ ;
+ }
+ ### DEPREL + ATTACHMENT
+
+ # counts for words involved in errors
+
+ if (! ($err_head || $err_dep))
+ {
+ next ;
+ }
+
+ $err_sent[$sent_num]{word}++ ;
+ $counts{err_any}++ ;
+ $counts{word}{err_any}{$wp}++ ;
+ $counts{pos}{$pos}{err_any}++ ;
+
+ ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+ if ($w_2 ne $START)
+ {
+ $wp_2 = $w_2.' / '.$p_2 ;
+ }
+ else
+ {
+ $wp_2 = $w_2 ;
+ }
+
+ if ($w_1 ne $START)
+ {
+ $wp_1 = $w_1.' / '.$p_1 ;
+ }
+ else
+ {
+ $wp_1 = $w_1 ;
+ }
+
+ if ($w1 ne $END)
+ {
+ $wp1 = $w1.' / '.$p1 ;
+ }
+ else
+ {
+ $wp1 = $w1 ;
+ }
+
+ if ($w2 ne $END)
+ {
+ $wp2 = $w2.' / '.$p2 ;
+ }
+ else
+ {
+ $wp2 = $w2 ;
+ }
+
+ $con_bef = $wp_1 ;
+ $con_bef_2 = $wp_2.' + '.$wp_1 ;
+ $con_aft = $wp1 ;
+ $con_aft_2 = $wp1.' + '.$wp2 ;
+
+ $con_pos_bef = $p_1 ;
+ $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+ $con_pos_aft = $p1 ;
+ $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+ if ($w_1 ne $START)
+ {
+ # do not count '.S' as a word context
+ $counts{con_bef_2}{tot}{$con_bef_2}++ ;
+ $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ;
+ $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ;
+ $counts{con_bef}{tot}{$con_bef}++ ;
+ $counts{con_bef}{err_head}{$con_bef} += $err_head ;
+ $counts{con_bef}{err_dep}{$con_bef} += $err_dep ;
+ }
+
+ if ($w1 ne $END)
+ {
+ # do not count '.E' as a word context
+ $counts{con_aft_2}{tot}{$con_aft_2}++ ;
+ $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ;
+ $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ;
+ $counts{con_aft}{tot}{$con_aft}++ ;
+ $counts{con_aft}{err_head}{$con_aft} += $err_head ;
+ $counts{con_aft}{err_dep}{$con_aft} += $err_dep ;
+ }
+
+ $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ;
+ $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ;
+ $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ;
+ $counts{con_pos_bef}{tot}{$con_pos_bef}++ ;
+ $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ;
+ $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ;
+
+ $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ;
+ $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ;
+ $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ;
+ $counts{con_pos_aft}{tot}{$con_pos_aft}++ ;
+ $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ;
+ $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ;
+
+ $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+ $freq_err{$err}++ ;
+
+ } # loop on words
+
+ foreach $i_w (0 .. $word_num) # including one for the virtual root
+ { # loop on words
+ if ($frames_g[$i_w] ne $frames_s[$i_w]) {
+ $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ;
+ }
+ }
+
+ if (defined $opt_b) { # produce output similar to evalb
+ if ($word_num > 0) {
+ my ($unlabeled,$labeled) = ('NaN', 'NaN');
+ if ($sent_counts{tot} > 0) { # there are scoring tokens
+ $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot};
+ $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot};
+ }
+ printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n",
+ $sent_num, $word_num,
+ $unlabeled, $labeled,
+ $sent_counts{tot}-$sent_counts{err_head},
+ $sent_counts{tot}-$sent_counts{err_any},
+ $sent_counts{tot},;
+ }
+ }
+
+} # main reading loop
+
+################################################################################
+### printing output ###
+################################################################################
+
+if (defined $opt_b) { # produce output similar to evalb
+ print OUT "\n\n";
+}
+printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ;
+printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ;
+printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ;
+
+if ($short_output)
+{
+ exit(0) ;
+}
+printf OUT "\n %s\n\n", '=' x 80 ;
+printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ;
+
+printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ;
+
+printf OUT " Number of non-scoring tokens: $counts{punct}\n\n";
+
+printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n",
+ 'Accuracy', 'words', 'right', 'right', 'both' ;
+printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n",
+ ' ', ' ', 'head', ' dep', 'right' ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ 'total', $counts{tot},
+ $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot},
+ $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot},
+ $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+ if (! defined($counts{pos}{$pos}{err_head}{tot}))
+ {
+ $counts{pos}{$pos}{err_head}{tot} = 0 ;
+ }
+ if (! defined($counts{pos}{$pos}{err_dep}{tot}))
+ {
+ $counts{pos}{$pos}{err_dep}{tot} = 0 ;
+ }
+ if (! defined($counts{pos}{$pos}{err_any}))
+ {
+ $counts{pos}{$pos}{err_any} = 0 ;
+ }
+
+ printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ $pos, $counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ;
+}
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "\n\n" ;
+
+printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n",
+ 'Error', 'words', 'head', ' dep', 'both' ;
+printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n",
+
+ 'Rate', ' ', 'err', ' err', 'wrong' ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ 'total', $counts{tot},
+ $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot},
+ $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot},
+ $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+ if (! defined($counts{pos}{$pos}{err_both}))
+ {
+ $counts{pos}{$pos}{err_both} = 0 ;
+ }
+
+ printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ $pos, $counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ;
+
+}
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+### added by Sabine Buchholz
+printf OUT "\n\n";
+printf OUT " Precision and recall of DEPREL\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dep2}{$dep}{$dep})) {
+ $tot_corr = $counts{dep2}{$dep}{$dep};
+ }
+ if (defined($counts{dep}{$dep}{tot})) {
+ $tot_g = $counts{dep}{$dep}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dep_s}{$dep}{tot})) {
+ $tot_s = $counts{dep_s}{$dep}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+### DEPREL + ATTACHMENT:
+### Same as Sabine's DEPREL apart from $tot_corr calculation
+printf OUT "\n\n";
+printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dep2}{$dep}{$dep})) {
+ if (defined($counts{err_head_corr_dep}{$dep})) {
+ $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep};
+ } else {
+ $tot_corr = $counts{dep2}{$dep}{$dep};
+ }
+ }
+ if (defined($counts{dep}{$dep}{tot})) {
+ $tot_g = $counts{dep}{$dep}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dep_s}{$dep}{tot})) {
+ $tot_s = $counts{dep_s}{$dep}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+### DEPREL + ATTACHMENT
+
+printf OUT "\n\n";
+printf OUT " Precision and recall of binned HEAD direction\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dir ('to_root', 'left', 'right', 'self') {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dir2}{$dir}{$dir})) {
+ $tot_corr = $counts{dir2}{$dir}{$dir};
+ }
+ if (defined($counts{dir_g}{$dir}{tot})) {
+ $tot_g = $counts{dir_g}{$dir}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dir_s}{$dir}{tot})) {
+ $tot_s = $counts{dir_s}{$dir}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT " Precision and recall of binned HEAD distance\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dist ('to_root', '1', '2', '3-6', '7-...') {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dist2}{$dist}{$dist})) {
+ $tot_corr = $counts{dist2}{$dist}{$dist};
+ }
+ if (defined($counts{dist_g}{$dist}{tot})) {
+ $tot_g = $counts{dist_g}{$dist}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dist_s}{$dist}{tot})) {
+ $tot_s = $counts{dist_s}{$dist}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n";
+foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}})
+{
+ if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later)
+ {
+ printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame;
+ }
+}
+### end of: added by Sabine Buchholz
+
+
+#
+# Leave only the 5 words mostly involved in errors
+#
+
+
+$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ;
+
+# ensure enough space for title
+$max_word_len = length('word') ;
+
+foreach $word (keys %{$counts{word}{err_any}})
+{
+ if ($counts{word}{err_any}{$word} < $thresh)
+ {
+ delete $counts{word}{err_any}{$word} ;
+ next ;
+ }
+
+ $l = uni_len($word) ;
+ if ($l > $max_word_len)
+ {
+ $max_word_len = $l ;
+ }
+}
+
+# filter a case when the difference between the error counts
+# for 2-word and 1-word contexts is small
+# (leave the 2-word context)
+
+foreach $con (keys %{$counts{con_aft_2}{tot}})
+{
+ ($w1) = split(/\+/, $con) ;
+
+ if (defined $counts{con_aft}{tot}{$w1} &&
+ $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1)
+ {
+ delete $counts{con_aft}{tot}{$w1} ;
+ }
+}
+
+foreach $con (keys %{$counts{con_bef_2}{tot}})
+{
+ ($w_2, $w_1) = split(/\+/, $con) ;
+
+ if (defined $counts{con_bef}{tot}{$w_1} &&
+ $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1)
+ {
+ delete $counts{con_bef}{tot}{$w_1} ;
+ }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+ ($p1) = split(/\+/, $con_pos) ;
+
+ if (defined($counts{con_pos_aft}{tot}{$p1}) &&
+ $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1)
+ {
+ delete $counts{con_pos_aft}{tot}{$p1} ;
+ }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+ ($p_2, $p_1) = split(/\+/, $con_pos) ;
+
+ if (defined($counts{con_pos_bef}{tot}{$p_1}) &&
+ $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1)
+ {
+ delete $counts{con_pos_bef}{tot}{$p_1} ;
+ }
+}
+
+# for each context type, take the three contexts most involved in errors
+
+$max_con_len = 0 ;
+
+filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ;
+
+# for each CPOS context type, take the three CPOS contexts most involved in errors
+
+$max_con_pos_len = 0 ;
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+ if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_bef_2}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef}{tot}})
+{
+ if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_bef}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft}{tot}})
+{
+ if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_aft}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+ if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_aft_2}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+# printing
+
+# ------------- focus words
+
+printf OUT "\n\n" ;
+printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ;
+
+printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ;
+printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len;
+
+foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}})
+{
+ if (!defined($counts{word}{err_head}{$word}))
+ {
+ $counts{word}{err_head}{$word} = 0 ;
+ }
+ if (! defined($counts{word}{err_dep}{$word}))
+ {
+ $counts{word}{err_dep}{$word} = 0 ;
+ }
+ if (! defined($counts{word}{err_any}{$word}))
+ {
+ $counts{word}{err_any}{$word} = 0;
+ }
+ printf OUT " %-*s | %4d | %4d | %4d | %4d\n",
+ $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word},
+ $counts{word}{err_head}{$word},
+ $counts{word}{err_dep}{$word},
+ $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ;
+}
+
+printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len;
+
+# ------------- contexts
+
+printf OUT "\n\n" ;
+
+printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " one-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " two-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ;
+
+# ------------- Sentences
+
+printf OUT " Sentence with the highest number of word errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word})
+ <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT " Sentence with the highest number of head errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head})
+ <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT " Sentence with the highest number of dependency errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep})
+ <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+#
+# Second pass, collect statistics of the frequent errors
+#
+
+# filter the errors, leave the most frequent $freq_err_num errors
+
+$i = 0 ;
+
+$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ;
+
+foreach $err (keys %freq_err)
+{
+ if ($freq_err{$err} < $thresh)
+ {
+ delete $freq_err{$err} ;
+ }
+}
+
+# in case there are several errors with the threshold count
+
+$freq_err_num = scalar keys %freq_err ;
+
+%err_counts = () ;
+
+$eof = 0 ;
+
+seek (GOLD, 0, 0) ;
+seek (SYS, 0, 0) ;
+
+while (! $eof)
+{ # second reading loop
+
+ $eof = read_sent(\@sent_gold, \@sent_sys) ;
+ $sent_num++ ;
+
+ $word_num = scalar @sent_gold ;
+
+ # printf "$sent_num $word_num\n" ;
+
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+ ($word, $pos, $head_g, $dep_g)
+ = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+
+ # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ;
+
+ if ((! $score_on_punct) && is_uni_punct($word))
+ {
+ # ignore punctuations
+ next ;
+ }
+
+ ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+ $err_head = ($head_g ne $head_s) ;
+ $err_dep = ($dep_g ne $dep_s) ;
+
+ $head_err = '-' ;
+ $dep_err = '-' ;
+
+ if ($head_g eq '0')
+ {
+ $head_aft_bef_g = '0' ;
+ }
+ elsif ($head_g eq $i_w+1)
+ {
+ $head_aft_bef_g = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ if ($head_s eq '0')
+ {
+ $head_aft_bef_s = '0' ;
+ }
+ elsif ($head_s eq $i_w+1)
+ {
+ $head_aft_bef_s = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+ if ($err_head)
+ {
+ if ($head_aft_bef_s eq '0')
+ {
+ $head_err = 0 ;
+ }
+ else
+ {
+ $head_err = $head_s-$head_g ;
+ }
+ }
+
+ if ($err_dep)
+ {
+ $dep_err = $dep_g.'->'.$dep_s ;
+ }
+
+ if (! ($err_head || $err_dep))
+ {
+ next ;
+ }
+
+ # handle only the most frequent errors
+
+ $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+
+ if (! exists $freq_err{$err})
+ {
+ next ;
+ }
+
+ ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+ $con_bef = $w_1 ;
+ $con_bef_2 = $w_2.' + '.$w_1 ;
+ $con_aft = $w1 ;
+ $con_aft_2 = $w1.' + '.$w2 ;
+
+ $con_pos_bef = $p_1 ;
+ $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+ $con_pos_aft = $p1 ;
+ $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+ @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ;
+
+ # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n",
+ # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ;
+
+ @bits = (0, 0, 0, 0, 0, 0) ;
+ $j = 0 ;
+
+ while ($j == 0)
+ {
+ for ($i = 0; $i <= $#bits; $i++)
+ {
+ if ($bits[$i] == 0)
+ {
+ $bits[$i] = 1 ;
+ $j = 0 ;
+ last ;
+ }
+ else
+ {
+ $bits[$i] = 0 ;
+ $j = 1 ;
+ }
+ }
+
+ @e_bits = @cur_err ;
+
+ for ($i = 0; $i <= $#bits; $i++)
+ {
+ if (! $bits[$i])
+ {
+ $e_bits[$i] = '*' ;
+ }
+ }
+
+ # include also the last case which is the most general
+ # (wildcards for everything)
+ $err_counts{$err}{join($sep, @e_bits)}++ ;
+
+ }
+
+ } # loop on words
+} # second reading loop
+
+printf OUT "\n\n" ;
+printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ;
+printf OUT "\n %s\n", '=' x 41 ;
+
+
+# deleting local contexts which are too general
+
+foreach $err (keys %err_counts)
+{
+ foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+ keys %{$err_counts{$err}})
+ {
+ @cur_err = split(/\Q$sep\E/, $loc_con) ;
+
+ # In this loop, one or two elements of the local context are
+ # replaced with '*' to make it more general. If the entry for
+ # the general context has the same count it is removed.
+
+ foreach $i (0 .. $#cur_err)
+ {
+ $w1 = $cur_err[$i] ;
+ if ($cur_err[$i] eq '*')
+ {
+ next ;
+ }
+ $cur_err[$i] = '*' ;
+ $con1 = join($sep, @cur_err) ;
+ if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+ && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+ {
+ delete $err_counts{$err}{$con1} ;
+ }
+ for ($j = $i+1; $j <=$#cur_err; $j++)
+ {
+ if ($cur_err[$j] eq '*')
+ {
+ next ;
+ }
+ $w2 = $cur_err[$j] ;
+ $cur_err[$j] = '*' ;
+ $con1 = join($sep, @cur_err) ;
+ if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+ && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+ {
+ delete $err_counts{$err}{$con1} ;
+ }
+ $cur_err[$j] = $w2 ;
+ }
+ $cur_err[$i] = $w1 ;
+ }
+ }
+}
+
+# Leaving only the topmost local contexts for each error
+
+foreach $err (keys %err_counts)
+{
+ $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ;
+
+ # of the threshold is too low, take the 2nd highest count
+ # (the highest may be the total which is the generic case
+ # and not relevant for printing)
+
+ if ($thresh < 5)
+ {
+ $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ;
+ }
+
+ foreach $loc_con (keys %{$err_counts{$err}})
+ {
+ if ($err_counts{$err}{$loc_con} < $thresh)
+ {
+ delete $err_counts{$err}{$loc_con} ;
+ }
+ else
+ {
+ if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*')))
+ {
+ $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ;
+ }
+ }
+ }
+}
+
+# printing an error summary
+
+# calculating the context field length
+
+$max_word_spec_len= length('word') ;
+$max_con_aft_len = length('word') ;
+$max_con_bef_len = length('word') ;
+$max_con_pos_len = length('CPOS') ;
+
+foreach $err (keys %err_counts)
+{
+ foreach $loc_con (sort keys %{$err_counts{$err}})
+ {
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $loc_con) ;
+
+ $l = uni_len($word) ;
+ if ($l > $max_word_spec_len)
+ {
+ $max_word_spec_len = $l ;
+ }
+
+ $l = uni_len($con_bef) ;
+ if ($l > $max_con_bef_len)
+ {
+ $max_con_bef_len = $l ;
+ }
+
+ $l = uni_len($con_aft) ;
+ if ($l > $max_con_aft_len)
+ {
+ $max_con_aft_len = $l ;
+ }
+
+ if (length($con_pos_aft) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos_aft) ;
+ }
+
+ if (length($con_pos_bef) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos_bef) ;
+ }
+ }
+}
+
+$err_counter = 0 ;
+
+foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err)
+{
+
+ ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ;
+
+ $err_counter++ ;
+ $err_desc{$err} = sprintf("%2d. ", $err_counter).
+ describe_err($head_err, $head_aft_bef, $dep_err) ;
+
+ # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ;
+ printf OUT "\n" ;
+ printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ printf OUT " %-*s | %-*s | %-*s | %s\n",
+ $max_con_pos_len+$max_con_bef_len+3, ' Before',
+ $max_word_spec_len+$max_pos_len+3, ' Focus',
+ $max_con_pos_len+$max_con_aft_len+3, ' After',
+ 'Count' ;
+
+ printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n",
+ $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+ $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+ $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+ keys %{$err_counts{$err}})
+ {
+ if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*')))
+ {
+ next ;
+ }
+
+ $con1 = $loc_con ;
+ $con1 =~ s/\*/ /g ;
+
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $con1) ;
+
+ printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n",
+ $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+ $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+ $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft,
+ $err_counts{$err}{$loc_con} ;
+ }
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+}
+
+printf OUT "\n\n" ;
+printf OUT " Local contexts involved in several frequent errors:" ;
+printf OUT "\n %s\n", '=' x 51 ;
+printf OUT "\n\n" ;
+
+foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=>
+ scalar keys %{$loc_con_err_counts{$a}}}
+ keys %loc_con_err_counts)
+{
+
+ if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1)
+ {
+ next ;
+ }
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ printf OUT " %-*s | %-*s | %-*s \n",
+ $max_con_pos_len+$max_con_bef_len+3, ' Before',
+ $max_word_spec_len+$max_pos_len+3, ' Focus',
+ $max_con_pos_len+$max_con_aft_len+3, ' After' ;
+
+ printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n",
+ $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+ $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+ $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ $con1 = $loc_con ;
+ $con1 =~ s/\*/ /g ;
+
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $con1) ;
+
+ printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n",
+ $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+ $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+ $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=>
+ $loc_con_err_counts{$loc_con}{$a}}
+ keys %{$loc_con_err_counts{$loc_con}})
+ {
+ printf OUT " %s : %d times\n", $err_desc{$err},
+ $loc_con_err_counts{$loc_con}{$err} ;
+ }
+
+ printf OUT "\n" ;
+}
+
+close GOLD ;
+close SYS ;
+
+close OUT ;
diff --git a/bist_parser/bmstparser/src/decoder.py b/bist_parser/bmstparser/src/decoder.py
new file mode 100644
index 0000000..f93b74f
--- /dev/null
+++ b/bist_parser/bmstparser/src/decoder.py
@@ -0,0 +1,105 @@
+# This file contains routines from Lisbon Machine Learning summer school.
+# The code is freely distributed under a MIT license. https://github.com/LxMLS/lxmls-toolkit/
+
+import numpy as np
+import sys
+from collections import defaultdict, namedtuple
+from operator import itemgetter
+
+
+def parse_proj(scores, gold=None):
+ '''
+ Parse using Eisner's algorithm.
+ '''
+ nr, nc = np.shape(scores)
+ if nr != nc:
+ raise ValueError("scores must be a squared matrix with nw+1 rows")
+
+ N = nr - 1 # Number of words (excluding root).
+
+ # Initialize CKY table.
+ complete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1).
+ incomplete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1).
+ complete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1).
+ incomplete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1).
+
+ incomplete[0, :, 0] -= np.inf
+
+ # Loop from smaller items to larger items.
+ for k in range(1,N+1):
+ for s in range(N-k+1):
+ t = s+k
+
+ # First, create incomplete items.
+ # left tree
+ incomplete_vals0 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[t, s] + (0.0 if gold is not None and gold[s]==t else 1.0)
+ incomplete[s, t, 0] = np.max(incomplete_vals0)
+ incomplete_backtrack[s, t, 0] = s + np.argmax(incomplete_vals0)
+ # right tree
+ incomplete_vals1 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[s, t] + (0.0 if gold is not None and gold[t]==s else 1.0)
+ incomplete[s, t, 1] = np.max(incomplete_vals1)
+ incomplete_backtrack[s, t, 1] = s + np.argmax(incomplete_vals1)
+
+ # Second, create complete items.
+ # left tree
+ complete_vals0 = complete[s, s:t, 0] + incomplete[s:t, t, 0]
+ complete[s, t, 0] = np.max(complete_vals0)
+ complete_backtrack[s, t, 0] = s + np.argmax(complete_vals0)
+ # right tree
+ complete_vals1 = incomplete[s, (s+1):(t+1), 1] + complete[(s+1):(t+1), t, 1]
+ complete[s, t, 1] = np.max(complete_vals1)
+ complete_backtrack[s, t, 1] = s + 1 + np.argmax(complete_vals1)
+
+ value = complete[0][N][1]
+ heads = [-1 for _ in range(N+1)] #-np.ones(N+1, dtype=int)
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, 0, N, 1, 1, heads)
+
+ value_proj = 0.0
+ for m in range(1,N+1):
+ h = heads[m]
+ value_proj += scores[h,m]
+
+ return heads
+
+
+def backtrack_eisner(incomplete_backtrack, complete_backtrack, s, t, direction, complete, heads):
+ '''
+ Backtracking step in Eisner's algorithm.
+ - incomplete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position,
+ an end position, and a direction flag (0 means left, 1 means right). This array contains
+ the arg-maxes of each step in the Eisner algorithm when building *incomplete* spans.
+ - complete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position,
+ an end position, and a direction flag (0 means left, 1 means right). This array contains
+ the arg-maxes of each step in the Eisner algorithm when building *complete* spans.
+ - s is the current start of the span
+ - t is the current end of the span
+ - direction is 0 (left attachment) or 1 (right attachment)
+ - complete is 1 if the current span is complete, and 0 otherwise
+ - heads is a (NW+1)-sized numpy array of integers which is a placeholder for storing the
+ head of each word.
+ '''
+ if s == t:
+ return
+ if complete:
+ r = complete_backtrack[s][t][direction]
+ if direction == 0:
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 0, 1, heads)
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 0, 0, heads)
+ return
+ else:
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 0, heads)
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 1, 1, heads)
+ return
+ else:
+ r = incomplete_backtrack[s][t][direction]
+ if direction == 0:
+ heads[s] = t
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads)
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads)
+ return
+ else:
+ heads[t] = s
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads)
+ backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads)
+ return
+
diff --git a/bist_parser/bmstparser/src/mstlstm.py b/bist_parser/bmstparser/src/mstlstm.py
new file mode 100644
index 0000000..e403d59
--- /dev/null
+++ b/bist_parser/bmstparser/src/mstlstm.py
@@ -0,0 +1,496 @@
+from dynet import *
+from bist_parser.bmstparser.src.utils import read_conll, write_conll
+from bist_parser.bmstparser.src import utils, decoder
+from operator import itemgetter
+import time, random
+import numpy as np
+
+
+class MSTParserLSTM:
+ def __init__(self, vocab, pos, rels, w2i, options):
+ self.model = Model()
+ random.seed(1)
+ self.trainer = AdamTrainer(self.model)
+
+ self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))}
+ self.activation = self.activations[options.activation]
+
+ self.blstmFlag = options.blstmFlag
+ self.labelsFlag = options.labelsFlag
+ self.costaugFlag = options.costaugFlag
+ self.bibiFlag = options.bibiFlag
+
+ self.ldims = options.lstm_dims
+ self.wdims = options.wembedding_dims
+ self.pdims = options.pembedding_dims
+ self.rdims = options.rembedding_dims
+ self.layers = options.lstm_layers
+ self.wordsCount = vocab
+ self.vocab = {word: ind+3 for word, ind in iter(w2i.items())}
+ self.pos = {word: ind+3 for ind, word in enumerate(pos)}
+ self.rels = {word: ind for ind, word in enumerate(rels)}
+ self.irels = rels
+
+ self.external_embedding, self.edim = None, 0
+ if options.external_embedding is not None:
+ external_embedding_fp = open(options.external_embedding,'r')
+ external_embedding_fp.readline()
+ self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp}
+ external_embedding_fp.close()
+
+ self.edim = len(self.external_embedding.values()[0])
+ self.noextrn = [0.0 for _ in range(self.edim)]
+ self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
+ self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
+ for word, i in iter(self.extrnd.items()):
+ self.elookup.init_row(i, self.external_embedding[word])
+ self.extrnd['*PAD*'] = 1
+ self.extrnd['*INITIAL*'] = 2
+
+ print('Load external embedding. Vector dimensions', self.edim)
+
+ if self.bibiFlag:
+ self.builders = [VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+ VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+ self.bbuilders = [VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model),
+ VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model)]
+ elif self.layers > 0:
+ self.builders = [VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+ VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+ else:
+ self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model),
+ SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)]
+
+ self.hidden_units = options.hidden_units
+ self.hidden2_units = options.hidden2_units
+
+ self.vocab['*PAD*'] = 1
+ self.pos['*PAD*'] = 1
+
+ self.vocab['*INITIAL*'] = 2
+ self.pos['*INITIAL*'] = 2
+
+ self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims))
+ self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
+ self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))
+
+ self.hidLayerFOH = self.model.add_parameters((self.hidden_units, self.ldims * 2))
+ self.hidLayerFOM = self.model.add_parameters((self.hidden_units, self.ldims * 2))
+ self.hidBias = self.model.add_parameters((self.hidden_units))
+
+ self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+ self.hid2Bias = self.model.add_parameters((self.hidden2_units))
+
+ self.outLayer = self.model.add_parameters((1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+
+ if self.labelsFlag:
+ self.rhidLayerFOH = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
+ self.rhidLayerFOM = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
+ self.rhidBias = self.model.add_parameters((self.hidden_units))
+
+ self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
+ self.rhid2Bias = self.model.add_parameters((self.hidden2_units))
+
+ self.routLayer = self.model.add_parameters((len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
+ self.routBias = self.model.add_parameters((len(self.irels)))
+
+
+ def __getExpr(self, sentence, i, j, train):
+
+ if sentence[i].headfov is None:
+ sentence[i].headfov = self.hidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]])
+ if sentence[j].modfov is None:
+ sentence[j].modfov = self.hidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]])
+
+ if self.hidden2_units > 0:
+ output = self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr())) # + self.outBias
+ else:
+ output = self.outLayer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr()) # + self.outBias
+
+ return output
+
+
+ def __evaluate(self, sentence, train):
+ exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ]
+ scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ])
+
+ return scores, exprs
+
+
+ def __evaluateLabel(self, sentence, i, j):
+ if sentence[i].rheadfov is None:
+ sentence[i].rheadfov = self.rhidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]])
+ if sentence[j].rmodfov is None:
+ sentence[j].rmodfov = self.rhidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]])
+
+ if self.hidden2_units > 0:
+ output = self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr())) + self.routBias.expr()
+ else:
+ output = self.routLayer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr()) + self.routBias.expr()
+
+ return output.value(), output
+
+
+ def Save(self, filename):
+ self.model.save(filename)
+
+
+ def Load(self, filename):
+ self.model.load(filename)
+
+
+ def Predict(self, conll_path):
+ with open(conll_path, 'r') as conllFP:
+ for iSentence, sentence in enumerate(read_conll(conllFP)):
+ for entry in sentence:
+ wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
+ posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+ evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None
+ entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+ entry.lstms = [entry.vec, entry.vec]
+ entry.headfov = None
+ entry.modfov = None
+
+ entry.rheadfov = None
+ entry.rmodfov = None
+
+ if self.blstmFlag:
+ lstm_forward = self.builders[0].initial_state()
+ lstm_backward = self.builders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ lstm_forward = lstm_forward.add_input(entry.vec)
+ lstm_backward = lstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = lstm_forward.output()
+ rentry.lstms[0] = lstm_backward.output()
+
+ if self.bibiFlag:
+ for entry in sentence:
+ entry.vec = concatenate(entry.lstms)
+
+ blstm_forward = self.bbuilders[0].initial_state()
+ blstm_backward = self.bbuilders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ blstm_forward = blstm_forward.add_input(entry.vec)
+ blstm_backward = blstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = blstm_forward.output()
+ rentry.lstms[0] = blstm_backward.output()
+
+ scores, exprs = self.__evaluate(sentence, True)
+ heads = decoder.parse_proj(scores)
+
+ for entry, head in zip(sentence, heads):
+ entry.pred_parent_id = head
+ entry.pred_relation = '_'
+
+ dump = False
+
+ if self.labelsFlag:
+ for modifier, head in enumerate(heads[1:]):
+ scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
+ sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]
+
+ renew_cg()
+ if not dump:
+ yield sentence
+
+ def PredictOnEntries(self, conll_entries):
+ for iSentence, sentence in enumerate(conll_entries):
+ for entry in sentence:
+ wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None
+ posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+ evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None
+ entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+ entry.lstms = [entry.vec, entry.vec]
+ entry.headfov = None
+ entry.modfov = None
+
+ entry.rheadfov = None
+ entry.rmodfov = None
+
+ if self.blstmFlag:
+ lstm_forward = self.builders[0].initial_state()
+ lstm_backward = self.builders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ lstm_forward = lstm_forward.add_input(entry.vec)
+ lstm_backward = lstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = lstm_forward.output()
+ rentry.lstms[0] = lstm_backward.output()
+
+ if self.bibiFlag:
+ for entry in sentence:
+ entry.vec = concatenate(entry.lstms)
+
+ blstm_forward = self.bbuilders[0].initial_state()
+ blstm_backward = self.bbuilders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ blstm_forward = blstm_forward.add_input(entry.vec)
+ blstm_backward = blstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = blstm_forward.output()
+ rentry.lstms[0] = blstm_backward.output()
+
+ scores, exprs = self.__evaluate(sentence, True)
+ heads = decoder.parse_proj(scores)
+
+ for entry, head in zip(sentence, heads):
+ entry.pred_parent_id = head
+ entry.pred_relation = '_'
+
+ dump = False
+
+ if self.labelsFlag:
+ for modifier, head in enumerate(heads[1:]):
+ scores, exprs = self.__evaluateLabel(sentence, head, modifier+1)
+ sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]]
+
+ renew_cg()
+ if not dump:
+ yield sentence
+
+ def Train(self, conll_path):
+ errors = 0
+ batch = 0
+ eloss = 0.0
+ mloss = 0.0
+ eerrors = 0
+ etotal = 0
+ start = time.time()
+
+ with open(conll_path, 'r') as conllFP:
+ shuffledData = list(read_conll(conllFP))
+ random.shuffle(shuffledData)
+
+ errs = []
+ lerrs = []
+ eeloss = 0.0
+
+ for iSentence, sentence in enumerate(shuffledData):
+ if iSentence % 100 == 0 and iSentence != 0:
+ print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start)
+ start = time.time()
+ eerrors = 0
+ eloss = 0.0
+ etotal = 0
+ lerrors = 0
+ ltotal = 0
+
+ for entry in sentence:
+ c = float(self.wordsCount.get(entry.norm, 0))
+ dropFlag = (random.random() < (c/(0.25+c)))
+ wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
+ posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+ evec = None
+
+ if self.external_embedding is not None:
+ evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
+ entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+ entry.lstms = [entry.vec, entry.vec]
+ entry.headfov = None
+ entry.modfov = None
+
+ entry.rheadfov = None
+ entry.rmodfov = None
+
+ if self.blstmFlag:
+ lstm_forward = self.builders[0].initial_state()
+ lstm_backward = self.builders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ lstm_forward = lstm_forward.add_input(entry.vec)
+ lstm_backward = lstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = lstm_forward.output()
+ rentry.lstms[0] = lstm_backward.output()
+
+ if self.bibiFlag:
+ for entry in sentence:
+ entry.vec = concatenate(entry.lstms)
+
+ blstm_forward = self.bbuilders[0].initial_state()
+ blstm_backward = self.bbuilders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ blstm_forward = blstm_forward.add_input(entry.vec)
+ blstm_backward = blstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = blstm_forward.output()
+ rentry.lstms[0] = blstm_backward.output()
+
+ scores, exprs = self.__evaluate(sentence, True)
+ gold = [entry.parent_id for entry in sentence]
+ heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)
+
+ if self.labelsFlag:
+ for modifier, head in enumerate(gold[1:]):
+ rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1)
+ goldLabelInd = self.rels[sentence[modifier+1].relation]
+ wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
+ if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
+ lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])
+
+ e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
+ eerrors += e
+ if e > 0:
+ loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
+ eloss += (e)
+ mloss += (e)
+ errs.extend(loss)
+
+ etotal += len(sentence)
+
+ if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
+ eeloss = 0.0
+
+ if len(errs) > 0 or len(lerrs) > 0:
+ eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+ eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+ errs = []
+ lerrs = []
+
+ renew_cg()
+
+ if len(errs) > 0:
+ eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+ eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+
+ errs = []
+ lerrs = []
+ eeloss = 0.0
+
+ renew_cg()
+
+ self.trainer.update_epoch()
+ print("Loss: ", mloss/iSentence)
+
+ def TrainOnEntries(self, shuffledData):
+ errors = 0
+ batch = 0
+ eloss = 0.0
+ mloss = 0.0
+ eerrors = 0
+ etotal = 0
+ start = time.time()
+
+ random.shuffle(shuffledData)
+
+ errs = []
+ lerrs = []
+ eeloss = 0.0
+
+ for iSentence, sentence in enumerate(shuffledData):
+ if iSentence % 100 == 0 and iSentence != 0:
+ print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start)
+ start = time.time()
+ eerrors = 0
+ eloss = 0.0
+ etotal = 0
+ lerrors = 0
+ ltotal = 0
+
+ for entry in sentence:
+ c = float(self.wordsCount.get(entry.norm, 0))
+ dropFlag = (random.random() < (c/(0.25+c)))
+ wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None
+ posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None
+ evec = None
+
+ if self.external_embedding is not None:
+ evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0]
+ entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec])))
+
+ entry.lstms = [entry.vec, entry.vec]
+ entry.headfov = None
+ entry.modfov = None
+
+ entry.rheadfov = None
+ entry.rmodfov = None
+
+ if self.blstmFlag:
+ lstm_forward = self.builders[0].initial_state()
+ lstm_backward = self.builders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ lstm_forward = lstm_forward.add_input(entry.vec)
+ lstm_backward = lstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = lstm_forward.output()
+ rentry.lstms[0] = lstm_backward.output()
+
+ if self.bibiFlag:
+ for entry in sentence:
+ entry.vec = concatenate(entry.lstms)
+
+ blstm_forward = self.bbuilders[0].initial_state()
+ blstm_backward = self.bbuilders[1].initial_state()
+
+ for entry, rentry in zip(sentence, reversed(sentence)):
+ blstm_forward = blstm_forward.add_input(entry.vec)
+ blstm_backward = blstm_backward.add_input(rentry.vec)
+
+ entry.lstms[1] = blstm_forward.output()
+ rentry.lstms[0] = blstm_backward.output()
+
+ scores, exprs = self.__evaluate(sentence, True)
+ gold = [entry.parent_id for entry in sentence]
+ heads = decoder.parse_proj(scores, gold if self.costaugFlag else None)
+
+ if self.labelsFlag:
+ for modifier, head in enumerate(gold[1:]):
+ rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1)
+ goldLabelInd = self.rels[sentence[modifier+1].relation]
+ wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0]
+ if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1:
+ lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd])
+
+ e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
+ eerrors += e
+ if e > 0:
+ loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e))
+ eloss += (e)
+ mloss += (e)
+ errs.extend(loss)
+
+ etotal += len(sentence)
+
+ if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0:
+ eeloss = 0.0
+
+ if len(errs) > 0 or len(lerrs) > 0:
+ eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+ eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+ errs = []
+ lerrs = []
+
+ renew_cg()
+
+ if len(errs) > 0:
+ eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs))))
+ eerrs.scalar_value()
+ eerrs.backward()
+ self.trainer.update()
+
+ errs = []
+ lerrs = []
+ eeloss = 0.0
+
+ renew_cg()
+
+ self.trainer.update_epoch()
+ print("Loss: ", mloss/iSentence)
+
diff --git a/bist_parser/bmstparser/src/parser.py b/bist_parser/bmstparser/src/parser.py
new file mode 100644
index 0000000..19b2980
--- /dev/null
+++ b/bist_parser/bmstparser/src/parser.py
@@ -0,0 +1,75 @@
+from optparse import OptionParser
+from bist_parser.bmstparser.src import utils, mstlstm
+import pickle
+import os.path
+import time
+
+
+if __name__ == '__main__':
+ parser = OptionParser()
+ parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/en-universal-train.conll.ptb")
+ parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/en-universal-dev.conll.ptb")
+ parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/en-universal-test.conll.ptb")
+ parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE")
+ parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle")
+ parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="neuralfirstorder.model")
+ parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100)
+ parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25)
+ parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25)
+ parser.add_option("--epochs", type="int", dest="epochs", default=30)
+ parser.add_option("--hidden", type="int", dest="hidden_units", default=100)
+ parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0)
+ parser.add_option("--lr", type="float", dest="learning_rate", default=0.1)
+ parser.add_option("--outdir", type="string", dest="output", default="results")
+ parser.add_option("--activation", type="string", dest="activation", default="tanh")
+ parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2)
+ parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=125)
+ parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True)
+ parser.add_option("--disablelabels", action="store_false", dest="labelsFlag", default=True)
+ parser.add_option("--predict", action="store_true", dest="predictFlag", default=False)
+ parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False)
+ parser.add_option("--disablecostaug", action="store_false", dest="costaugFlag", default=True)
+ parser.add_option("--dynet-seed", type="int", dest="seed", default=0)
+ parser.add_option("--dynet-mem", type="int", dest="mem", default=0)
+
+ (options, args) = parser.parse_args()
+
+ print('Using external embedding:', options.external_embedding)
+
+ if options.predictFlag:
+ with open(options.params, 'rb') as paramsfp:
+ words, w2i, pos, rels, stored_opt = pickle.load(paramsfp)
+
+ stored_opt.external_embedding = options.external_embedding
+
+ print('Initializing lstm mstparser:')
+ parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, stored_opt)
+
+ parser.Load(options.model)
+ tespath = os.path.join(options.output, 'test_pred.conll')
+
+ ts = time.time()
+ test_res = list(parser.Predict(options.conll_test))
+ te = time.time()
+ print('Finished predicting test.', te-ts, 'seconds.')
+ utils.write_conll(tespath, test_res)
+
+ os.system('perl src/util_scripts/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt')
+ else:
+ print('Preparing vocab')
+ words, w2i, pos, rels = utils.vocab(options.conll_train)
+
+ with open(os.path.join(options.output, options.params), 'wb') as paramsfp:
+ pickle.dump((words, w2i, pos, rels, options), paramsfp)
+ print('Finished collecting vocab')
+
+ print('Initializing lstm mstparser:')
+ parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options)
+
+ for epoch in range(options.epochs):
+ print('Starting epoch', epoch)
+ parser.Train(options.conll_train)
+ devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll')
+ utils.write_conll(devpath, parser.Predict(options.conll_dev))
+ parser.Save(os.path.join(options.output, os.path.basename(options.model) + str(epoch+1)))
+ os.system('perl src/util_scripts/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt')
diff --git a/bist_parser/bmstparser/src/util_scripts/eval.pl b/bist_parser/bmstparser/src/util_scripts/eval.pl
new file mode 100644
index 0000000..3db9837
--- /dev/null
+++ b/bist_parser/bmstparser/src/util_scripts/eval.pl
@@ -0,0 +1,1826 @@
+#!/usr/bin/env perl
+
+# Author: Yuval Krymolowski
+# Addition of precision and recall
+# and of frame confusion list: Sabine Buchholz
+# Addition of DEPREL + ATTACHMENT:
+# Prokopis Prokopidis (prokopis at ilsp dot gr)
+# Acknowledgements:
+# to Markus Kuhn for suggesting the use of
+# the Unicode category property
+
+if ($] < 5.008001)
+{
+ printf STDERR < -s
+
+ This script evaluates a system output with respect to a gold standard.
+ Both files should be in UTF-8 encoded CoNLL-X tabular format.
+
+ Punctuation tokens (those where all characters have the Unicode
+ category property "Punctuation") are ignored for scoring (unless the
+ -p flag is used).
+
+ The output breaks down the errors according to their type and context.
+
+ Optional parameters:
+ -o FILE : output: print output to FILE (default is standard output)
+ -q : quiet: only print overall performance, without the details
+ -b : evalb: produce output in a format similar to evalb
+ (http://nlp.cs.nyu.edu/evalb/); use together with -q
+ -p : punctuation: also score on punctuation (default is not to score on it)
+ -v : version: show the version number
+ -h : help: print this help text and exit
+
+EOT
+;
+
+my ($line_num) ;
+my ($sep) = '0x01' ;
+
+my ($START) = '.S' ;
+my ($END) = '.E' ;
+
+my ($con_err_num) = 3 ;
+my ($freq_err_num) = 10 ;
+my ($spec_err_loc_con) = 8 ;
+
+################################################################################
+### subfunctions ###
+################################################################################
+
+# Whether a string consists entirely of characters with the Unicode
+# category property "Punctuation" (see "man perlunicode")
+sub is_uni_punct
+{
+ my ($word) = @_ ;
+
+ return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ;
+}
+
+# The length of a unicode string, excluding non-spacing marks
+# (for example vowel marks in Arabic)
+
+sub uni_len
+{
+ my ($word) = @_ ;
+ my ($ch, $l) ;
+
+ $l = 0 ;
+ foreach $ch (split(//, Encode::decode_utf8($word)))
+ {
+ if ($ch !~ /^\p{NonspacingMark}/)
+ {
+ $l++ ;
+ }
+ }
+
+ return $l ;
+}
+
+sub filter_context_counts
+{ # filter_context_counts
+
+ my ($vec, $num, $max_len) = @_ ;
+ my ($con, $l, $thresh) ;
+
+ $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ;
+
+ foreach $con (keys %{$vec})
+ {
+ if (${$vec}{$con} < $thresh)
+ {
+ delete ${$vec}{$con} ;
+ next ;
+ }
+
+ $l = uni_len($con) ;
+
+ if ($l > ${$max_len})
+ {
+ ${$max_len} = $l ;
+ }
+ }
+
+} # filter_context_counts
+
+sub print_context
+{ # print_context
+
+ my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ;
+ my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ;
+
+ printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ;
+ printf OUT " ||" ;
+ printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ;
+ printf OUT "\n" ;
+ printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len;
+ printf OUT "--++" ;
+ printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+ printf OUT "\n" ;
+
+ @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ;
+ @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ;
+
+ $n = scalar @v_con ;
+ if (scalar @v_con_pos > $n)
+ {
+ $n = scalar @v_con_pos ;
+ }
+
+ foreach $i (0 .. $n-1)
+ {
+ if (defined $v_con_pos[$i])
+ {
+ $con_pos = $v_con_pos[$i] ;
+ printf OUT " %-*s | %4d | %4d | %4d | %4d",
+ $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos},
+ ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos},
+ ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ;
+ }
+ else
+ {
+ printf OUT " %-*s | %4s | %4s | %4s | %4s",
+ $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ;
+ }
+
+ printf OUT " ||" ;
+
+ if (defined $v_con[$i])
+ {
+ $con = $v_con[$i] ;
+ printf OUT " %-*s | %4d | %4d | %4d | %4d",
+ $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con},
+ ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con},
+ ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ;
+ }
+ else
+ {
+ printf OUT " %-*s | %4s | %4s | %4s | %4s",
+ $max_con_len, ' ', ' ', ' ', ' ', ' ' ;
+ }
+
+ printf OUT "\n" ;
+ }
+
+ printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len;
+ printf OUT "--++" ;
+ printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len;
+ printf OUT "\n" ;
+
+ printf OUT "\n\n" ;
+
+} # print_context
+
+sub num_as_word
+{
+ my ($num) = @_ ;
+
+ $num = abs($num) ;
+
+ if ($num == 1)
+ {
+ return ('one word') ;
+ }
+ elsif ($num == 2)
+ {
+ return ('two words') ;
+ }
+ elsif ($num == 3)
+ {
+ return ('three words') ;
+ }
+ elsif ($num == 4)
+ {
+ return ('four words') ;
+ }
+ else
+ {
+ return ($num.' words') ;
+ }
+}
+
+sub describe_err
+{ # describe_err
+
+ my ($head_err, $head_aft_bef, $dep_err) = @_ ;
+ my ($dep_g, $dep_s, $desc) ;
+ my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ;
+
+ if ($head_err eq '-')
+ {
+ $desc = 'correct head' ;
+
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= ' (0)' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= ' (the focus word)' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= ' (after the focus word)' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= ' (before the focus word)' ;
+ }
+ }
+ elsif ($head_aft_bef_s eq '0')
+ {
+ $desc = 'head = 0 instead of ' ;
+ if ($head_aft_bef_g eq 'a')
+ {
+ $desc.= 'after ' ;
+ }
+ if ($head_aft_bef_g eq 'b')
+ {
+ $desc.= 'before ' ;
+ }
+ $desc .= 'the focus word' ;
+ }
+ elsif ($head_aft_bef_g eq '0')
+ {
+ $desc = 'head is ' ;
+ if ($head_aft_bef_g eq 'a')
+ {
+ $desc.= 'after ' ;
+ }
+ if ($head_aft_bef_g eq 'b')
+ {
+ $desc.= 'before ' ;
+ }
+ $desc .= 'the focus word instead of 0' ;
+ }
+ else
+ {
+ $desc = num_as_word($head_err) ;
+ if ($head_err < 0)
+ {
+ $desc .= ' before' ;
+ }
+ else
+ {
+ $desc .= ' after' ;
+ }
+
+ $desc = 'head '.$desc.' the correct head ' ;
+
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= '(0' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= '(the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= '(after the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= '(before the focus word' ;
+ }
+
+ if ($head_aft_bef_g ne $head_aft_bef_s)
+ {
+ $desc .= ' instead of' ;
+ if ($head_aft_bef_s eq '0')
+ {
+ $desc .= '0' ;
+ }
+ elsif ($head_aft_bef_s eq 'e')
+ {
+ $desc .= 'the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'a')
+ {
+ $desc .= 'after the focus word' ;
+ }
+ elsif ($head_aft_bef_s eq 'b')
+ {
+ $desc .= 'before the focus word' ;
+ }
+ }
+
+ $desc .= ')' ;
+ }
+
+ $desc .= ', ' ;
+
+ if ($dep_err eq '-')
+ {
+ $desc .= 'correct dependency' ;
+ }
+ else
+ {
+ ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ;
+ $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ;
+ }
+
+ return($desc) ;
+
+} # describe_err
+
+sub get_context
+{ # get_context
+
+ my ($sent, $i_w) = @_ ;
+ my ($w_2, $w_1, $w1, $w2) ;
+ my ($p_2, $p_1, $p1, $p2) ;
+
+ if ($i_w >= 2)
+ {
+ $w_2 = ${${$sent}[$i_w-2]}{word} ;
+ $p_2 = ${${$sent}[$i_w-2]}{pos} ;
+ }
+ else
+ {
+ $w_2 = $START ;
+ $p_2 = $START ;
+ }
+
+ if ($i_w >= 1)
+ {
+ $w_1 = ${${$sent}[$i_w-1]}{word} ;
+ $p_1 = ${${$sent}[$i_w-1]}{pos} ;
+ }
+ else
+ {
+ $w_1 = $START ;
+ $p_1 = $START ;
+ }
+
+ if ($i_w <= scalar @{$sent}-2)
+ {
+ $w1 = ${${$sent}[$i_w+1]}{word} ;
+ $p1 = ${${$sent}[$i_w+1]}{pos} ;
+ }
+ else
+ {
+ $w1 = $END ;
+ $p1 = $END ;
+ }
+
+ if ($i_w <= scalar @{$sent}-3)
+ {
+ $w2 = ${${$sent}[$i_w+2]}{word} ;
+ $p2 = ${${$sent}[$i_w+2]}{pos} ;
+ }
+ else
+ {
+ $w2 = $END ;
+ $p2 = $END ;
+ }
+
+ return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ;
+
+} # get_context
+
+sub read_sent
+{ # read_sent
+
+ my ($sent_gold, $sent_sys) = @_ ;
+ my ($line_g, $line_s, $new_sent) ;
+ my (%fields_g, %fields_s) ;
+
+ $new_sent = 1 ;
+
+ @{$sent_gold} = () ;
+ @{$sent_sys} = () ;
+
+ while (1)
+ { # main reading loop
+
+ $line_g = ;
+ $line_s = ;
+
+ $line_num++ ;
+
+ # system output has fewer lines than gold standard
+ if ((defined $line_g) && (! defined $line_s))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : past end of file\n" ;
+ exit(1) ;
+ }
+
+ # system output has more lines than gold standard
+ if ((! defined $line_g) && (defined $line_s))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: past end of file\n" ;
+ printf STDERR " sys : %s", $line_s ;
+ exit(1) ;
+ }
+
+ # end of file reached for both
+ if ((! defined $line_g) && (! defined $line_s))
+ {
+ return (1) ;
+ }
+
+ # one contains end of sentence but other one does not
+ if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/))
+ {
+ printf STDERR "line mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : %s", $line_s ;
+ exit(1) ;
+ }
+
+ # end of sentence reached
+ if ($line_g =~ /^\s+$/)
+ {
+ return(0) ;
+ }
+
+ # now both lines contain information
+
+ if ($new_sent)
+ {
+ $new_sent = 0 ;
+ }
+
+ # 'official' column names
+ # options.output = ['id','form','lemma','cpostag','postag',
+ # 'feats','head','deprel','phead','pdeprel']
+
+ @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ;
+
+ push @{$sent_gold}, { %fields_g } ;
+
+ @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ;
+
+ if (($fields_g{word} ne $fields_s{word})
+ ||
+ ($fields_g{pos} ne $fields_s{pos}))
+ {
+ printf STDERR "Word/pos mismatch, line %d:\n", $line_num ;
+ printf STDERR " gold: %s", $line_g ;
+ printf STDERR " sys : %s", $line_s ;
+ #exit(1) ;
+ }
+
+ push @{$sent_sys}, { %fields_s } ;
+
+ } # main reading loop
+
+} # read_sent
+
+################################################################################
+### main ###
+################################################################################
+
+our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ;
+
+my ($sent_num, $eof, $word_num, @err_sent) ;
+my (@sent_gold, @sent_sys, @starts) ;
+my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ;
+my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ;
+my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ;
+my ($loc_con, %loc_con_err_counts, %err_desc) ;
+my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ;
+my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ;
+my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ;
+my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ;
+my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ;
+my (%freq_err, $err) ;
+
+my ($i, $j, $i_w, $l, $n_args) ;
+my ($w_2, $w_1, $w1, $w2) ;
+my ($wp_2, $wp_1, $wp1, $wp2) ;
+my ($p_2, $p_1, $p1, $p2) ;
+
+my ($short_output) ;
+my ($score_on_punct) ;
+$counts{punct} = 0; # initialize
+
+getopts("g:o:s:qvhpb") ;
+
+if (defined $opt_v)
+{
+ my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $';
+ my @parts = split ' ',$id;
+ print "Version $parts[2]\n";
+ exit(0);
+}
+
+if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s)))
+{
+ die $usage ;
+}
+
+if (! defined $opt_g)
+{
+ die "Gold standard file (-g) missing\n" ;
+}
+
+if (! defined $opt_s)
+{
+ die "System output file (-s) missing\n" ;
+}
+
+if (! defined $opt_o)
+{
+ $opt_o = '-' ;
+}
+
+if (defined $opt_q)
+{
+ $short_output = 1 ;
+} else {
+ $short_output = 0 ;
+}
+
+if (defined $opt_p)
+{
+ $score_on_punct = 1 ;
+} else {
+ $score_on_punct = 0 ;
+}
+
+$line_num = 0 ;
+$sent_num = 0 ;
+$eof = 0 ;
+
+@err_sent = () ;
+@starts = () ;
+
+%{$err_sent[0]} = () ;
+
+$max_pos_len = length('CPOS') ;
+
+################################################################################
+### reading input ###
+################################################################################
+
+open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ;
+open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ;
+open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ;
+
+
+if (defined $opt_b) { # produce output similar to evalb
+ print OUT " Sent. Attachment Correct Scoring \n";
+ print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n";
+ print OUT " ============================================================================\n";
+}
+
+
+while (! $eof)
+{ # main reading loop
+
+ $starts[$sent_num] = $line_num+1 ;
+ $eof = read_sent(\@sent_gold, \@sent_sys) ;
+
+ $sent_num++ ;
+
+ %{$err_sent[$sent_num]} = () ;
+ $word_num = scalar @sent_gold ;
+
+ # for accuracy per sentence
+ my %sent_counts = ( tot => 0,
+ err_any => 0,
+ err_head => 0
+ );
+
+ # printf "$sent_num $word_num\n" ;
+
+ my @frames_g = ('** '); # the initial frame for the virtual root
+ my @frames_s = ('** '); # the initial frame for the virtual root
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+ push @frames_g, ''; # initialize
+ push @frames_s, ''; # initialize
+ }
+
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+
+ ($word, $pos, $head_g, $dep_g)
+ = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+ $wp = $word.' / '.$pos ;
+
+ # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ;
+
+ if ((! $score_on_punct) && is_uni_punct($word))
+ {
+ $counts{punct}++ ;
+ # ignore punctuations
+ next ;
+ }
+
+ if (length($pos) > $max_pos_len)
+ {
+ $max_pos_len = length($pos) ;
+ }
+
+ ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+ $counts{tot}++ ;
+ $counts{word}{$wp}{tot}++ ;
+ $counts{pos}{$pos}{tot}++ ;
+ $counts{head}{$head_g-$i_w-1}{tot}++ ;
+
+ # for frame confusions
+ # add child to frame of parent
+ $frames_g[$head_g] .= "$dep_g ";
+ $frames_s[$head_s] .= "$dep_s ";
+ # add to frame of token itself
+ $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero
+ $frames_s[$i_w+1] .= "*$dep_g* ";
+
+ # for precision and recall of DEPREL
+ $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels
+ $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions
+ $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels
+ $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ...
+ $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output
+
+ # for precision and recall of HEAD direction
+ my $dir_g;
+ if ($head_g == 0) {
+ $dir_g = 'to_root';
+ } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero
+ # also below
+ $dir_g = 'left';
+ } elsif ($head_g > $i_w+1) {
+ $dir_g = 'right';
+ } else {
+ # token links to itself; should never happen in correct gold standard
+ $dir_g = 'self';
+ }
+ my $dir_s;
+ if ($head_s == 0) {
+ $dir_s = 'to_root';
+ } elsif ($head_s < $i_w+1) {
+ $dir_s = 'left';
+ } elsif ($head_s > $i_w+1) {
+ $dir_s = 'right';
+ } else {
+ # token links to itself; should not happen in good system
+ # (but not forbidden in shared task)
+ $dir_s = 'self';
+ }
+ $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction
+ $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions
+ $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction
+
+ # for precision and recall of HEAD distance
+ my $dist_g;
+ if ($head_g == 0) {
+ $dist_g = 'to_root';
+ } elsif ( abs($head_g - ($i_w+1)) <= 1 ) {
+ $dist_g = '1'; # includes the 'self' cases
+ } elsif ( abs($head_g - ($i_w+1)) <= 2 ) {
+ $dist_g = '2';
+ } elsif ( abs($head_g - ($i_w+1)) <= 6 ) {
+ $dist_g = '3-6';
+ } else {
+ $dist_g = '7-...';
+ }
+ my $dist_s;
+ if ($head_s == 0) {
+ $dist_s = 'to_root';
+ } elsif ( abs($head_s - ($i_w+1)) <= 1 ) {
+ $dist_s = '1'; # includes the 'self' cases
+ } elsif ( abs($head_s - ($i_w+1)) <= 2 ) {
+ $dist_s = '2';
+ } elsif ( abs($head_s - ($i_w+1)) <= 6 ) {
+ $dist_s = '3-6';
+ } else {
+ $dist_s = '7-...';
+ }
+ $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance
+ $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions
+ $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance
+
+
+ $err_head = ($head_g ne $head_s) ; # error in head
+ $err_dep = ($dep_g ne $dep_s) ; # error in deprel
+
+ $head_err = '-' ;
+ $dep_err = '-' ;
+
+ # for accuracy per sentence
+ $sent_counts{tot}++ ;
+ if ($err_dep || $err_head) {
+ $sent_counts{err_any}++ ;
+ }
+ if ($err_head) {
+ $sent_counts{err_head}++ ;
+ }
+
+ # total counts and counts for CPOS involved in errors
+
+ if ($head_g eq '0')
+ {
+ $head_aft_bef_g = '0' ;
+ }
+ elsif ($head_g eq $i_w+1)
+ {
+ $head_aft_bef_g = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ if ($head_s eq '0')
+ {
+ $head_aft_bef_s = '0' ;
+ }
+ elsif ($head_s eq $i_w+1)
+ {
+ $head_aft_bef_s = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+ if ($err_head)
+ {
+ if ($head_aft_bef_s eq '0')
+ {
+ $head_err = 0 ;
+ }
+ else
+ {
+ $head_err = $head_s-$head_g ;
+ }
+
+ $err_sent[$sent_num]{head}++ ;
+ $counts{err_head}{tot}++ ;
+ $counts{err_head}{$head_err}++ ;
+
+ $counts{word}{err_head}{$wp}++ ;
+ $counts{pos}{$pos}{err_head}{tot}++ ;
+ $counts{pos}{$pos}{err_head}{$head_err}++ ;
+ }
+
+ if ($err_dep)
+ {
+ $dep_err = $dep_g.'->'.$dep_s ;
+ $err_sent[$sent_num]{dep}++ ;
+ $counts{err_dep}{tot}++ ;
+ $counts{err_dep}{$dep_err}++ ;
+
+ $counts{word}{err_dep}{$wp}++ ;
+ $counts{pos}{$pos}{err_dep}{tot}++ ;
+ $counts{pos}{$pos}{err_dep}{$dep_err}++ ;
+
+ if ($err_head)
+ {
+ $counts{err_both}++ ;
+ $counts{pos}{$pos}{err_both}++ ;
+ }
+ }
+
+ ### DEPREL + ATTACHMENT
+ if ((!$err_dep) && ($err_head)) {
+ $counts{err_head_corr_dep}{tot}++ ;
+ $counts{err_head_corr_dep}{$dep_s}++ ;
+ }
+ ### DEPREL + ATTACHMENT
+
+ # counts for words involved in errors
+
+ if (! ($err_head || $err_dep))
+ {
+ next ;
+ }
+
+ $err_sent[$sent_num]{word}++ ;
+ $counts{err_any}++ ;
+ $counts{word}{err_any}{$wp}++ ;
+ $counts{pos}{$pos}{err_any}++ ;
+
+ ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+ if ($w_2 ne $START)
+ {
+ $wp_2 = $w_2.' / '.$p_2 ;
+ }
+ else
+ {
+ $wp_2 = $w_2 ;
+ }
+
+ if ($w_1 ne $START)
+ {
+ $wp_1 = $w_1.' / '.$p_1 ;
+ }
+ else
+ {
+ $wp_1 = $w_1 ;
+ }
+
+ if ($w1 ne $END)
+ {
+ $wp1 = $w1.' / '.$p1 ;
+ }
+ else
+ {
+ $wp1 = $w1 ;
+ }
+
+ if ($w2 ne $END)
+ {
+ $wp2 = $w2.' / '.$p2 ;
+ }
+ else
+ {
+ $wp2 = $w2 ;
+ }
+
+ $con_bef = $wp_1 ;
+ $con_bef_2 = $wp_2.' + '.$wp_1 ;
+ $con_aft = $wp1 ;
+ $con_aft_2 = $wp1.' + '.$wp2 ;
+
+ $con_pos_bef = $p_1 ;
+ $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+ $con_pos_aft = $p1 ;
+ $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+ if ($w_1 ne $START)
+ {
+ # do not count '.S' as a word context
+ $counts{con_bef_2}{tot}{$con_bef_2}++ ;
+ $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ;
+ $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ;
+ $counts{con_bef}{tot}{$con_bef}++ ;
+ $counts{con_bef}{err_head}{$con_bef} += $err_head ;
+ $counts{con_bef}{err_dep}{$con_bef} += $err_dep ;
+ }
+
+ if ($w1 ne $END)
+ {
+ # do not count '.E' as a word context
+ $counts{con_aft_2}{tot}{$con_aft_2}++ ;
+ $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ;
+ $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ;
+ $counts{con_aft}{tot}{$con_aft}++ ;
+ $counts{con_aft}{err_head}{$con_aft} += $err_head ;
+ $counts{con_aft}{err_dep}{$con_aft} += $err_dep ;
+ }
+
+ $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ;
+ $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ;
+ $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ;
+ $counts{con_pos_bef}{tot}{$con_pos_bef}++ ;
+ $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ;
+ $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ;
+
+ $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ;
+ $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ;
+ $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ;
+ $counts{con_pos_aft}{tot}{$con_pos_aft}++ ;
+ $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ;
+ $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ;
+
+ $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+ $freq_err{$err}++ ;
+
+ } # loop on words
+
+ foreach $i_w (0 .. $word_num) # including one for the virtual root
+ { # loop on words
+ if ($frames_g[$i_w] ne $frames_s[$i_w]) {
+ $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ;
+ }
+ }
+
+ if (defined $opt_b) { # produce output similar to evalb
+ if ($word_num > 0) {
+ my ($unlabeled,$labeled) = ('NaN', 'NaN');
+ if ($sent_counts{tot} > 0) { # there are scoring tokens
+ $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot};
+ $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot};
+ }
+ printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n",
+ $sent_num, $word_num,
+ $unlabeled, $labeled,
+ $sent_counts{tot}-$sent_counts{err_head},
+ $sent_counts{tot}-$sent_counts{err_any},
+ $sent_counts{tot},;
+ }
+ }
+
+} # main reading loop
+
+################################################################################
+### printing output ###
+################################################################################
+
+if (defined $opt_b) { # produce output similar to evalb
+ print OUT "\n\n";
+}
+printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ;
+printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ;
+printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n",
+ $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ;
+
+if ($short_output)
+{
+ exit(0) ;
+}
+printf OUT "\n %s\n\n", '=' x 80 ;
+printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ;
+
+printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ;
+
+printf OUT " Number of non-scoring tokens: $counts{punct}\n\n";
+
+printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n",
+ 'Accuracy', 'words', 'right', 'right', 'both' ;
+printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n",
+ ' ', ' ', 'head', ' dep', 'right' ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ 'total', $counts{tot},
+ $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot},
+ $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot},
+ $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+ if (! defined($counts{pos}{$pos}{err_head}{tot}))
+ {
+ $counts{pos}{$pos}{err_head}{tot} = 0 ;
+ }
+ if (! defined($counts{pos}{$pos}{err_dep}{tot}))
+ {
+ $counts{pos}{$pos}{err_dep}{tot} = 0 ;
+ }
+ if (! defined($counts{pos}{$pos}{err_any}))
+ {
+ $counts{pos}{$pos}{err_any} = 0 ;
+ }
+
+ printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ $pos, $counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ;
+}
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT "\n\n" ;
+
+printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ;
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n",
+ 'Error', 'words', 'head', ' dep', 'both' ;
+printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n",
+
+ 'Rate', ' ', 'err', ' err', 'wrong' ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ 'total', $counts{tot},
+ $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot},
+ $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot},
+ $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ;
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}})
+{
+ if (! defined($counts{pos}{$pos}{err_both}))
+ {
+ $counts{pos}{$pos}{err_both} = 0 ;
+ }
+
+ printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n",
+ $pos, $counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot},
+ $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ;
+
+}
+
+printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ;
+
+### added by Sabine Buchholz
+printf OUT "\n\n";
+printf OUT " Precision and recall of DEPREL\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dep2}{$dep}{$dep})) {
+ $tot_corr = $counts{dep2}{$dep}{$dep};
+ }
+ if (defined($counts{dep}{$dep}{tot})) {
+ $tot_g = $counts{dep}{$dep}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dep_s}{$dep}{tot})) {
+ $tot_s = $counts{dep_s}{$dep}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+### DEPREL + ATTACHMENT:
+### Same as Sabine's DEPREL apart from $tot_corr calculation
+printf OUT "\n\n";
+printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dep (sort keys %{$counts{all_dep}}) {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dep2}{$dep}{$dep})) {
+ if (defined($counts{err_head_corr_dep}{$dep})) {
+ $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep};
+ } else {
+ $tot_corr = $counts{dep2}{$dep}{$dep};
+ }
+ }
+ if (defined($counts{dep}{$dep}{tot})) {
+ $tot_g = $counts{dep}{$dep}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dep_s}{$dep}{tot})) {
+ $tot_s = $counts{dep_s}{$dep}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+### DEPREL + ATTACHMENT
+
+printf OUT "\n\n";
+printf OUT " Precision and recall of binned HEAD direction\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dir ('to_root', 'left', 'right', 'self') {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dir2}{$dir}{$dir})) {
+ $tot_corr = $counts{dir2}{$dir}{$dir};
+ }
+ if (defined($counts{dir_g}{$dir}{tot})) {
+ $tot_g = $counts{dir_g}{$dir}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dir_s}{$dir}{tot})) {
+ $tot_s = $counts{dir_s}{$dir}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT " Precision and recall of binned HEAD distance\n\n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n";
+printf OUT " ----------------+------+---------+--------+------------+---------------\n";
+foreach my $dist ('to_root', '1', '2', '3-6', '7-...') {
+ # initialize
+ my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN');
+
+ if (defined($counts{dist2}{$dist}{$dist})) {
+ $tot_corr = $counts{dist2}{$dist}{$dist};
+ }
+ if (defined($counts{dist_g}{$dist}{tot})) {
+ $tot_g = $counts{dist_g}{$dist}{tot};
+ $rec = sprintf("%.2f",$tot_corr / $tot_g * 100);
+ }
+ if (defined($counts{dist_s}{$dist}{tot})) {
+ $tot_s = $counts{dist_s}{$dist}{tot};
+ $prec = sprintf("%.2f",$tot_corr / $tot_s * 100);
+ }
+ printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n",
+ $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec;
+}
+
+printf OUT "\n\n";
+printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n";
+foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}})
+{
+ if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later)
+ {
+ printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame;
+ }
+}
+### end of: added by Sabine Buchholz
+
+
+#
+# Leave only the 5 words mostly involved in errors
+#
+
+
+$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ;
+
+# ensure enough space for title
+$max_word_len = length('word') ;
+
+foreach $word (keys %{$counts{word}{err_any}})
+{
+ if ($counts{word}{err_any}{$word} < $thresh)
+ {
+ delete $counts{word}{err_any}{$word} ;
+ next ;
+ }
+
+ $l = uni_len($word) ;
+ if ($l > $max_word_len)
+ {
+ $max_word_len = $l ;
+ }
+}
+
+# filter a case when the difference between the error counts
+# for 2-word and 1-word contexts is small
+# (leave the 2-word context)
+
+foreach $con (keys %{$counts{con_aft_2}{tot}})
+{
+ ($w1) = split(/\+/, $con) ;
+
+ if (defined $counts{con_aft}{tot}{$w1} &&
+ $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1)
+ {
+ delete $counts{con_aft}{tot}{$w1} ;
+ }
+}
+
+foreach $con (keys %{$counts{con_bef_2}{tot}})
+{
+ ($w_2, $w_1) = split(/\+/, $con) ;
+
+ if (defined $counts{con_bef}{tot}{$w_1} &&
+ $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1)
+ {
+ delete $counts{con_bef}{tot}{$w_1} ;
+ }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+ ($p1) = split(/\+/, $con_pos) ;
+
+ if (defined($counts{con_pos_aft}{tot}{$p1}) &&
+ $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1)
+ {
+ delete $counts{con_pos_aft}{tot}{$p1} ;
+ }
+}
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+ ($p_2, $p_1) = split(/\+/, $con_pos) ;
+
+ if (defined($counts{con_pos_bef}{tot}{$p_1}) &&
+ $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1)
+ {
+ delete $counts{con_pos_bef}{tot}{$p_1} ;
+ }
+}
+
+# for each context type, take the three contexts most involved in errors
+
+$max_con_len = 0 ;
+
+filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ;
+
+filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ;
+
+# for each CPOS context type, take the three CPOS contexts most involved in errors
+
+$max_con_pos_len = 0 ;
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}})
+{
+ if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_bef_2}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_bef}{tot}})
+{
+ if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_bef}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft}{tot}})
+{
+ if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_aft}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ;
+
+foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}})
+{
+ if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh)
+ {
+ delete $counts{con_pos_aft_2}{tot}{$con_pos} ;
+ next ;
+ }
+ if (length($con_pos) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos) ;
+ }
+}
+
+# printing
+
+# ------------- focus words
+
+printf OUT "\n\n" ;
+printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ;
+
+printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ;
+printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len;
+
+foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}})
+{
+ if (!defined($counts{word}{err_head}{$word}))
+ {
+ $counts{word}{err_head}{$word} = 0 ;
+ }
+ if (! defined($counts{word}{err_dep}{$word}))
+ {
+ $counts{word}{err_dep}{$word} = 0 ;
+ }
+ if (! defined($counts{word}{err_any}{$word}))
+ {
+ $counts{word}{err_any}{$word} = 0;
+ }
+ printf OUT " %-*s | %4d | %4d | %4d | %4d\n",
+ $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word},
+ $counts{word}{err_head}{$word},
+ $counts{word}{err_dep}{$word},
+ $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ;
+}
+
+printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len;
+
+# ------------- contexts
+
+printf OUT "\n\n" ;
+
+printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " one-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ;
+
+printf OUT " two-token following contexts where most of the errors occur:\n\n" ;
+
+print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ;
+
+# ------------- Sentences
+
+printf OUT " Sentence with the highest number of word errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word})
+ <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT " Sentence with the highest number of head errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head})
+ <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+printf OUT "\n\n" ;
+
+printf OUT " Sentence with the highest number of dependency errors:\n" ;
+$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep})
+ <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ;
+printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ;
+printf OUT "%d head errors, %d dependency errors, %d word errors\n",
+ $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ;
+
+#
+# Second pass, collect statistics of the frequent errors
+#
+
+# filter the errors, leave the most frequent $freq_err_num errors
+
+$i = 0 ;
+
+$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ;
+
+foreach $err (keys %freq_err)
+{
+ if ($freq_err{$err} < $thresh)
+ {
+ delete $freq_err{$err} ;
+ }
+}
+
+# in case there are several errors with the threshold count
+
+$freq_err_num = scalar keys %freq_err ;
+
+%err_counts = () ;
+
+$eof = 0 ;
+
+seek (GOLD, 0, 0) ;
+seek (SYS, 0, 0) ;
+
+while (! $eof)
+{ # second reading loop
+
+ $eof = read_sent(\@sent_gold, \@sent_sys) ;
+ $sent_num++ ;
+
+ $word_num = scalar @sent_gold ;
+
+ # printf "$sent_num $word_num\n" ;
+
+ foreach $i_w (0 .. $word_num-1)
+ { # loop on words
+ ($word, $pos, $head_g, $dep_g)
+ = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ;
+
+ # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ;
+
+ if ((! $score_on_punct) && is_uni_punct($word))
+ {
+ # ignore punctuations
+ next ;
+ }
+
+ ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ;
+
+ $err_head = ($head_g ne $head_s) ;
+ $err_dep = ($dep_g ne $dep_s) ;
+
+ $head_err = '-' ;
+ $dep_err = '-' ;
+
+ if ($head_g eq '0')
+ {
+ $head_aft_bef_g = '0' ;
+ }
+ elsif ($head_g eq $i_w+1)
+ {
+ $head_aft_bef_g = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ if ($head_s eq '0')
+ {
+ $head_aft_bef_s = '0' ;
+ }
+ elsif ($head_s eq $i_w+1)
+ {
+ $head_aft_bef_s = 'e' ;
+ }
+ else
+ {
+ $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ;
+ }
+
+ $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ;
+
+ if ($err_head)
+ {
+ if ($head_aft_bef_s eq '0')
+ {
+ $head_err = 0 ;
+ }
+ else
+ {
+ $head_err = $head_s-$head_g ;
+ }
+ }
+
+ if ($err_dep)
+ {
+ $dep_err = $dep_g.'->'.$dep_s ;
+ }
+
+ if (! ($err_head || $err_dep))
+ {
+ next ;
+ }
+
+ # handle only the most frequent errors
+
+ $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ;
+
+ if (! exists $freq_err{$err})
+ {
+ next ;
+ }
+
+ ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ;
+
+ $con_bef = $w_1 ;
+ $con_bef_2 = $w_2.' + '.$w_1 ;
+ $con_aft = $w1 ;
+ $con_aft_2 = $w1.' + '.$w2 ;
+
+ $con_pos_bef = $p_1 ;
+ $con_pos_bef_2 = $p_2.'+'.$p_1 ;
+ $con_pos_aft = $p1 ;
+ $con_pos_aft_2 = $p1.'+'.$p2 ;
+
+ @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ;
+
+ # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n",
+ # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ;
+
+ @bits = (0, 0, 0, 0, 0, 0) ;
+ $j = 0 ;
+
+ while ($j == 0)
+ {
+ for ($i = 0; $i <= $#bits; $i++)
+ {
+ if ($bits[$i] == 0)
+ {
+ $bits[$i] = 1 ;
+ $j = 0 ;
+ last ;
+ }
+ else
+ {
+ $bits[$i] = 0 ;
+ $j = 1 ;
+ }
+ }
+
+ @e_bits = @cur_err ;
+
+ for ($i = 0; $i <= $#bits; $i++)
+ {
+ if (! $bits[$i])
+ {
+ $e_bits[$i] = '*' ;
+ }
+ }
+
+ # include also the last case which is the most general
+ # (wildcards for everything)
+ $err_counts{$err}{join($sep, @e_bits)}++ ;
+
+ }
+
+ } # loop on words
+} # second reading loop
+
+printf OUT "\n\n" ;
+printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ;
+printf OUT "\n %s\n", '=' x 41 ;
+
+
+# deleting local contexts which are too general
+
+foreach $err (keys %err_counts)
+{
+ foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+ keys %{$err_counts{$err}})
+ {
+ @cur_err = split(/\Q$sep\E/, $loc_con) ;
+
+ # In this loop, one or two elements of the local context are
+ # replaced with '*' to make it more general. If the entry for
+ # the general context has the same count it is removed.
+
+ foreach $i (0 .. $#cur_err)
+ {
+ $w1 = $cur_err[$i] ;
+ if ($cur_err[$i] eq '*')
+ {
+ next ;
+ }
+ $cur_err[$i] = '*' ;
+ $con1 = join($sep, @cur_err) ;
+ if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+ && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+ {
+ delete $err_counts{$err}{$con1} ;
+ }
+ for ($j = $i+1; $j <=$#cur_err; $j++)
+ {
+ if ($cur_err[$j] eq '*')
+ {
+ next ;
+ }
+ $w2 = $cur_err[$j] ;
+ $cur_err[$j] = '*' ;
+ $con1 = join($sep, @cur_err) ;
+ if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con})
+ && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con}))
+ {
+ delete $err_counts{$err}{$con1} ;
+ }
+ $cur_err[$j] = $w2 ;
+ }
+ $cur_err[$i] = $w1 ;
+ }
+ }
+}
+
+# Leaving only the topmost local contexts for each error
+
+foreach $err (keys %err_counts)
+{
+ $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ;
+
+ # of the threshold is too low, take the 2nd highest count
+ # (the highest may be the total which is the generic case
+ # and not relevant for printing)
+
+ if ($thresh < 5)
+ {
+ $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ;
+ }
+
+ foreach $loc_con (keys %{$err_counts{$err}})
+ {
+ if ($err_counts{$err}{$loc_con} < $thresh)
+ {
+ delete $err_counts{$err}{$loc_con} ;
+ }
+ else
+ {
+ if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*')))
+ {
+ $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ;
+ }
+ }
+ }
+}
+
+# printing an error summary
+
+# calculating the context field length
+
+$max_word_spec_len= length('word') ;
+$max_con_aft_len = length('word') ;
+$max_con_bef_len = length('word') ;
+$max_con_pos_len = length('CPOS') ;
+
+foreach $err (keys %err_counts)
+{
+ foreach $loc_con (sort keys %{$err_counts{$err}})
+ {
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $loc_con) ;
+
+ $l = uni_len($word) ;
+ if ($l > $max_word_spec_len)
+ {
+ $max_word_spec_len = $l ;
+ }
+
+ $l = uni_len($con_bef) ;
+ if ($l > $max_con_bef_len)
+ {
+ $max_con_bef_len = $l ;
+ }
+
+ $l = uni_len($con_aft) ;
+ if ($l > $max_con_aft_len)
+ {
+ $max_con_aft_len = $l ;
+ }
+
+ if (length($con_pos_aft) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos_aft) ;
+ }
+
+ if (length($con_pos_bef) > $max_con_pos_len)
+ {
+ $max_con_pos_len = length($con_pos_bef) ;
+ }
+ }
+}
+
+$err_counter = 0 ;
+
+foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err)
+{
+
+ ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ;
+
+ $err_counter++ ;
+ $err_desc{$err} = sprintf("%2d. ", $err_counter).
+ describe_err($head_err, $head_aft_bef, $dep_err) ;
+
+ # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ;
+ printf OUT "\n" ;
+ printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ printf OUT " %-*s | %-*s | %-*s | %s\n",
+ $max_con_pos_len+$max_con_bef_len+3, ' Before',
+ $max_word_spec_len+$max_pos_len+3, ' Focus',
+ $max_con_pos_len+$max_con_aft_len+3, ' After',
+ 'Count' ;
+
+ printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n",
+ $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+ $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+ $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}}
+ keys %{$err_counts{$err}})
+ {
+ if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*')))
+ {
+ next ;
+ }
+
+ $con1 = $loc_con ;
+ $con1 =~ s/\*/ /g ;
+
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $con1) ;
+
+ printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n",
+ $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+ $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+ $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft,
+ $err_counts{$err}{$loc_con} ;
+ }
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+}
+
+printf OUT "\n\n" ;
+printf OUT " Local contexts involved in several frequent errors:" ;
+printf OUT "\n %s\n", '=' x 51 ;
+printf OUT "\n\n" ;
+
+foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=>
+ scalar keys %{$loc_con_err_counts{$a}}}
+ keys %loc_con_err_counts)
+{
+
+ if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1)
+ {
+ next ;
+ }
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ printf OUT " %-*s | %-*s | %-*s \n",
+ $max_con_pos_len+$max_con_bef_len+3, ' Before',
+ $max_word_spec_len+$max_pos_len+3, ' Focus',
+ $max_con_pos_len+$max_con_aft_len+3, ' After' ;
+
+ printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n",
+ $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word',
+ $max_pos_len, 'CPOS', $max_word_spec_len, 'word',
+ $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ $con1 = $loc_con ;
+ $con1 =~ s/\*/ /g ;
+
+ ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) =
+ split(/\Q$sep\E/, $con1) ;
+
+ printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n",
+ $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef,
+ $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word,
+ $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ;
+
+ printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n",
+ '-' x $max_con_pos_len, '-' x $max_con_bef_len,
+ '-' x $max_pos_len, '-' x $max_word_spec_len,
+ '-' x $max_con_pos_len, '-' x $max_con_aft_len ;
+
+ foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=>
+ $loc_con_err_counts{$loc_con}{$a}}
+ keys %{$loc_con_err_counts{$loc_con}})
+ {
+ printf OUT " %s : %d times\n", $err_desc{$err},
+ $loc_con_err_counts{$loc_con}{$err} ;
+ }
+
+ printf OUT "\n" ;
+}
+
+close GOLD ;
+close SYS ;
+
+close OUT ;
diff --git a/bist_parser/bmstparser/src/utils.py b/bist_parser/bmstparser/src/utils.py
new file mode 100644
index 0000000..901e3b5
--- /dev/null
+++ b/bist_parser/bmstparser/src/utils.py
@@ -0,0 +1,93 @@
+from collections import Counter
+import re
+
+
+class ConllEntry:
+ def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
+ self.id = id
+ self.form = form
+ self.norm = normalize(form)
+ self.cpos = cpos.upper()
+ self.pos = pos.upper()
+ self.parent_id = parent_id
+ self.relation = relation
+
+ self.pred_parent_id = None
+ self.pred_relation = None
+
+
+def vocab(conll_path):
+ wordsCount = Counter()
+ posCount = Counter()
+ relCount = Counter()
+
+ with open(conll_path, 'r') as conllFP:
+ for sentence in read_conll(conllFP):
+ wordsCount.update([node.norm for node in sentence])
+ posCount.update([node.pos for node in sentence])
+ relCount.update([node.relation for node in sentence])
+
+ return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},
+ [k for k in posCount.keys()], [k for k in relCount.keys()])
+
+
+def vocab_conll(conll_entries):
+ """
+ Create the vocabulary directly from CoNLL entries.
+ :param conll_entries: a list of lists of CoNLL entries
+ :return: the words count, a word-to-id mapping, a list of pos count keys, a list of rel count keys
+ """
+ wordsCount = Counter()
+ posCount = Counter()
+ relCount = Counter()
+
+ for sentence in conll_entries:
+ wordsCount.update([node.norm for node in sentence])
+ posCount.update([node.pos for node in sentence])
+ relCount.update([node.relation for node in sentence])
+
+ return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())},
+ [k for k in posCount.keys()], [k for k in relCount.keys()])
+
+
+def read_conll(fh):
+ root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', -1, 'rroot')
+ tokens = [root]
+ for line in fh:
+ tok = line.strip().split()
+ if not tok:
+ if len(tokens)>1: yield tokens
+ tokens = [root]
+ else:
+ tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7]))
+ if len(tokens) > 1:
+ yield tokens
+
+
+def write_conll(fn, conll_gen):
+ with open(fn, 'w') as fh:
+ for sentence in conll_gen:
+ for entry in sentence[1:]:
+ fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_']))
+ fh.write('\n')
+ fh.write('\n')
+
+
+def write_original_conll(fn, conll_original):
+ """
+ Write original CoNLL entries to file (in contrast to predicted/generated CoNLL entries).
+ :param fn: the path of the file to which the CoNLL entries should be written
+ :param conll_original: the original CoNLL entries that should be written to the file
+ """
+ with open(fn, 'w') as fh:
+ for sentence in conll_original:
+ for entry in sentence[1:]:
+ fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.parent_id), entry.relation, '_', '_']))
+ fh.write('\n')
+ fh.write('\n')
+
+
+numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+");
+def normalize(word):
+ return 'NUM' if numberRegex.match(word) else word.lower()
+
diff --git a/constants.py b/constants.py
new file mode 100644
index 0000000..62c3d85
--- /dev/null
+++ b/constants.py
@@ -0,0 +1,39 @@
+"""
+Constants that are shared across files.
+"""
+
+NEG_ID = 0 # the negative sentiment id
+POS_ID = 1 # the positive sentiment id
+NEU_ID = 2 # the neutral sentiment id
+
+# feature-related constants
+FEATURE_SETS = ['similarity', 'topic_similarity', 'word_embedding_similarity',
+ 'diversity']
+SIMILARITY_FUNCTIONS = ['jensen-shannon', 'renyi', 'cosine', 'euclidean',
+ 'variational', 'bhattacharyya']
+DIVERSITY_FEATURES = ['num_word_types', 'type_token_ratio', 'entropy',
+ 'simpsons_index', 'quadratic_entropy', 'renyi_entropy']
+
+# task-related constants
+POS = 'pos'
+POS_BILSTM = 'pos_bilstm'
+SENTIMENT = 'sentiment'
+PARSING = 'parsing'
+TASKS = [POS, POS_BILSTM, SENTIMENT, PARSING]
+POS_PARSING_TRG_DOMAINS = ['answers', 'emails', 'newsgroups', 'reviews', 'weblogs', 'wsj']
+SENTIMENT_TRG_DOMAINS = ['books', 'dvd', 'electronics', 'kitchen']
+TASK2TRAIN_EXAMPLES = {
+ POS: 2000, POS_BILSTM: 2000, SENTIMENT: 1600, PARSING: 2000
+}
+TASK2DOMAINS = {
+ POS: POS_PARSING_TRG_DOMAINS, POS_BILSTM: POS_PARSING_TRG_DOMAINS,
+ SENTIMENT: SENTIMENT_TRG_DOMAINS, PARSING: POS_PARSING_TRG_DOMAINS
+}
+
+# method-related constants
+BAYES_OPT = 'bayes-opt'
+RANDOM = 'random'
+MOST_SIMILAR_DOMAIN = 'most-similar-domain'
+MOST_SIMILAR_EXAMPLES = 'most-similar-examples'
+ALL_SOURCE_DATA = 'all-source-data'
+BASELINES = [RANDOM, MOST_SIMILAR_DOMAIN, MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA]
diff --git a/data_utils.py b/data_utils.py
new file mode 100644
index 0000000..ad50b57
--- /dev/null
+++ b/data_utils.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Utility methods for loading and processing data.
+"""
+
+import os
+import codecs
+from collections import Counter
+import itertools
+import operator
+
+import numpy as np
+import scipy.sparse
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from constants import NEG_ID, POS_ID
+from simpletagger import read_conll_file
+
+from constants import SENTIMENT, POS, POS_BILSTM, PARSING, \
+ SENTIMENT_TRG_DOMAINS, POS_PARSING_TRG_DOMAINS
+from bist_parser.bmstparser.src.utils import read_conll
+
+
+class Vocab:
+ """
+ The vocabulary class. Stores the word-to-id mapping.
+ """
+ def __init__(self, max_vocab_size, vocab_path):
+ self.max_vocab_size = max_vocab_size
+ self.vocab_path = vocab_path
+ self.size = 0
+ self.word2id = {}
+ self.id2word = {}
+
+ def load(self):
+ """
+ Loads the vocabulary from the vocabulary path.
+ """
+ assert self.size == 0, 'Vocabulary has already been loaded or built.'
+ print('Reading vocabulary from %s...' % self.vocab_path)
+ with codecs.open(self.vocab_path, 'r', encoding='utf-8') as f:
+ for i, line in enumerate(f):
+ if i >= self.max_vocab_size:
+ print('Vocab in file is larger than max vocab size. '
+ 'Only using top %d words.' % self.max_vocab_size)
+ break
+ word, idx = line.split('\t')
+ self.word2id[word] = int(idx.strip())
+ self.size = len(self.word2id)
+ self.id2word = {index: word for word, index in self.word2id.items()}
+ assert self.size <= self.max_vocab_size, \
+ 'Loaded vocab is of size %d., max vocab size is %d.' % (
+ self.size, self.max_vocab_size)
+
+ def create(self, texts, lowercase=True):
+ """
+ Creates the vocabulary and stores it at the vocabulary path.
+ :param texts: a list of lists of tokens
+ :param lowercase: lowercase the input texts
+ """
+ assert self.size == 0, 'Vocabulary has already been loaded or built.'
+ print('Building the vocabulary...')
+ if lowercase:
+ print('Lower-casing the input texts...')
+ texts = [[word.lower() for word in text] for text in texts]
+
+ word_counts = Counter(itertools.chain(*texts))
+
+ # get the n most common words
+ most_common = word_counts.most_common(n=self.max_vocab_size)
+
+ # construct the word to index mapping
+ self.word2id = {word: index for index, (word, count)
+ in enumerate(most_common)}
+ self.id2word = {index: word for word, index in self.word2id.items()}
+
+ print('Writing vocabulary to %s...' % self.vocab_path)
+ with codecs.open(self.vocab_path, 'w', encoding='utf-8') as f:
+ for word, index in sorted(self.word2id.items(),
+ key=operator.itemgetter(1)):
+ f.write('%s\t%d\n' % (word, index))
+ self.size = len(self.word2id)
+
+
+def get_all_docs(domain_data_pairs, unlabeled=True):
+ """
+ Return all labeled and undocumented documents of multiple domains.
+ :param domain_data_pairs: a list of (domain, (labeled_reviews, labels,
+ unlabeled_reviews)) tuples as obtained by
+ domain2data.items()
+ :param unlabeled: whether unlabeled documents should be incorporated
+ :return: a list containing the documents from all domains, the corresponding
+ labels, and a list containing the domain of each example
+ """
+ docs, labels, domains = [], [], []
+ for domain, (labeled_docs, doc_labels, unlabeled_docs) in domain_data_pairs:
+ length_of_docs = 0
+ if not scipy.sparse.issparse(labeled_docs):
+ # if the labeled documents are not a sparse matrix, i.e.
+ # a tf-idf matrix, we can just flatten them into one array
+ docs += labeled_docs
+ length_of_docs += len(labeled_docs)
+ if unlabeled:
+ # if specified, we add the unlabeled documents
+ docs += unlabeled_docs
+ length_of_docs += len(labeled_docs)
+ else:
+ # if it is a sparse matrix, we just append the docs as a list and
+ # then stack the list in the end
+ docs.append(labeled_docs)
+ length_of_docs += labeled_docs.shape[0]
+ if unlabeled and unlabeled_docs is not None:
+ docs.append(unlabeled_docs)
+ length_of_docs += unlabeled_docs.shape[0]
+ labels.append(doc_labels)
+
+ # we just add the corresponding domain for each document so that we can
+ # later see where the docs came from
+ domains += [domain] * length_of_docs
+ if scipy.sparse.issparse(labeled_docs):
+ # finally, if the matrix was sparse, we can stack the documents together
+ docs = scipy.sparse.vstack(docs)
+ return docs, np.hstack(labels), domains
+
+
+def get_tfidf_data(domain2data, vocab):
+ """
+ Transform the tokenized documents of each domain into a tf-idf matrix.
+ :param domain2data: the mapping of domains to a (tokenized_reviews, labels,
+ tokenized_unlabeled_reviews) tuple
+ :param vocab: the Vocabulary class
+ :return: a mapping of domains to a (labeled_tfidf_matrix, labels,
+ unlabeled_tfidf_matrix) tuple where both tfidf matrices are
+ scipy.sparse.csr.csr_matrix with shape (num_examples, vocab_size)
+ """
+ domain2tfidf_data = {}
+ for domain, (labeled_examples, labels, unlabeled_examples) in domain2data.items():
+
+ # apply the vectorizer to the already tokenized and pre-processed input
+ vectorizer = TfidfVectorizer(vocabulary=vocab.word2id,
+ tokenizer=lambda x: x,
+ preprocessor=lambda x: x)
+
+ # fit the vectorizer to both labeled and unlabeled examples but keep
+ # the transformed examples separate
+ vectorizer.fit(labeled_examples + unlabeled_examples)
+ tfidf_labeled_examples = vectorizer.transform(labeled_examples)
+
+ # note: we cap unlabeled examples at 100k (only relevant for the books
+ # domain in the large-scale setting)
+ unlabeled_examples = unlabeled_examples[:100000]
+ tfidf_unlabeled_examples = vectorizer.transform(unlabeled_examples) \
+ if len(unlabeled_examples) != 0 else None
+ assert isinstance(tfidf_labeled_examples, scipy.sparse.csr.csr_matrix),\
+ 'The input is not a sparse matrix.'
+ assert isinstance(labels, np.ndarray), 'Labels are not a numpy array.'
+ domain2tfidf_data[domain] = [tfidf_labeled_examples, labels,
+ tfidf_unlabeled_examples]
+ return domain2tfidf_data
+
+
+def log_to_file(log_file, run_dict, trg_domain, args):
+ """
+ Log the results of experiment runs to a file.
+ :param log_file: the file used for logging
+ :param run_dict: a dictionary mapping a method name to a list of
+ (val_accuracy, test_accuracy) tuples or a list
+ of (val_accuracy, test_accuracy, best_feature_weight)
+ tuples for the bayes-opt method
+ :param trg_domain: the target domain
+ :param args: the arguments used as input to the script
+ """
+ with open(log_file, 'a') as f:
+ for method, scores in run_dict.items():
+ best_feature_weights = ''
+ if len(scores) == 0:
+ continue
+ if method.startswith('bayes-opt'):
+ val_accuracies, test_accuracies, best_feature_weights = \
+ zip(*scores)
+ else:
+ val_accuracies, test_accuracies = zip(*scores)
+ mean_val, std_val = np.mean(val_accuracies), np.std(val_accuracies)
+ mean_test, std_test = np.mean(test_accuracies),\
+ np.std(test_accuracies)
+ # target domain. method. feature_sets. # all other params
+ f.write('%s\t%s\t%s\t%.4f (+-%.4f)\t%.4f (+-%.4f)\t[%s]\t[%s]\t%s\t'
+ '%s\n'
+ % (trg_domain, method, ' '.join(args.feature_sets),
+ mean_val, std_val, mean_test, std_test,
+ ', '.join(['%.4f' % v for v in val_accuracies]),
+ ', '.join(['%.4f' % t for t in test_accuracies]),
+ str(list(best_feature_weights)),
+ ' '.join(['%s=%s' % (arg, str(getattr(args, arg)))
+ for arg in vars(args)])))
+
+
+def read_feature_weights_file(feature_weights_path):
+ """
+ Reads a manually created file containing the learned feature weights for
+ some task, trg domain, and feature set and returns them.
+ The file format is this (note that ~ is used as delimiter to avoid clash
+ with other delimiters in the feature sets):
+ books~similarity diversity~[0.0, -0.66, -0.66, 0.66, 0.66, -0.66, 0.66, 0.0, 0.0, -0.66, 0.66, 0.66]
+ ...
+ :param feature_weights_path: the path to the feature weights file
+ :return: a generator of tuples (feature_weights_domain, feature_set, feature_weights)
+ """
+ print('Reading feature weights from %s...' % feature_weights_path)
+ with open(feature_weights_path, 'r') as f:
+ for line in f:
+ feature_weights_domain, feature_set, feature_weights =\
+ line.split('~')
+ feature_weights = feature_weights.strip('[]\n')
+ feature_weights = feature_weights.split(', ')
+ feature_weights = [float(f) for f in feature_weights]
+ print('Feature weights domain: %s. Feature set: %s. '
+ 'Feature weights: %s' %
+ (feature_weights_domain, feature_set, str(feature_weights)))
+ yield feature_weights_domain, feature_set, feature_weights
+
+
+def task2read_data_func(task):
+ """Returns the read data method for each task."""
+ if task == SENTIMENT:
+ return read_processed
+ if task in [POS, POS_BILSTM]:
+ return read_tagging_data
+ if task == PARSING:
+ return read_parsing_data
+ raise ValueError(
+ 'No data reading function available for task %s.' % task)
+
+
+# =============== sentiment data functions =======
+
+def read_processed(dir_path):
+ """
+ Reads the processed files in the processed_acl directory.
+ :param dir_path: the directory containing the processed_acl folder
+ :return: a dictionary that maps domains to a tuple of
+ (labeled_reviews,labels, unlabeled_reviews); labeled_reviews is
+ a list of reviews where each review is a list of (unordered)
+ ngrams; labels is a numpy array of label ids of shape (num_labels);
+ unlabeled_reviews has the same format as labeled_reviews
+ """
+ domains_path = os.path.join(dir_path, 'processed_acl')
+ assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+ domains_path)
+ domains = os.listdir(domains_path)
+ assert set(domains) == set(SENTIMENT_TRG_DOMAINS)
+ domain2data = {domain: [[], [], None] for domain in domains}
+ for domain in domains:
+ print('Processing %s...' % domain)
+ # file names are positive.review, negative.review, and unlabeled.review
+ # positive and negative each contain 2k examples;
+ # unlabeled contains ~4k examples
+ splits = ['positive', 'negative', 'unlabeled']
+ for split in splits:
+ print('Processing %s/%s...' % (domain, split), end='')
+ file_path = os.path.join(domains_path, domain, '%s.review' % split)
+ assert os.path.exists(file_path), '%s does not exist.' % file_path
+ reviews = []
+ with open(file_path, encoding='utf-8') as f:
+ for line in f:
+ # get the pre-processed features; these are a white-space
+ # separated list of unigram/bigram occurrence counts in
+ # the document, e.g. "must:1", "still_has:1"
+ features = line.split(' ')[:-1]
+
+ # convert the features to a sequence (note: order does not
+ # matter here); we do this to be able to later use the
+ # same post-processing as for data from other sources
+ review = []
+ for feature in features:
+ ngram, count = feature.split(':')
+ for _ in range(int(count)):
+ review.append(ngram)
+
+ # add the review to the reviews
+ reviews.append(review)
+
+ # the domain2data dict maps a domain to a tuple of
+ # (reviews, labels, unlabeled_reviews)
+ if split == 'unlabeled':
+ # add the unlabeled reviews at the third position of the tuple
+ domain2data[domain][2] = reviews
+ else:
+ # add labels with the same polarity as the file
+ domain2data[domain][0] += reviews
+ domain2data[domain][1] += [sentiment2id(split)] * len(reviews)
+
+ print(' Processed %d reviews.' % len(reviews))
+ domain2data[domain][1] = np.array(domain2data[domain][1])
+ return domain2data
+
+
+def sentiment2id(sentiment):
+ """
+ Maps a sentiment to a label id.
+ :param sentiment: the sentiment; one of [positive, pos, negative, neg]
+ :return: the id of the specified sentiment
+ """
+ if sentiment in ['positive', 'pos']:
+ return POS_ID
+ if sentiment in ['negative', 'neg']:
+ return NEG_ID
+ raise ValueError('%s is not a valid sentiment.' % sentiment)
+
+
+# =============== tagging data functions ======
+
+def read_tagging_data(dir_path, top_k_unlabeled=2000):
+ """
+ Reads the CoNLL tagging files in the gweb_sancl/pos directory. Outputs the
+ documents as list of lists with tokens and lists of corresponding tags.
+ The domains are reviews, answer, emails, newsblogs, weblogs, wsj and
+ the corresponding files are called gweb-{domain}-{dev|test}.conll in folder
+ gweb_sancl/pos/{domain}
+ :param dir_path: the path to the directory gweb_sancl
+ :param top_k_unlabeled: only use the top k unlabeled examples
+ :return: a dictionary that maps domains to a tuple of (labeled_examples,
+ labels, unlabeled_examples); labeled_examples is a list of
+ sentences where each sentence is a list of tokens; labels
+ is a list of tags for each sentence; unlabeled_examples has the
+ same format as labeled_examples
+ """
+ domains_path = os.path.join(dir_path, 'pos')
+ assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+ domains_path)
+ domains = [d for d in os.listdir(domains_path)]
+ print(domains)
+ assert set(domains) == set(POS_PARSING_TRG_DOMAINS)
+ domain2data = {domain: [[], [], None] for domain in domains}
+ for domain in domains:
+ print('Processing %s...' % domain)
+ # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll
+ splits = ['dev', 'test', 'unlabeled']
+ for split in splits:
+ print('Processing %s/%s...' % (domain, split), end='')
+
+ if split == 'unlabeled':
+ file_path = os.path.join(dir_path, 'unlabeled',
+ 'gweb-%s.unlabeled.txt' % (domain))
+ assert os.path.exists(file_path), ('%s does not exist.' %
+ file_path)
+ unlabeled_data = []
+ print(file_path)
+ with open(file_path,'rb') as f:
+ for line in f:
+ line = line.decode('utf-8','ignore').strip().split()
+ unlabeled_data.append(line)
+ # add the unlabeled reviews at the third position of the tuple
+ print('Read %s number of unlabeled sentences'
+ % len(unlabeled_data))
+
+ unlabeled_data = unlabeled_data[:top_k_unlabeled]
+ print('Took top {} documents '.format(top_k_unlabeled))
+ domain2data[domain][2] = unlabeled_data
+ else:
+
+ file_path = os.path.join(domains_path, domain,
+ 'gweb-%s-%s.conll' % (domain, split))
+ assert os.path.exists(file_path), ('%s does not exist.' %
+ file_path)
+
+ data = list(read_conll_file(file_path))
+ words = [words for words, tags in data]
+ tags = [tags for words, tags in data]
+ domain2data[domain][0] += words
+ domain2data[domain][1] += tags
+
+ print(' Processed %d sentences.' % len(data))
+ domain2data[domain][1] = np.array(domain2data[domain][1])
+ return domain2data
+
+
+# =============== parsing data functions ======
+
+def read_parsing_data(dir_path, top_k_unlabeled=2000):
+ """
+ Reads the CoNLL parsing files in the gweb_sancl/pos directory
+ :param dir_path: The gweb_sancl directory path.
+ :param top_k_unlabeled: only use the top k unlabeled examples
+ :return: a dictionary that maps domains to a tuple of (
+ labeled_conll_entries, pseudo_labels, unlabeled_conll_entries);
+ labeled_conll_entries is a list of CoNLLEntry containing the
+ word forms, annotations, and target labels to be used for
+ parsing; since each CoNLLEntry already contains the target label,
+ pseudo_labels only contains pseudo-labels; unlabeled_conll_entries
+ are used as unlabeled data
+ """
+ domains_path = os.path.join(dir_path, 'parse')
+ assert os.path.exists(domains_path), ('Error: %s does not exist.' %
+ domains_path)
+ domains = [d for d in os.listdir(domains_path)]
+ print(domains)
+ assert set(domains) == set(POS_PARSING_TRG_DOMAINS)
+ domain2data = {domain: [[], [], None] for domain in domains}
+ for domain in domains:
+ print('Processing %s...' % domain)
+ # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll
+ splits = ['dev', 'test', 'unlabeled']
+ for split in splits:
+ print('Processing %s/%s...' % (domain, split), end='')
+ if split == 'unlabeled':
+ file_path = os.path.join(dir_path, 'unlabeled',
+ 'gweb-%s.unlabeled.txt' % (domain))
+ assert os.path.exists(file_path), ('%s does not exist.' %
+ file_path)
+ unlabeled_data = []
+ with open(file_path,'rb') as f:
+ for line in f:
+ line = line.decode('utf-8','ignore').strip().split()
+ unlabeled_data.append(line)
+
+ # add the unlabeled reviews at the third position of the tuple
+ print('Read %s number of unlabeled sentences' % len(unlabeled_data))
+
+ unlabeled_data = unlabeled_data[:top_k_unlabeled]
+ print('Took top {} documents '.format(top_k_unlabeled))
+ domain2data[domain][2] = unlabeled_data
+ else:
+ if domain == 'wsj' and split == 'test':
+ file_path = os.path.join(domains_path, domain,
+ 'ontonotes-%s-%s.conll'
+ % (domain, split))
+ else:
+ file_path = os.path.join(domains_path, domain,
+ 'gweb-%s-%s.conll'
+ % (domain, split))
+ assert os.path.exists(file_path), ('%s does not exist.' %
+ file_path)
+
+ with open(file_path, 'r') as conll_file_path:
+ data = list(read_conll(conll_file_path))
+ domain2data[domain][0] += data
+
+ # add pseudo-labels since the model doesn't use explicit
+ # labels for training
+ domain2data[domain][1] += [0] * len(data)
+ domain2data[domain][1] = np.array(domain2data[domain][1])
+ return domain2data
+
+
+def read_parsing_evaluation(evaluation_file_path):
+ """
+ Read the labeled attachment score, unlabeled attachment score, and label
+ accuracy score from a file produced by the parsing evaluation perl
+ script. The beginning of the file looks like this:
+ Labeled attachment score: 6995 / 9615 * 100 = 72.75 %
+ Unlabeled attachment score: 7472 / 9615 * 100 = 77.71 %
+ Label accuracy score: 8038 / 9615 * 100 = 83.60 %
+ ...
+ :param evaluation_file_path: the path of the evaluation file produced by the perl script
+ :return: the labeled attachment score, the unlabeled attachment score, and the label accuracy score
+ """
+ try:
+ with open(evaluation_file_path, 'r') as f:
+ lines = f.readlines()
+ las = float(lines[0].split('=')[1].strip('% \n'))
+ uas = float(lines[1].split('=')[1].strip('% \n'))
+ acc = float(lines[2].split('=')[1].strip('% \n'))
+ except Exception:
+ las = 0.0
+ uas = 0.0
+ acc = 0.0
+ return las, uas, acc
diff --git a/similarity.py b/similarity.py
new file mode 100644
index 0000000..fdf91a0
--- /dev/null
+++ b/similarity.py
@@ -0,0 +1,342 @@
+"""
+Methods for measuring domain similarity according to different metrics based on
+different representations.
+"""
+
+import os
+
+from sklearn.feature_extraction.text import CountVectorizer
+import gensim
+
+import numpy as np
+import scipy.stats
+import scipy.spatial.distance
+
+
+# SIMILARITY MEASURES
+
+def jensen_shannon_divergence(repr1, repr2):
+ """Calculates Jensen-Shannon divergence (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)."""
+ avg_repr = 0.5 * (repr1 + repr2)
+ sim = 1 - 0.5 * (scipy.stats.entropy(repr1, avg_repr) + scipy.stats.entropy(repr1, avg_repr))
+ if np.isinf(sim):
+ # the similarity is -inf if no term in the document is in the vocabulary
+ return 0
+ return sim
+
+
+def renyi_divergence(repr1, repr2, alpha=0.99):
+ """Calculates Renyi divergence (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R.C3.A9nyi_divergence)."""
+ log_sum = np.sum([np.power(p, alpha) / np.power(q, alpha-1) for (p, q) in zip(repr1, repr2)])
+ sim = 1 / (alpha - 1) * np.log(log_sum)
+ if np.isinf(sim):
+ # the similarity is -inf if no term in the document is in the vocabulary
+ return 0
+ return sim
+
+
+def cosine_similarity(repr1, repr2):
+ """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity)."""
+ if repr1 is None or repr2 is None:
+ return 0
+ assert not (np.isnan(repr2).any() or np.isinf(repr2).any())
+ assert not (np.isnan(repr1).any() or np.isinf(repr1).any())
+ sim = 1 - scipy.spatial.distance.cosine(repr1, repr2)
+ if np.isnan(sim):
+ # the similarity is nan if no term in the document is in the vocabulary
+ return 0
+ return sim
+
+
+def euclidean_distance(repr1, repr2):
+ """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance)."""
+ sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)]))
+ return sim
+
+
+def variational_distance(repr1, repr2):
+ """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry)."""
+ sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)])
+ return sim
+
+
+def kl_divergence(repr1, repr2):
+ """Calculates Kullback-Leibler divergence (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)."""
+ sim = scipy.stats.entropy(repr1, repr2)
+ return sim
+
+
+def bhattacharyya_distance(repr1, repr2):
+ """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance)."""
+ sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)]))
+ assert not np.isnan(sim), 'Error: Similarity is nan.'
+ if np.isinf(sim):
+ # the similarity is -inf if no term in the review is in the vocabulary
+ return 0
+ return sim
+
+
+def similarity_name2value(s_name, repr1, repr2):
+ """Given a similarity function name, return the corresponding similarity function value."""
+ if s_name == 'jensen-shannon':
+ return jensen_shannon_divergence(repr1, repr2)
+ if s_name == 'renyi':
+ return renyi_divergence(repr1, repr2)
+ if s_name == 'cos' or s_name == 'cosine':
+ return cosine_similarity(repr1, repr2)
+ if s_name == 'euclidean':
+ return euclidean_distance(repr1, repr2)
+ if s_name == 'variational':
+ return variational_distance(repr1, repr2)
+ if s_name == 'kl':
+ return kl_divergence(repr1, repr2)
+ if s_name == 'bhattacharyya':
+ return bhattacharyya_distance(repr1, repr2)
+ raise ValueError('%s is not a valid feature name.' % s_name)
+
+
+# TERM DISTRIBUTIONS
+
+def get_domain_term_dists(term_dist_path, domain2data, vocab, lowercase=True):
+ """
+ Retrieves relative term distributions from the provided domains.
+ :param term_dist_path: the path where the term distributions of the domains
+ should be saved
+ :param domain2data: the mapping of domains to (labeled_examples, labels,
+ unlabeled_examples) tuples
+ :param vocab: the Vocabulary object
+ :param lowercase: lower-case the input data
+ :return: a mapping of domains to their term distributions,
+ i.e. a numpy array of shape (vocab_size,)
+ """
+ domain2term_dist = {}
+ if os.path.exists(term_dist_path):
+ print('Loading the term distributions from file...')
+ with open(term_dist_path, 'r') as f:
+ for line in f:
+ domain, term_dist = line.strip().split('\t')
+ term_dist = np.fromstring(term_dist, count=vocab.size, sep=' ')
+ assert len(term_dist) == vocab.size,\
+ ('Length of term dist for %s should be %d, is %d.' %
+ (domain, vocab.size, len(term_dist)))
+ assert np.round(np.sum(term_dist), 6) == 1,\
+ ('Sum of term distribution is %.6f instead of 1. The '
+ 'vocabulary was likely created with a larger '
+ 'max_vocab_size.' % np.sum(term_dist))
+ domain2term_dist[domain] = term_dist
+ assert set(domain2term_dist.keys()) == set(domain2data.keys()),\
+ ('Term distributions are not saved for all domains: "%s" and "%s"'
+ 'are not equal.' % (' '.join(domain2term_dist.keys()),
+ ' '.join(domain2data.keys())))
+ return domain2term_dist
+
+ if lowercase:
+ print('Lower-casing the data for calculating the term distributions...')
+
+ # get the term domain counts for the term distributions
+ for domain, (examples, _, unlabeled_examples) in domain2data.items():
+ domain2term_dist[domain] = get_term_dist(
+ examples + unlabeled_examples, vocab, lowercase)
+
+ print('Writing relative frequency distributions to %s...' % term_dist_path)
+ with open(term_dist_path, 'w') as f:
+ for domain, term_dist in domain2term_dist.items():
+ f.write('%s\t%s\n' % (domain, ' '.join([str(c) for c in term_dist])))
+ return domain2term_dist
+
+
+def get_term_dist(docs, vocab, lowercase=True):
+ """
+ Calculates the term distribution of a list of documents.
+ :param docs: a list of tokenized docs; can also contain a single document
+ :param vocab: the Vocabulary object
+ :param lowercase: lower-case the input data
+ :return: the term distribution of the input documents,
+ i.e. a numpy array of shape (vocab_size,)
+ """
+ term_dist = np.zeros(vocab.size)
+ for doc in docs:
+ for word in doc:
+ if lowercase:
+ word = word.lower()
+ if word in vocab.word2id:
+ term_dist[vocab.word2id[word]] += 1
+
+ # normalize absolute freqs to obtain a relative frequency term distribution
+ term_dist /= np.sum(term_dist)
+ if np.isnan(np.sum(term_dist)):
+ # the sum is nan if docs only contains one document and that document
+ # has no words in the vocabulary
+ term_dist = np.zeros(vocab.size)
+ return term_dist
+
+
+def get_most_similar_domain(trg_domain, domain2term_dists,
+ similarity_name='jensen-shannon'):
+ """
+ Given a target domain, retrieve the domain that is most similar to it
+ according to some domain similarity measure (default: Jensen-Shannon
+ divergence).
+ :param trg_domain: the target domain
+ :param domain2term_dists: a mapping of domain names to their term distribution
+ (a numpy array of shape (vocab_size,) )
+ :param similarity_name: a string indicating the name of the similarity
+ measure used (default: 'jensen-shannon')
+ :return: the domain most similar to the target domain
+ """
+ highest_sim_score, most_similar_domain = 0, None
+ trg_term_dist = domain2term_dists[trg_domain]
+ for domain, src_term_dist in domain2term_dists.items():
+ if domain == trg_domain:
+ continue
+ sim_score = similarity_name2value(similarity_name, src_term_dist, trg_term_dist)
+ if sim_score > highest_sim_score:
+ highest_sim_score, most_similar_domain = sim_score, domain
+ return most_similar_domain
+
+
+# TOPIC DISTRIBUTIONS
+
+def train_topic_model(examples, vocab, num_topics=50, num_iterations=2000,
+ num_passes=10):
+ """
+ Trains an LDA topic model on the provided list of tokenised documents and
+ returns the vectorizer used for the transformation and the trained LDA
+ model.
+ :param examples: a list of tokenised documents of all domains
+ :param vocab: the Vocabulary object
+ :param num_topics: the number of topics that should be used
+ :param num_iterations: the number of iterations
+ :param num_passes: the number of passes over the corpus that should be
+ performed
+ :return: the CountVectorizer used for transforming the corpus and the
+ trained LDA topic model
+ """
+ # the text is already tokenized and pre-processed; we only need to
+ # transform it to vectors
+ vectorizer = CountVectorizer(vocabulary=vocab.word2id,
+ tokenizer=lambda x: x,
+ preprocessor=lambda x: x)
+ lda_corpus = vectorizer.fit_transform(examples)
+
+ # the gensim LDA implementation requires a sparse corpus;
+ # we could also use sci-kit learn instead
+ lda_corpus = gensim.matutils.Sparse2Corpus(lda_corpus,
+ documents_columns=False)
+ print('Training LDA model on data of all domains with %d topics, '
+ '%d iterations, %d passes...' % (num_topics, num_iterations,
+ num_passes))
+ lda_model = gensim.models.LdaMulticore(
+ lda_corpus, num_topics=num_topics, id2word=vocab.id2word,
+ iterations=num_iterations, passes=num_passes)
+ return vectorizer, lda_model
+
+
+def get_topic_distributions(examples, vectorizer, lda_model):
+ """
+ Retrieve the topic distributions of a collection of documents.
+ :param examples: a list of tokenised documents
+ :param vectorizer: the CountVectorizer used for transforming the documents
+ :param lda_model: the trained LDA model
+ :return: an array of shape (num_examples, num_topics) containing the topic
+ distribution of each example
+ """
+ vectorized_corpus = vectorizer.transform(examples)
+ gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus,
+ documents_columns=False)
+ topic_representations = []
+ for doc in gensim_corpus:
+ topic_representations.append(
+ [topic_prob for (topic_id, topic_prob) in
+ lda_model.get_document_topics(doc, minimum_probability=0.)])
+ return np.array(topic_representations)
+
+
+# PRE-TRAINED WORD EMBEDDINGS METHODS
+
+def load_word_vectors(file, vocab_word_vec_file, word2id, vector_size=300,
+ header=False):
+ """
+ Loads word vectors from a text file, e.g. the one obtained from
+ http://nlp.stanford.edu/projects/glove/.
+ :param file: the file the word vectors should be loaded from
+ :param vocab_word_vec_file: the file where the word embeddings in the
+ vocabulary can be stored for faster retrieval
+ :param word2id: the mapping of words to their ids in the vocabulary
+ :param vector_size: the size of the word vectors
+ :param header: whether the word vectors text file contains a header;
+ default is False
+ :return a dictionary mapping each word to its numpy word vector
+ """
+ word2vector = {}
+ if os.path.exists(vocab_word_vec_file):
+ print('Loading vocabulary word vectors from %s...' % vocab_word_vec_file)
+ with open(vocab_word_vec_file, 'r', encoding='utf-8') as f:
+ for line in f:
+ word = line.split(' ')[0]
+ assert word in word2id, ('Error: %s in vocab word vec file is '
+ 'not in vocab.' % word)
+ line = ' '.join(line.split(' ')[1:]).strip()
+ vector = np.fromstring(line, dtype=float, sep=' ')
+ assert len(vector) == vector_size,\
+ ('Error: %d != vector size %d for word %s.'
+ % (len(vector), vector_size, word))
+ word2vector[word] = vector
+ return word2vector
+
+ print('Reading word vectors from %s...' % file)
+ with open(file, 'r', encoding='utf-8') as f:
+ for i, line in enumerate(f):
+ if i == 0 and header:
+ continue
+ if i % 100000 == 0 and i > 0:
+ print('Processed %d vectors.' % i)
+ word = line.split(' ')[0]
+ if word not in word2id:
+ continue
+ line = ' '.join(line.split(' ')[1:]).strip()
+ vector = np.fromstring(line, dtype=float, sep=' ')
+ assert len(vector) == vector_size
+ word2vector[word] = vector
+
+ print('Writing word vectors to %s...' % vocab_word_vec_file)
+ with open(vocab_word_vec_file, 'w', encoding='utf-8') as f:
+ for word, vector in word2vector.items():
+ f.write('%s %s\n' % (word, ' '.join([str(c) for c in vector])))
+ return word2vector
+
+
+def weighted_sum_of_embeddings(docs, word2id, word2vector, term_dist):
+ """
+ Get a weighted sum of embeddings representation for a list of documents
+ belonging to one domain. The documents are represented as a list of
+ ngrams. Also works if the list only contains a single document.
+ :param docs: a list of documents
+ :param word2id: the mapping of words to their ids in the vocabulary
+ :param word2vector: the mapping of words to their vector representations
+ :param term_dist: the term distribution of the data the words belong to
+ :return: the vector representation of the provided list of documents
+ """
+ # the factor with which the word probability is smoothed, we empirically
+ # set this to the value used in Mikolov et al. (2013)
+ t = 10e-5
+ word_embed_representations = []
+ for doc in docs:
+ doc_vector = np.zeros(len(list(word2vector.values())[0]))
+ word_vector_count = 0
+ for word in doc:
+ if word in word2vector:
+ vector = word2vector[word]
+
+ # weight the vector with the smoothed inverse probability of
+ # the word
+ doc_vector += np.sqrt(t / (term_dist[word2id[word]])) * vector
+ word_vector_count += 1
+ if word_vector_count == 0:
+ # this might be because the review is in another language by
+ # accident; set count to 1 to avoid division by 0
+ word_vector_count = 1
+ doc_vector /= word_vector_count
+ assert not (np.isnan(doc_vector).any() or np.isinf(doc_vector).any())
+ word_embed_representations.append(doc_vector)
+ return np.array(word_embed_representations)
diff --git a/simpletagger.py b/simpletagger.py
new file mode 100644
index 0000000..5b39a05
--- /dev/null
+++ b/simpletagger.py
@@ -0,0 +1,359 @@
+#!/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Simple structured perceptron tagger (bplank, parts by andersjo) - Language Proc 2
+import argparse
+import codecs
+from collections import defaultdict, Counter
+import json
+import re
+import numpy as np
+import sys
+import random
+
+np.set_printoptions(precision=4)
+
+
+def read_conll_file(file_name):
+ """
+ read in a file with format:
+ word1 tag1
+ ... ...
+ wordN tagN
+
+ Sentences MUST be separated by newlines!
+
+ :param file_name: file to read in
+ :return: generator of instances ((list of words, list of tags) pairs)
+ """
+ current_words = []
+ current_tags = []
+
+ for line in codecs.open(file_name, encoding='utf-8'):
+ line = line.strip()
+
+ if line:
+ word, tag = line.split('\t')
+ current_words.append(word)
+ current_tags.append(tag)
+
+ else:
+ yield (current_words, current_tags)
+ current_words = []
+ current_tags = []
+
+ # if file does not end in newline (it should...), check whether there is an instance in the buffer
+ if current_tags != []:
+ yield (current_words, current_tags)
+
+
+def memoize(f):
+ """
+ helper function to be used as decorator to memoize features
+ :param f:
+ :return:
+ """
+ memo = {}
+ def helper(*args):
+ key = tuple(args[1:])
+ try:
+ return memo[key]
+ except KeyError:
+ memo[key] = f(*args)
+ return memo[key]
+ return helper
+
+
+class StructuredPerceptron(object):
+ """
+ implements a structured perceptron as described in Collins 2002
+ """
+
+ def __init__(self, seed=1512141834):
+ """
+ initialize model
+ :return:
+ """
+ self.feature_weights = defaultdict(float)
+ self.tags = set()
+
+ self.START = "__START__"
+ self.END = "__END__"
+ print("using seed: {}".format(seed))
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def fit(self, train_data, iterations=5, learning_rate=0.2):
+ """
+ read in a CoNLL file, extract emission features iterate over instances to train weight vector
+ :param file_name:
+ :return:
+ """
+ averaged_weights = Counter()
+
+ for iteration in range(iterations):
+ correct = 0
+ total = 0.0
+ sys.stderr.write('iteration %s\n************\n' % (iteration+1))
+
+ for i, (words, tags) in enumerate(train_data):
+ if i%100==0:
+ sys.stderr.write('%s'%i)
+ elif i%10==0:
+ sys.stderr.write('.')
+
+ for tag in tags:
+ self.tags.add(tag)
+
+ # get prediction
+ prediction = self.decode(words)
+
+ # derive global features
+ global_gold_features = self.get_global_features(words, tags)
+ global_prediction_features = self.get_global_features(words, prediction)
+
+ # update weight vector
+ for fid, count in global_gold_features.items():
+ self.feature_weights[fid] += learning_rate * count
+ for fid, count in global_prediction_features.items():
+ self.feature_weights[fid] -= learning_rate * count
+
+ # compute training accuracy for this iteration
+ correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold])
+ total += len(tags)
+
+ #sys.stderr.write('\n\t%s features\n' % (len(self.feature_weights)))
+ averaged_weights.update(self.feature_weights)
+ sys.stderr.write('\tTraining accuracy: %.4f\n\n' % (correct/total))
+
+ random.shuffle(train_data)
+
+ self.feature_weights = averaged_weights
+
+ def get_global_features(self, words, tags):
+ """
+ count how often each feature fired for the whole sentence
+ :param words:
+ :param tags:
+ :return:
+ """
+ feature_counts = Counter()
+
+ for i, (word, tag) in enumerate(zip(words, tags)):
+ previous_tag = self.START if i == 0 else tags[i-1]
+ feature_counts.update(self.get_features(word, tag, previous_tag))
+
+ return feature_counts
+
+ @memoize
+ def get_features(self, word, tag, previous_tag):
+ """
+ get all features that can be derived from the word and tags
+ :param word:
+ :param tag:
+ :param previous_tag:
+ :return:
+ """
+ word_lower = word.lower()
+ prefix = word_lower[:3]
+ suffix = word_lower[-3:]
+
+ features = [
+ 'TAG_%s' % (tag), # current tag
+ 'TAG_BIGRAM_%s_%s' % (previous_tag, tag), # tag bigrams
+ 'WORD+TAG_%s_%s' % (word, tag), # word-tag combination
+ 'WORD_LOWER+TAG_%s_%s' % (word_lower, tag),# word-tag combination (lowercase)
+ 'UPPER_%s_%s' % (word[0].isupper(), tag), # word starts with uppercase letter
+ 'DASH_%s_%s' % ('-' in word, tag), # word contains a dash
+ 'PREFIX+TAG_%s_%s' % (prefix, tag), # prefix and tag
+ 'SUFFIX+TAG_%s_%s' % (suffix, tag), # suffix and tag
+
+ #########################
+ # ADD MOAAAAR FEATURES! #
+ #########################
+ ('WORDSHAPE', self.shape(word), tag),
+ 'WORD+TAG_BIGRAM_%s_%s_%s' % (word, tag, previous_tag),
+ 'SUFFIX+2TAGS_%s_%s_%s' % (suffix, previous_tag, tag),
+ 'PREFIX+2TAGS_%s_%s_%s' % (prefix, previous_tag, tag)
+ ]
+
+ return features
+
+ @memoize
+ def shape(self, x):
+ result = []
+ for c in x:
+ if c.isupper():
+ result.append('X')
+ elif c.islower():
+ result.append('x')
+ elif c in '0123456789':
+ result.append('d')
+ else:
+ result.append(c)
+
+ # replace multiple occurrences of a character with 'x*' and return it
+ return re.sub(r"x+", "x*", ''.join(result))
+
+ def decode(self,words):
+ """
+ Find best sequence
+ :param words:
+ :return:
+ """
+ N=len(words)
+ M=len(self.tags) #number of tags
+ tags=list(self.tags)
+
+ # create trellis of size M (number of tags) x N (sentence length)
+ Q = np.ones((len(self.tags), N)) * float('-Inf')
+ backp = np.ones((len(self.tags), N), dtype=np.int16) * -1 #backpointers
+
+ ### initialization step
+ cur_word=words[0]
+ for j in range(M):
+ # initialize probs for tags j at position 1 (first word)
+ cur_tag=tags[j]
+ features = self.get_features(words[0], cur_tag, self.START)
+ feature_weights = sum((self.feature_weights[x] for x in features))
+ Q[j,0]=feature_weights
+
+ # iteration step
+ # filling the lattice, for every position and every tag find viterbi score Q
+ for i in range(1,N):
+ # for every tag
+ for j in range(M):
+ # checks if we are at end or start
+ tag=tags[j]
+
+ best_score = float('-Inf')
+
+ # for every possible previous tag
+ for k in range(M):
+
+ # k=previous tag
+ previous_tag=tags[k]
+
+ best_before=Q[k,i-1] # score until best step before
+
+ features = self.get_features(words[i], tag, previous_tag)
+ feature_weights = sum((self.feature_weights[x] for x in features))
+
+ score = best_before + feature_weights
+
+ if score > best_score:
+ Q[j,i]=score
+ best_score = score
+ backp[j,i]=k #best tag
+
+ # final best
+ #best_id=np.argmax(Q[:, -1]) #the same
+ best_id=Q[:,-1].argmax()
+
+ ## print best tags in reverse order
+ predtags=[]
+ predtags.append(tags[best_id])
+
+ for i in range(N-1,0,-1):
+ idx=int(backp[best_id,i])
+ predtags.append(tags[idx])
+ best_id=idx
+
+ #return reversed predtags
+ #return (words,predtags[::-1])
+ return predtags[::-1]
+
+ def predict(self, test_data):
+ """
+ Get predictions for entire test set
+ :param test_data:
+ :return:
+ """
+ return [self.decode(words) for words in test_data]
+
+ def predict_eval(self, test_data, output=False):
+ """
+ compute accuracy on a test file
+ :param file_name:
+ :param output:
+ :return:
+ """
+ correct = 0
+ total = 0.0
+ sys.stderr.write('\nTesting\n')
+ sys.stderr.write('*******\n')
+
+ for i, (words, tags) in enumerate(test_data):
+ if i%100==0:
+ sys.stderr.write('%s'%i)
+ elif i%10==0:
+ sys.stderr.write('.')
+
+ # get prediction
+ prediction = self.decode(words)
+
+ if output:
+ for word, gold, pred in zip(words, tags, prediction):
+ print("{}\t{}\t{}".format(word, gold, pred))
+ print("")
+
+ correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold])
+ total += len(tags)
+ print("\nTest accuracy on %s items: %.4f" % (i+1, correct/total), file=sys.stderr)
+
+ def save(self, file_name):
+ """
+ save model
+ :param file_name:
+ :return:
+ """
+ print("saving model...", end=' ', file=sys.stderr)
+ with codecs.open(file_name, "w", encoding='utf-8') as model:
+ model.write("%s\n" % json.dumps({'tags': list(self.tags), 'weights': dict(self.feature_weights)}))
+ print("done", file=sys.stderr)
+
+ def load(self, file_name):
+ """
+ load model from JSON file
+ :param file_name:
+ :return:
+ """
+ print("loading model...", end=' ', file=sys.stderr)
+ model_data = codecs.open(file_name, 'r', encoding='utf-8').readline().strip()
+ model = json.loads(model_data)
+ self.tags = set(model['tags'])
+ self.feature_weights = model['weights']
+ print("done", file=sys.stderr)
+
+
+# if script is run from command line, automatically execute the following
+if __name__=="__main__":
+
+ # parse command line options
+ parser = argparse.ArgumentParser(description="""Run a structured perceptron""")
+ parser.add_argument("--train", help="train model on a file (CoNLL format)", required=False)
+ parser.add_argument("--test", help="test model on a file (CoNLL format)", required=False)
+ parser.add_argument("--output", help="output predictions to stdout", required=False,action="store_true")
+ parser.add_argument("--load", help="load model from JSON file", required=False)
+ parser.add_argument("--save", help="save model as JSON file", required=False)
+ parser.add_argument("--iterations", help="number of training iterations", required=False, default=5, type=int)
+ parser.add_argument("--learning_rate", help="learning rate during training", required=False, default=0.2, type=float)
+ args = parser.parse_args()
+
+ # create new model
+ sp = StructuredPerceptron()
+
+ if args.load:
+ sp.load(args.load)
+
+ if args.train:
+ train_data = list(read_conll_file(args.train))
+ sp.fit(train_data, iterations=args.iterations, learning_rate=args.learning_rate)
+
+ if args.save:
+ sp.save(args.save)
+
+ # check whether to show predictions
+ if args.test:
+ test_data = list(read_conll_file(args.test))
+ sp.predict_eval(test_data, output=args.output)
diff --git a/task_utils.py b/task_utils.py
new file mode 100644
index 0000000..99ab066
--- /dev/null
+++ b/task_utils.py
@@ -0,0 +1,409 @@
+"""
+Utility methods that are used for training and evaluation of the tasks.
+"""
+
+import os
+import operator
+import numpy as np
+import random
+from collections import namedtuple
+
+from sklearn import svm
+from sklearn.metrics import accuracy_score
+
+import data_utils
+from constants import POS_ID, NEG_ID, SENTIMENT, POS, POS_BILSTM, PARSING,\
+ BAYES_OPT
+from simpletagger import StructuredPerceptron
+
+from bist_parser.bmstparser.src import mstlstm
+from bist_parser.bmstparser.src.utils import vocab_conll, write_conll,\
+ write_original_conll
+
+from bilstm_tagger.src.simplebilty import SimpleBiltyTagger, load
+
+NUM_EPOCHS = 50
+PATIENCE = 2
+
+
+def get_data_subsets(feature_vals, feature_weights, train_data, train_labels,
+ task, num_train_examples):
+ """
+ Given the feature values and the feature weights, return the stratified
+ subset of the training data with the highest feature scores.
+ :param feature_vals: a numpy array of shape (num_train_data, num_features)
+ containing the feature values
+ :param feature_weights: a numpy array of shape (num_features, ) containing
+ the weight for each feature
+ :param train_data: a sparse numpy array of shape (num_train_data, vocab_size)
+ containing the training data
+ :param train_labels: a numpy array of shape (num_train_data) containing the
+ training labels
+ :param task: the task; this determines whether we use stratification
+ :param num_train_examples: the number of training examples for the
+ respective task
+ :return: subsets of the training data and its labels as a tuple of two
+ numpy arrays
+ """
+ # calculate the scores as the dot product between feature values and weights
+ scores = feature_vals.dot(np.transpose(feature_weights))
+
+ # sort the indices by their scores
+ sorted_index_score_pairs = sorted(zip(range(len(scores)), scores),
+ key=operator.itemgetter(1), reverse=True)
+
+ # get the top indices
+ top_indices, _ = zip(*sorted_index_score_pairs)
+
+ if task == SENTIMENT:
+ # for sentiment, rather than taking the top n indices, we still want to
+ # have a stratified training set so we take the top n/2 positive and
+ # top n/2 negative indices
+ top_pos_indices = [idx for idx in top_indices if train_labels[idx] ==
+ POS_ID][:int(num_train_examples/2)]
+ top_neg_indices = [idx for idx in top_indices if train_labels[idx] ==
+ NEG_ID][:int(num_train_examples/2)]
+ top_indices = top_pos_indices + top_neg_indices
+ elif task in [POS, POS_BILSTM, PARSING]:
+ # for POS tagging and parsing, we don't need a stratified train set
+ top_indices = list(top_indices[:num_train_examples])
+ else:
+ raise ValueError('Top index retrieval not implemented for %s.' % task)
+
+ if isinstance(train_data, list):
+ # numpy indexing does not work if train_data is a list
+ return [train_data[idx] for idx in top_indices],\
+ train_labels[top_indices]
+
+ # we get the corresponding subsets of the training data and the labels
+ return train_data[top_indices], train_labels[top_indices]
+
+
+def task2train_and_evaluate_func(task):
+ """Return the train_and_evaluate function for a task."""
+ if task == SENTIMENT:
+ return train_and_evaluate_sentiment
+ if task == POS:
+ return train_and_evaluate_pos
+ if task == POS_BILSTM:
+ return train_and_evaluate_pos_bilstm
+ if task == PARSING:
+ return train_and_evaluate_parsing
+ raise ValueError('Train_and_evaluate is not implemented for %s.' % task)
+
+
+def train_and_evaluate_sentiment(train_data, train_labels, val_data, val_labels,
+ test_data=None, test_labels=None,
+ parser_output_path=None, perl_script_path=None):
+ """
+ Trains an SVM on the provided training data. Calculates accuracy on the
+ validation set and (optionally) on the test set.
+ :param train_data: the training data; a sparse numpy matrix of shape
+ (num_examples, max_vocab_size)
+ :param train_labels: the training labels; a numpy array of shape (num_labels)
+ :param val_data: the validation data; same format as the training data
+ :param val_labels: the validation labels
+ :param test_data: the test data
+ :param test_labels: the test labels
+ :param parser_output_path: only necessary for parsing; is ignored here
+ :param perl_script_path: only necessary for parsing; is ignored here
+ :return: the validation accuracy and (optionally) the test data;
+ otherwise None
+ """
+ print('Training the SVM on %d examples...' % train_data.shape[0])
+ clf = svm.SVC()
+ clf.fit(train_data, train_labels)
+
+ # validate the configuration on the validation and test set (if provided)
+ val_predictions = clf.predict(val_data)
+ val_accuracy = accuracy_score(val_labels, val_predictions)
+ print('Val acc: %.5f' % val_accuracy)
+ test_accuracy = None
+ if test_data is not None and test_labels is not None:
+ test_predictions = clf.predict(test_data)
+ test_accuracy = accuracy_score(test_labels, test_predictions)
+ print('Test acc: %.5f' % test_accuracy)
+ return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_pos(train_data, train_labels, val_data, val_labels,
+ test_data=None, test_labels=None,
+ parser_output_path=None, perl_script_path=None):
+ """
+ Trains the tagger on the provided training data. Calculates accuracy on the
+ validation set and (optionally) on the test set.
+ :param train_data: the training data; a list of lists of shape
+ (num_examples, sequence_length)
+ :param train_labels: the training labels; a list of lists of tags
+ :param val_data: the validation data; same format as the training data
+ :param val_labels: the validation labels
+ :param test_data: the test data
+ :param test_labels: the test labels
+ :param parser_output_path: only necessary for parsing; is ignored here
+ :param perl_script_path: only necessary for parsing; is ignored here
+ :return: the validation accuracy and (optionally) the test acc; else None
+ """
+ print('Training the tagger on %d examples...' % len(train_data))
+ sp = StructuredPerceptron()
+ tr_data = [(words, tags) for words, tags in zip(train_data, train_labels)]
+ pos_iterations, pos_learning_rate = 5, 0.2
+ sp.fit(tr_data, iterations=pos_iterations, learning_rate=pos_learning_rate)
+
+ # validate the configuration on the validation and test set (if provided)
+ val_predictions = sp.predict(val_data)
+
+ val_accuracy = pos_accuracy_score(val_labels, val_predictions)
+ print('Val acc: %.5f' % val_accuracy)
+
+ test_accuracy = None
+ if test_data is not None and test_labels is not None:
+ test_predictions = sp.predict(test_data)
+ test_accuracy = pos_accuracy_score(test_labels, test_predictions)
+ print('Test acc: %.5f' % test_accuracy)
+ return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_pos_bilstm(train_data, train_labels, val_data, val_labels,
+ test_data=None, test_labels=None,
+ parser_output_path=None, perl_script_path=None):
+ """
+ Trains the tagger on the provided training data. Calculates accuracy on the
+ validation set and (optionally) on the test set.
+ :param train_data: the training data; a list of lists of shape
+ (num_examples, sequence_length)
+ :param train_labels: the training labels; a list of lists of tags
+ :param val_data: the validation data; same format as the training data
+ :param val_labels: the validation labels
+ :param test_data: the test data
+ :param test_labels: the test labels
+ :return: the validation accuracy and (optionally) the test data; else None
+ """
+ print('Training the BiLSTM tagger on %d examples...' % len(train_data))
+ in_dim = 64
+ h_dim = 100
+ c_in_dim = 100
+ h_layers = 1
+ trainer = "adam"
+ # temporary file used to restore best model; random number is used to avoid
+ # name clash in parallel runs
+ model_path = '/tmp/bilstm_tagger_model_%d' % random.randint(0, 1000000)
+ tagger = SimpleBiltyTagger(in_dim, h_dim, c_in_dim, h_layers,
+ embeds_file=None)
+ train_X, train_Y = tagger.get_train_data_from_instances(train_data,
+ train_labels)
+ val_X, val_Y = tagger.get_data_as_indices_from_instances(val_data,
+ val_labels)
+
+ # train the model with early stopping
+ tagger.fit(train_X, train_Y, NUM_EPOCHS, trainer, val_X=val_X, val_Y=val_Y,
+ patience=PATIENCE, model_path=model_path)
+
+ # load the best model and remove the model files
+ tagger = load(model_path)
+ os.unlink(model_path)
+ os.unlink(model_path + '.pickle') # file used to save the parameters
+ val_correct, val_total = tagger.evaluate(val_X, val_Y)
+ val_accuracy = val_correct / val_total
+ print('Val acc: %.5f' % val_accuracy)
+
+ test_accuracy = None
+ if test_data is not None and test_labels is not None:
+ test_X, test_Y = tagger.get_data_as_indices_from_instances(test_data,
+ test_labels)
+ test_correct, test_total = tagger.evaluate(test_X, test_Y)
+ test_accuracy = test_correct / test_total
+ print('Test acc: %.5f' % test_accuracy)
+ return val_accuracy, test_accuracy
+
+
+def train_and_evaluate_parsing(train_data, train_labels, val_data, val_labels,
+ test_data=None, test_labels=None,
+ parser_output_path=None, perl_script_path=None):
+ """
+ Trains the parser on the provided training data. Calculates LAS on the
+ validation set and (optionally) on the test set.
+ :param train_data: the training data; a list of CoNLL entries
+ :param train_labels: pseudo-labels; not used as labels as labels are
+ contained in train_data
+ :param val_data: the validation data; same format as the training data
+ :param val_labels: pseud-labels; not used as contained in val_data
+ :param test_data: the test data
+ :param test_labels: pseudo-labels; not used as contained in test_data
+ :return: the validation accuracy and (optionally) the test data; else None
+ """
+ print('Training the parser on %d examples...' % len(train_data))
+ if test_data is not None:
+ # incorporate the test data as some POS tags (e.g. XX) might only
+ # appear in the target domain
+ words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data, test_data]))
+ else:
+ words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data]))
+
+ # set the variables used for initializing the parser and initialize the
+ # parser
+ ParserOptions = namedtuple('parser_options',
+ 'activation, blstmFlag, labelsFlag, costaugFlag,'
+ ' bibiFlag, lstm_dims, wembedding_dims, '
+ 'pembedding_dims, rembedding_dims, lstm_layers, '
+ 'external_embedding, hidden_units, '
+ 'hidden2_units, epochs')
+ parser_options = ParserOptions(
+ epochs=NUM_EPOCHS,
+ activation='tanh',
+ blstmFlag=True,
+ labelsFlag=True,
+ costaugFlag=True,
+ bibiFlag=False,
+ lstm_dims=125,
+ wembedding_dims=100,
+ pembedding_dims=25,
+ rembedding_dims=25,
+ lstm_layers=2,
+ external_embedding=None,
+ hidden_units=100,
+ hidden2_units=0
+ )
+ parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
+
+ # write the dev data to a file
+ dev_data_path = os.path.join(parser_output_path, 'dev.conll')
+ write_original_conll(dev_data_path, val_data)
+
+ # set the variables used for tracking training progress for early stopping
+ best_dev_las, epochs_no_improvement = 0., 0
+ best_model_path = os.path.join(parser_output_path, 'parser')
+ print('Training model for %d max epochs with early stopping with patience '
+ '%d...' % (NUM_EPOCHS, PATIENCE))
+ for epoch in range(parser_options.epochs):
+ print('Starting epoch', epoch)
+ parser.TrainOnEntries(train_data)
+
+ # write the predictions to a file
+ pred_path = os.path.join(parser_output_path,
+ 'dev_pred_epoch_' + str(epoch + 1) + '.conll')
+ write_conll(pred_path, parser.PredictOnEntries(val_data))
+ eval_path = pred_path + '.eval'
+ perl_script_command = ('perl %s -g %s -s %s > %s' % (
+ perl_script_path,dev_data_path, pred_path, eval_path))
+ print('Evaluating with %s...' % perl_script_command)
+ os.system(perl_script_command)
+ las, uas, acc = data_utils.read_parsing_evaluation(eval_path)
+
+ # remove the predictions and the evaluation file
+ if os.path.exists(pred_path):
+ os.unlink(pred_path)
+ if os.path.exists(eval_path):
+ os.unlink(eval_path)
+ if las > best_dev_las:
+ print('LAS %.2f is better than best dev LAS %.2f.'
+ % (las, best_dev_las))
+ best_dev_las = las
+ epochs_no_improvement = 0
+ parser.Save(best_model_path)
+ else:
+ print('LAS %.2f is worse than best dev LAS %.2f.'
+ % (las, best_dev_las))
+ epochs_no_improvement += 1
+ if epochs_no_improvement == PATIENCE:
+ print('No improvement for %d epochs. Early stopping...'
+ % epochs_no_improvement)
+ print('Best dev LAS:', best_dev_las)
+ break
+
+ test_las = None
+ if test_data is not None:
+ # load the best model
+ parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options)
+ parser.Load(best_model_path)
+
+ # first write the dev data to a file
+ test_data_path = os.path.join(parser_output_path, 'test.conll')
+ write_original_conll(test_data_path, test_data)
+
+ # then write the prediction to another file
+ pred_path = os.path.join(parser_output_path, 'test_pred.conll')
+ write_conll(pred_path, parser.PredictOnEntries(test_data))
+ eval_path = pred_path + '.eval'
+ perl_script_command = ('perl %s -g %s -s %s > %s' % (
+ perl_script_path, test_data_path, pred_path, eval_path))
+ print('Evaluating with %s...' % perl_script_command)
+ os.system(perl_script_command)
+ test_las, test_uas, test_acc = data_utils.read_parsing_evaluation(
+ eval_path)
+ print('Test LAS:', test_las, 'test UAS:', test_uas,
+ 'test acc:', test_acc)
+
+ # remove the saved parser
+ if os.path.exists(best_model_path):
+ os.unlink(best_model_path)
+ return best_dev_las, test_las
+
+
+def train_pretrained_weights(feature_values, X_train, y_train, train_domains,
+ num_train_examples, X_val, y_val, X_test, y_test,
+ trg_domain, args, feature_names,
+ parser_output_path, perl_script_path):
+ """
+ Train a model using pre-trained data selection weights (which could have
+ been trained on an other model/domain/task).
+ :param feature_values: a numpy array of shape (num_examples, num_features)
+ :param X_train: the training data
+ :param y_train: the training labels
+ :param train_domains: a list of training domains, only used for counting
+ :param num_train_examples: the number of examples used for training
+ :param X_val: the validation data
+ :param y_val: the validation labels
+ :param X_test: the test data
+ :param y_test: the test labels
+ :param trg_domain: the target domain
+ :param args: the arguments used for calling the script; used for logging
+ :param feature_names: a list of the feature names
+ :param parser_output_path: the output path of the parser
+ :param perl_script_path: the path to the perl script
+ :return:
+ """
+ for feat_weights_domain, feat_weights_feats, feature_weights in \
+ data_utils.read_feature_weights_file(args.feature_weights_file):
+ assert len(feature_weights) == len(feature_names)
+ assert set(args.feature_sets) == set(feat_weights_feats.split(' '))
+
+ if trg_domain != feat_weights_domain:
+ continue
+
+ # count how many examples belong to each source domain
+ train_domain_subset, _ = get_data_subsets(
+ feature_values, feature_weights, train_domains, y_train, args.task,
+ num_train_examples)
+ for subset_domain in set(train_domain_subset):
+ print('# of %s in train data for trg domain %s: %d'
+ % (subset_domain, trg_domain,
+ train_domain_subset.count(subset_domain)))
+ continue
+
+ # get the train subset with the highest scores and train
+ train_subset, labels_subset = get_data_subsets(
+ feature_values, feature_weights, X_train, y_train, args.task,
+ num_train_examples)
+ val_accuracy, test_accuracy = task2train_and_evaluate_func(args.task)(
+ train_subset, labels_subset, X_val, y_val, X_test, y_test,
+ parser_output_path=parser_output_path,
+ perl_script_path=perl_script_path)
+ dict_key = ('%s-X-domain-%s-%s' % (BAYES_OPT, feat_weights_domain,
+ feat_weights_feats))
+
+ # log the result to the log file
+ data_utils.log_to_file(args.log_file, {dict_key: [(
+ val_accuracy, test_accuracy, feature_weights)]}, trg_domain, args)
+
+
+def pos_accuracy_score(gold, predicted):
+ """
+ Calculate the accuracy for POS.
+ :param gold: a list of lists of gold tags
+ :param predicted: a list of lists of predicted tags
+ :return the accuracy score
+ """
+ tags_correct = np.sum([1 for gold_tags, pred_tags in zip(gold, predicted)
+ for g, p in zip(gold_tags, pred_tags) if g == p])
+ tags_total = len([t for g in gold for t in g]) # ravel list
+ return tags_correct/float(tags_total)