diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f8633b --- /dev/null +++ b/.gitignore @@ -0,0 +1,116 @@ +# LaTeX temporary files +*.aux +*.log +*.toc + +# PDF output - usually a bad idea to keep this in Git +*.pdf + +# Latexmk +*.fdb_latexmk + +# SyncTeX +*.synctex.gz + +# LaTeX Beamer +*.snm +*.vrb +*.nav +*.out + +# BibTeX +*.bbl +*.blg + +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + diff --git a/README.md b/README.md new file mode 100644 index 0000000..33da967 --- /dev/null +++ b/README.md @@ -0,0 +1,195 @@ +# Learning to select data for transfer learning with Bayesian Optimization + +Sebastian Ruder, Barbara Plank (2017). Learning to select data for transfer +learning with Bayesian Optimization. _In Proceedings of the 2017 Conference +on Empirical Methods in Natural Language Processing_, Copenhagen, Denmark. + +## Requirements + +### RoBO + +The Robust Bayesian Optimization framework [RoBO](http://automl.github.io/RoBO/) needs to be installed. +It can be installed using the following steps: + +1. First, install `libeigen3-dev` as a prerequisite: +`sudo apt-get install libeigen3-dev` (*) +2. Then, clone the RoBO repository: +`git clone https://github.com/automl/RoBO.git` +3. Change into the directory: `cd RoBO/` +4. Install RoBOs requirements: +`for req in $(cat all_requirements.txt); do pip install $req; done` +5. Finally, install RoBO: +`python setup.py install` + +For the topic models, `gensim` needs to be installed: +`pip install gensim` + +### DyNet + +We use the neural network library [DyNet](http://dynet.readthedocs.io/en/latest/index.html), +which works well with networks that have dynamic structures. DyNet can be +installed by following the instructions [here](http://dynet.readthedocs.io/en/latest/python.html#manual-installation). + +## Repository structure + +- `bilstm_tagger`: The repository containing code for the Bi-LSTM tagger from +Plank et al. (2016). +- `bist_parser`: The repository containing the code for the BIST parser from +Kiperwasser and Goldberg (2016). +- `bayes_opt.py`: The main logic for running Bayesian Optimization. +- `constants.py`: Constants that are shared across all files. +- `data_utils.py`: Utility methods for data reading and processing. +- `similarity.py`: Methods for measuring domain similarity. +- `simpletagger.py`: Code for running the Structured Perceptron POS tagger. +- `task_utils.py`: Utility methods for training and evaluation. + +## Instructions + +### Running Bayesian Optimization + +The main logic for running Bayesian Optimization can be found in `bayes_opt.py`. +The features that are currently used are currently defined in `constants.py` as +`FEATURE_SETS` and are split into diversity and similarity features. +Bayesian Optimization minimizes the validation error on the specified dataset. + +### Example usage + +``` +python bayes_opt.py --dynet-autobatch 1 -d data/gweb_sancl -m models/model \ + -t emails newsgroups reviews weblogs wsj --task pos \ + -b random most-similar-examples \ + --parser-output-path parser_outputs \ + --perl-script-path bist_parser/bmstparser/src/util_scripts/eval.pl \ + -f similarity --z-norm --num-iterations 100 \ + --num-runs 1 --log-file logs/log +``` + +- `dynet-autobatch 1`: use DyNet auto-batching +- `-d data/gweb_sancl`: use the data from the SANCL 2012 shared task +- `-m models/model`: specify the directory where the model should be saved +- `-t emails newsgroups reviews weblogs wsj`: adapt to the specified target +domains in the order they were provided +- `--task pos`: perform POS tagging with the Structured Perceptron model +- `-b`: use the random and most-similar-examples baselines +- `--parser-output-path`, `--perl-script-path`: only required when performing +parsing +- `-f`: use only similarity features with Bayesian Optimization +- `--z-norm`: perform z-normalisation (recommended) +- `--num-iterations`: perform 100 iterations of Bayesian Optimization +- `--num-runs`: perform one run of Bayesian Optimization per target domain +- `--log-file`: log the results of the baselines and Bayesian Optimization to + this file + +### Adding a new task + +In order to add a new task, you need to do several things: +- Add the new task to `TASKS`, `TASK2TRAIN_EXAMPLES`, and `TASK2DOMAINS` in +`constants.py`. +- Add a method to read data for the task to `data_utils.py` and add the +mapping to `data_utils.task2read_data_func`. +- Add a method to train and evaluate the task to `task_utils.py` and add the +mapping to `task_utils.task2train_and_evaluate_func`. +- Add the function that should be minimized to `bayes_opt.py` and add the +mapping to `task2_objective_function`. The function should take +as input the feature weights and output the error. + +### Adding new features + +New feature sets or features can be added by adding them to `constants.py`. +Similarity features or new representations can be added to +`similarity.py`. Diversity features or any other features can to be added to +`features.py`. All new features must be added to +`get_feature_representations` and `get_feature_names` in `features.py`. + + + +## Data + +### Multi-Domain Sentiment Dataset + +The Amazon Reviews Multi-Domain Sentiment Dataset (Blitzer et al., 2007) +used in the current Bayesian Optimization experiment can be downloaded +using the following steps: +1. Create a new `amazon-reviews` directory: +`mkdir amazon-reviews` +2. Change into the directory: +`cd amazon-reviews` +3. Download the dataset: +`wget https://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz` +4. Extract the dataset: +`tar -xvf processed_acl.tar.gz` + +In `bayes_opt.py`, the `data-path` argument should now be pointed to +the `amazon-reviews` directory. + +### Multi-domain POS and parsing data + +We use the data from the [SANCL 2012 shared task/English Web Treebank](https://catalog.ldc.upenn.edu/ldc2012t13). + +### Word embedding data + +Pre-trained word embeddings can be downloaded from [here](http://nlp.stanford.edu/projects/glove/). +We are using GloVe embeddings in the paper, but other pre-trained embeddings are also possible. +Smaller embedding files can be used for faster iteration. + + +## Models + +### BIST parser + +We use the BIST parser from Kiperwasser and Goldberg (2016) for our experiments. The parser repo can be found +[here](https://github.com/elikip/bist-parser) and was integrated using [`git submodule`](http://stackoverflow.com/questions/2140985/how-to-set-up-a-git-project-to-use-an-external-repo-submodule). + +For running the parser with Bayesian Optimization, two additional hyperparameters are necessary: +- `--perl-script-path`: This is the location of the `perl` script that is used to evaluate the parser's predictions. + The script is located in `bist_parser/bmstparser/src/util_scripts/eval.pl` per default. +- `--parser-output-path`: This is the location of the folder where the parser's predictions and the output of the + `perl` script will be written to. + +Per default, Labeled Attachment Score on the held-out validation set is used to evaluate the parser's performance and +evaluation results are saved to a subfolders of `parser-output-path` that indicate the target domain and feature sets +used. Another subsubfolder is created for the best weights configuration so that Labeled Attachment Score, Unlabeled +Attachment Score and Accuracy as well as other statistics are available for the final test set evaluation. + +### Bi-LSTM tagger + +The Bi-LSTM tagger we are using is a simplified, single-task version of the +hierarchical Multi-task Bi-LSTM tagger used by Plank et al. (2016). The source +repository of the tagger can be found [here](https://github.com/bplank/bilstm-aux/). + +## (*) Installing Eigen without sudo rights + +In case you you do not have sudo rights to run `sudo apt-get install +libeigen3-dev` here is a workaround. + +Create a folder where you download the sources of libeigen3-dev: + +``` +mkdir -p tools/eigen3 +cd tools/eigen3 +apt-get source libeigen3-dev +``` + +Afterwards point the required packages for `RoBo` to the folder just created: `tools/eigen3/eigen3-3.2.0` + +For instance, to install the 'george' requirement of `RoBo`, add the `--global-option` parameters pointing to the eigen directory: + +``` +pip install git+https://github.com/sfalkner/george.git --global-option=build_ext --global-option=-I/path/to/tools/eigen3/eigen3-3.2.0 +``` + +(see http://dan.iel.fm/george/current/user/quickstart/#installation -> if you have Eigen in a strange place) + + +## Reference + +If you make use of the contents of this repository, we appreciate citing the following paper: +``` +@inproceedings{ruder2017select, + title={{Learning to select data for transfer learning with Bayesian Optimization}}, + author={Ruder, Sebastian and Plank, Barbara}, + booktitle={Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processin}, + year={2017} +} +``` + diff --git a/bayes_opt.py b/bayes_opt.py new file mode 100644 index 0000000..5714a07 --- /dev/null +++ b/bayes_opt.py @@ -0,0 +1,459 @@ +""" +Run Bayesian optimization to learn to learn select data for transfer learning. + +Uses Python 3.5. +""" + +import os +import argparse +import logging +import pickle +import copy + +import numpy as np +from scipy import stats +from sklearn.cross_validation import train_test_split + +from robo.fmin import bayesian_optimization + +import task_utils +import data_utils +import similarity +import features +from constants import FEATURE_SETS, SENTIMENT, POS, POS_BILSTM, PARSING,\ + TASK2TRAIN_EXAMPLES, TASK2DOMAINS, TASKS, POS_PARSING_TRG_DOMAINS,\ + SENTIMENT_TRG_DOMAINS, BASELINES, BAYES_OPT, RANDOM, MOST_SIMILAR_DOMAIN,\ + MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA + +from bist_parser.bmstparser.src.utils import ConllEntry + + +def task2_objective_function(task): + """Returns the objective function of a task.""" + if task == SENTIMENT: + return objective_function_sentiment + if task == POS: + return objective_function_pos + if task == POS_BILSTM: + return objective_function_pos_bilstm + if task == PARSING: + return objective_function_parsing + raise ValueError('No objective function implemented for %s.' % task) + + +def objective_function_sentiment(feature_weights): + """ + The objective function to optimize for sentiment analysis. + :param feature_weights: a numpy array; these are the weights of the features + that we want to learn + :return: the error that should be minimized + """ + train_subset, train_labels_subset = task_utils.get_data_subsets( + feature_values, feature_weights, X_train, y_train, SENTIMENT, + TASK2TRAIN_EXAMPLES[SENTIMENT]) + + # train and evaluate the SVM; we input the test documents here but only + # minimize the validation error + val_accuracy, _ = task_utils.train_and_evaluate_sentiment( + train_subset, train_labels_subset, X_val, y_val, X_test, y_test) + + # we minimize the error; the lower the better + error = 1 - float(val_accuracy) + return error + + +def objective_function_pos(feature_weights): + """ + The objective function to optimize for POS tagging. + :param feature_weights: a numpy array; these are the weights of the features + that we want to learn + :return: the error that should be minimized + """ + train_subset, train_labels_subset = task_utils.get_data_subsets( + feature_values, feature_weights, X_train, y_train, POS, + TASK2TRAIN_EXAMPLES[POS]) + + # train and evaluate the tagger; we input the test documents here but only + # minimize the validation error + val_accuracy, _ = task_utils.train_and_evaluate_pos( + train_subset, train_labels_subset, X_val, y_val) + + # we minimize the error; the lower the better + error = 1 - float(val_accuracy) + return error + + +def objective_function_pos_bilstm(feature_weights): + """ + The objective function to optimize for POS tagging. + :param feature_weights: a numpy array; these are the weights of the features + that we want to learn + :return: the error that should be minimized + """ + train_subset, train_labels_subset = task_utils.get_data_subsets( + feature_values, feature_weights, X_train, y_train, POS_BILSTM, + TASK2TRAIN_EXAMPLES[POS_BILSTM]) + + # train and evaluate the tagger; we input the test documents here but only + # minimize the validation error + val_accuracy, _ = task_utils.train_and_evaluate_pos_bilstm( + train_subset, train_labels_subset, X_val, y_val) + + # we minimize the error; the lower the better + error = 1 - float(val_accuracy) + return error + + +def objective_function_parsing(feature_weights): + """ + The objective function to optimize for dependency parsing. + :param feature_weights: a numpy array; these are the weights of the features + that we want to learn + :return: the error that should be minimized + """ + train_subset, train_labels_subset = task_utils.get_data_subsets( + feature_values, feature_weights, X_train, y_train, PARSING, + TASK2TRAIN_EXAMPLES[PARSING]) + val_accuracy, _ = task_utils.train_and_evaluate_parsing( + train_subset, train_labels_subset, X_val, y_val, + parser_output_path=parser_output_path, + perl_script_path=perl_script_path) + error = 100 - float(val_accuracy) + return error + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Learn to select data using Bayesian Optimization.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + # dynet parameters + parser.add_argument('--dynet-autobatch', type=int, + help='use auto-batching (1) (should be first argument)') + parser.add_argument('--dynet-mem', default=5000, help='the memory used', + type=int) # Note: needs to be given to the script! + parser.add_argument('--dynet-seed', default=1512141834, type=int, + help='the dynet seed') # Note: needs to still be given! + + # domain and data paths + parser.add_argument('-d', '--data-path', required=True, + help='the path to the directory containing the ' + 'processed_acl or gweb_sancl directory') + parser.add_argument('-m', '--model-dir', required=True, + help='the directory where the model should be saved') + parser.add_argument('-t', '--trg-domains', nargs='+', required=True, + choices=POS_PARSING_TRG_DOMAINS + SENTIMENT_TRG_DOMAINS, + help='the domains to which to adapt') + parser.add_argument('--task', choices=TASKS, required=True, + help='the task which to optimize') + parser.add_argument('-b', '--baselines', nargs='+', choices=BASELINES, + default=[RANDOM], + help='the baselines that should be compared against') + parser.add_argument('-o', '--parser-output-path', + default='outputs', help='the output path of the parser') + parser.add_argument('-p', '--perl-script-path', help='perl script path', + default='bist_parser/bmstparser/src/util_scripts/eval' + '.pl') + + # feature parameters + parser.add_argument('-f', '--feature-sets', nargs='+', default=['similarity'], + choices=FEATURE_SETS, + help='which feature sets (similarity, topic_similarity,' + 'word_embedding_similarity, diversity) ' + 'to use; default: similarity') + parser.add_argument('--z-norm', action='store_true', + help='use z-normalisation') # important to specify + parser.add_argument('--feature-weights-file', + help='a file containing learned feature weights to be' + 'used for cross-domain experiments') + + # word embedding parameters + parser.add_argument('-wv', '--word2vec-path', help='the path to the word' + 'vector file') + parser.add_argument('-vs', '--vector-size', type=int, default=300, + help='the size of the word vectors') + parser.add_argument('--header', action='store_true', + help='whether the word embeddings file contains header;' + 'GloVe embeddings used in the paper have no header') + + # processing parameters + parser.add_argument('-v', '--max-vocab-size', default=10000, type=int, + help='the maximum size of the vocabulary') + + # training parameters + parser.add_argument('--num-iterations', default=100, type=int) + parser.add_argument('--logging', action='store_true', help='perform logging') + parser.add_argument('--num-runs', type=int, default=1, + help='the number of experiment runs for each domain') + parser.add_argument('--log-file', required=True, + help='the path to which validation and test accuracies' + 'should be logged') + + args = parser.parse_args() + + # switch on logging if specified to see the output of LDA training and of + # the Bayesian optimization + if args.logging: + logging.basicConfig(level=logging.INFO) + + assert os.path.exists(args.data_path), ('Error: %s does not exist.' % + args.data_path) + assert not args.word2vec_path or os.path.exists(args.word2vec_path), \ + 'Error: %s does not exist.' % args.word2vec_path + + # create the model directory if it does not exist + if not os.path.exists(args.model_dir): + print('Creating %s...' % args.model_dir) + os.makedirs(args.model_dir) + + # perl script path and parser output path are only required for parsing + perl_script_path = None + if args.task == PARSING: + assert args.parser_output_path is not None + assert args.perl_script_path is not None + if not os.path.exists(args.parser_output_path): + os.makedirs('Creating output path %s.' % args.parser_output_path) + assert os.path.exists(args.perl_script_path) + perl_script_path = args.perl_script_path + + # get the task-specific methods and hyper-parameters + num_train_examples = TASK2TRAIN_EXAMPLES[args.task] + task_trg_domains = TASK2DOMAINS[args.task] + read_data = data_utils.task2read_data_func(args.task) + train_and_evaluate = task_utils.task2train_and_evaluate_func(args.task) + objective_function = task2_objective_function(args.task) + + # get the names of the individual features in the feature sets + assert args.word2vec_path or 'diversity' not in args.feature_sets,\ + 'Error: Word2vec path is required for quadratic entropy in ' \ + 'diversity-based features.' + feature_names = features.get_feature_names(args.feature_sets) + + if args.feature_weights_file: + print('Training model with pre-learned feature weights rather than ' + 'learning new ones...') + assert os.path.exists(args.feature_weights_file),\ + 'Error: %s does not exist.' % args.feature_weights_file + + # read the data and pickle it or load it + preproc_data_path = os.path.join(args.model_dir, + 'preproc_data_%s.pkl' % args.task) + if not os.path.exists(preproc_data_path): + domain2data = read_data(args.data_path) + print('Saving domain2data object to %s...' % preproc_data_path) + with open(preproc_data_path, 'wb') as f: + pickle.dump(domain2data, f) + else: + print('Loading domain2data object from %s...' % preproc_data_path) + with open(preproc_data_path, 'rb') as f: + domain2data = pickle.load(f) + assert set(task_trg_domains) == set(domain2data.keys()) + + # create the vocabulary or load it if it was already created + vocab_path = os.path.join(args.model_dir, 'vocab.txt') + vocab = data_utils.Vocab(args.max_vocab_size, vocab_path) + if not os.path.exists(vocab_path): + # retrieve all available tokenised sentences + tokenised_sentences = data_utils.get_all_docs( + domain2data.items(), unlabeled=True)[0] + if args.task == PARSING: + # get the word form from every ConllEntry + tokenised_sentences = [[token.form if isinstance(token, ConllEntry) + else token for token in tokens] + for tokens in tokenised_sentences] + vocab.create(tokenised_sentences) + del tokenised_sentences + else: + vocab.load() + + # load word vectors if we are using them + word2vec = None + if args.word2vec_path: + vocab_word2vec_file = os.path.join(args.model_dir, 'vocab_word2vec.txt') + word2vec = similarity.load_word_vectors( + args.word2vec_path, vocab_word2vec_file, vocab.word2id, + vector_size=args.vector_size, header=args.header) + + # perform the task-specific pre-processing + if args.task == SENTIMENT: + print('Creating binary training data...') + domain2train_data = data_utils.get_tfidf_data(domain2data, vocab) + elif args.task in [POS, POS_BILSTM]: + print('Using words as training data for POS tagging...') + domain2train_data = domain2data + elif args.task == PARSING: + print('Using CoNLL entries as training data for parsing. Using word ' + 'forms to extract feature representations...') + domain2train_data = copy.deepcopy(domain2data) + for domain, domain_data in domain2data.items(): + domain_data[0] = [[conll_entry.form for conll_entry in conll_entries] + for conll_entries in domain_data[0]] + else: + raise ValueError('Data preproc for %s is not implemented.' % args.task) + + print('Creating relative term frequency distributions for all domains...') + term_dist_path = os.path.join(args.model_dir, 'term_dist.txt') + domain2term_dist = similarity.get_domain_term_dists( + term_dist_path, domain2data, vocab) + + # perform optimization for every target domain + for trg_domain in args.trg_domains: + print('Target domain:', trg_domain) + + # set the domain and similarity-specific parser output path for parsing + parser_output_path, best_weights_parser_output_path = None, None + if args.task == PARSING: + parser_output_path = os.path.join( + args.parser_output_path, '%s-%s' % (trg_domain, '_'.join( + args.feature_sets))) + if not os.path.exists(parser_output_path): + print('Creating %s...' % parser_output_path) + os.makedirs(parser_output_path) + # use a separate subfolder for the best weights + best_weights_parser_output_path = os.path.join(parser_output_path, + 'best-weights') + if not os.path.exists(best_weights_parser_output_path): + os.makedirs(best_weights_parser_output_path) + + # get the training data of all source domains (not the target domain) + X_train, y_train, train_domains = data_utils.get_all_docs( + [(k, v) for (k, v) in sorted(domain2train_data.items()) + if k != trg_domain], unlabeled=False) + + # get the unprocessed examples for extracting the feature values + examples, y_train_check, train_domains_check = data_utils.get_all_docs( + [(k, v) for (k, v) in sorted(domain2data.items()) + if k != trg_domain], unlabeled=False) + + # some sanity checks just to make sure the processed and the + # unprocessed data still correspond to the same examples + assert np.array_equal(y_train, y_train_check) + assert len(train_domains) == len(train_domains_check),\ + 'Error: %d != %d.' % (len(train_domains), len(train_domains_check)) + assert train_domains == train_domains_check, ('Error: %s != %s' % ( + str(train_domains), str(train_domains_check))) + if args.task in [POS, POS_BILSTM, PARSING]: + # for sentiment, we are using a sparse matrix + X_train = np.array(X_train) + print('Training data shape:', X_train.shape, y_train.shape) + + # train topic model if any of the features requires a topic distribution + topic_vectorizer, lda_model = None, None + if any(f_name.startswith('topic') for f_name in feature_names): + # train a topic model on labeled and unlabeled data of all domains + topic_vectorizer, lda_model = similarity.train_topic_model( + data_utils.get_all_docs( + domain2data.items(), unlabeled=True)[0], vocab) + + # get the feature representations of the training data + print('Creating the feature representations for the training data. ' + 'This may take some time...') + feature_values = features.get_feature_representations( + feature_names, examples, domain2data[trg_domain][0], vocab, + word2vec, topic_vectorizer, lda_model) + + if args.z_norm: + # apply z-normalisation; this is important for good performance + print('Z-normalizing features...') + print('First five example features before normalisation:', + feature_values[:5, :]) + print('Standard deviation of features:', np.std(feature_values, + axis=0)) + print('Mean of features:', np.mean(feature_values, axis=0)) + feature_values = stats.zscore(feature_values, axis=0) + + # delete unnecessary variables to save space + del examples, y_train_check, train_domains_check + + # run num_runs iterations of the optimization and baselines in order to + # compute statistics around mean/variance; things that vary between + # runs: validation/test split; train set of random baseline; + # final BayesOpt parameters; the feature values are constant for each + # run, which is why we generate them before to reduce the overhead + run_dict = {method: [] for method in BASELINES + [BAYES_OPT]} + for i in range(args.num_runs): + print('\nTarget domain %s. Run %d/%d.' % (trg_domain, i+1, + args.num_runs)) + + # get the evaluation data from the target domain + X_test, y_test, _ = domain2train_data[trg_domain] + + # split off a validation set from the evaluation data + X_test, X_val, y_test, y_val = train_test_split( + X_test, y_test, test_size=100, stratify=y_test + if args.task == SENTIMENT else None) + print('# of validation examples: %d. # of test examples: %d.' + % (len(y_val), len(y_test))) + + # train the model with pre-learned feature weights if specified + if args.feature_weights_file: + print('Training with pre-learned feature weights...') + task_utils.train_pretrained_weights( + feature_values, X_train, y_train, train_domains, + num_train_examples, X_val, y_val, X_test, y_test, + trg_domain, args, feature_names, parser_output_path, + perl_script_path) + continue + + for baseline in args.baselines: + + # select the training data dependent on the baseline + if baseline == RANDOM: + print('Randomly selecting examples...') + train_subset, _, labels_subset, _ = train_test_split( + X_train, y_train, train_size=num_train_examples, + stratify=y_train if args.task == SENTIMENT else None) + elif baseline == ALL_SOURCE_DATA: + print('Selecting all source data examples...') + train_subset, labels_subset = X_train, y_train + elif baseline == MOST_SIMILAR_DOMAIN: + print('Selecting examples from the most similar domain...') + most_similar_domain = similarity.get_most_similar_domain( + trg_domain, domain2term_dist) + train_subset, labels_subset, _ = domain2train_data[ + most_similar_domain] + train_subset, _, labels_subset, _ = train_test_split( + train_subset, labels_subset, train_size=num_train_examples, + stratify=labels_subset if args.task == SENTIMENT else None) + elif baseline == MOST_SIMILAR_EXAMPLES: + print('Selecting the most similar examples...') + one_all_weights = np.ones(len(feature_names)) + one_all_weights[1:] = 0 + train_subset, labels_subset = task_utils.get_data_subsets( + feature_values, one_all_weights, X_train, y_train, + args.task, num_train_examples) + else: + raise ValueError('%s is not a baseline.' % baseline) + + # train the baseline + val_accuracy, test_accuracy = train_and_evaluate( + train_subset, labels_subset, X_val, y_val, + X_test, y_test, parser_output_path=parser_output_path, + perl_script_path=perl_script_path) + run_dict[baseline].append((val_accuracy, test_accuracy)) + + # define the lower and upper bounds of the input space [-1, 1] + lower = np.array(len(feature_names) * [-1]) + upper = np.array(len(feature_names) * [1]) + print('Lower limits shape:', lower.shape) + print('Upper limits shape:', upper.shape) + + print('Running Bayesian Optimization...') + res = bayesian_optimization(objective_function, lower=lower, + upper=upper, + num_iterations=args.num_iterations) + + best_feature_weights = res['x_opt'] + print('Best feature weights', best_feature_weights) + train_subset, labels_subset = task_utils.get_data_subsets( + feature_values, best_feature_weights, X_train, y_train, + args.task, num_train_examples) + val_accuracy, test_accuracy = train_and_evaluate( + train_subset, labels_subset, X_val, y_val, X_test, y_test, + parser_output_path=best_weights_parser_output_path, + perl_script_path=perl_script_path) + run_dict[BAYES_OPT].append((val_accuracy, test_accuracy, + best_feature_weights)) + + # log the results of all methods to the log file + data_utils.log_to_file(args.log_file, run_dict, trg_domain, args) diff --git a/bilstm_tagger/License b/bilstm_tagger/License new file mode 100755 index 0000000..87c6de2 --- /dev/null +++ b/bilstm_tagger/License @@ -0,0 +1,13 @@ +Copyright 2016 The bilstm-aux authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/bilstm_tagger/README.md b/bilstm_tagger/README.md new file mode 100755 index 0000000..95c4940 --- /dev/null +++ b/bilstm_tagger/README.md @@ -0,0 +1,108 @@ +## bi-LSTM tagger + +Bidirectional Long-Short Term Memory tagger + +If you use this tagger please cite our paper: +http://arxiv.org/abs/1604.05529 + +### Requirements + +* python3 +* [dynet](https://github.com/clab/dynet) + +## Installation + +Download and install dynet in a directory of your choice DYNETDIR: + +``` +mkdir $DYNETDIR +git clone https://github.com/clab/dynet +``` + +Follow the instructions in the Dynet documentation (use `-DPYTHON`, +see http://dynet.readthedocs.io/en/latest/python.html). + +And compile dynet: + +``` +cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python` +``` + +(if you have a GPU: + +``` +cmake .. -DEIGEN3_INCLUDE_DIR=$HOME/tools/eigen/ -DPYTHON=`which python` -DBACKEND=cuda +``` +) + +After successful installation open python and import dynet, you can +test if the installation worked with: + +``` +>>> import dynet +[dynet] random seed: 2809331847 +[dynet] allocating memory: 512MB +[dynet] memory allocation done. +>>> dynet.__version__ +2.0 +``` + +(You may need to set you PYTHONPATH to include Dynet's `build/python`) + +#### DyNet supports python 3 + +The old bilstm-aux had a patch to work with python 3. This +is no longer necessary, as DyNet supports python 3 as of +https://github.com/clab/dynet/pull/130#issuecomment-259656695 + + +#### Example command + +Training the tagger: + +``` +python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1 > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2 +``` + +#### Embeddings + +The poly embeddings [(Al-Rfou et al., +2013)](https://sites.google.com/site/rmyeid/projects/polyglot) can be +downloaded from [here](http://www.let.rug.nl/bplank/bilty/embeds.tar.gz) (0.6GB) + + +#### A couple of remarks + +The choice of 22 languages from UD1.2 (rather than 33) is described in +our TACL parsing paper, Section 3.1. [(Agić et al., +2016)](https://transacl.org/ojs/index.php/tacl/article/view/869). Note, +however, that the bi-LSTM tagger does not require large amounts of +training data (as discussed in our paper). Therefore above are +results for all languages in UD1.3 (for the canonical language +subparts, i.e., those with just the language prefix, no further +suffix; e.g. 'nl' but not 'nl_lassy', and those languages which are +distributed with word forms). + +The `bilty` code is a significantly refactored version of the code +originally used in the paper. For example, `bilty` supports multi-task +learning with output layers at different layers (`--pred_layer`), and +it correctly supports stacked LSTMs (see e.g., Ballesteros et al., +2015, Dyer et al., 2015). The results on UD1.3 are obtained with +`bilty` using no stacking (`--h_layers 1`). + +#### Recommended setting for `bilty`: + +* 3 stacked LSTMs, predicting on outermost layer, otherwise default settings, i.e., `--h_layers 3 --pred_layer 3` + +#### Reference + +``` +@inproceedings{plank:ea:2016, + title={{Multilingual Part-of-Speech Tagging with Bidirectional Long Short-Term Memory Models and Auxiliary Loss}}, + author={Plank, Barbara and S{\o}gaard, Anders and Goldberg, Yoav}, + booktitle={ACL 2016, arXiv preprint arXiv:1604.05529}, + url={http://arxiv.org/abs/1604.05529}, + year={2016} +} +``` + diff --git a/bilstm_tagger/langs/lang_canonic.txt b/bilstm_tagger/langs/lang_canonic.txt new file mode 100755 index 0000000..9e15b31 --- /dev/null +++ b/bilstm_tagger/langs/lang_canonic.txt @@ -0,0 +1,39 @@ +ar +bg +ca +cs +cu +da +de +el +en +es +et +eu +fa +fi +fr +ga +gl +got +grc +he +hi +hr +hu +id +it +kk +la +lv +nl +no +pl +pt +ro +ru +sl +sv +ta +tr +zh diff --git a/bilstm_tagger/langs/lang_with_embeds.txt b/bilstm_tagger/langs/lang_with_embeds.txt new file mode 100755 index 0000000..a595b41 --- /dev/null +++ b/bilstm_tagger/langs/lang_with_embeds.txt @@ -0,0 +1,26 @@ +ar +bg +ca +cs +da +de +el +en +es +et +eu +fa +fi +fr +ga +he +hi +hr +id +it +nl +no +pl +pt +sl +sv diff --git a/bilstm_tagger/results-UD1.3-pycnn.md b/bilstm_tagger/results-UD1.3-pycnn.md new file mode 100755 index 0000000..1a9a47f --- /dev/null +++ b/bilstm_tagger/results-UD1.3-pycnn.md @@ -0,0 +1,64 @@ + +#### Results on UD1.3 + +NB. The results below are with the old version of Dynet (pycnn). + +The table below provides results on UD1.3 (iters=20, h_layers=1). + ++poly is using pre-trained embeddings to initialize +word embeddings. Note that for some languages it slightly hurts performance. + +``` +python src/bilty.py --dynet-seed 1512141834 --dynet-mem 1500 --train /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-train.conllu --test /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-test.conllu --dev /home/$user/corpora/pos/ud1.3/orgtok/goldpos//en-ud-dev.conllu --output /data/$user/experiments/bilty/predictions/bilty/en-ud-test.conllu.bilty-en-ud1.3-poly-i20-h1 --in_dim 64 --c_in_dim 100 --trainer sgd --iters 20 --sigma 0.2 --save /data/$user/experiments/bilty/models/bilty/bilty-en-ud1.3-poly-i20-h1.model --embeds embeds/poly_a/en.polyglot.txt --h_layers 1 --pred_layer 1 > /data/$user/experiments/bilty/nohup/bilty-en-ud1.3-poly-i20-h1.out 2> /data/$user/experiments/bilty/nohup/bilty.bilty-en-ud1.3-poly-i20-h1.out2 +``` + +| Lang | i20-h1 | +poly | +| ---| -----:| -----:| +| ar | 96.07 | 96.37 | +| bg | 98.21 | 98.12 | +| ca | 98.11 | 98.24 | +| cs | 98.63 | 98.60 | +| cu | 96.48 | -- | +| da | 96.06 | 96.04 | +| de | 92.91 | 93.64 | +| el | 97.85 | 98.36 | +| en | 94.60 | 95.04 | +| es | 95.23 | 95.76 | +| et | 95.75 | 96.57 | +| eu | 93.86 | 95.40 | +| fa | 96.82 | 97.38 | +| fi | 94.32 | 95.35 | +| fr | 96.34 | 96.45 | +| ga | 90.50 | 91.29 | +| gl | 96.89 | -- | +| got | 95.97 | -- | +| grc | 94.36 | -- | +| he | 95.25 | 96.78 | +| hi | 96.37 | 96.93 | +| hr | 94.98 | 96.07 | +| hu | 93.84 | -- | +| id | 93.17 | 93.55 | +| it | 97.40 | 97.82 | +| kk | 77.68 | -- | +| la | 90.17 | -- | +| lv | 91.42 | -- | +| nl | 90.02 | 89.87 | +| no | 97.58 | 97.97 | +| pl | 96.30 | 97.36 | +| pt | 97.21 | 97.46 | +| ro | 95.49 | -- | +| ru | 95.69 | -- | +| sl | 97.53 | 96.42 | +| sv | 96.49 | 96.76 | +| ta | 84.51 | -- | +| tr | 93.81 | -- | +| zh | 93.13 | -- | + +Using pre-trained embeddings often helps to improve accuracy, however, does not +strictly hold for all languages. + +For more information, predictions files and pre-trained models +visit [http://www.let.rug.nl/bplank/bilty/](http://www.let.rug.nl/bplank/bilty/) + + + diff --git a/bilstm_tagger/scripts/submit-bilty-ud1.3.sh b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh new file mode 100755 index 0000000..f5c661b --- /dev/null +++ b/bilstm_tagger/scripts/submit-bilty-ud1.3.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# train models on UD 1.3 +# +SUBMIT=0 + +PARTITION=nodes +mkdir -p runs + +CORPUSDIR=~/corpora/pos/ud1.3/orgtok/goldpos/ +EXPDIR=/data/p252438/experiments/bilty + +tagger=bilty +mkdir -p $EXPDIR/models/$tagger +mkdir -p $EXPDIR/nohup +mkdir -p $EXPDIR/predictions/$tagger + +ITERS=20 +#ITERS=30 +SIGMA=0.2 +CDIM=100 + +SEED=1512141834 +TRAINER=sgd +INDIM=64 +HLAYERS=1 +#HLAYERS=3 +T0_OUT=$HLAYERS + +for lang in `cat langs/lang_with_embeds.txt`; # all for which we have poly embeds (26) +do + TRAIN=$lang-ud-train.conllu + JOBNAME=bilty-$lang-ud1.3-poly-i$ITERS-h$HLAYERS + + echo "#!/bin/bash" > $$tmp + echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp + echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp + echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp + echo "module load CMake" >> $$tmp + + echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --embeds embeds/poly_a/$lang.polyglot.txt --h_layers $HLAYERS --pred_layer $T0_OUT > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp + + if [ $SUBMIT -eq 1 ] ; then + echo "SUBMIT" + sbatch $$tmp + fi + cat $$tmp + rm $$tmp +done + +for lang in `cat langs/lang_canonic.txt` ; # all without embeddings (but only canical names) +do + TRAIN=$lang-ud-train.conllu + JOBNAME=bilty-$lang-ud1.3-i$ITERS-h$HLAYERS + + echo "#!/bin/bash" > $$tmp + echo "#SBATCH --ntasks=1 --cpus-per-task 12 --time=24:00:00 --job-name=$JOBNAME --partition=$PARTITION --mem=64GB" >> $$tmp + echo "#SBATCH --output=runs/${JOBNAME}.out" >> $$tmp + echo "#SBATCH --error=runs/${JOBNAME}.out2" >> $$tmp + echo "module load CMake" >> $$tmp + + echo "python src/$tagger.py --dynet-seed $SEED --dynet-mem 1500 --train $CORPUSDIR/$TRAIN --test $CORPUSDIR/$lang-ud-test.conllu --dev $CORPUSDIR/$lang-ud-dev.conllu --output $EXPDIR/predictions/$tagger/$lang-ud-test.conllu.$JOBNAME --in_dim 64 --c_in_dim $CDIM --trainer $TRAINER --iters $ITERS --sigma $SIGMA --save $EXPDIR/models/$tagger/$JOBNAME.model --h_layers $HLAYERS --pred_layer $T0_OUT > $EXPDIR/nohup/$JOBNAME.out 2> $EXPDIR/nohup/$tagger.$JOBNAME.out2" >> $$tmp + + if [ $SUBMIT -eq 1 ] ; then + echo "SUBMIT" + sbatch $$tmp + fi + + cat $$tmp + rm $$tmp +done diff --git a/bilstm_tagger/src/bilty.py b/bilstm_tagger/src/bilty.py new file mode 100755 index 0000000..e3d89d7 --- /dev/null +++ b/bilstm_tagger/src/bilty.py @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +# coding=utf-8 +""" +A neural network based tagger (bi-LSTM) +:author: Barbara Plank +""" +import argparse +import random +import time +import sys +import numpy as np +import os +import pickle +import dynet + +from lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor +from lib.mio import read_conll_file, load_embeddings_file + + +def main(): + parser = argparse.ArgumentParser(description="""Run the NN tagger""") + parser.add_argument("--train", nargs='*', help="train folder for each task") # allow multiple train files, each asociated with a task = position in the list + parser.add_argument("--pred_layer", nargs='*', help="layer of predictons for each task", required=True) # for each task the layer on which it is predicted (default 1) + parser.add_argument("--model", help="load model from file", required=False) + parser.add_argument("--iters", help="training iterations [default: 30]", required=False,type=int,default=30) + parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False,type=int,default=64) + parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False,type=int,default=100) + parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False,type=int,default=100) + parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False,type=int,default=1) + parser.add_argument("--test", nargs='*', help="test file(s)", required=False) # should be in the same order/task as train + parser.add_argument("--dev", help="dev file(s)", required=False) + parser.add_argument("--output", help="output predictions to file", required=False,default=None) + parser.add_argument("--lower", help="lowercase words (not used)", required=False,default=False,action="store_true") + parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False,default=None) + parser.add_argument("--embeds", help="word embeddings file", required=False, default=None) + parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float) + parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh", type=MyNNTaggerArgumentOptions.acfunct) + parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd") + parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False, type=int) + parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int) + parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None) + + args = parser.parse_args() + + if args.train: + if not args.pred_layer: + print("--pred_layer required!") + exit() + + if args.dynet_seed: + print(">>> using seed: ", args.dynet_seed, file=sys.stderr) + np.random.seed(args.dynet_seed) + random.seed(args.dynet_seed) + + if args.c_in_dim == 0: + print("no character embeddings", file=sys.stderr) + + if args.save: + # check if folder exists + if os.path.isdir(args.save): + modeldir = os.path.dirname(args.save) + if not os.path.exists(modeldir): + os.makedirs(modeldir) + if args.output: + if os.path.isdir(args.output): + outdir = os.path.dirname(args.output) + if not os.path.exists(outdir): + os.makedirs(outdir) + + + start = time.time() + + if args.model: + print("loading model from file {}".format(args.model), file=sys.stderr) + tagger = load(args) + else: + tagger = NNTagger(args.in_dim, + args.h_dim, + args.c_in_dim, + args.h_layers, + args.pred_layer, + embeds_file=args.embeds, + activation=args.ac, + lower=args.lower, + noise_sigma=args.sigma) + + if args.train and len( args.train ) != 0: + tagger.fit(args.train, args.iters, args.trainer, dev=args.dev) + if args.save: + save(tagger, args) + + if args.test and len( args.test ) != 0: + stdout = sys.stdout + # One file per test ... + for i, test in enumerate( args.test ): + if args.output != None: + file_pred = args.output+".task"+str(i) + sys.stdout = open(file_pred, 'w') + + sys.stderr.write('\nTesting Task'+str(i)+'\n') + sys.stderr.write('*******\n') + test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task"+str(i)) + correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output) + + print("\ntask%s test accuracy on %s items: %.4f" % (i, i+1, correct/total), file=sys.stderr) + print(("Task"+str(i)+" Done. Took {0:.2f} seconds.".format(time.time()-start)),file=sys.stderr) + sys.stdout = stdout + + + if args.ac: + activation=args.ac.__name__ + else: + activation="None" + print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}" + "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}" + "\tembeds: {3}".format(args.in_dim,args.h_dim,args.h_layers,args.embeds,activation, args.sigma, args.lower, args.c_in_dim), file=sys.stderr) + + if args.save_embeds: + tagger.save_embeds(args.save_embeds) + +def load(args): + """ + load a model from file; specify the .model file, it assumes the *pickle file in the same location + """ + myparams = pickle.load(open(args.model+".pickle", "rb")) + tagger = NNTagger(myparams["in_dim"], + myparams["h_dim"], + myparams["c_in_dim"], + myparams["h_layers"], + myparams["pred_layer"], + activation=myparams["activation"], tasks_ids=myparams["tasks_ids"]) + tagger.set_indices(myparams["w2i"],myparams["c2i"],myparams["task2tag2idx"]) + tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \ + tagger.build_computation_graph(myparams["num_words"], + myparams["num_chars"]) + #tagger.model.load(str.encode(args.model)) + tagger.model.load(args.model) + print("model loaded: {}".format(args.model), file=sys.stderr) + return tagger + +def save(nntagger, args): + """ + save a model; dynet only saves the parameters, need to store the rest separately + """ + outdir = args.save + modelname = outdir + ".model" + #nntagger.model.save(str.encode(modelname)) #python3 needs it as bytes - no longer! + nntagger.model.save(modelname) + import pickle + print(nntagger.task2tag2idx) + myparams = {"num_words": len(nntagger.w2i), + "num_chars": len(nntagger.c2i), + "tasks_ids": nntagger.tasks_ids, + "w2i": nntagger.w2i, + "c2i": nntagger.c2i, + "task2tag2idx": nntagger.task2tag2idx, + "activation": nntagger.activation, + "in_dim": nntagger.in_dim, + "h_dim": nntagger.h_dim, + "c_in_dim": nntagger.c_in_dim, + "h_layers": nntagger.h_layers, + "embeds_file": nntagger.embeds_file, + "pred_layer": nntagger.pred_layer + } + pickle.dump(myparams, open( modelname+".pickle", "wb" ) ) + print("model stored: {}".format(modelname), file=sys.stderr) + + +class NNTagger(object): + + def __init__(self,in_dim,h_dim,c_in_dim,h_layers,pred_layer,embeds_file=None,activation=dynet.tanh, lower=False, noise_sigma=0.1, tasks_ids=[]): + self.w2i = {} # word to index mapping + self.c2i = {} # char to index mapping + self.tasks_ids = tasks_ids # list of names for each task + self.task2tag2idx = {} # need one dictionary per task + self.pred_layer = [int(layer) for layer in pred_layer] # at which layer to predict each task + self.model = dynet.Model() #init model + self.in_dim = in_dim + self.h_dim = h_dim + self.c_in_dim = c_in_dim + self.activation = activation + self.lower = lower + self.noise_sigma = noise_sigma + self.h_layers = h_layers + self.predictors = {"inner": [], "output_layers_dict": {}, "task_expected_at": {} } # the inner layers and predictors + self.wembeds = None # lookup: embeddings for words + self.cembeds = None # lookup: embeddings for characters + self.embeds_file = embeds_file + self.char_rnn = None # RNN for character input + + + def pick_neg_log(self, pred, gold): + return -dynet.log(dynet.pick(pred, gold)) + + def set_indices(self, w2i, c2i, task2t2i): + for task_id in task2t2i: + self.task2tag2idx[task_id] = task2t2i[task_id] + self.w2i = w2i + self.c2i = c2i + + def fit(self, list_folders_name, num_iterations, train_algo, dev=None): + """ + train the tagger + """ + print("read training data",file=sys.stderr) + + nb_tasks = len( list_folders_name ) + + train_X, train_Y, task_labels, w2i, c2i, task2t2i = self.get_train_data(list_folders_name) + + ## after calling get_train_data we have self.tasks_ids + self.task2layer = {task_id: out_layer for task_id, out_layer in zip(self.tasks_ids, self.pred_layer)} + print("task2layer", self.task2layer, file=sys.stderr) + + # store mappings of words and tags to indices + self.set_indices(w2i,c2i,task2t2i) + + if dev: + dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0") + + # init lookup parameters and define graph + print("build graph",file=sys.stderr) + + num_words = len(self.w2i) + num_chars = len(self.c2i) + + assert(nb_tasks==len(self.pred_layer)) + + self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars) + + if train_algo == "sgd": + trainer = dynet.SimpleSGDTrainer(self.model) + elif train_algo == "adam": + trainer = dynet.AdamTrainer(self.model) + + train_data = list(zip(train_X,train_Y, task_labels)) + + for iter in range(num_iterations): + total_loss=0.0 + total_tagged=0.0 + random.shuffle(train_data) + for ((word_indices,char_indices),y, task_of_instance) in train_data: + # use same predict function for training and testing + output = self.predict(word_indices, char_indices, task_of_instance, train=True) + + loss1 = dynet.esum([self.pick_neg_log(pred,gold) for pred, gold in zip(output, y)]) + lv = loss1.value() + total_loss += lv + total_tagged += len(word_indices) + + loss1.backward() + trainer.update() + + print("iter {2} {0:>12}: {1:.2f}".format("total loss",total_loss/total_tagged,iter), file=sys.stderr) + + if dev: + # evaluate after every epoch + correct, total = self.evaluate(dev_X, dev_Y, org_X, org_Y, task_labels) + print("\ndev accuracy: %.4f" % (correct/total), file=sys.stderr) + + + + def build_computation_graph(self, num_words, num_chars): + """ + build graph and link to parameters + """ + # initialize the word embeddings and the parameters + if self.embeds_file: + print("loading embeddings", file=sys.stderr) + embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower) + assert(emb_dim==self.in_dim) + num_words=len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings + # init model parameters and initialize them + wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) + cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) + + init=0 + l = len(embeddings.keys()) + for word in embeddings.keys(): + # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) + if word in self.w2i: + wembeds.init_row(self.w2i[word], embeddings[word]) + else: + self.w2i[word]=len(self.w2i.keys()) # add new word + wembeds.init_row(self.w2i[word], embeddings[word]) + init+=1 + print("initialized: {}".format(init), file=sys.stderr) + + else: + wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) + cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) + + + #make it more flexible to add number of layers as specified by parameter + layers = [] # inner layers + output_layers_dict = {} # from task_id to actual softmax predictor + task_expected_at = {} # map task_id => output_layer_# + + # connect output layers to tasks + for output_layer, task_id in zip(self.pred_layer, self.tasks_ids): + if output_layer > self.h_layers: + raise ValueError("cannot have a task at a layer which is beyond the model, increase h_layers") + task_expected_at[task_id] = output_layer + + print("task expected at", task_expected_at, file=sys.stderr) + + nb_tasks = len( self.tasks_ids ) + + print("h_layers:", self.h_layers, file=sys.stderr) + for layer_num in range(0,self.h_layers): + print(">>>", layer_num, "layer_num") + + if layer_num == 0: + builder = dynet.LSTMBuilder(1, self.in_dim+self.c_in_dim*2, self.h_dim, self.model) # in_dim: size of each layer + layers.append(BiRNNSequencePredictor(builder)) #returns forward and backward sequence + else: + # add inner layers (if h_layers >1) + builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) + layers.append(BiRNNSequencePredictor(builder)) + + # store at which layer to predict task + for task_id in self.tasks_ids: + task_num_labels= len(self.task2tag2idx[task_id]) + output_layers_dict[task_id] = FFSequencePredictor(Layer(self.model, self.h_dim*2, task_num_labels, dynet.softmax)) + + sys.stderr.write('#\nOutput layers'+str(len(output_layers_dict))+'\n') + + char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) + + predictors = {} + predictors["inner"] = layers + predictors["output_layers_dict"] = output_layers_dict + predictors["task_expected_at"] = task_expected_at + + return predictors, char_rnn, wembeds, cembeds + + def get_features(self, words): + """ + from a list of words, return the word and word char indices + """ + word_indices = [] + word_char_indices = [] + for word in words: + if word in self.w2i: + word_indices.append(self.w2i[word]) + else: + word_indices.append(self.w2i["_UNK"]) + + chars_of_word = [self.c2i[""]] + for char in word: + if char in self.c2i: + chars_of_word.append(self.c2i[char]) + else: + chars_of_word.append(self.c2i["_UNK"]) + chars_of_word.append(self.c2i[""]) + word_char_indices.append(chars_of_word) + return word_indices, word_char_indices + + + def get_data_as_indices(self, folder_name, task): + """ + X = list of (word_indices, word_char_indices) + Y = list of tag indices + """ + X, Y = [],[] + org_X, org_Y = [], [] + task_labels = [] + for (words, tags) in read_conll_file(folder_name): + word_indices, word_char_indices = self.get_features(words) + tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags] + X.append((word_indices,word_char_indices)) + Y.append(tag_indices) + org_X.append(words) + org_Y.append(tags) + task_labels.append( task ) + return X, Y, org_X, org_Y, task_labels + + + def predict(self, word_indices, char_indices, task_id, train=False): + """ + predict tags for a sentence represented as char+word embeddings + """ + dynet.renew_cg() # new graph + + char_emb = [] + rev_char_emb = [] + # get representation for words + for chars_of_token in char_indices: + # use last state as word representation + last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1] + rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1] + char_emb.append(last_state) + rev_char_emb.append(rev_last_state) + + wfeatures = [self.wembeds[w] for w in word_indices] + features = [dynet.concatenate([w,c,rev_c]) for w,c,rev_c in zip(wfeatures,char_emb,reversed(rev_char_emb))] + + if train: # only do at training time + features = [dynet.noise(fe,self.noise_sigma) for fe in features] + + output_expected_at_layer = self.predictors["task_expected_at"][task_id] + output_expected_at_layer -=1 + + # go through layers + # input is now combination of w + char emb + prev = features + num_layers = self.h_layers +# for i in range(0,num_layers-1): + for i in range(0,num_layers): + predictor = self.predictors["inner"][i] + forward_sequence, backward_sequence = predictor.predict_sequence(prev) + if i > 0 and self.activation: + # activation between LSTM layers + forward_sequence = [self.activation(s) for s in forward_sequence] + backward_sequence = [self.activation(s) for s in backward_sequence] + + if i == output_expected_at_layer: + output_predictor = self.predictors["output_layers_dict"][task_id] + concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))] + + if train and self.noise_sigma > 0.0: + concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer] + output = output_predictor.predict_sequence(concat_layer) + return output + + prev = forward_sequence + prev_rev = backward_sequence # not used + + raise Exception("oops should not be here") + return None + + def evaluate(self, test_X, test_Y, org_X, org_Y, task_labels, output_predictions=None, verbose=True): + """ + compute accuracy on a test file + """ + correct = 0 + total = 0.0 + + if output_predictions != None: + i2w = {self.w2i[w] : w for w in self.w2i.keys()} + task_id = task_labels[0] #get first + print(task_id,"labels:", self.task2tag2idx[task_id], file=sys.stderr ) + i2t = {self.task2tag2idx[task_id][t] : t for t in self.task2tag2idx[task_id].keys()} + + for i, ((word_indices, word_char_indices), gold_tag_indices, task_of_instance) in enumerate(zip(test_X, test_Y, task_labels)): + if verbose: + if i%100==0: + sys.stderr.write('%s'%i) + elif i%10==0: + sys.stderr.write('.') + + output = self.predict(word_indices, word_char_indices, task_of_instance) + predicted_tag_indices = [np.argmax(o.value()) for o in output] + if output_predictions: + prediction = [i2t[idx] for idx in predicted_tag_indices] + + words = org_X[i] + gold = org_Y[i] + + for w,g,p in zip(words,gold,prediction): + print(("{}\t{}\t{}".format(w,g,p))) + print("") + correct += sum([1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold]) + total += len(gold_tag_indices) + + return correct, total + + + + # Get train data: need to read each train set (linked to a task) separately + + def get_train_data(self, list_folders_name): + """ + + :param list_folders_name: list of folders names + :param lower: whether to lowercase tokens + + transform training data to features (word indices) + map tags to integers + """ + X = [] + Y = [] + task_labels = [] #keeps track of where instances come from "task1" or "task2".. + self.tasks_ids = [] #record the id of the tasks + + #num_sentences=0 + #num_tokens=0 + + # word 2 indices and tag 2 indices + w2i = {} # word to index + c2i = {} # char to index + task2tag2idx = {} # id of the task -> tag2idx + + w2i["_UNK"] = 0 # unk word / OOV + c2i["_UNK"] = 0 # unk char + c2i[""] = 1 # word start + c2i[""] = 2 # word end index + + + for i, folder_name in enumerate( list_folders_name ): + num_sentences=0 + num_tokens=0 + task_id = 'task'+str(i) + self.tasks_ids.append( task_id ) + if task_id not in task2tag2idx: + task2tag2idx[task_id] = {} + for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)): + num_sentences += 1 + instance_word_indices = [] #sequence of word indices + instance_char_indices = [] #sequence of char indices + instance_tags_indices = [] #sequence of tag indices + + for i, (word, tag) in enumerate(zip(words, tags)): + num_tokens += 1 + + # map words and tags to indices + if word not in w2i: + w2i[word] = len(w2i) + instance_word_indices.append(w2i[word]) + + chars_of_word = [c2i[""]] + for char in word: + if char not in c2i: + c2i[char] = len(c2i) + chars_of_word.append(c2i[char]) + chars_of_word.append(c2i[""]) + instance_char_indices.append(chars_of_word) + + if tag not in task2tag2idx[task_id]: + #tag2idx[tag]=len(tag2idx) + task2tag2idx[task_id][tag]=len(task2tag2idx[task_id]) + + instance_tags_indices.append(task2tag2idx[task_id].get(tag)) + + X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices + Y.append(instance_tags_indices) + task_labels.append(task_id) + + #self.num_labels[task_id] = len( task2tag2idx[task_id] ) + + if num_sentences == 0 or num_tokens == 0: + sys.exit( "No data read from: "+folder_name ) + print("TASK "+task_id+" "+folder_name, file=sys.stderr ) + print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) + print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr) + + assert(len(X)==len(Y)) + return X, Y, task_labels, w2i, c2i, task2tag2idx #sequence of features, sequence of labels, necessary mappings + + + def save_embeds(self, out_filename): + # construct reverse mapping + i2w = {self.w2i[w]: w for w in self.w2i.keys()} + + OUT = open(out_filename+".w.emb","w") + for word_id in i2w.keys(): + wembeds_expression = self.wembeds[word_id] + word = i2w[word_id] + OUT.write("{} {}\n".format(word," ".join([str(x) for x in wembeds_expression.npvalue()]))) + OUT.close() + + +class MyNNTaggerArgumentOptions(object): + def __init__(self): + pass + ### functions for checking arguments + def acfunct(arg): + """ check for allowed argument for --ac option """ + try: + functions = [dynet.rectify, dynet.tanh] + functions = { function.__name__ : function for function in functions} + functions["None"] = None + return functions[str(arg)] + except: + raise argparse.ArgumentTypeError("String {} does not match required format".format(arg,)) + + + +if __name__=="__main__": + main() diff --git a/bilstm_tagger/src/run_simple.py b/bilstm_tagger/src/run_simple.py new file mode 100755 index 0000000..60eec71 --- /dev/null +++ b/bilstm_tagger/src/run_simple.py @@ -0,0 +1,23 @@ +#### Example of using bilty from within code +## +## to properly seed dyNet add parameter to your script: +## python run_simply.py --dynet-seed 113 + +from bilstm_tagger.src.simplebilty import SimpleBiltyTagger +import random +### Use --dynet-seed $SEED +seed=113 # assume we pass this to script +train_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-dev.conllu" +dev_data = "/Users/bplank/corpora/pos/ud1.3/orgtok/goldpos/da-ud-test.conllu" +in_dim=64 +h_dim=100 +c_in_dim=100 +h_layers=1 +iters=2 +trainer="sgd" +tagger = SimpleBiltyTagger(in_dim, h_dim,c_in_dim,h_layers,embeds_file=None) +train_X, train_Y = tagger.get_train_data(train_data) +tagger.fit(train_X, train_Y, iters, trainer,seed=seed) +test_X, test_Y = tagger.get_data_as_indices(dev_data) +correct, total = tagger.evaluate(test_X, test_Y) +print(correct, total, correct/total) diff --git a/bilstm_tagger/src/simplebilty.py b/bilstm_tagger/src/simplebilty.py new file mode 100755 index 0000000..11f4972 --- /dev/null +++ b/bilstm_tagger/src/simplebilty.py @@ -0,0 +1,598 @@ +#!/usr/bin/env python3 +# coding=utf-8 +""" +A neural network based tagger (bi-LSTM) - version w/o MTL +:author: Barbara Plank +""" +import argparse +import random +import time +import sys +import numpy as np +import os +import pickle +import dynet + +from bilstm_tagger.src.lib.mnnl import FFSequencePredictor, Layer, RNNSequencePredictor, BiRNNSequencePredictor +from bilstm_tagger.src.lib.mio import read_conll_file, load_embeddings_file + + +def main(): + parser = argparse.ArgumentParser(description="""Run the NN tagger""") + parser.add_argument("--train", + help="train data") # allow multiple train files, each asociated with a task = position in the list + # parser.add_argument("--pred_layer", help="layer of predictons", default=1) # assume always h_layer here + parser.add_argument("--model", help="load model from file", required=False) + parser.add_argument("--iters", help="training iterations [default: 30]", required=False, type=int, default=30) + parser.add_argument("--in_dim", help="input dimension [default: 64] (like Polyglot embeds)", required=False, + type=int, default=64) + parser.add_argument("--c_in_dim", help="input dimension for character embeddings [default: 100]", required=False, + type=int, default=100) + parser.add_argument("--h_dim", help="hidden dimension [default: 100]", required=False, type=int, default=100) + parser.add_argument("--h_layers", help="number of stacked LSTMs [default: 1 = no stacking]", required=False, + type=int, default=1) + parser.add_argument("--test", nargs='*', help="test file(s)", + required=False) # should be in the same order/task as train + parser.add_argument("--dev", help="dev file(s)", required=False) + parser.add_argument("--output", help="output predictions to file", required=False, default=None) + parser.add_argument("--lower", help="lowercase words (not used)", required=False, default=False, + action="store_true") + parser.add_argument("--save", help="save model to file (appends .model as well as .pickle)", required=False, + default=None) + parser.add_argument("--embeds", help="word embeddings file", required=False, default=None) + parser.add_argument("--sigma", help="noise sigma", required=False, default=0.2, type=float) + parser.add_argument("--ac", help="activation function [rectify, tanh, ...]", default="tanh", + type=MyNNTaggerArgumentOptions.acfunct) + parser.add_argument("--trainer", help="trainer [sgd, adam] default: sgd", required=False, default="sgd") + parser.add_argument("--dynet-seed", help="random seed for dynet (needs to be first argument!)", required=False, + type=int) + parser.add_argument("--dynet-mem", help="memory for dynet (needs to be first argument!)", required=False, type=int) + parser.add_argument("--save-embeds", help="save word embeddings file", required=False, default=None) + + args = parser.parse_args() + + if args.save: + # check if folder exists + if os.path.isdir(args.save): + modeldir = os.path.dirname(args.save) + if not os.path.exists(modeldir): + os.makedirs(modeldir) + if args.output: + if os.path.isdir(args.output): + outdir = os.path.dirname(args.output) + if not os.path.exists(outdir): + os.makedirs(outdir) + + start = time.time() + + if args.model: + print("loading model from file {}".format(args.model), file=sys.stderr) + tagger = load(args.model) + else: + tagger = SimpleBiltyTagger(args.in_dim, + args.h_dim, + args.c_in_dim, + args.h_layers, + embeds_file=args.embeds, + activation=args.ac, + lower=args.lower, + noise_sigma=args.sigma) + + if args.train: + ## read data + train_X, train_Y = tagger.get_train_data(args.train) + + if dev: + dev_X, dev_Y, org_X, org_Y, task_labels = self.get_data_as_indices(dev, "task0") + + tagger.fit(args.train, args.iters, args.trainer, seed=args.dynet_seed) + if args.save: + save(tagger, args.save) + + if args.test and len(args.test) != 0: + stdout = sys.stdout + # One file per test ... + for i, test in enumerate(args.test): + if args.output != None: + file_pred = args.output + ".task" + str(i) + sys.stdout = open(file_pred, 'w') + + sys.stderr.write('\nTesting Task' + str(i) + '\n') + sys.stderr.write('*******\n') + test_X, test_Y, org_X, org_Y, task_labels = tagger.get_data_as_indices(test, "task" + str(i)) + correct, total = tagger.evaluate(test_X, test_Y, org_X, org_Y, task_labels, output_predictions=args.output) + + print("\ntask%s test accuracy on %s items: %.4f" % (i, i + 1, correct / total), file=sys.stderr) + print(("Task" + str(i) + " Done. Took {0:.2f} seconds.".format(time.time() - start)), file=sys.stderr) + sys.stdout = stdout + + if args.ac: + activation = args.ac.__name__ + else: + activation = "None" + print("Info: biLSTM\n\tin_dim: {0}\n\tc_in_dim: {7}\n\th_dim: {1}" + "\n\th_layers: {2}\n\tactivation: {4}\n\tsigma: {5}\n\tlower: {6}" + "\tembeds: {3}".format(args.in_dim, args.h_dim, args.h_layers, args.embeds, activation, args.sigma, + args.lower, args.c_in_dim), file=sys.stderr) + + if args.save_embeds: + tagger.save_embeds(args.save_embeds) + + +def load(model_file): + """ + load a model from file; specify the .model file, it assumes the *pickle file in the same location + """ + myparams = pickle.load(open(model_file + ".pickle", "rb")) + tagger = SimpleBiltyTagger(myparams["in_dim"], + myparams["h_dim"], + myparams["c_in_dim"], + myparams["h_layers"], + activation=myparams["activation"]) + tagger.set_indices(myparams["w2i"], myparams["c2i"], myparams["tag2idx"]) + tagger.predictors, tagger.char_rnn, tagger.wembeds, tagger.cembeds = \ + tagger.build_computation_graph(myparams["num_words"], + myparams["num_chars"]) + tagger.model.load(model_file) + print("model loaded: {}".format(model_file), file=sys.stderr) + return tagger + + +def save(nntagger, model_file_name): + """ + save a model; dynet only saves the parameters, need to store the rest separately + """ + nntagger.model.save(model_file_name) + import pickle + myparams = {"num_words": len(nntagger.w2i), + "num_chars": len(nntagger.c2i), + "w2i": nntagger.w2i, + "c2i": nntagger.c2i, + "tag2idx": nntagger.tag2idx, + "activation": nntagger.activation, + "in_dim": nntagger.in_dim, + "h_dim": nntagger.h_dim, + "c_in_dim": nntagger.c_in_dim, + "h_layers": nntagger.h_layers, + "embeds_file": nntagger.embeds_file, + "pred_layer": nntagger.pred_layer + } + pickle.dump(myparams, open(model_file_name + ".pickle", "wb")) + print("model stored: {}".format(model_file_name), file=sys.stderr) + + +class SimpleBiltyTagger(object): + def __init__(self, in_dim, h_dim, c_in_dim, h_layers, embeds_file=None, activation=dynet.tanh, lower=False, + noise_sigma=0.1, tasks_ids=[]): + self.w2i = {} # word to index mapping + self.c2i = {} # char to index mapping + self.tag2idx = {} # tag to tag_id mapping + self.pred_layer = 1 # at which layer to predict + self.model = dynet.Model() # init model + self.in_dim = in_dim + self.h_dim = h_dim + self.c_in_dim = c_in_dim + self.activation = activation + self.lower = lower + self.noise_sigma = noise_sigma + self.h_layers = h_layers + self.predictors = {"inner": [], "output_layers_dict": {}, + "task_expected_at": {}} # the inner layers and predictors + self.wembeds = None # lookup: embeddings for words + self.cembeds = None # lookup: embeddings for characters + self.embeds_file = embeds_file + self.char_rnn = None # RNN for character input + + def pick_neg_log(self, pred, gold): + return -dynet.log(dynet.pick(pred, gold)) + + def set_indices(self, w2i, c2i, tag2idx): + self.tag2idx = tag2idx + self.w2i = w2i + self.c2i = c2i + + def fit(self, train_X, train_Y, num_epochs, train_algo, val_X=None, val_Y=None, patience=2, model_path=None, + seed=None): + """ + train the tagger + """ + print("read training data", file=sys.stderr) + + if seed: + print(">>> using seed: ", seed, file=sys.stderr) + random.seed(seed) # setting random seed + + # init lookup parameters and define graph + print("build graph", file=sys.stderr) + + num_words = len(self.w2i) + num_chars = len(self.c2i) + + self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars) + + if train_algo == "sgd": + trainer = dynet.SimpleSGDTrainer(self.model) + elif train_algo == "adam": + trainer = dynet.AdamTrainer(self.model) + else: + raise ValueError('%s is not a valid optimizer.' % train_algo) + + assert (len(train_X) == len(train_Y)) + train_data = list(zip(train_X, train_Y)) + + print('Starting training for %d epochs...' % num_epochs) + best_val_acc, epochs_no_improvement = 0., 0 + if val_X is not None and val_Y is not None and model_path is not None: + print('Using early stopping with patience of %d...' % patience) + for cur_iter in range(num_epochs): + total_loss = 0.0 + total_tagged = 0.0 + random.shuffle(train_data) + for ((word_indices, char_indices), y) in train_data: + # use same predict function for training and testing + output = self.predict(word_indices, char_indices, train=True) + + loss1 = dynet.esum([self.pick_neg_log(pred, gold) for pred, gold in zip(output, y)]) + lv = loss1.value() + total_loss += lv + total_tagged += len(word_indices) + + loss1.backward() + trainer.update() + total_loss = total_loss / total_tagged + print("epoch {2} {0:>12}: {1:.2f}".format("total loss", total_loss, cur_iter)) + + # get the best accuracy on the validation set + val_correct, val_total = self.evaluate(val_X, val_Y) + val_accuracy = val_correct / val_total + + if val_X is not None and val_Y is not None and model_path is not None: + if val_accuracy > best_val_acc: + print('Accuracy %.4f is better than best val accuracy %.4f.' % (val_accuracy, best_val_acc)) + best_val_acc = val_accuracy + epochs_no_improvement = 0 + save(self, model_path) + else: + print('Accuracy %.4f is worse than best val loss %.4f.' % (val_accuracy, best_val_acc)) + epochs_no_improvement += 1 + if epochs_no_improvement == patience: + print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) + break + + def build_computation_graph(self, num_words, num_chars): + """ + build graph and link to parameters + """ + # initialize the word embeddings and the parameters + if self.embeds_file: + print("loading embeddings", file=sys.stderr) + embeddings, emb_dim = load_embeddings_file(self.embeds_file, lower=self.lower) + assert (emb_dim == self.in_dim) + num_words = len(set(embeddings.keys()).union(set(self.w2i.keys()))) # initialize all with embeddings + # init model parameters and initialize them + wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) + cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) + + init = 0 + l = len(embeddings.keys()) + for word in embeddings.keys(): + # for those words we have already in w2i, update vector, otherwise add to w2i (since we keep data as integers) + if word in self.w2i: + wembeds.init_row(self.w2i[word], embeddings[word]) + else: + self.w2i[word] = len(self.w2i.keys()) # add new word + wembeds.init_row(self.w2i[word], embeddings[word]) + init += 1 + print("initialized: {}".format(init), file=sys.stderr) + + else: + wembeds = self.model.add_lookup_parameters((num_words, self.in_dim)) + cembeds = self.model.add_lookup_parameters((num_chars, self.c_in_dim)) + + # make it more flexible to add number of layers as specified by parameter + layers = [] # inner layers + # print("h_layers:", self.h_layers, file=sys.stderr) + for layer_num in range(0, self.h_layers): + # print(">>>", layer_num, "layer_num") + + if layer_num == 0: + builder = dynet.LSTMBuilder(1, self.in_dim + self.c_in_dim * 2, self.h_dim, + self.model) # in_dim: size of each layer + layers.append(BiRNNSequencePredictor(builder)) # returns forward and backward sequence + else: + # add inner layers (if h_layers >1) + builder = dynet.LSTMBuilder(1, self.h_dim, self.h_dim, self.model) + layers.append(BiRNNSequencePredictor(builder)) + + # store at which layer to predict task + + task_num_labels = len(self.tag2idx) + output_layer = FFSequencePredictor(Layer(self.model, self.h_dim * 2, task_num_labels, dynet.softmax)) + + char_rnn = RNNSequencePredictor(dynet.LSTMBuilder(1, self.c_in_dim, self.c_in_dim, self.model)) + + predictors = {} + predictors["inner"] = layers + predictors["output_layers_dict"] = output_layer + predictors["task_expected_at"] = self.h_layers + + return predictors, char_rnn, wembeds, cembeds + + def get_features(self, words): + """ + from a list of words, return the word and word char indices + """ + word_indices = [] + word_char_indices = [] + for word in words: + if word in self.w2i: + word_indices.append(self.w2i[word]) + else: + word_indices.append(self.w2i["_UNK"]) + + chars_of_word = [self.c2i[""]] + for char in word: + if char in self.c2i: + chars_of_word.append(self.c2i[char]) + else: + chars_of_word.append(self.c2i["_UNK"]) + chars_of_word.append(self.c2i[""]) + word_char_indices.append(chars_of_word) + return word_indices, word_char_indices + + def get_data_as_indices(self, file_name): + """ + X = list of (word_indices, word_char_indices) + Y = list of tag indices + """ + X, Y = [], [] + org_X, org_Y = [], [] + + for (words, tags) in read_conll_file(file_name): + word_indices, word_char_indices = self.get_features(words) + tag_indices = [self.tag2idx.get(tag) for tag in tags] + X.append((word_indices, word_char_indices)) + Y.append(tag_indices) + org_X.append(words) + org_Y.append(tags) + return X, Y # , org_X, org_Y - for now don't use + + def get_data_as_indices_from_instances(self, dev_words, dev_tags): + """ + Extension of get_data_as_indices. Use words and tags rather than a file as input. + X = list of (word_indices, word_char_indices) + Y = list of tag indices + """ + X, Y = [], [] + org_X, org_Y = [], [] + + for (words, tags) in zip(dev_words, dev_tags): + word_indices, word_char_indices = self.get_features(words) + tag_indices = [self.tag2idx.get(tag) for tag in tags] + X.append((word_indices, word_char_indices)) + Y.append(tag_indices) + org_X.append(words) + org_Y.append(tags) + return X, Y # , org_X, org_Y - for now don't use + + def predict(self, word_indices, char_indices, train=False): + """ + predict tags for a sentence represented as char+word embeddings + """ + dynet.renew_cg() # new graph + + char_emb = [] + rev_char_emb = [] + # get representation for words + for chars_of_token in char_indices: + # use last state as word representation + last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in chars_of_token])[-1] + rev_last_state = self.char_rnn.predict_sequence([self.cembeds[c] for c in reversed(chars_of_token)])[-1] + char_emb.append(last_state) + rev_char_emb.append(rev_last_state) + + wfeatures = [self.wembeds[w] for w in word_indices] + features = [dynet.concatenate([w, c, rev_c]) for w, c, rev_c in + zip(wfeatures, char_emb, reversed(rev_char_emb))] + + if train: # only do at training time + features = [dynet.noise(fe, self.noise_sigma) for fe in features] + + output_expected_at_layer = self.h_layers + output_expected_at_layer -= 1 + + # go through layers + # input is now combination of w + char emb + prev = features + num_layers = self.h_layers + for i in range(0, num_layers): + predictor = self.predictors["inner"][i] + forward_sequence, backward_sequence = predictor.predict_sequence(prev) + if i > 0 and self.activation: + # activation between LSTM layers + forward_sequence = [self.activation(s) for s in forward_sequence] + backward_sequence = [self.activation(s) for s in backward_sequence] + + if i == output_expected_at_layer: + output_predictor = self.predictors["output_layers_dict"] + concat_layer = [dynet.concatenate([f, b]) for f, b in + zip(forward_sequence, reversed(backward_sequence))] + + if train and self.noise_sigma > 0.0: + concat_layer = [dynet.noise(fe, self.noise_sigma) for fe in concat_layer] + output = output_predictor.predict_sequence(concat_layer) + return output + + prev = forward_sequence + prev_rev = backward_sequence # not used + + raise Exception("oops should not be here") + return None + + def evaluate(self, test_X, test_Y): + """ + compute accuracy on a test file + """ + correct = 0 + total = 0.0 + + for i, ((word_indices, word_char_indices), gold_tag_indices) in enumerate(zip(test_X, test_Y)): + output = self.predict(word_indices, word_char_indices) + predicted_tag_indices = [np.argmax(o.value()) for o in output] + + correct += sum( + [1 for (predicted, gold) in zip(predicted_tag_indices, gold_tag_indices) if predicted == gold]) + total += len(gold_tag_indices) + + return correct, total + + # Get train data: need to read each train set (linked to a task) separately + + def get_train_data(self, train_data): + """ + transform training data to features (word indices) + map tags to integers + """ + X = [] + Y = [] + + # word 2 indices and tag 2 indices + w2i = {} # word to index + c2i = {} # char to index + tag2idx = {} # tag2idx + + w2i["_UNK"] = 0 # unk word / OOV + c2i["_UNK"] = 0 # unk char + c2i[""] = 1 # word start + c2i[""] = 2 # word end index + + num_sentences = 0 + num_tokens = 0 + for instance_idx, (words, tags) in enumerate(read_conll_file(train_data)): + instance_word_indices = [] # sequence of word indices + instance_char_indices = [] # sequence of char indices + instance_tags_indices = [] # sequence of tag indices + + for i, (word, tag) in enumerate(zip(words, tags)): + + # map words and tags to indices + if word not in w2i: + w2i[word] = len(w2i) + instance_word_indices.append(w2i[word]) + + chars_of_word = [c2i[""]] + for char in word: + if char not in c2i: + c2i[char] = len(c2i) + chars_of_word.append(c2i[char]) + chars_of_word.append(c2i[""]) + instance_char_indices.append(chars_of_word) + + if tag not in tag2idx: + tag2idx[tag] = len(tag2idx) + + instance_tags_indices.append(tag2idx.get(tag)) + + num_tokens += 1 + + num_sentences += 1 + + X.append((instance_word_indices, + instance_char_indices)) # list of word indices, for every word list of char indices + Y.append(instance_tags_indices) + + print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) + print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr) + + assert (len(X) == len(Y)) + + # store mappings of words and tags to indices + self.set_indices(w2i, c2i, tag2idx) + + return X, Y + + def get_train_data_from_instances(self, train_words, train_tags): + """ + Extension of get_train_data method. Extracts training data from two arrays of word and label lists. + transform training data to features (word indices) + map tags to integers + :param train_words: a numpy array containing lists of words + :param train_tags: a numpy array containing lists of corresponding tags + """ + X = [] + Y = [] + + # word 2 indices and tag 2 indices + w2i = {} # word to index + c2i = {} # char to index + tag2idx = {} # tag2idx + + w2i["_UNK"] = 0 # unk word / OOV + c2i["_UNK"] = 0 # unk char + c2i[""] = 1 # word start + c2i[""] = 2 # word end index + + num_sentences = 0 + num_tokens = 0 + for instance_idx, (words, tags) in enumerate(zip(train_words, train_tags)): + instance_word_indices = [] # sequence of word indices + instance_char_indices = [] # sequence of char indices + instance_tags_indices = [] # sequence of tag indices + + for i, (word, tag) in enumerate(zip(words, tags)): + + # map words and tags to indices + if word not in w2i: + w2i[word] = len(w2i) + instance_word_indices.append(w2i[word]) + + chars_of_word = [c2i[""]] + for char in word: + if char not in c2i: + c2i[char] = len(c2i) + chars_of_word.append(c2i[char]) + chars_of_word.append(c2i[""]) + instance_char_indices.append(chars_of_word) + + if tag not in tag2idx: + tag2idx[tag] = len(tag2idx) + + instance_tags_indices.append(tag2idx.get(tag)) + + num_tokens += 1 + + num_sentences += 1 + + X.append((instance_word_indices, + instance_char_indices)) # list of word indices, for every word list of char indices + Y.append(instance_tags_indices) + + print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr) + print("%s w features, %s c features " % (len(w2i), len(c2i)), file=sys.stderr) + + assert (len(X) == len(Y)) + + # store mappings of words and tags to indices + self.set_indices(w2i, c2i, tag2idx) + + return X, Y + + +class MyNNTaggerArgumentOptions(object): + def __init__(self): + pass + + ### functions for checking arguments + def acfunct(arg): + """ check for allowed argument for --ac option """ + try: + functions = [dynet.rectify, dynet.tanh] + functions = {function.__name__: function for function in functions} + functions["None"] = None + return functions[str(arg)] + except: + raise argparse.ArgumentTypeError("String {} does not match required format".format(arg, )) + + +if __name__ == "__main__": + main() diff --git a/bist_parser/LICENSE b/bist_parser/LICENSE new file mode 100644 index 0000000..8dada3e --- /dev/null +++ b/bist_parser/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/bist_parser/README.md b/bist_parser/README.md new file mode 100644 index 0000000..4147304 --- /dev/null +++ b/bist_parser/README.md @@ -0,0 +1,75 @@ +# BIST Parsers +## Graph & Transition based dependency parsers using BiLSTM feature extractors. + +The techniques behind the parser are described in the paper [Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations](https://www.transacl.org/ojs/index.php/tacl/article/viewFile/885/198). Futher materials could be found [here](http://elki.cc/#/article/Simple%20and%20Accurate%20Dependency%20Parsing%20Using%20Bidirectional%20LSTM%20Feature%20Representations). + +#### Required software + + * Python 2.7 interpreter + * [DyNet library](https://github.com/clab/dynet/tree/master/python) + +#### Train a parsing model + +The software requires having a `training.conll` and `development.conll` files formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat). +For the faster graph-based parser change directory to `bmstparser` (1200 words/sec), and for the more accurate transition-based parser change directory to `barchybrid` (800 word/sec). The benchmark was performed on a Mac book pro with i7 processor. The graph-based parser acheives an accuracy of 93.8 UAS and the transition-based parser an accuracy of 94.7 UAS on the standard Penn Treebank dataset (Standford Dependencies). The transition-based parser requires no part-of-speech tagging and setting all the tags to NN will produce the expected accuracy. The model and param files achieving those scores are available for download ([Graph-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AADgBS9hb9vy0o-UBZW9AbbKa/bestfirstorder.tar.gz?dl=0), [Transition-based model](https://www.dropbox.com/sh/v9cbshnmb36km6v/AACEPp3DLQeJnRA_QyPmll93a/bestarchybrid.tar.gz?dl=0)). The trained models include improvements beyond those described in the paper, to be published soon. + +To train a parsing model with for either parsing architecture type the following at the command prompt: + + python src/parser.py --dynet-seed 123456789 [--dynet-mem XXXX] --outdir [results directory] --train training.conll --dev development.conll --epochs 30 --lstmdims 125 --lstmlayers 2 [--extrn extrn.vectors] --bibi-lstm + +We use the same external embedding used in [Transition-Based Dependency Parsing with Stack Long Short-Term Memory](http://arxiv.org/abs/1505.08075) which can be downloaded from the authors [github repository](https://github.com/clab/lstm-parser/) and [directly here](https://drive.google.com/file/d/0B8nESzOdPhLsdWF2S1Ayb1RkTXc/view?usp=sharing). + +If you are training a transition-based parser then for optimal results you should add the following to the command prompt `--k 3 --usehead --userl`. These switch will set the stack to 3 elements; use the BiLSTM of the head of trees on the stack as feature vectors; and add the BiLSTM of the right/leftmost children to the feature vectors. + +Note 1: You can run it without pos embeddings by setting the pos embedding dimensions to zero (--pembedding 0). + +Note 2: The reported test result is the one matching the highest development score. + +Note 3: The parser calculates (after each iteration) the accuracies excluding punctuation symbols by running the `eval.pl` script from the CoNLL-X Shared Task and stores the results in directory specified by the `--outdir`. + +Note 4: The external embeddings parameter is optional and better not used when train/predicting a graph-based model. + +#### Parse data with your parsing model + +The command for parsing a `test.conll` file formatted according to the [CoNLL data format](http://ilk.uvt.nl/conll/#dataformat) with a previously trained model is: + + python src/parser.py --predict --outdir [results directory] --test test.conll [--extrn extrn.vectors] --model [trained model file] --params [param file generate during training] + +The parser will store the resulting conll file in the out directory (`--outdir`). + +Note 1: If you are using the arc-hybrid trained model we provided please use the `--extrn` flag and specify the location of the external embeddings file. + +Note 2: If you are using the first-order trained model we provided please do not use the `--extrn` flag. + +#### Citation + +If you make use of this software for research purposes, we'll appreciate citing the following: + + @article{DBLP:journals/tacl/KiperwasserG16, + author = {Eliyahu Kiperwasser and Yoav Goldberg}, + title = {Simple and Accurate Dependency Parsing Using Bidirectional {LSTM} + Feature Representations}, + journal = {{TACL}}, + volume = {4}, + pages = {313--327}, + year = {2016}, + url = {https://transacl.org/ojs/index.php/tacl/article/view/885}, + timestamp = {Tue, 09 Aug 2016 14:51:09 +0200}, + biburl = {http://dblp.uni-trier.de/rec/bib/journals/tacl/KiperwasserG16}, + bibsource = {dblp computer science bibliography, http://dblp.org} + } + +#### License + +This software is released under the terms of the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +#### Contact + +For questions and usage issues, please contact elikip@gmail.com + +#### Credits + +[Eliyahu Kiperwasser](http://elki.cc) + +[Yoav Goldberg](https://www.cs.bgu.ac.il/~yoavg/uni/) + diff --git a/bist_parser/__init__.py b/bist_parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bist_parser/barchybrid/src/arc_hybrid.py b/bist_parser/barchybrid/src/arc_hybrid.py new file mode 100644 index 0000000..2d74fe4 --- /dev/null +++ b/bist_parser/barchybrid/src/arc_hybrid.py @@ -0,0 +1,401 @@ +from dynet import * +from utils import ParseForest, read_conll, write_conll +from operator import itemgetter +from itertools import chain +import utils, time, random +import numpy as np + + +class ArcHybridLSTM: + def __init__(self, words, pos, rels, w2i, options): + self.model = Model() + self.trainer = AdamTrainer(self.model) + random.seed(1) + + self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} + self.activation = self.activations[options.activation] + + self.oracle = options.oracle + self.ldims = options.lstm_dims * 2 + self.wdims = options.wembedding_dims + self.pdims = options.pembedding_dims + self.rdims = options.rembedding_dims + self.layers = options.lstm_layers + self.wordsCount = words + self.vocab = {word: ind+3 for word, ind in w2i.iteritems()} + self.pos = {word: ind+3 for ind, word in enumerate(pos)} + self.rels = {word: ind for ind, word in enumerate(rels)} + self.irels = rels + + self.headFlag = options.headFlag + self.rlMostFlag = options.rlMostFlag + self.rlFlag = options.rlFlag + self.k = options.window + + self.nnvecs = (1 if self.headFlag else 0) + (2 if self.rlFlag or self.rlMostFlag else 0) + + self.external_embedding = None + if options.external_embedding is not None: + external_embedding_fp = open(options.external_embedding,'r') + external_embedding_fp.readline() + self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp} + external_embedding_fp.close() + + self.edim = len(self.external_embedding.values()[0]) + self.noextrn = [0.0 for _ in xrange(self.edim)] + self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} + self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) + for word, i in self.extrnd.iteritems(): + self.elookup.init_row(i, self.external_embedding[word]) + self.extrnd['*PAD*'] = 1 + self.extrnd['*INITIAL*'] = 2 + + print 'Load external embedding. Vector dimensions', self.edim + + dims = self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0) + self.blstmFlag = options.blstmFlag + self.bibiFlag = options.bibiFlag + + if self.bibiFlag: + self.surfaceBuilders = [VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + self.bsurfaceBuilders = [VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model), + VanillaLSTMBuilder(1, self.ldims, self.ldims * 0.5, self.model)] + elif self.blstmFlag: + if self.layers > 0: + self.surfaceBuilders = [VanillaLSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model), LSTMBuilder(self.layers, dims, self.ldims * 0.5, self.model)] + else: + self.surfaceBuilders = [SimpleRNNBuilder(1, dims, self.ldims * 0.5, self.model), LSTMBuilder(1, dims, self.ldims * 0.5, self.model)] + + self.hidden_units = options.hidden_units + self.hidden2_units = options.hidden2_units + self.vocab['*PAD*'] = 1 + self.pos['*PAD*'] = 1 + + self.vocab['*INITIAL*'] = 2 + self.pos['*INITIAL*'] = 2 + + self.wlookup = self.model.add_lookup_parameters((len(words) + 3, self.wdims)) + self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims)) + self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) + + self.word2lstm = self.model.add_parameters((self.ldims, self.wdims + self.pdims + (self.edim if self.external_embedding is not None else 0))) + self.word2lstmbias = self.model.add_parameters((self.ldims)) + self.lstm2lstm = self.model.add_parameters((self.ldims, self.ldims * self.nnvecs + self.rdims)) + self.lstm2lstmbias = self.model.add_parameters((self.ldims)) + + self.hidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1))) + self.hidBias = self.model.add_parameters((self.hidden_units)) + + self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.hid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.outLayer = self.model.add_parameters((3, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.outBias = self.model.add_parameters((3)) + + self.rhidLayer = self.model.add_parameters((self.hidden_units, self.ldims * self.nnvecs * (self.k + 1))) + self.rhidBias = self.model.add_parameters((self.hidden_units)) + + self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.routLayer = self.model.add_parameters((2 * (len(self.irels) + 0) + 1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.routBias = self.model.add_parameters((2 * (len(self.irels) + 0) + 1)) + + + def __evaluate(self, stack, buf, train): + topStack = [ stack.roots[-i-1].lstms if len(stack) > i else [self.empty] for i in xrange(self.k) ] + topBuffer = [ buf.roots[i].lstms if len(buf) > i else [self.empty] for i in xrange(1) ] + + input = concatenate(list(chain(*(topStack + topBuffer)))) + + if self.hidden2_units > 0: + routput = (self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr())) + self.routBias.expr()) + else: + routput = (self.routLayer.expr() * self.activation(self.rhidLayer.expr() * input + self.rhidBias.expr()) + self.routBias.expr()) + + if self.hidden2_units > 0: + output = (self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr())) + self.outBias.expr()) + else: + output = (self.outLayer.expr() * self.activation(self.hidLayer.expr() * input + self.hidBias.expr()) + self.outBias.expr()) + + scrs, uscrs = routput.value(), output.value() + + uscrs0 = uscrs[0] + uscrs1 = uscrs[1] + uscrs2 = uscrs[2] + if train: + output0 = output[0] + output1 = output[1] + output2 = output[2] + ret = [ [ (rel, 0, scrs[1 + j * 2] + uscrs1, routput[1 + j * 2 ] + output1) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [], + [ (rel, 1, scrs[2 + j * 2] + uscrs2, routput[2 + j * 2 ] + output2) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [], + [ (None, 2, scrs[0] + uscrs0, routput[0] + output0) ] if len(buf) > 0 else [] ] + else: + s1,r1 = max(zip(scrs[1::2],self.irels)) + s2,r2 = max(zip(scrs[2::2],self.irels)) + s1 += uscrs1 + s2 += uscrs2 + ret = [ [ (r1, 0, s1) ] if len(stack) > 0 and len(buf) > 0 else [], + [ (r2, 1, s2) ] if len(stack) > 1 else [], + [ (None, 2, scrs[0] + uscrs0) ] if len(buf) > 0 else [] ] + return ret + #return [ [ (rel, 0, scrs[1 + j * 2 + 0] + uscrs[1], routput[1 + j * 2 + 0] + output[1]) for j, rel in enumerate(self.irels) ] if len(stack) > 0 and len(buf) > 0 else [], + # [ (rel, 1, scrs[1 + j * 2 + 1] + uscrs[2], routput[1 + j * 2 + 1] + output[2]) for j, rel in enumerate(self.irels) ] if len(stack) > 1 else [], + # [ (None, 2, scrs[0] + uscrs[0], routput[0] + output[0]) ] if len(buf) > 0 else [] ] + + + def Save(self, filename): + self.model.save(filename) + + + def Load(self, filename): + self.model.load(filename) + + def Init(self): + evec = self.elookup[1] if self.external_embedding is not None else None + paddingWordVec = self.wlookup[1] + paddingPosVec = self.plookup[1] if self.pdims > 0 else None + + paddingVec = tanh(self.word2lstm.expr() * concatenate(filter(None, [paddingWordVec, paddingPosVec, evec])) + self.word2lstmbias.expr() ) + self.empty = paddingVec if self.nnvecs == 1 else concatenate([paddingVec for _ in xrange(self.nnvecs)]) + + + def getWordEmbeddings(self, sentence, train): + for root in sentence: + c = float(self.wordsCount.get(root.norm, 0)) + dropFlag = not train or (random.random() < (c/(0.25+c))) + root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if dropFlag else 0] + root.posvec = self.plookup[int(self.pos[root.pos])] if self.pdims > 0 else None + + if self.external_embedding is not None: + #if not dropFlag and random.random() < 0.5: + # root.evec = self.elookup[0] + if root.form in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.form]] + elif root.norm in self.external_embedding: + root.evec = self.elookup[self.extrnd[root.norm]] + else: + root.evec = self.elookup[0] + else: + root.evec = None + root.ivec = concatenate(filter(None, [root.wordvec, root.posvec, root.evec])) + + if self.blstmFlag: + forward = self.surfaceBuilders[0].initial_state() + backward = self.surfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + forward = forward.add_input( froot.ivec ) + backward = backward.add_input( rroot.ivec ) + froot.fvec = forward.output() + rroot.bvec = backward.output() + for root in sentence: + root.vec = concatenate( [root.fvec, root.bvec] ) + + if self.bibiFlag: + bforward = self.bsurfaceBuilders[0].initial_state() + bbackward = self.bsurfaceBuilders[1].initial_state() + + for froot, rroot in zip(sentence, reversed(sentence)): + bforward = bforward.add_input( froot.vec ) + bbackward = bbackward.add_input( rroot.vec ) + froot.bfvec = bforward.output() + rroot.bbvec = bbackward.output() + for root in sentence: + root.vec = concatenate( [root.bfvec, root.bbvec] ) + + else: + for root in sentence: + root.ivec = (self.word2lstm.expr() * root.ivec) + self.word2lstmbias.expr() + root.vec = tanh( root.ivec ) + + + def Predict(self, conll_path): + with open(conll_path, 'r') as conllFP: + for iSentence, sentence in enumerate(read_conll(conllFP, False)): + self.Init() + + sentence = sentence[1:] + [sentence[0]] + self.getWordEmbeddings(sentence, False) + stack = ParseForest([]) + buf = ParseForest(sentence) + + for root in sentence: + root.lstms = [root.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + while len(buf) > 0 or len(stack) > 1 : + scores = self.__evaluate(stack, buf, False) + best = max(chain(*scores), key = itemgetter(2) ) + + if best[1] == 2: + stack.roots.append(buf.roots[0]) + del buf.roots[0] + + elif best[1] == 0: + child = stack.roots.pop() + parent = buf.roots[0] + + child.pred_parent_id = parent.id + child.pred_relation = best[0] + + bestOp = 0 + if self.rlMostFlag: + parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] + if self.rlFlag: + parent.lstms[bestOp + hoffset] = child.vec + + elif best[1] == 1: + child = stack.roots.pop() + parent = stack.roots[-1] + + child.pred_parent_id = parent.id + child.pred_relation = best[0] + + bestOp = 1 + if self.rlMostFlag: + parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] + if self.rlFlag: + parent.lstms[bestOp + hoffset] = child.vec + + renew_cg() + yield [sentence[-1]] + sentence[:-1] + + + def Train(self, conll_path): + mloss = 0.0 + errors = 0 + batch = 0 + eloss = 0.0 + eerrors = 0 + lerrors = 0 + etotal = 0 + ltotal = 0 + ninf = -float('inf') + + hoffset = 1 if self.headFlag else 0 + + start = time.time() + + with open(conll_path, 'r') as conllFP: + shuffledData = list(read_conll(conllFP, True)) + random.shuffle(shuffledData) + + errs = [] + eeloss = 0.0 + + self.Init() + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + sentence = sentence[1:] + [sentence[0]] + self.getWordEmbeddings(sentence, True) + stack = ParseForest([]) + buf = ParseForest(sentence) + + for root in sentence: + root.lstms = [root.vec for _ in xrange(self.nnvecs)] + + hoffset = 1 if self.headFlag else 0 + + while len(buf) > 0 or len(stack) > 1 : + scores = self.__evaluate(stack, buf, True) + scores.append([(None, 3, ninf ,None)]) + + alpha = stack.roots[:-2] if len(stack) > 2 else [] + s1 = [stack.roots[-2]] if len(stack) > 1 else [] + s0 = [stack.roots[-1]] if len(stack) > 0 else [] + b = [buf.roots[0]] if len(buf) > 0 else [] + beta = buf.roots[1:] if len(buf) > 1 else [] + + left_cost = ( len([h for h in s1 + beta if h.id == s0[0].parent_id]) + + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[0]) > 0 else 1 + right_cost = ( len([h for h in b + beta if h.id == s0[0].parent_id]) + + len([d for d in b + beta if d.parent_id == s0[0].id]) ) if len(scores[1]) > 0 else 1 + shift_cost = ( len([h for h in s1 + alpha if h.id == b[0].parent_id]) + + len([d for d in s0 + s1 + alpha if d.parent_id == b[0].id]) ) if len(scores[2]) > 0 else 1 + costs = (left_cost, right_cost, shift_cost, 1) + + bestValid = max(( s for s in chain(*scores) if costs[s[1]] == 0 and ( s[1] == 2 or s[0] == stack.roots[-1].relation ) ), key=itemgetter(2)) + bestWrong = max(( s for s in chain(*scores) if costs[s[1]] != 0 or ( s[1] != 2 and s[0] != stack.roots[-1].relation ) ), key=itemgetter(2)) + best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1) ) else bestWrong + + if best[1] == 2: + stack.roots.append(buf.roots[0]) + del buf.roots[0] + + elif best[1] == 0: + child = stack.roots.pop() + parent = buf.roots[0] + + child.pred_parent_id = parent.id + child.pred_relation = best[0] + + bestOp = 0 + if self.rlMostFlag: + parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] + if self.rlFlag: + parent.lstms[bestOp + hoffset] = child.vec + + elif best[1] == 1: + child = stack.roots.pop() + parent = stack.roots[-1] + + child.pred_parent_id = parent.id + child.pred_relation = best[0] + + bestOp = 1 + if self.rlMostFlag: + parent.lstms[bestOp + hoffset] = child.lstms[bestOp + hoffset] + if self.rlFlag: + parent.lstms[bestOp + hoffset] = child.vec + + if bestValid[2] < bestWrong[2] + 1.0: + loss = bestWrong[3] - bestValid[3] + mloss += 1.0 + bestWrong[2] - bestValid[2] + eloss += 1.0 + bestWrong[2] - bestValid[2] + errs.append(loss) + + if best[1] != 2 and (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): + lerrors += 1 + if child.pred_parent_id != child.parent_id: + errors += 1 + eerrors += 1 + + etotal += 1 + + if len(errs) > 50: # or True: + #eerrs = ((esum(errs)) * (1.0/(float(len(errs))))) + eerrs = esum(errs) + scalar_loss = eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + self.Init() + + if len(errs) > 0: + eerrs = (esum(errs)) # * (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + + renew_cg() + + self.trainer.update_epoch() + print "Loss: ", mloss/iSentence diff --git a/bist_parser/barchybrid/src/parser.py b/bist_parser/barchybrid/src/parser.py new file mode 100644 index 0000000..8ddbe95 --- /dev/null +++ b/bist_parser/barchybrid/src/parser.py @@ -0,0 +1,76 @@ +from optparse import OptionParser +from arc_hybrid import ArcHybridLSTM +import pickle, utils, os, time, sys + +if __name__ == '__main__': + parser = OptionParser() + parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/PTB_SD_3_3_0/train.conll") + parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/PTB_SD_3_3_0/dev.conll") + parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/PTB_SD_3_3_0/test.conll") + parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle") + parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE") + parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="barchybrid.model") + parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100) + parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25) + parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25) + parser.add_option("--epochs", type="int", dest="epochs", default=30) + parser.add_option("--hidden", type="int", dest="hidden_units", default=100) + parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0) + parser.add_option("--k", type="int", dest="window", default=3) + parser.add_option("--lr", type="float", dest="learning_rate", default=0.1) + parser.add_option("--outdir", type="string", dest="output", default="results") + parser.add_option("--activation", type="string", dest="activation", default="tanh") + parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2) + parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=200) + parser.add_option("--dynet-seed", type="int", dest="seed", default=7) + parser.add_option("--disableoracle", action="store_false", dest="oracle", default=True) + parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True) + parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False) + parser.add_option("--usehead", action="store_true", dest="headFlag", default=False) + parser.add_option("--userlmost", action="store_true", dest="rlFlag", default=False) + parser.add_option("--userl", action="store_true", dest="rlMostFlag", default=False) + parser.add_option("--predict", action="store_true", dest="predictFlag", default=False) + parser.add_option("--dynet-mem", type="int", dest="cnn_mem", default=512) + + (options, args) = parser.parse_args() + print 'Using external embedding:', options.external_embedding + + if not options.predictFlag: + if not (options.rlFlag or options.rlMostFlag or options.headFlag): + print 'You must use either --userlmost or --userl or --usehead (you can use multiple)' + sys.exit() + + print 'Preparing vocab' + words, w2i, pos, rels = utils.vocab(options.conll_train) + + with open(os.path.join(options.output, options.params), 'w') as paramsfp: + pickle.dump((words, w2i, pos, rels, options), paramsfp) + print 'Finished collecting vocab' + + print 'Initializing blstm arc hybrid:' + parser = ArcHybridLSTM(words, pos, rels, w2i, options) + + for epoch in xrange(options.epochs): + print 'Starting epoch', epoch + parser.Train(options.conll_train) + devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll') + utils.write_conll(devpath, parser.Predict(options.conll_dev)) + os.system('perl src/utils/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt &') + print 'Finished predicting dev' + parser.Save(os.path.join(options.output, options.model + str(epoch+1))) + else: + with open(options.params, 'r') as paramsfp: + words, w2i, pos, rels, stored_opt = pickle.load(paramsfp) + + stored_opt.external_embedding = options.external_embedding + + parser = ArcHybridLSTM(words, pos, rels, w2i, stored_opt) + parser.Load(options.model) + tespath = os.path.join(options.output, 'test_pred.conll') + ts = time.time() + pred = list(parser.Predict(options.conll_test)) + te = time.time() + utils.write_conll(tespath, pred) + os.system('perl src/utils/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt &') + print 'Finished predicting test',te-ts + diff --git a/bist_parser/barchybrid/src/utils.py b/bist_parser/barchybrid/src/utils.py new file mode 100644 index 0000000..7b21851 --- /dev/null +++ b/bist_parser/barchybrid/src/utils.py @@ -0,0 +1,114 @@ +from collections import Counter +import re + + +class ConllEntry: + def __init__(self, id, form, pos, cpos, parent_id=None, relation=None): + self.id = id + self.form = form + self.norm = normalize(form) + self.cpos = cpos.upper() + self.pos = pos.upper() + self.parent_id = parent_id + self.relation = relation + + +class ParseForest: + def __init__(self, sentence): + self.roots = list(sentence) + + for root in self.roots: + root.children = [] + root.scores = None + root.parent = None + root.pred_parent_id = 0 # None + root.pred_relation = 'rroot' # None + root.vecs = None + root.lstms = None + + def __len__(self): + return len(self.roots) + + + def Attach(self, parent_index, child_index): + parent = self.roots[parent_index] + child = self.roots[child_index] + + child.pred_parent_id = parent.id + del self.roots[child_index] + + +def isProj(sentence): + forest = ParseForest(sentence) + unassigned = {entry.id: sum([1 for pentry in sentence if pentry.parent_id == entry.id]) for entry in sentence} + + for _ in xrange(len(sentence)): + for i in xrange(len(forest.roots) - 1): + if forest.roots[i].parent_id == forest.roots[i+1].id and unassigned[forest.roots[i].id] == 0: + unassigned[forest.roots[i+1].id]-=1 + forest.Attach(i+1, i) + break + if forest.roots[i+1].parent_id == forest.roots[i].id and unassigned[forest.roots[i+1].id] == 0: + unassigned[forest.roots[i].id]-=1 + forest.Attach(i, i+1) + break + + return len(forest.roots) == 1 + +def vocab(conll_path): + wordsCount = Counter() + posCount = Counter() + relCount = Counter() + + with open(conll_path, 'r') as conllFP: + for sentence in read_conll(conllFP, True): + wordsCount.update([node.norm for node in sentence]) + posCount.update([node.pos for node in sentence]) + relCount.update([node.relation for node in sentence]) + + return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, posCount.keys(), relCount.keys()) + +def read_conll(fh, proj): + dropped = 0 + read = 0 + root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', 0, 'rroot') + tokens = [root] + for line in fh: + tok = line.strip().split() + if not tok: + if len(tokens)>1: + if not proj or isProj(tokens): + yield tokens + else: + print 'Non-projective sentence dropped' + dropped += 1 + read += 1 + tokens = [root] + id = 0 + else: + tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7])) + if len(tokens) > 1: + yield tokens + + print dropped, 'dropped non-projective sentences.' + print read, 'sentences read.' + + +def write_conll(fn, conll_gen): + with open(fn, 'w') as fh: + for sentence in conll_gen: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + +numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); +def normalize(word): + return 'NUM' if numberRegex.match(word) else word.lower() + +cposTable = {"PRP$": "PRON", "VBG": "VERB", "VBD": "VERB", "VBN": "VERB", ",": ".", "''": ".", "VBP": "VERB", "WDT": "DET", "JJ": "ADJ", "WP": "PRON", "VBZ": "VERB", + "DT": "DET", "#": ".", "RP": "PRT", "$": ".", "NN": "NOUN", ")": ".", "(": ".", "FW": "X", "POS": "PRT", ".": ".", "TO": "PRT", "PRP": "PRON", "RB": "ADV", + ":": ".", "NNS": "NOUN", "NNP": "NOUN", "``": ".", "WRB": "ADV", "CC": "CONJ", "LS": "X", "PDT": "DET", "RBS": "ADV", "RBR": "ADV", "CD": "NUM", "EX": "DET", + "IN": "ADP", "WP$": "PRON", "MD": "VERB", "NNPS": "NOUN", "JJS": "ADJ", "JJR": "ADJ", "SYM": "X", "VB": "VERB", "UH": "X", "ROOT-POS": "ROOT-CPOS", + "-LRB-": ".", "-RRB-": "."} diff --git a/bist_parser/barchybrid/src/utils/eval.pl b/bist_parser/barchybrid/src/utils/eval.pl new file mode 100644 index 0000000..3db9837 --- /dev/null +++ b/bist_parser/barchybrid/src/utils/eval.pl @@ -0,0 +1,1826 @@ +#!/usr/bin/env perl + +# Author: Yuval Krymolowski +# Addition of precision and recall +# and of frame confusion list: Sabine Buchholz +# Addition of DEPREL + ATTACHMENT: +# Prokopis Prokopidis (prokopis at ilsp dot gr) +# Acknowledgements: +# to Markus Kuhn for suggesting the use of +# the Unicode category property + +if ($] < 5.008001) +{ + printf STDERR < -s + + This script evaluates a system output with respect to a gold standard. + Both files should be in UTF-8 encoded CoNLL-X tabular format. + + Punctuation tokens (those where all characters have the Unicode + category property "Punctuation") are ignored for scoring (unless the + -p flag is used). + + The output breaks down the errors according to their type and context. + + Optional parameters: + -o FILE : output: print output to FILE (default is standard output) + -q : quiet: only print overall performance, without the details + -b : evalb: produce output in a format similar to evalb + (http://nlp.cs.nyu.edu/evalb/); use together with -q + -p : punctuation: also score on punctuation (default is not to score on it) + -v : version: show the version number + -h : help: print this help text and exit + +EOT +; + +my ($line_num) ; +my ($sep) = '0x01' ; + +my ($START) = '.S' ; +my ($END) = '.E' ; + +my ($con_err_num) = 3 ; +my ($freq_err_num) = 10 ; +my ($spec_err_loc_con) = 8 ; + +################################################################################ +### subfunctions ### +################################################################################ + +# Whether a string consists entirely of characters with the Unicode +# category property "Punctuation" (see "man perlunicode") +sub is_uni_punct +{ + my ($word) = @_ ; + + return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ; +} + +# The length of a unicode string, excluding non-spacing marks +# (for example vowel marks in Arabic) + +sub uni_len +{ + my ($word) = @_ ; + my ($ch, $l) ; + + $l = 0 ; + foreach $ch (split(//, Encode::decode_utf8($word))) + { + if ($ch !~ /^\p{NonspacingMark}/) + { + $l++ ; + } + } + + return $l ; +} + +sub filter_context_counts +{ # filter_context_counts + + my ($vec, $num, $max_len) = @_ ; + my ($con, $l, $thresh) ; + + $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ; + + foreach $con (keys %{$vec}) + { + if (${$vec}{$con} < $thresh) + { + delete ${$vec}{$con} ; + next ; + } + + $l = uni_len($con) ; + + if ($l > ${$max_len}) + { + ${$max_len} = $l ; + } + } + +} # filter_context_counts + +sub print_context +{ # print_context + + my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ; + my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ; + + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ; + printf OUT " ||" ; + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ; + printf OUT "\n" ; + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ; + @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ; + + $n = scalar @v_con ; + if (scalar @v_con_pos > $n) + { + $n = scalar @v_con_pos ; + } + + foreach $i (0 .. $n-1) + { + if (defined $v_con_pos[$i]) + { + $con_pos = $v_con_pos[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos}, + ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos}, + ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT " ||" ; + + if (defined $v_con[$i]) + { + $con = $v_con[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con}, + ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con}, + ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT "\n" ; + } + + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + printf OUT "\n\n" ; + +} # print_context + +sub num_as_word +{ + my ($num) = @_ ; + + $num = abs($num) ; + + if ($num == 1) + { + return ('one word') ; + } + elsif ($num == 2) + { + return ('two words') ; + } + elsif ($num == 3) + { + return ('three words') ; + } + elsif ($num == 4) + { + return ('four words') ; + } + else + { + return ($num.' words') ; + } +} + +sub describe_err +{ # describe_err + + my ($head_err, $head_aft_bef, $dep_err) = @_ ; + my ($dep_g, $dep_s, $desc) ; + my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ; + + if ($head_err eq '-') + { + $desc = 'correct head' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= ' (0)' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= ' (the focus word)' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= ' (after the focus word)' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= ' (before the focus word)' ; + } + } + elsif ($head_aft_bef_s eq '0') + { + $desc = 'head = 0 instead of ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_g eq '0') + { + $desc = 'head is ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word instead of 0' ; + } + else + { + $desc = num_as_word($head_err) ; + if ($head_err < 0) + { + $desc .= ' before' ; + } + else + { + $desc .= ' after' ; + } + + $desc = 'head '.$desc.' the correct head ' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= '(0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= '(the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= '(after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= '(before the focus word' ; + } + + if ($head_aft_bef_g ne $head_aft_bef_s) + { + $desc .= ' instead of' ; + if ($head_aft_bef_s eq '0') + { + $desc .= '0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= 'after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= 'before the focus word' ; + } + } + + $desc .= ')' ; + } + + $desc .= ', ' ; + + if ($dep_err eq '-') + { + $desc .= 'correct dependency' ; + } + else + { + ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ; + $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ; + } + + return($desc) ; + +} # describe_err + +sub get_context +{ # get_context + + my ($sent, $i_w) = @_ ; + my ($w_2, $w_1, $w1, $w2) ; + my ($p_2, $p_1, $p1, $p2) ; + + if ($i_w >= 2) + { + $w_2 = ${${$sent}[$i_w-2]}{word} ; + $p_2 = ${${$sent}[$i_w-2]}{pos} ; + } + else + { + $w_2 = $START ; + $p_2 = $START ; + } + + if ($i_w >= 1) + { + $w_1 = ${${$sent}[$i_w-1]}{word} ; + $p_1 = ${${$sent}[$i_w-1]}{pos} ; + } + else + { + $w_1 = $START ; + $p_1 = $START ; + } + + if ($i_w <= scalar @{$sent}-2) + { + $w1 = ${${$sent}[$i_w+1]}{word} ; + $p1 = ${${$sent}[$i_w+1]}{pos} ; + } + else + { + $w1 = $END ; + $p1 = $END ; + } + + if ($i_w <= scalar @{$sent}-3) + { + $w2 = ${${$sent}[$i_w+2]}{word} ; + $p2 = ${${$sent}[$i_w+2]}{pos} ; + } + else + { + $w2 = $END ; + $p2 = $END ; + } + + return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ; + +} # get_context + +sub read_sent +{ # read_sent + + my ($sent_gold, $sent_sys) = @_ ; + my ($line_g, $line_s, $new_sent) ; + my (%fields_g, %fields_s) ; + + $new_sent = 1 ; + + @{$sent_gold} = () ; + @{$sent_sys} = () ; + + while (1) + { # main reading loop + + $line_g = ; + $line_s = ; + + $line_num++ ; + + # system output has fewer lines than gold standard + if ((defined $line_g) && (! defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : past end of file\n" ; + exit(1) ; + } + + # system output has more lines than gold standard + if ((! defined $line_g) && (defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: past end of file\n" ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of file reached for both + if ((! defined $line_g) && (! defined $line_s)) + { + return (1) ; + } + + # one contains end of sentence but other one does not + if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of sentence reached + if ($line_g =~ /^\s+$/) + { + return(0) ; + } + + # now both lines contain information + + if ($new_sent) + { + $new_sent = 0 ; + } + + # 'official' column names + # options.output = ['id','form','lemma','cpostag','postag', + # 'feats','head','deprel','phead','pdeprel'] + + @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ; + + push @{$sent_gold}, { %fields_g } ; + + @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ; + + if (($fields_g{word} ne $fields_s{word}) + || + ($fields_g{pos} ne $fields_s{pos})) + { + printf STDERR "Word/pos mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + #exit(1) ; + } + + push @{$sent_sys}, { %fields_s } ; + + } # main reading loop + +} # read_sent + +################################################################################ +### main ### +################################################################################ + +our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ; + +my ($sent_num, $eof, $word_num, @err_sent) ; +my (@sent_gold, @sent_sys, @starts) ; +my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ; +my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ; +my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ; +my ($loc_con, %loc_con_err_counts, %err_desc) ; +my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ; +my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ; +my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ; +my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ; +my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ; +my (%freq_err, $err) ; + +my ($i, $j, $i_w, $l, $n_args) ; +my ($w_2, $w_1, $w1, $w2) ; +my ($wp_2, $wp_1, $wp1, $wp2) ; +my ($p_2, $p_1, $p1, $p2) ; + +my ($short_output) ; +my ($score_on_punct) ; +$counts{punct} = 0; # initialize + +getopts("g:o:s:qvhpb") ; + +if (defined $opt_v) +{ + my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $'; + my @parts = split ' ',$id; + print "Version $parts[2]\n"; + exit(0); +} + +if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s))) +{ + die $usage ; +} + +if (! defined $opt_g) +{ + die "Gold standard file (-g) missing\n" ; +} + +if (! defined $opt_s) +{ + die "System output file (-s) missing\n" ; +} + +if (! defined $opt_o) +{ + $opt_o = '-' ; +} + +if (defined $opt_q) +{ + $short_output = 1 ; +} else { + $short_output = 0 ; +} + +if (defined $opt_p) +{ + $score_on_punct = 1 ; +} else { + $score_on_punct = 0 ; +} + +$line_num = 0 ; +$sent_num = 0 ; +$eof = 0 ; + +@err_sent = () ; +@starts = () ; + +%{$err_sent[0]} = () ; + +$max_pos_len = length('CPOS') ; + +################################################################################ +### reading input ### +################################################################################ + +open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ; +open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ; +open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ; + + +if (defined $opt_b) { # produce output similar to evalb + print OUT " Sent. Attachment Correct Scoring \n"; + print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n"; + print OUT " ============================================================================\n"; +} + + +while (! $eof) +{ # main reading loop + + $starts[$sent_num] = $line_num+1 ; + $eof = read_sent(\@sent_gold, \@sent_sys) ; + + $sent_num++ ; + + %{$err_sent[$sent_num]} = () ; + $word_num = scalar @sent_gold ; + + # for accuracy per sentence + my %sent_counts = ( tot => 0, + err_any => 0, + err_head => 0 + ); + + # printf "$sent_num $word_num\n" ; + + my @frames_g = ('** '); # the initial frame for the virtual root + my @frames_s = ('** '); # the initial frame for the virtual root + foreach $i_w (0 .. $word_num-1) + { # loop on words + push @frames_g, ''; # initialize + push @frames_s, ''; # initialize + } + + foreach $i_w (0 .. $word_num-1) + { # loop on words + + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + $wp = $word.' / '.$pos ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + $counts{punct}++ ; + # ignore punctuations + next ; + } + + if (length($pos) > $max_pos_len) + { + $max_pos_len = length($pos) ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $counts{tot}++ ; + $counts{word}{$wp}{tot}++ ; + $counts{pos}{$pos}{tot}++ ; + $counts{head}{$head_g-$i_w-1}{tot}++ ; + + # for frame confusions + # add child to frame of parent + $frames_g[$head_g] .= "$dep_g "; + $frames_s[$head_s] .= "$dep_s "; + # add to frame of token itself + $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero + $frames_s[$i_w+1] .= "*$dep_g* "; + + # for precision and recall of DEPREL + $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels + $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions + $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels + $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ... + $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output + + # for precision and recall of HEAD direction + my $dir_g; + if ($head_g == 0) { + $dir_g = 'to_root'; + } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero + # also below + $dir_g = 'left'; + } elsif ($head_g > $i_w+1) { + $dir_g = 'right'; + } else { + # token links to itself; should never happen in correct gold standard + $dir_g = 'self'; + } + my $dir_s; + if ($head_s == 0) { + $dir_s = 'to_root'; + } elsif ($head_s < $i_w+1) { + $dir_s = 'left'; + } elsif ($head_s > $i_w+1) { + $dir_s = 'right'; + } else { + # token links to itself; should not happen in good system + # (but not forbidden in shared task) + $dir_s = 'self'; + } + $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction + $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions + $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction + + # for precision and recall of HEAD distance + my $dist_g; + if ($head_g == 0) { + $dist_g = 'to_root'; + } elsif ( abs($head_g - ($i_w+1)) <= 1 ) { + $dist_g = '1'; # includes the 'self' cases + } elsif ( abs($head_g - ($i_w+1)) <= 2 ) { + $dist_g = '2'; + } elsif ( abs($head_g - ($i_w+1)) <= 6 ) { + $dist_g = '3-6'; + } else { + $dist_g = '7-...'; + } + my $dist_s; + if ($head_s == 0) { + $dist_s = 'to_root'; + } elsif ( abs($head_s - ($i_w+1)) <= 1 ) { + $dist_s = '1'; # includes the 'self' cases + } elsif ( abs($head_s - ($i_w+1)) <= 2 ) { + $dist_s = '2'; + } elsif ( abs($head_s - ($i_w+1)) <= 6 ) { + $dist_s = '3-6'; + } else { + $dist_s = '7-...'; + } + $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance + $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions + $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance + + + $err_head = ($head_g ne $head_s) ; # error in head + $err_dep = ($dep_g ne $dep_s) ; # error in deprel + + $head_err = '-' ; + $dep_err = '-' ; + + # for accuracy per sentence + $sent_counts{tot}++ ; + if ($err_dep || $err_head) { + $sent_counts{err_any}++ ; + } + if ($err_head) { + $sent_counts{err_head}++ ; + } + + # total counts and counts for CPOS involved in errors + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + + $err_sent[$sent_num]{head}++ ; + $counts{err_head}{tot}++ ; + $counts{err_head}{$head_err}++ ; + + $counts{word}{err_head}{$wp}++ ; + $counts{pos}{$pos}{err_head}{tot}++ ; + $counts{pos}{$pos}{err_head}{$head_err}++ ; + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + $err_sent[$sent_num]{dep}++ ; + $counts{err_dep}{tot}++ ; + $counts{err_dep}{$dep_err}++ ; + + $counts{word}{err_dep}{$wp}++ ; + $counts{pos}{$pos}{err_dep}{tot}++ ; + $counts{pos}{$pos}{err_dep}{$dep_err}++ ; + + if ($err_head) + { + $counts{err_both}++ ; + $counts{pos}{$pos}{err_both}++ ; + } + } + + ### DEPREL + ATTACHMENT + if ((!$err_dep) && ($err_head)) { + $counts{err_head_corr_dep}{tot}++ ; + $counts{err_head_corr_dep}{$dep_s}++ ; + } + ### DEPREL + ATTACHMENT + + # counts for words involved in errors + + if (! ($err_head || $err_dep)) + { + next ; + } + + $err_sent[$sent_num]{word}++ ; + $counts{err_any}++ ; + $counts{word}{err_any}{$wp}++ ; + $counts{pos}{$pos}{err_any}++ ; + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + if ($w_2 ne $START) + { + $wp_2 = $w_2.' / '.$p_2 ; + } + else + { + $wp_2 = $w_2 ; + } + + if ($w_1 ne $START) + { + $wp_1 = $w_1.' / '.$p_1 ; + } + else + { + $wp_1 = $w_1 ; + } + + if ($w1 ne $END) + { + $wp1 = $w1.' / '.$p1 ; + } + else + { + $wp1 = $w1 ; + } + + if ($w2 ne $END) + { + $wp2 = $w2.' / '.$p2 ; + } + else + { + $wp2 = $w2 ; + } + + $con_bef = $wp_1 ; + $con_bef_2 = $wp_2.' + '.$wp_1 ; + $con_aft = $wp1 ; + $con_aft_2 = $wp1.' + '.$wp2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + if ($w_1 ne $START) + { + # do not count '.S' as a word context + $counts{con_bef_2}{tot}{$con_bef_2}++ ; + $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ; + $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ; + $counts{con_bef}{tot}{$con_bef}++ ; + $counts{con_bef}{err_head}{$con_bef} += $err_head ; + $counts{con_bef}{err_dep}{$con_bef} += $err_dep ; + } + + if ($w1 ne $END) + { + # do not count '.E' as a word context + $counts{con_aft_2}{tot}{$con_aft_2}++ ; + $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ; + $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ; + $counts{con_aft}{tot}{$con_aft}++ ; + $counts{con_aft}{err_head}{$con_aft} += $err_head ; + $counts{con_aft}{err_dep}{$con_aft} += $err_dep ; + } + + $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ; + $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ; + $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ; + $counts{con_pos_bef}{tot}{$con_pos_bef}++ ; + $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ; + $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ; + + $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ; + $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ; + $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ; + $counts{con_pos_aft}{tot}{$con_pos_aft}++ ; + $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ; + $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ; + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + $freq_err{$err}++ ; + + } # loop on words + + foreach $i_w (0 .. $word_num) # including one for the virtual root + { # loop on words + if ($frames_g[$i_w] ne $frames_s[$i_w]) { + $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ; + } + } + + if (defined $opt_b) { # produce output similar to evalb + if ($word_num > 0) { + my ($unlabeled,$labeled) = ('NaN', 'NaN'); + if ($sent_counts{tot} > 0) { # there are scoring tokens + $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot}; + $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot}; + } + printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n", + $sent_num, $word_num, + $unlabeled, $labeled, + $sent_counts{tot}-$sent_counts{err_head}, + $sent_counts{tot}-$sent_counts{err_any}, + $sent_counts{tot},; + } + } + +} # main reading loop + +################################################################################ +### printing output ### +################################################################################ + +if (defined $opt_b) { # produce output similar to evalb + print OUT "\n\n"; +} +printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ; +printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ; +printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ; + +if ($short_output) +{ + exit(0) ; +} +printf OUT "\n %s\n\n", '=' x 80 ; +printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ; + +printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ; + +printf OUT " Number of non-scoring tokens: $counts{punct}\n\n"; + +printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Accuracy', 'words', 'right', 'right', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + ' ', ' ', 'head', ' dep', 'right' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_head}{tot})) + { + $counts{pos}{$pos}{err_head}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_dep}{tot})) + { + $counts{pos}{$pos}{err_dep}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_any})) + { + $counts{pos}{$pos}{err_any} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ; +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT "\n\n" ; + +printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Error', 'words', 'head', ' dep', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + + 'Rate', ' ', 'err', ' err', 'wrong' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot}, + $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_both})) + { + $counts{pos}{$pos}{err_both} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ; + +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +### added by Sabine Buchholz +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +### DEPREL + ATTACHMENT: +### Same as Sabine's DEPREL apart from $tot_corr calculation +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + if (defined($counts{err_head_corr_dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep}; + } else { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} +### DEPREL + ATTACHMENT + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD direction\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dir ('to_root', 'left', 'right', 'self') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dir2}{$dir}{$dir})) { + $tot_corr = $counts{dir2}{$dir}{$dir}; + } + if (defined($counts{dir_g}{$dir}{tot})) { + $tot_g = $counts{dir_g}{$dir}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dir_s}{$dir}{tot})) { + $tot_s = $counts{dir_s}{$dir}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD distance\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dist ('to_root', '1', '2', '3-6', '7-...') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dist2}{$dist}{$dist})) { + $tot_corr = $counts{dist2}{$dist}{$dist}; + } + if (defined($counts{dist_g}{$dist}{tot})) { + $tot_g = $counts{dist_g}{$dist}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dist_s}{$dist}{tot})) { + $tot_s = $counts{dist_s}{$dist}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n"; +foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}}) +{ + if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later) + { + printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame; + } +} +### end of: added by Sabine Buchholz + + +# +# Leave only the 5 words mostly involved in errors +# + + +$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ; + +# ensure enough space for title +$max_word_len = length('word') ; + +foreach $word (keys %{$counts{word}{err_any}}) +{ + if ($counts{word}{err_any}{$word} < $thresh) + { + delete $counts{word}{err_any}{$word} ; + next ; + } + + $l = uni_len($word) ; + if ($l > $max_word_len) + { + $max_word_len = $l ; + } +} + +# filter a case when the difference between the error counts +# for 2-word and 1-word contexts is small +# (leave the 2-word context) + +foreach $con (keys %{$counts{con_aft_2}{tot}}) +{ + ($w1) = split(/\+/, $con) ; + + if (defined $counts{con_aft}{tot}{$w1} && + $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1) + { + delete $counts{con_aft}{tot}{$w1} ; + } +} + +foreach $con (keys %{$counts{con_bef_2}{tot}}) +{ + ($w_2, $w_1) = split(/\+/, $con) ; + + if (defined $counts{con_bef}{tot}{$w_1} && + $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1) + { + delete $counts{con_bef}{tot}{$w_1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + ($p1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_aft}{tot}{$p1}) && + $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_aft}{tot}{$p1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + ($p_2, $p_1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_bef}{tot}{$p_1}) && + $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_bef}{tot}{$p_1} ; + } +} + +# for each context type, take the three contexts most involved in errors + +$max_con_len = 0 ; + +filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ; + +# for each CPOS context type, take the three CPOS contexts most involved in errors + +$max_con_pos_len = 0 ; + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef}{tot}}) +{ + if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft}{tot}}) +{ + if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +# printing + +# ------------- focus words + +printf OUT "\n\n" ; +printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ; + +printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ; +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}}) +{ + if (!defined($counts{word}{err_head}{$word})) + { + $counts{word}{err_head}{$word} = 0 ; + } + if (! defined($counts{word}{err_dep}{$word})) + { + $counts{word}{err_dep}{$word} = 0 ; + } + if (! defined($counts{word}{err_any}{$word})) + { + $counts{word}{err_any}{$word} = 0; + } + printf OUT " %-*s | %4d | %4d | %4d | %4d\n", + $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word}, + $counts{word}{err_head}{$word}, + $counts{word}{err_dep}{$word}, + $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ; +} + +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +# ------------- contexts + +printf OUT "\n\n" ; + +printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ; + +printf OUT " one-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ; + +# ------------- Sentences + +printf OUT " Sentence with the highest number of word errors:\n" ; +$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word}) + <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of head errors:\n" ; +$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) + <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of dependency errors:\n" ; +$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) + <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +# +# Second pass, collect statistics of the frequent errors +# + +# filter the errors, leave the most frequent $freq_err_num errors + +$i = 0 ; + +$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ; + +foreach $err (keys %freq_err) +{ + if ($freq_err{$err} < $thresh) + { + delete $freq_err{$err} ; + } +} + +# in case there are several errors with the threshold count + +$freq_err_num = scalar keys %freq_err ; + +%err_counts = () ; + +$eof = 0 ; + +seek (GOLD, 0, 0) ; +seek (SYS, 0, 0) ; + +while (! $eof) +{ # second reading loop + + $eof = read_sent(\@sent_gold, \@sent_sys) ; + $sent_num++ ; + + $word_num = scalar @sent_gold ; + + # printf "$sent_num $word_num\n" ; + + foreach $i_w (0 .. $word_num-1) + { # loop on words + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + # ignore punctuations + next ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $err_head = ($head_g ne $head_s) ; + $err_dep = ($dep_g ne $dep_s) ; + + $head_err = '-' ; + $dep_err = '-' ; + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + } + + if (! ($err_head || $err_dep)) + { + next ; + } + + # handle only the most frequent errors + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + + if (! exists $freq_err{$err}) + { + next ; + } + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + $con_bef = $w_1 ; + $con_bef_2 = $w_2.' + '.$w_1 ; + $con_aft = $w1 ; + $con_aft_2 = $w1.' + '.$w2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ; + + # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n", + # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ; + + @bits = (0, 0, 0, 0, 0, 0) ; + $j = 0 ; + + while ($j == 0) + { + for ($i = 0; $i <= $#bits; $i++) + { + if ($bits[$i] == 0) + { + $bits[$i] = 1 ; + $j = 0 ; + last ; + } + else + { + $bits[$i] = 0 ; + $j = 1 ; + } + } + + @e_bits = @cur_err ; + + for ($i = 0; $i <= $#bits; $i++) + { + if (! $bits[$i]) + { + $e_bits[$i] = '*' ; + } + } + + # include also the last case which is the most general + # (wildcards for everything) + $err_counts{$err}{join($sep, @e_bits)}++ ; + + } + + } # loop on words +} # second reading loop + +printf OUT "\n\n" ; +printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ; +printf OUT "\n %s\n", '=' x 41 ; + + +# deleting local contexts which are too general + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + @cur_err = split(/\Q$sep\E/, $loc_con) ; + + # In this loop, one or two elements of the local context are + # replaced with '*' to make it more general. If the entry for + # the general context has the same count it is removed. + + foreach $i (0 .. $#cur_err) + { + $w1 = $cur_err[$i] ; + if ($cur_err[$i] eq '*') + { + next ; + } + $cur_err[$i] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + for ($j = $i+1; $j <=$#cur_err; $j++) + { + if ($cur_err[$j] eq '*') + { + next ; + } + $w2 = $cur_err[$j] ; + $cur_err[$j] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + $cur_err[$j] = $w2 ; + } + $cur_err[$i] = $w1 ; + } + } +} + +# Leaving only the topmost local contexts for each error + +foreach $err (keys %err_counts) +{ + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ; + + # of the threshold is too low, take the 2nd highest count + # (the highest may be the total which is the generic case + # and not relevant for printing) + + if ($thresh < 5) + { + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ; + } + + foreach $loc_con (keys %{$err_counts{$err}}) + { + if ($err_counts{$err}{$loc_con} < $thresh) + { + delete $err_counts{$err}{$loc_con} ; + } + else + { + if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*'))) + { + $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ; + } + } + } +} + +# printing an error summary + +# calculating the context field length + +$max_word_spec_len= length('word') ; +$max_con_aft_len = length('word') ; +$max_con_bef_len = length('word') ; +$max_con_pos_len = length('CPOS') ; + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort keys %{$err_counts{$err}}) + { + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $loc_con) ; + + $l = uni_len($word) ; + if ($l > $max_word_spec_len) + { + $max_word_spec_len = $l ; + } + + $l = uni_len($con_bef) ; + if ($l > $max_con_bef_len) + { + $max_con_bef_len = $l ; + } + + $l = uni_len($con_aft) ; + if ($l > $max_con_aft_len) + { + $max_con_aft_len = $l ; + } + + if (length($con_pos_aft) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_aft) ; + } + + if (length($con_pos_bef) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_bef) ; + } + } +} + +$err_counter = 0 ; + +foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err) +{ + + ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ; + + $err_counter++ ; + $err_desc{$err} = sprintf("%2d. ", $err_counter). + describe_err($head_err, $head_aft_bef, $dep_err) ; + + # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ; + printf OUT "\n" ; + printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s | %s\n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After', + 'Count' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*'))) + { + next ; + } + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft, + $err_counts{$err}{$loc_con} ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + +} + +printf OUT "\n\n" ; +printf OUT " Local contexts involved in several frequent errors:" ; +printf OUT "\n %s\n", '=' x 51 ; +printf OUT "\n\n" ; + +foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=> + scalar keys %{$loc_con_err_counts{$a}}} + keys %loc_con_err_counts) +{ + + if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1) + { + next ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s \n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=> + $loc_con_err_counts{$loc_con}{$a}} + keys %{$loc_con_err_counts{$loc_con}}) + { + printf OUT " %s : %d times\n", $err_desc{$err}, + $loc_con_err_counts{$loc_con}{$err} ; + } + + printf OUT "\n" ; +} + +close GOLD ; +close SYS ; + +close OUT ; diff --git a/bist_parser/bmstparser/src/decoder.py b/bist_parser/bmstparser/src/decoder.py new file mode 100644 index 0000000..f93b74f --- /dev/null +++ b/bist_parser/bmstparser/src/decoder.py @@ -0,0 +1,105 @@ +# This file contains routines from Lisbon Machine Learning summer school. +# The code is freely distributed under a MIT license. https://github.com/LxMLS/lxmls-toolkit/ + +import numpy as np +import sys +from collections import defaultdict, namedtuple +from operator import itemgetter + + +def parse_proj(scores, gold=None): + ''' + Parse using Eisner's algorithm. + ''' + nr, nc = np.shape(scores) + if nr != nc: + raise ValueError("scores must be a squared matrix with nw+1 rows") + + N = nr - 1 # Number of words (excluding root). + + # Initialize CKY table. + complete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1). + incomplete = np.zeros([N+1, N+1, 2]) # s, t, direction (right=1). + complete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1). + incomplete_backtrack = -np.ones([N+1, N+1, 2], dtype=int) # s, t, direction (right=1). + + incomplete[0, :, 0] -= np.inf + + # Loop from smaller items to larger items. + for k in range(1,N+1): + for s in range(N-k+1): + t = s+k + + # First, create incomplete items. + # left tree + incomplete_vals0 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[t, s] + (0.0 if gold is not None and gold[s]==t else 1.0) + incomplete[s, t, 0] = np.max(incomplete_vals0) + incomplete_backtrack[s, t, 0] = s + np.argmax(incomplete_vals0) + # right tree + incomplete_vals1 = complete[s, s:t, 1] + complete[(s+1):(t+1), t, 0] + scores[s, t] + (0.0 if gold is not None and gold[t]==s else 1.0) + incomplete[s, t, 1] = np.max(incomplete_vals1) + incomplete_backtrack[s, t, 1] = s + np.argmax(incomplete_vals1) + + # Second, create complete items. + # left tree + complete_vals0 = complete[s, s:t, 0] + incomplete[s:t, t, 0] + complete[s, t, 0] = np.max(complete_vals0) + complete_backtrack[s, t, 0] = s + np.argmax(complete_vals0) + # right tree + complete_vals1 = incomplete[s, (s+1):(t+1), 1] + complete[(s+1):(t+1), t, 1] + complete[s, t, 1] = np.max(complete_vals1) + complete_backtrack[s, t, 1] = s + 1 + np.argmax(complete_vals1) + + value = complete[0][N][1] + heads = [-1 for _ in range(N+1)] #-np.ones(N+1, dtype=int) + backtrack_eisner(incomplete_backtrack, complete_backtrack, 0, N, 1, 1, heads) + + value_proj = 0.0 + for m in range(1,N+1): + h = heads[m] + value_proj += scores[h,m] + + return heads + + +def backtrack_eisner(incomplete_backtrack, complete_backtrack, s, t, direction, complete, heads): + ''' + Backtracking step in Eisner's algorithm. + - incomplete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position, + an end position, and a direction flag (0 means left, 1 means right). This array contains + the arg-maxes of each step in the Eisner algorithm when building *incomplete* spans. + - complete_backtrack is a (NW+1)-by-(NW+1) numpy array indexed by a start position, + an end position, and a direction flag (0 means left, 1 means right). This array contains + the arg-maxes of each step in the Eisner algorithm when building *complete* spans. + - s is the current start of the span + - t is the current end of the span + - direction is 0 (left attachment) or 1 (right attachment) + - complete is 1 if the current span is complete, and 0 otherwise + - heads is a (NW+1)-sized numpy array of integers which is a placeholder for storing the + head of each word. + ''' + if s == t: + return + if complete: + r = complete_backtrack[s][t][direction] + if direction == 0: + backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 0, 1, heads) + backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 0, 0, heads) + return + else: + backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 0, heads) + backtrack_eisner(incomplete_backtrack, complete_backtrack, r, t, 1, 1, heads) + return + else: + r = incomplete_backtrack[s][t][direction] + if direction == 0: + heads[s] = t + backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads) + backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads) + return + else: + heads[t] = s + backtrack_eisner(incomplete_backtrack, complete_backtrack, s, r, 1, 1, heads) + backtrack_eisner(incomplete_backtrack, complete_backtrack, r+1, t, 0, 1, heads) + return + diff --git a/bist_parser/bmstparser/src/mstlstm.py b/bist_parser/bmstparser/src/mstlstm.py new file mode 100644 index 0000000..e403d59 --- /dev/null +++ b/bist_parser/bmstparser/src/mstlstm.py @@ -0,0 +1,496 @@ +from dynet import * +from bist_parser.bmstparser.src.utils import read_conll, write_conll +from bist_parser.bmstparser.src import utils, decoder +from operator import itemgetter +import time, random +import numpy as np + + +class MSTParserLSTM: + def __init__(self, vocab, pos, rels, w2i, options): + self.model = Model() + random.seed(1) + self.trainer = AdamTrainer(self.model) + + self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cwise_multiply(cwise_multiply(x, x), x)))} + self.activation = self.activations[options.activation] + + self.blstmFlag = options.blstmFlag + self.labelsFlag = options.labelsFlag + self.costaugFlag = options.costaugFlag + self.bibiFlag = options.bibiFlag + + self.ldims = options.lstm_dims + self.wdims = options.wembedding_dims + self.pdims = options.pembedding_dims + self.rdims = options.rembedding_dims + self.layers = options.lstm_layers + self.wordsCount = vocab + self.vocab = {word: ind+3 for word, ind in iter(w2i.items())} + self.pos = {word: ind+3 for ind, word in enumerate(pos)} + self.rels = {word: ind for ind, word in enumerate(rels)} + self.irels = rels + + self.external_embedding, self.edim = None, 0 + if options.external_embedding is not None: + external_embedding_fp = open(options.external_embedding,'r') + external_embedding_fp.readline() + self.external_embedding = {line.split(' ')[0] : [float(f) for f in line.strip().split(' ')[1:]] for line in external_embedding_fp} + external_embedding_fp.close() + + self.edim = len(self.external_embedding.values()[0]) + self.noextrn = [0.0 for _ in range(self.edim)] + self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} + self.elookup = self.model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) + for word, i in iter(self.extrnd.items()): + self.elookup.init_row(i, self.external_embedding[word]) + self.extrnd['*PAD*'] = 1 + self.extrnd['*INITIAL*'] = 2 + + print('Load external embedding. Vector dimensions', self.edim) + + if self.bibiFlag: + self.builders = [VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model), + VanillaLSTMBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)] + self.bbuilders = [VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model), + VanillaLSTMBuilder(1, self.ldims * 2, self.ldims, self.model)] + elif self.layers > 0: + self.builders = [VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model), + VanillaLSTMBuilder(self.layers, self.wdims + self.pdims + self.edim, self.ldims, self.model)] + else: + self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model), + SimpleRNNBuilder(1, self.wdims + self.pdims + self.edim, self.ldims, self.model)] + + self.hidden_units = options.hidden_units + self.hidden2_units = options.hidden2_units + + self.vocab['*PAD*'] = 1 + self.pos['*PAD*'] = 1 + + self.vocab['*INITIAL*'] = 2 + self.pos['*INITIAL*'] = 2 + + self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims)) + self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims)) + self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) + + self.hidLayerFOH = self.model.add_parameters((self.hidden_units, self.ldims * 2)) + self.hidLayerFOM = self.model.add_parameters((self.hidden_units, self.ldims * 2)) + self.hidBias = self.model.add_parameters((self.hidden_units)) + + self.hid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.hid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.outLayer = self.model.add_parameters((1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + + if self.labelsFlag: + self.rhidLayerFOH = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) + self.rhidLayerFOM = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) + self.rhidBias = self.model.add_parameters((self.hidden_units)) + + self.rhid2Layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) + self.rhid2Bias = self.model.add_parameters((self.hidden2_units)) + + self.routLayer = self.model.add_parameters((len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) + self.routBias = self.model.add_parameters((len(self.irels))) + + + def __getExpr(self, sentence, i, j, train): + + if sentence[i].headfov is None: + sentence[i].headfov = self.hidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]]) + if sentence[j].modfov is None: + sentence[j].modfov = self.hidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]]) + + if self.hidden2_units > 0: + output = self.outLayer.expr() * self.activation(self.hid2Bias.expr() + self.hid2Layer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr())) # + self.outBias + else: + output = self.outLayer.expr() * self.activation(sentence[i].headfov + sentence[j].modfov + self.hidBias.expr()) # + self.outBias + + return output + + + def __evaluate(self, sentence, train): + exprs = [ [self.__getExpr(sentence, i, j, train) for j in range(len(sentence))] for i in range(len(sentence)) ] + scores = np.array([ [output.scalar_value() for output in exprsRow] for exprsRow in exprs ]) + + return scores, exprs + + + def __evaluateLabel(self, sentence, i, j): + if sentence[i].rheadfov is None: + sentence[i].rheadfov = self.rhidLayerFOH.expr() * concatenate([sentence[i].lstms[0], sentence[i].lstms[1]]) + if sentence[j].rmodfov is None: + sentence[j].rmodfov = self.rhidLayerFOM.expr() * concatenate([sentence[j].lstms[0], sentence[j].lstms[1]]) + + if self.hidden2_units > 0: + output = self.routLayer.expr() * self.activation(self.rhid2Bias.expr() + self.rhid2Layer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr())) + self.routBias.expr() + else: + output = self.routLayer.expr() * self.activation(sentence[i].rheadfov + sentence[j].rmodfov + self.rhidBias.expr()) + self.routBias.expr() + + return output.value(), output + + + def Save(self, filename): + self.model.save(filename) + + + def Load(self, filename): + self.model.load(filename) + + + def Predict(self, conll_path): + with open(conll_path, 'r') as conllFP: + for iSentence, sentence in enumerate(read_conll(conllFP)): + for entry in sentence: + wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None + entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec]))) + + entry.lstms = [entry.vec, entry.vec] + entry.headfov = None + entry.modfov = None + + entry.rheadfov = None + entry.rmodfov = None + + if self.blstmFlag: + lstm_forward = self.builders[0].initial_state() + lstm_backward = self.builders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + lstm_forward = lstm_forward.add_input(entry.vec) + lstm_backward = lstm_backward.add_input(rentry.vec) + + entry.lstms[1] = lstm_forward.output() + rentry.lstms[0] = lstm_backward.output() + + if self.bibiFlag: + for entry in sentence: + entry.vec = concatenate(entry.lstms) + + blstm_forward = self.bbuilders[0].initial_state() + blstm_backward = self.bbuilders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + blstm_forward = blstm_forward.add_input(entry.vec) + blstm_backward = blstm_backward.add_input(rentry.vec) + + entry.lstms[1] = blstm_forward.output() + rentry.lstms[0] = blstm_backward.output() + + scores, exprs = self.__evaluate(sentence, True) + heads = decoder.parse_proj(scores) + + for entry, head in zip(sentence, heads): + entry.pred_parent_id = head + entry.pred_relation = '_' + + dump = False + + if self.labelsFlag: + for modifier, head in enumerate(heads[1:]): + scores, exprs = self.__evaluateLabel(sentence, head, modifier+1) + sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] + + renew_cg() + if not dump: + yield sentence + + def PredictOnEntries(self, conll_entries): + for iSentence, sentence in enumerate(conll_entries): + for entry in sentence: + wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0))] if self.wdims > 0 else None + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + evec = self.elookup[int(self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)))] if self.external_embedding is not None else None + entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec]))) + + entry.lstms = [entry.vec, entry.vec] + entry.headfov = None + entry.modfov = None + + entry.rheadfov = None + entry.rmodfov = None + + if self.blstmFlag: + lstm_forward = self.builders[0].initial_state() + lstm_backward = self.builders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + lstm_forward = lstm_forward.add_input(entry.vec) + lstm_backward = lstm_backward.add_input(rentry.vec) + + entry.lstms[1] = lstm_forward.output() + rentry.lstms[0] = lstm_backward.output() + + if self.bibiFlag: + for entry in sentence: + entry.vec = concatenate(entry.lstms) + + blstm_forward = self.bbuilders[0].initial_state() + blstm_backward = self.bbuilders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + blstm_forward = blstm_forward.add_input(entry.vec) + blstm_backward = blstm_backward.add_input(rentry.vec) + + entry.lstms[1] = blstm_forward.output() + rentry.lstms[0] = blstm_backward.output() + + scores, exprs = self.__evaluate(sentence, True) + heads = decoder.parse_proj(scores) + + for entry, head in zip(sentence, heads): + entry.pred_parent_id = head + entry.pred_relation = '_' + + dump = False + + if self.labelsFlag: + for modifier, head in enumerate(heads[1:]): + scores, exprs = self.__evaluateLabel(sentence, head, modifier+1) + sentence[modifier+1].pred_relation = self.irels[max(enumerate(scores), key=itemgetter(1))[0]] + + renew_cg() + if not dump: + yield sentence + + def Train(self, conll_path): + errors = 0 + batch = 0 + eloss = 0.0 + mloss = 0.0 + eerrors = 0 + etotal = 0 + start = time.time() + + with open(conll_path, 'r') as conllFP: + shuffledData = list(read_conll(conllFP)) + random.shuffle(shuffledData) + + errs = [] + lerrs = [] + eeloss = 0.0 + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start) + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + for entry in sentence: + c = float(self.wordsCount.get(entry.norm, 0)) + dropFlag = (random.random() < (c/(0.25+c))) + wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + evec = None + + if self.external_embedding is not None: + evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] + entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec]))) + + entry.lstms = [entry.vec, entry.vec] + entry.headfov = None + entry.modfov = None + + entry.rheadfov = None + entry.rmodfov = None + + if self.blstmFlag: + lstm_forward = self.builders[0].initial_state() + lstm_backward = self.builders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + lstm_forward = lstm_forward.add_input(entry.vec) + lstm_backward = lstm_backward.add_input(rentry.vec) + + entry.lstms[1] = lstm_forward.output() + rentry.lstms[0] = lstm_backward.output() + + if self.bibiFlag: + for entry in sentence: + entry.vec = concatenate(entry.lstms) + + blstm_forward = self.bbuilders[0].initial_state() + blstm_backward = self.bbuilders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + blstm_forward = blstm_forward.add_input(entry.vec) + blstm_backward = blstm_backward.add_input(rentry.vec) + + entry.lstms[1] = blstm_forward.output() + rentry.lstms[0] = blstm_backward.output() + + scores, exprs = self.__evaluate(sentence, True) + gold = [entry.parent_id for entry in sentence] + heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) + + if self.labelsFlag: + for modifier, head in enumerate(gold[1:]): + rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1) + goldLabelInd = self.rels[sentence[modifier+1].relation] + wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] + if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: + lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) + + e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) + eerrors += e + if e > 0: + loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) + eloss += (e) + mloss += (e) + errs.extend(loss) + + etotal += len(sentence) + + if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: + eeloss = 0.0 + + if len(errs) > 0 or len(lerrs) > 0: + eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + + if len(errs) > 0: + eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + eeloss = 0.0 + + renew_cg() + + self.trainer.update_epoch() + print("Loss: ", mloss/iSentence) + + def TrainOnEntries(self, shuffledData): + errors = 0 + batch = 0 + eloss = 0.0 + mloss = 0.0 + eerrors = 0 + etotal = 0 + start = time.time() + + random.shuffle(shuffledData) + + errs = [] + lerrs = [] + eeloss = 0.0 + + for iSentence, sentence in enumerate(shuffledData): + if iSentence % 100 == 0 and iSentence != 0: + print('Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time()-start) + start = time.time() + eerrors = 0 + eloss = 0.0 + etotal = 0 + lerrors = 0 + ltotal = 0 + + for entry in sentence: + c = float(self.wordsCount.get(entry.norm, 0)) + dropFlag = (random.random() < (c/(0.25+c))) + wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if dropFlag else 0] if self.wdims > 0 else None + posvec = self.plookup[int(self.pos[entry.pos])] if self.pdims > 0 else None + evec = None + + if self.external_embedding is not None: + evec = self.elookup[self.vocab.get(entry.form, self.vocab.get(entry.norm, 0)) if (dropFlag or (random.random() < 0.5)) else 0] + entry.vec = concatenate(list(filter(None, [wordvec, posvec, evec]))) + + entry.lstms = [entry.vec, entry.vec] + entry.headfov = None + entry.modfov = None + + entry.rheadfov = None + entry.rmodfov = None + + if self.blstmFlag: + lstm_forward = self.builders[0].initial_state() + lstm_backward = self.builders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + lstm_forward = lstm_forward.add_input(entry.vec) + lstm_backward = lstm_backward.add_input(rentry.vec) + + entry.lstms[1] = lstm_forward.output() + rentry.lstms[0] = lstm_backward.output() + + if self.bibiFlag: + for entry in sentence: + entry.vec = concatenate(entry.lstms) + + blstm_forward = self.bbuilders[0].initial_state() + blstm_backward = self.bbuilders[1].initial_state() + + for entry, rentry in zip(sentence, reversed(sentence)): + blstm_forward = blstm_forward.add_input(entry.vec) + blstm_backward = blstm_backward.add_input(rentry.vec) + + entry.lstms[1] = blstm_forward.output() + rentry.lstms[0] = blstm_backward.output() + + scores, exprs = self.__evaluate(sentence, True) + gold = [entry.parent_id for entry in sentence] + heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) + + if self.labelsFlag: + for modifier, head in enumerate(gold[1:]): + rscores, rexprs = self.__evaluateLabel(sentence, head, modifier+1) + goldLabelInd = self.rels[sentence[modifier+1].relation] + wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] + if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: + lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) + + e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) + eerrors += e + if e > 0: + loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) + eloss += (e) + mloss += (e) + errs.extend(loss) + + etotal += len(sentence) + + if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: + eeloss = 0.0 + + if len(errs) > 0 or len(lerrs) > 0: + eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + errs = [] + lerrs = [] + + renew_cg() + + if len(errs) > 0: + eerrs = (esum(errs + lerrs)) #* (1.0/(float(len(errs)))) + eerrs.scalar_value() + eerrs.backward() + self.trainer.update() + + errs = [] + lerrs = [] + eeloss = 0.0 + + renew_cg() + + self.trainer.update_epoch() + print("Loss: ", mloss/iSentence) + diff --git a/bist_parser/bmstparser/src/parser.py b/bist_parser/bmstparser/src/parser.py new file mode 100644 index 0000000..19b2980 --- /dev/null +++ b/bist_parser/bmstparser/src/parser.py @@ -0,0 +1,75 @@ +from optparse import OptionParser +from bist_parser.bmstparser.src import utils, mstlstm +import pickle +import os.path +import time + + +if __name__ == '__main__': + parser = OptionParser() + parser.add_option("--train", dest="conll_train", help="Annotated CONLL train file", metavar="FILE", default="../data/en-universal-train.conll.ptb") + parser.add_option("--dev", dest="conll_dev", help="Annotated CONLL dev file", metavar="FILE", default="../data/en-universal-dev.conll.ptb") + parser.add_option("--test", dest="conll_test", help="Annotated CONLL test file", metavar="FILE", default="../data/en-universal-test.conll.ptb") + parser.add_option("--extrn", dest="external_embedding", help="External embeddings", metavar="FILE") + parser.add_option("--params", dest="params", help="Parameters file", metavar="FILE", default="params.pickle") + parser.add_option("--model", dest="model", help="Load/Save model file", metavar="FILE", default="neuralfirstorder.model") + parser.add_option("--wembedding", type="int", dest="wembedding_dims", default=100) + parser.add_option("--pembedding", type="int", dest="pembedding_dims", default=25) + parser.add_option("--rembedding", type="int", dest="rembedding_dims", default=25) + parser.add_option("--epochs", type="int", dest="epochs", default=30) + parser.add_option("--hidden", type="int", dest="hidden_units", default=100) + parser.add_option("--hidden2", type="int", dest="hidden2_units", default=0) + parser.add_option("--lr", type="float", dest="learning_rate", default=0.1) + parser.add_option("--outdir", type="string", dest="output", default="results") + parser.add_option("--activation", type="string", dest="activation", default="tanh") + parser.add_option("--lstmlayers", type="int", dest="lstm_layers", default=2) + parser.add_option("--lstmdims", type="int", dest="lstm_dims", default=125) + parser.add_option("--disableblstm", action="store_false", dest="blstmFlag", default=True) + parser.add_option("--disablelabels", action="store_false", dest="labelsFlag", default=True) + parser.add_option("--predict", action="store_true", dest="predictFlag", default=False) + parser.add_option("--bibi-lstm", action="store_true", dest="bibiFlag", default=False) + parser.add_option("--disablecostaug", action="store_false", dest="costaugFlag", default=True) + parser.add_option("--dynet-seed", type="int", dest="seed", default=0) + parser.add_option("--dynet-mem", type="int", dest="mem", default=0) + + (options, args) = parser.parse_args() + + print('Using external embedding:', options.external_embedding) + + if options.predictFlag: + with open(options.params, 'rb') as paramsfp: + words, w2i, pos, rels, stored_opt = pickle.load(paramsfp) + + stored_opt.external_embedding = options.external_embedding + + print('Initializing lstm mstparser:') + parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, stored_opt) + + parser.Load(options.model) + tespath = os.path.join(options.output, 'test_pred.conll') + + ts = time.time() + test_res = list(parser.Predict(options.conll_test)) + te = time.time() + print('Finished predicting test.', te-ts, 'seconds.') + utils.write_conll(tespath, test_res) + + os.system('perl src/util_scripts/eval.pl -g ' + options.conll_test + ' -s ' + tespath + ' > ' + tespath + '.txt') + else: + print('Preparing vocab') + words, w2i, pos, rels = utils.vocab(options.conll_train) + + with open(os.path.join(options.output, options.params), 'wb') as paramsfp: + pickle.dump((words, w2i, pos, rels, options), paramsfp) + print('Finished collecting vocab') + + print('Initializing lstm mstparser:') + parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, options) + + for epoch in range(options.epochs): + print('Starting epoch', epoch) + parser.Train(options.conll_train) + devpath = os.path.join(options.output, 'dev_epoch_' + str(epoch+1) + '.conll') + utils.write_conll(devpath, parser.Predict(options.conll_dev)) + parser.Save(os.path.join(options.output, os.path.basename(options.model) + str(epoch+1))) + os.system('perl src/util_scripts/eval.pl -g ' + options.conll_dev + ' -s ' + devpath + ' > ' + devpath + '.txt') diff --git a/bist_parser/bmstparser/src/util_scripts/eval.pl b/bist_parser/bmstparser/src/util_scripts/eval.pl new file mode 100644 index 0000000..3db9837 --- /dev/null +++ b/bist_parser/bmstparser/src/util_scripts/eval.pl @@ -0,0 +1,1826 @@ +#!/usr/bin/env perl + +# Author: Yuval Krymolowski +# Addition of precision and recall +# and of frame confusion list: Sabine Buchholz +# Addition of DEPREL + ATTACHMENT: +# Prokopis Prokopidis (prokopis at ilsp dot gr) +# Acknowledgements: +# to Markus Kuhn for suggesting the use of +# the Unicode category property + +if ($] < 5.008001) +{ + printf STDERR < -s + + This script evaluates a system output with respect to a gold standard. + Both files should be in UTF-8 encoded CoNLL-X tabular format. + + Punctuation tokens (those where all characters have the Unicode + category property "Punctuation") are ignored for scoring (unless the + -p flag is used). + + The output breaks down the errors according to their type and context. + + Optional parameters: + -o FILE : output: print output to FILE (default is standard output) + -q : quiet: only print overall performance, without the details + -b : evalb: produce output in a format similar to evalb + (http://nlp.cs.nyu.edu/evalb/); use together with -q + -p : punctuation: also score on punctuation (default is not to score on it) + -v : version: show the version number + -h : help: print this help text and exit + +EOT +; + +my ($line_num) ; +my ($sep) = '0x01' ; + +my ($START) = '.S' ; +my ($END) = '.E' ; + +my ($con_err_num) = 3 ; +my ($freq_err_num) = 10 ; +my ($spec_err_loc_con) = 8 ; + +################################################################################ +### subfunctions ### +################################################################################ + +# Whether a string consists entirely of characters with the Unicode +# category property "Punctuation" (see "man perlunicode") +sub is_uni_punct +{ + my ($word) = @_ ; + + return scalar(Encode::decode_utf8($word)=~ /^\p{Punctuation}+$/) ; +} + +# The length of a unicode string, excluding non-spacing marks +# (for example vowel marks in Arabic) + +sub uni_len +{ + my ($word) = @_ ; + my ($ch, $l) ; + + $l = 0 ; + foreach $ch (split(//, Encode::decode_utf8($word))) + { + if ($ch !~ /^\p{NonspacingMark}/) + { + $l++ ; + } + } + + return $l ; +} + +sub filter_context_counts +{ # filter_context_counts + + my ($vec, $num, $max_len) = @_ ; + my ($con, $l, $thresh) ; + + $thresh = (sort {$b <=> $a} values %{$vec})[$num-1] ; + + foreach $con (keys %{$vec}) + { + if (${$vec}{$con} < $thresh) + { + delete ${$vec}{$con} ; + next ; + } + + $l = uni_len($con) ; + + if ($l > ${$max_len}) + { + ${$max_len} = $l ; + } + } + +} # filter_context_counts + +sub print_context +{ # print_context + + my ($counts, $counts_pos, $max_con_len, $max_con_pos_len) = @_ ; + my (@v_con, @v_con_pos, $con, $con_pos, $i, $n) ; + + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_pos_len, 'CPOS', 'any', 'head', 'dep', 'both' ; + printf OUT " ||" ; + printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s", $max_con_len, 'word', 'any', 'head', 'dep', 'both' ; + printf OUT "\n" ; + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + @v_con = sort {${$counts}{tot}{$b} <=> ${$counts}{tot}{$a}} keys %{${$counts}{tot}} ; + @v_con_pos = sort {${$counts_pos}{tot}{$b} <=> ${$counts_pos}{tot}{$a}} keys %{${$counts_pos}{tot}} ; + + $n = scalar @v_con ; + if (scalar @v_con_pos > $n) + { + $n = scalar @v_con_pos ; + } + + foreach $i (0 .. $n-1) + { + if (defined $v_con_pos[$i]) + { + $con_pos = $v_con_pos[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_pos_len, $con_pos, ${$counts_pos}{tot}{$con_pos}, + ${$counts_pos}{err_head}{$con_pos}, ${$counts_pos}{err_dep}{$con_pos}, + ${$counts_pos}{err_dep}{$con_pos}+${$counts_pos}{err_head}{$con_pos}-${$counts_pos}{tot}{$con_pos} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_pos_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT " ||" ; + + if (defined $v_con[$i]) + { + $con = $v_con[$i] ; + printf OUT " %-*s | %4d | %4d | %4d | %4d", + $max_con_len+length($con)-uni_len($con), $con, ${$counts}{tot}{$con}, + ${$counts}{err_head}{$con}, ${$counts}{err_dep}{$con}, + ${$counts}{err_dep}{$con}+${$counts}{err_head}{$con}-${$counts}{tot}{$con} ; + } + else + { + printf OUT " %-*s | %4s | %4s | %4s | %4s", + $max_con_len, ' ', ' ', ' ', ' ', ' ' ; + } + + printf OUT "\n" ; + } + + printf OUT " %s-+------+------+------+-----", '-' x $max_con_pos_len; + printf OUT "--++" ; + printf OUT "--%s-+------+------+------+-----", '-' x $max_con_len; + printf OUT "\n" ; + + printf OUT "\n\n" ; + +} # print_context + +sub num_as_word +{ + my ($num) = @_ ; + + $num = abs($num) ; + + if ($num == 1) + { + return ('one word') ; + } + elsif ($num == 2) + { + return ('two words') ; + } + elsif ($num == 3) + { + return ('three words') ; + } + elsif ($num == 4) + { + return ('four words') ; + } + else + { + return ($num.' words') ; + } +} + +sub describe_err +{ # describe_err + + my ($head_err, $head_aft_bef, $dep_err) = @_ ; + my ($dep_g, $dep_s, $desc) ; + my ($head_aft_bef_g, $head_aft_bef_s) = split(//, $head_aft_bef) ; + + if ($head_err eq '-') + { + $desc = 'correct head' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= ' (0)' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= ' (the focus word)' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= ' (after the focus word)' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= ' (before the focus word)' ; + } + } + elsif ($head_aft_bef_s eq '0') + { + $desc = 'head = 0 instead of ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_g eq '0') + { + $desc = 'head is ' ; + if ($head_aft_bef_g eq 'a') + { + $desc.= 'after ' ; + } + if ($head_aft_bef_g eq 'b') + { + $desc.= 'before ' ; + } + $desc .= 'the focus word instead of 0' ; + } + else + { + $desc = num_as_word($head_err) ; + if ($head_err < 0) + { + $desc .= ' before' ; + } + else + { + $desc .= ' after' ; + } + + $desc = 'head '.$desc.' the correct head ' ; + + if ($head_aft_bef_s eq '0') + { + $desc .= '(0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= '(the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= '(after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= '(before the focus word' ; + } + + if ($head_aft_bef_g ne $head_aft_bef_s) + { + $desc .= ' instead of' ; + if ($head_aft_bef_s eq '0') + { + $desc .= '0' ; + } + elsif ($head_aft_bef_s eq 'e') + { + $desc .= 'the focus word' ; + } + elsif ($head_aft_bef_s eq 'a') + { + $desc .= 'after the focus word' ; + } + elsif ($head_aft_bef_s eq 'b') + { + $desc .= 'before the focus word' ; + } + } + + $desc .= ')' ; + } + + $desc .= ', ' ; + + if ($dep_err eq '-') + { + $desc .= 'correct dependency' ; + } + else + { + ($dep_g, $dep_s) = ($dep_err =~ /^(.*)->(.*)$/) ; + $desc .= sprintf('dependency "%s" instead of "%s"', $dep_s, $dep_g) ; + } + + return($desc) ; + +} # describe_err + +sub get_context +{ # get_context + + my ($sent, $i_w) = @_ ; + my ($w_2, $w_1, $w1, $w2) ; + my ($p_2, $p_1, $p1, $p2) ; + + if ($i_w >= 2) + { + $w_2 = ${${$sent}[$i_w-2]}{word} ; + $p_2 = ${${$sent}[$i_w-2]}{pos} ; + } + else + { + $w_2 = $START ; + $p_2 = $START ; + } + + if ($i_w >= 1) + { + $w_1 = ${${$sent}[$i_w-1]}{word} ; + $p_1 = ${${$sent}[$i_w-1]}{pos} ; + } + else + { + $w_1 = $START ; + $p_1 = $START ; + } + + if ($i_w <= scalar @{$sent}-2) + { + $w1 = ${${$sent}[$i_w+1]}{word} ; + $p1 = ${${$sent}[$i_w+1]}{pos} ; + } + else + { + $w1 = $END ; + $p1 = $END ; + } + + if ($i_w <= scalar @{$sent}-3) + { + $w2 = ${${$sent}[$i_w+2]}{word} ; + $p2 = ${${$sent}[$i_w+2]}{pos} ; + } + else + { + $w2 = $END ; + $p2 = $END ; + } + + return ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) ; + +} # get_context + +sub read_sent +{ # read_sent + + my ($sent_gold, $sent_sys) = @_ ; + my ($line_g, $line_s, $new_sent) ; + my (%fields_g, %fields_s) ; + + $new_sent = 1 ; + + @{$sent_gold} = () ; + @{$sent_sys} = () ; + + while (1) + { # main reading loop + + $line_g = ; + $line_s = ; + + $line_num++ ; + + # system output has fewer lines than gold standard + if ((defined $line_g) && (! defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : past end of file\n" ; + exit(1) ; + } + + # system output has more lines than gold standard + if ((! defined $line_g) && (defined $line_s)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: past end of file\n" ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of file reached for both + if ((! defined $line_g) && (! defined $line_s)) + { + return (1) ; + } + + # one contains end of sentence but other one does not + if (($line_g =~ /^\s+$/) != ($line_s =~ /^\s+$/)) + { + printf STDERR "line mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + exit(1) ; + } + + # end of sentence reached + if ($line_g =~ /^\s+$/) + { + return(0) ; + } + + # now both lines contain information + + if ($new_sent) + { + $new_sent = 0 ; + } + + # 'official' column names + # options.output = ['id','form','lemma','cpostag','postag', + # 'feats','head','deprel','phead','pdeprel'] + + @fields_g{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_g))[1, 3, 6, 7] ; + + push @{$sent_gold}, { %fields_g } ; + + @fields_s{'word', 'pos', 'head', 'dep'} = (split (/\s+/, $line_s))[1, 3, 6, 7] ; + + if (($fields_g{word} ne $fields_s{word}) + || + ($fields_g{pos} ne $fields_s{pos})) + { + printf STDERR "Word/pos mismatch, line %d:\n", $line_num ; + printf STDERR " gold: %s", $line_g ; + printf STDERR " sys : %s", $line_s ; + #exit(1) ; + } + + push @{$sent_sys}, { %fields_s } ; + + } # main reading loop + +} # read_sent + +################################################################################ +### main ### +################################################################################ + +our ($opt_g, $opt_s, $opt_o, $opt_h, $opt_v, $opt_q, $opt_p, $opt_b) ; + +my ($sent_num, $eof, $word_num, @err_sent) ; +my (@sent_gold, @sent_sys, @starts) ; +my ($word, $pos, $wp, $head_g, $dep_g, $head_s, $dep_s) ; +my (%counts, $err_head, $err_dep, $con, $con1, $con_pos, $con_pos1, $thresh) ; +my ($head_err, $dep_err, @cur_err, %err_counts, $err_counter, $err_desc) ; +my ($loc_con, %loc_con_err_counts, %err_desc) ; +my ($head_aft_bef_g, $head_aft_bef_s, $head_aft_bef) ; +my ($con_bef, $con_aft, $con_bef_2, $con_aft_2, @bits, @e_bits, @v_con, @v_con_pos) ; +my ($con_pos_bef, $con_pos_aft, $con_pos_bef_2, $con_pos_aft_2) ; +my ($max_word_len, $max_pos_len, $max_con_len, $max_con_pos_len) ; +my ($max_word_spec_len, $max_con_bef_len, $max_con_aft_len) ; +my (%freq_err, $err) ; + +my ($i, $j, $i_w, $l, $n_args) ; +my ($w_2, $w_1, $w1, $w2) ; +my ($wp_2, $wp_1, $wp1, $wp2) ; +my ($p_2, $p_1, $p1, $p2) ; + +my ($short_output) ; +my ($score_on_punct) ; +$counts{punct} = 0; # initialize + +getopts("g:o:s:qvhpb") ; + +if (defined $opt_v) +{ + my $id = '$Id: eval.pl,v 1.9 2006/05/09 20:30:01 yuval Exp $'; + my @parts = split ' ',$id; + print "Version $parts[2]\n"; + exit(0); +} + +if ((defined $opt_h) || ((! defined $opt_g) && (! defined $opt_s))) +{ + die $usage ; +} + +if (! defined $opt_g) +{ + die "Gold standard file (-g) missing\n" ; +} + +if (! defined $opt_s) +{ + die "System output file (-s) missing\n" ; +} + +if (! defined $opt_o) +{ + $opt_o = '-' ; +} + +if (defined $opt_q) +{ + $short_output = 1 ; +} else { + $short_output = 0 ; +} + +if (defined $opt_p) +{ + $score_on_punct = 1 ; +} else { + $score_on_punct = 0 ; +} + +$line_num = 0 ; +$sent_num = 0 ; +$eof = 0 ; + +@err_sent = () ; +@starts = () ; + +%{$err_sent[0]} = () ; + +$max_pos_len = length('CPOS') ; + +################################################################################ +### reading input ### +################################################################################ + +open (GOLD, "<$opt_g") || die "Could not open gold standard file $opt_g\n" ; +open (SYS, "<$opt_s") || die "Could not open system output file $opt_s\n" ; +open (OUT, ">$opt_o") || die "Could not open output file $opt_o\n" ; + + +if (defined $opt_b) { # produce output similar to evalb + print OUT " Sent. Attachment Correct Scoring \n"; + print OUT " ID Tokens - Unlab. Lab. HEAD HEAD+DEPREL tokens - - - -\n"; + print OUT " ============================================================================\n"; +} + + +while (! $eof) +{ # main reading loop + + $starts[$sent_num] = $line_num+1 ; + $eof = read_sent(\@sent_gold, \@sent_sys) ; + + $sent_num++ ; + + %{$err_sent[$sent_num]} = () ; + $word_num = scalar @sent_gold ; + + # for accuracy per sentence + my %sent_counts = ( tot => 0, + err_any => 0, + err_head => 0 + ); + + # printf "$sent_num $word_num\n" ; + + my @frames_g = ('** '); # the initial frame for the virtual root + my @frames_s = ('** '); # the initial frame for the virtual root + foreach $i_w (0 .. $word_num-1) + { # loop on words + push @frames_g, ''; # initialize + push @frames_s, ''; # initialize + } + + foreach $i_w (0 .. $word_num-1) + { # loop on words + + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + $wp = $word.' / '.$pos ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + $counts{punct}++ ; + # ignore punctuations + next ; + } + + if (length($pos) > $max_pos_len) + { + $max_pos_len = length($pos) ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $counts{tot}++ ; + $counts{word}{$wp}{tot}++ ; + $counts{pos}{$pos}{tot}++ ; + $counts{head}{$head_g-$i_w-1}{tot}++ ; + + # for frame confusions + # add child to frame of parent + $frames_g[$head_g] .= "$dep_g "; + $frames_s[$head_s] .= "$dep_s "; + # add to frame of token itself + $frames_g[$i_w+1] .= "*$dep_g* "; # $i_w+1 because $i_w starts counting at zero + $frames_s[$i_w+1] .= "*$dep_g* "; + + # for precision and recall of DEPREL + $counts{dep}{$dep_g}{tot}++ ; # counts for gold standard deprels + $counts{dep2}{$dep_g}{$dep_s}++ ; # counts for confusions + $counts{dep_s}{$dep_s}{tot}++ ; # counts for system deprels + $counts{all_dep}{$dep_g} = 1 ; # list of all deprels that occur ... + $counts{all_dep}{$dep_s} = 1 ; # ... in either gold or system output + + # for precision and recall of HEAD direction + my $dir_g; + if ($head_g == 0) { + $dir_g = 'to_root'; + } elsif ($head_g < $i_w+1) { # $i_w+1 because $i_w starts counting at zero + # also below + $dir_g = 'left'; + } elsif ($head_g > $i_w+1) { + $dir_g = 'right'; + } else { + # token links to itself; should never happen in correct gold standard + $dir_g = 'self'; + } + my $dir_s; + if ($head_s == 0) { + $dir_s = 'to_root'; + } elsif ($head_s < $i_w+1) { + $dir_s = 'left'; + } elsif ($head_s > $i_w+1) { + $dir_s = 'right'; + } else { + # token links to itself; should not happen in good system + # (but not forbidden in shared task) + $dir_s = 'self'; + } + $counts{dir_g}{$dir_g}{tot}++ ; # counts for gold standard head direction + $counts{dir2}{$dir_g}{$dir_s}++ ; # counts for confusions + $counts{dir_s}{$dir_s}{tot}++ ; # counts for system head direction + + # for precision and recall of HEAD distance + my $dist_g; + if ($head_g == 0) { + $dist_g = 'to_root'; + } elsif ( abs($head_g - ($i_w+1)) <= 1 ) { + $dist_g = '1'; # includes the 'self' cases + } elsif ( abs($head_g - ($i_w+1)) <= 2 ) { + $dist_g = '2'; + } elsif ( abs($head_g - ($i_w+1)) <= 6 ) { + $dist_g = '3-6'; + } else { + $dist_g = '7-...'; + } + my $dist_s; + if ($head_s == 0) { + $dist_s = 'to_root'; + } elsif ( abs($head_s - ($i_w+1)) <= 1 ) { + $dist_s = '1'; # includes the 'self' cases + } elsif ( abs($head_s - ($i_w+1)) <= 2 ) { + $dist_s = '2'; + } elsif ( abs($head_s - ($i_w+1)) <= 6 ) { + $dist_s = '3-6'; + } else { + $dist_s = '7-...'; + } + $counts{dist_g}{$dist_g}{tot}++ ; # counts for gold standard head distance + $counts{dist2}{$dist_g}{$dist_s}++ ; # counts for confusions + $counts{dist_s}{$dist_s}{tot}++ ; # counts for system head distance + + + $err_head = ($head_g ne $head_s) ; # error in head + $err_dep = ($dep_g ne $dep_s) ; # error in deprel + + $head_err = '-' ; + $dep_err = '-' ; + + # for accuracy per sentence + $sent_counts{tot}++ ; + if ($err_dep || $err_head) { + $sent_counts{err_any}++ ; + } + if ($err_head) { + $sent_counts{err_head}++ ; + } + + # total counts and counts for CPOS involved in errors + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + + $err_sent[$sent_num]{head}++ ; + $counts{err_head}{tot}++ ; + $counts{err_head}{$head_err}++ ; + + $counts{word}{err_head}{$wp}++ ; + $counts{pos}{$pos}{err_head}{tot}++ ; + $counts{pos}{$pos}{err_head}{$head_err}++ ; + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + $err_sent[$sent_num]{dep}++ ; + $counts{err_dep}{tot}++ ; + $counts{err_dep}{$dep_err}++ ; + + $counts{word}{err_dep}{$wp}++ ; + $counts{pos}{$pos}{err_dep}{tot}++ ; + $counts{pos}{$pos}{err_dep}{$dep_err}++ ; + + if ($err_head) + { + $counts{err_both}++ ; + $counts{pos}{$pos}{err_both}++ ; + } + } + + ### DEPREL + ATTACHMENT + if ((!$err_dep) && ($err_head)) { + $counts{err_head_corr_dep}{tot}++ ; + $counts{err_head_corr_dep}{$dep_s}++ ; + } + ### DEPREL + ATTACHMENT + + # counts for words involved in errors + + if (! ($err_head || $err_dep)) + { + next ; + } + + $err_sent[$sent_num]{word}++ ; + $counts{err_any}++ ; + $counts{word}{err_any}{$wp}++ ; + $counts{pos}{$pos}{err_any}++ ; + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + if ($w_2 ne $START) + { + $wp_2 = $w_2.' / '.$p_2 ; + } + else + { + $wp_2 = $w_2 ; + } + + if ($w_1 ne $START) + { + $wp_1 = $w_1.' / '.$p_1 ; + } + else + { + $wp_1 = $w_1 ; + } + + if ($w1 ne $END) + { + $wp1 = $w1.' / '.$p1 ; + } + else + { + $wp1 = $w1 ; + } + + if ($w2 ne $END) + { + $wp2 = $w2.' / '.$p2 ; + } + else + { + $wp2 = $w2 ; + } + + $con_bef = $wp_1 ; + $con_bef_2 = $wp_2.' + '.$wp_1 ; + $con_aft = $wp1 ; + $con_aft_2 = $wp1.' + '.$wp2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + if ($w_1 ne $START) + { + # do not count '.S' as a word context + $counts{con_bef_2}{tot}{$con_bef_2}++ ; + $counts{con_bef_2}{err_head}{$con_bef_2} += $err_head ; + $counts{con_bef_2}{err_dep}{$con_bef_2} += $err_dep ; + $counts{con_bef}{tot}{$con_bef}++ ; + $counts{con_bef}{err_head}{$con_bef} += $err_head ; + $counts{con_bef}{err_dep}{$con_bef} += $err_dep ; + } + + if ($w1 ne $END) + { + # do not count '.E' as a word context + $counts{con_aft_2}{tot}{$con_aft_2}++ ; + $counts{con_aft_2}{err_head}{$con_aft_2} += $err_head ; + $counts{con_aft_2}{err_dep}{$con_aft_2} += $err_dep ; + $counts{con_aft}{tot}{$con_aft}++ ; + $counts{con_aft}{err_head}{$con_aft} += $err_head ; + $counts{con_aft}{err_dep}{$con_aft} += $err_dep ; + } + + $counts{con_pos_bef_2}{tot}{$con_pos_bef_2}++ ; + $counts{con_pos_bef_2}{err_head}{$con_pos_bef_2} += $err_head ; + $counts{con_pos_bef_2}{err_dep}{$con_pos_bef_2} += $err_dep ; + $counts{con_pos_bef}{tot}{$con_pos_bef}++ ; + $counts{con_pos_bef}{err_head}{$con_pos_bef} += $err_head ; + $counts{con_pos_bef}{err_dep}{$con_pos_bef} += $err_dep ; + + $counts{con_pos_aft_2}{tot}{$con_pos_aft_2}++ ; + $counts{con_pos_aft_2}{err_head}{$con_pos_aft_2} += $err_head ; + $counts{con_pos_aft_2}{err_dep}{$con_pos_aft_2} += $err_dep ; + $counts{con_pos_aft}{tot}{$con_pos_aft}++ ; + $counts{con_pos_aft}{err_head}{$con_pos_aft} += $err_head ; + $counts{con_pos_aft}{err_dep}{$con_pos_aft} += $err_dep ; + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + $freq_err{$err}++ ; + + } # loop on words + + foreach $i_w (0 .. $word_num) # including one for the virtual root + { # loop on words + if ($frames_g[$i_w] ne $frames_s[$i_w]) { + $counts{frame2}{"$frames_g[$i_w]/ $frames_s[$i_w]"}++ ; + } + } + + if (defined $opt_b) { # produce output similar to evalb + if ($word_num > 0) { + my ($unlabeled,$labeled) = ('NaN', 'NaN'); + if ($sent_counts{tot} > 0) { # there are scoring tokens + $unlabeled = 100-$sent_counts{err_head}*100.0/$sent_counts{tot}; + $labeled = 100-$sent_counts{err_any} *100.0/$sent_counts{tot}; + } + printf OUT " %4d %4d 0 %6.2f %6.2f %4d %4d %4d 0 0 0 0\n", + $sent_num, $word_num, + $unlabeled, $labeled, + $sent_counts{tot}-$sent_counts{err_head}, + $sent_counts{tot}-$sent_counts{err_any}, + $sent_counts{tot},; + } + } + +} # main reading loop + +################################################################################ +### printing output ### +################################################################################ + +if (defined $opt_b) { # produce output similar to evalb + print OUT "\n\n"; +} +printf OUT " Labeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_any}, $counts{tot}, 100-$counts{err_any}*100.0/$counts{tot} ; +printf OUT " Unlabeled attachment score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_head}{tot}, $counts{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot} ; +printf OUT " Label accuracy score: %d / %d * 100 = %.2f %%\n", + $counts{tot}-$counts{err_dep}{tot}, $counts{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot} ; + +if ($short_output) +{ + exit(0) ; +} +printf OUT "\n %s\n\n", '=' x 80 ; +printf OUT " Evaluation of the results in %s\n vs. gold standard %s:\n\n", $opt_s, $opt_g ; + +printf OUT " Legend: '%s' - the beginning of a sentence, '%s' - the end of a sentence\n\n", $START, $END ; + +printf OUT " Number of non-scoring tokens: $counts{punct}\n\n"; + +printf OUT " The overall accuracy and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Accuracy', 'words', 'right', 'right', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + ' ', ' ', 'head', ' dep', 'right' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{tot}-$counts{err_head}{tot}, 100-$counts{err_head}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_dep}{tot}, 100-$counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{tot}-$counts{err_any}, 100-$counts{err_any}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_head}{tot})) + { + $counts{pos}{$pos}{err_head}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_dep}{tot})) + { + $counts{pos}{$pos}{err_dep}{tot} = 0 ; + } + if (! defined($counts{pos}{$pos}{err_any})) + { + $counts{pos}{$pos}{err_any} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_head}{tot}, 100-$counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_dep}{tot}, 100-$counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{tot}-$counts{pos}{$pos}{err_any}, 100-$counts{pos}{$pos}{err_any}*100.0/$counts{pos}{$pos}{tot} ; +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT "\n\n" ; + +printf OUT " The overall error rate and its distribution over CPOSTAGs\n\n" ; +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %-5s | %-5s | %% | %-5s | %% | %-5s | %%\n", + 'Error', 'words', 'head', ' dep', 'both' ; +printf OUT " %-10s | %-5s | %-5s | | %-5s | | %-5s |\n", + + 'Rate', ' ', 'err', ' err', 'wrong' ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + 'total', $counts{tot}, + $counts{err_head}{tot}, $counts{err_head}{tot}*100.0/$counts{tot}, + $counts{err_dep}{tot}, $counts{err_dep}{tot}*100.0/$counts{tot}, + $counts{err_both}, $counts{err_both}*100.0/$counts{tot} ; + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +foreach $pos (sort {$counts{pos}{$b}{tot} <=> $counts{pos}{$a}{tot}} keys %{$counts{pos}}) +{ + if (! defined($counts{pos}{$pos}{err_both})) + { + $counts{pos}{$pos}{err_both} = 0 ; + } + + printf OUT " %-10s | %5d | %5d | %3.0f%% | %5d | %3.0f%% | %5d | %3.0f%%\n", + $pos, $counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_head}{tot}, $counts{pos}{$pos}{err_head}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_dep}{tot}, $counts{pos}{$pos}{err_dep}{tot}*100.0/$counts{pos}{$pos}{tot}, + $counts{pos}{$pos}{err_both}, $counts{pos}{$pos}{err_both}*100.0/$counts{pos}{$pos}{tot} ; + +} + +printf OUT "%s\n", " -----------+-------+-------+------+-------+------+-------+-------" ; + +### added by Sabine Buchholz +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +### DEPREL + ATTACHMENT: +### Same as Sabine's DEPREL apart from $tot_corr calculation +printf OUT "\n\n"; +printf OUT " Precision and recall of DEPREL + ATTACHMENT\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " deprel | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dep (sort keys %{$counts{all_dep}}) { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dep2}{$dep}{$dep})) { + if (defined($counts{err_head_corr_dep}{$dep})) { + $tot_corr = $counts{dep2}{$dep}{$dep} - $counts{err_head_corr_dep}{$dep}; + } else { + $tot_corr = $counts{dep2}{$dep}{$dep}; + } + } + if (defined($counts{dep}{$dep}{tot})) { + $tot_g = $counts{dep}{$dep}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dep_s}{$dep}{tot})) { + $tot_s = $counts{dep_s}{$dep}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dep, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} +### DEPREL + ATTACHMENT + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD direction\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " direction | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dir ('to_root', 'left', 'right', 'self') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dir2}{$dir}{$dir})) { + $tot_corr = $counts{dir2}{$dir}{$dir}; + } + if (defined($counts{dir_g}{$dir}{tot})) { + $tot_g = $counts{dir_g}{$dir}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dir_s}{$dir}{tot})) { + $tot_s = $counts{dir_s}{$dir}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dir, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Precision and recall of binned HEAD distance\n\n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +printf OUT " distance | gold | correct | system | recall (%%) | precision (%%) \n"; +printf OUT " ----------------+------+---------+--------+------------+---------------\n"; +foreach my $dist ('to_root', '1', '2', '3-6', '7-...') { + # initialize + my ($tot_corr, $tot_g, $tot_s, $prec, $rec) = (0, 0, 0, 'NaN', 'NaN'); + + if (defined($counts{dist2}{$dist}{$dist})) { + $tot_corr = $counts{dist2}{$dist}{$dist}; + } + if (defined($counts{dist_g}{$dist}{tot})) { + $tot_g = $counts{dist_g}{$dist}{tot}; + $rec = sprintf("%.2f",$tot_corr / $tot_g * 100); + } + if (defined($counts{dist_s}{$dist}{tot})) { + $tot_s = $counts{dist_s}{$dist}{tot}; + $prec = sprintf("%.2f",$tot_corr / $tot_s * 100); + } + printf OUT " %-15s | %4d | %7d | %6d | %10s | %13s\n", + $dist, $tot_g, $tot_corr, $tot_s, $rec, $prec; +} + +printf OUT "\n\n"; +printf OUT " Frame confusions (gold versus system; *...* marks the head token)\n\n"; +foreach my $frame (sort {$counts{frame2}{$b} <=> $counts{frame2}{$a}} keys %{$counts{frame2}}) +{ + if ($counts{frame2}{$frame} >= 5) # (make 5 a changeable threshold later) + { + printf OUT " %3d %s\n", $counts{frame2}{$frame}, $frame; + } +} +### end of: added by Sabine Buchholz + + +# +# Leave only the 5 words mostly involved in errors +# + + +$thresh = (sort {$b <=> $a} values %{$counts{word}{err_any}})[4] ; + +# ensure enough space for title +$max_word_len = length('word') ; + +foreach $word (keys %{$counts{word}{err_any}}) +{ + if ($counts{word}{err_any}{$word} < $thresh) + { + delete $counts{word}{err_any}{$word} ; + next ; + } + + $l = uni_len($word) ; + if ($l > $max_word_len) + { + $max_word_len = $l ; + } +} + +# filter a case when the difference between the error counts +# for 2-word and 1-word contexts is small +# (leave the 2-word context) + +foreach $con (keys %{$counts{con_aft_2}{tot}}) +{ + ($w1) = split(/\+/, $con) ; + + if (defined $counts{con_aft}{tot}{$w1} && + $counts{con_aft}{tot}{$w1}-$counts{con_aft_2}{tot}{$con} <= 1) + { + delete $counts{con_aft}{tot}{$w1} ; + } +} + +foreach $con (keys %{$counts{con_bef_2}{tot}}) +{ + ($w_2, $w_1) = split(/\+/, $con) ; + + if (defined $counts{con_bef}{tot}{$w_1} && + $counts{con_bef}{tot}{$w_1}-$counts{con_bef_2}{tot}{$con} <= 1) + { + delete $counts{con_bef}{tot}{$w_1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + ($p1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_aft}{tot}{$p1}) && + $counts{con_pos_aft}{tot}{$p1}-$counts{con_pos_aft_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_aft}{tot}{$p1} ; + } +} + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + ($p_2, $p_1) = split(/\+/, $con_pos) ; + + if (defined($counts{con_pos_bef}{tot}{$p_1}) && + $counts{con_pos_bef}{tot}{$p_1}-$counts{con_pos_bef_2}{tot}{$con_pos} <= 1) + { + delete $counts{con_pos_bef}{tot}{$p_1} ; + } +} + +# for each context type, take the three contexts most involved in errors + +$max_con_len = 0 ; + +filter_context_counts($counts{con_bef_2}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_bef}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft}{tot}, $con_err_num, \$max_con_len) ; + +filter_context_counts($counts{con_aft_2}{tot}, $con_err_num, \$max_con_len) ; + +# for each CPOS context type, take the three CPOS contexts most involved in errors + +$max_con_pos_len = 0 ; + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef_2}{tot}}) +{ + if ($counts{con_pos_bef_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_bef}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_bef}{tot}}) +{ + if ($counts{con_pos_bef}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_bef}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft}{tot}}) +{ + if ($counts{con_pos_aft}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +$thresh = (sort {$b <=> $a} values %{$counts{con_pos_aft_2}{tot}})[$con_err_num-1] ; + +foreach $con_pos (keys %{$counts{con_pos_aft_2}{tot}}) +{ + if ($counts{con_pos_aft_2}{tot}{$con_pos} < $thresh) + { + delete $counts{con_pos_aft_2}{tot}{$con_pos} ; + next ; + } + if (length($con_pos) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos) ; + } +} + +# printing + +# ------------- focus words + +printf OUT "\n\n" ; +printf OUT " %d focus words where most of the errors occur:\n\n", scalar keys %{$counts{word}{err_any}} ; + +printf OUT " %-*s | %-4s | %-4s | %-4s | %-4s\n", $max_word_len, ' ', 'any', 'head', 'dep', 'both' ; +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +foreach $word (sort {$counts{word}{err_any}{$b} <=> $counts{word}{err_any}{$a}} keys %{$counts{word}{err_any}}) +{ + if (!defined($counts{word}{err_head}{$word})) + { + $counts{word}{err_head}{$word} = 0 ; + } + if (! defined($counts{word}{err_dep}{$word})) + { + $counts{word}{err_dep}{$word} = 0 ; + } + if (! defined($counts{word}{err_any}{$word})) + { + $counts{word}{err_any}{$word} = 0; + } + printf OUT " %-*s | %4d | %4d | %4d | %4d\n", + $max_word_len+length($word)-uni_len($word), $word, $counts{word}{err_any}{$word}, + $counts{word}{err_head}{$word}, + $counts{word}{err_dep}{$word}, + $counts{word}{err_dep}{$word}+$counts{word}{err_head}{$word}-$counts{word}{err_any}{$word} ; +} + +printf OUT " %s-+------+------+------+------\n", '-' x $max_word_len; + +# ------------- contexts + +printf OUT "\n\n" ; + +printf OUT " one-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef}, $counts{con_pos_bef}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token preceeding contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_bef_2}, $counts{con_pos_bef_2}, $max_con_len, $max_con_pos_len) ; + +printf OUT " one-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft}, $counts{con_pos_aft}, $max_con_len, $max_con_pos_len) ; + +printf OUT " two-token following contexts where most of the errors occur:\n\n" ; + +print_context($counts{con_aft_2}, $counts{con_pos_aft_2}, $max_con_len, $max_con_pos_len) ; + +# ------------- Sentences + +printf OUT " Sentence with the highest number of word errors:\n" ; +$i = (sort { (defined($err_sent[$b]{word}) && $err_sent[$b]{word}) + <=> (defined($err_sent[$a]{word}) && $err_sent[$a]{word}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of head errors:\n" ; +$i = (sort { (defined($err_sent[$b]{head}) && $err_sent[$b]{head}) + <=> (defined($err_sent[$a]{head}) && $err_sent[$a]{head}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +printf OUT "\n\n" ; + +printf OUT " Sentence with the highest number of dependency errors:\n" ; +$i = (sort { (defined($err_sent[$b]{dep}) && $err_sent[$b]{dep}) + <=> (defined($err_sent[$a]{dep}) && $err_sent[$a]{dep}) } 1 .. $sent_num)[0] ; +printf OUT " Sentence %d line %d, ", $i, $starts[$i-1] ; +printf OUT "%d head errors, %d dependency errors, %d word errors\n", + $err_sent[$i]{head}, $err_sent[$i]{dep}, $err_sent[$i]{word} ; + +# +# Second pass, collect statistics of the frequent errors +# + +# filter the errors, leave the most frequent $freq_err_num errors + +$i = 0 ; + +$thresh = (sort {$b <=> $a} values %freq_err)[$freq_err_num-1] ; + +foreach $err (keys %freq_err) +{ + if ($freq_err{$err} < $thresh) + { + delete $freq_err{$err} ; + } +} + +# in case there are several errors with the threshold count + +$freq_err_num = scalar keys %freq_err ; + +%err_counts = () ; + +$eof = 0 ; + +seek (GOLD, 0, 0) ; +seek (SYS, 0, 0) ; + +while (! $eof) +{ # second reading loop + + $eof = read_sent(\@sent_gold, \@sent_sys) ; + $sent_num++ ; + + $word_num = scalar @sent_gold ; + + # printf "$sent_num $word_num\n" ; + + foreach $i_w (0 .. $word_num-1) + { # loop on words + ($word, $pos, $head_g, $dep_g) + = @{$sent_gold[$i_w]}{'word', 'pos', 'head', 'dep'} ; + + # printf "%d: %s %s %s %s\n", $i_w, $word, $pos, $head_g, $dep_g ; + + if ((! $score_on_punct) && is_uni_punct($word)) + { + # ignore punctuations + next ; + } + + ($head_s, $dep_s) = @{$sent_sys[$i_w]}{'head', 'dep'} ; + + $err_head = ($head_g ne $head_s) ; + $err_dep = ($dep_g ne $dep_s) ; + + $head_err = '-' ; + $dep_err = '-' ; + + if ($head_g eq '0') + { + $head_aft_bef_g = '0' ; + } + elsif ($head_g eq $i_w+1) + { + $head_aft_bef_g = 'e' ; + } + else + { + $head_aft_bef_g = ($head_g <= $i_w+1 ? 'b' : 'a') ; + } + + if ($head_s eq '0') + { + $head_aft_bef_s = '0' ; + } + elsif ($head_s eq $i_w+1) + { + $head_aft_bef_s = 'e' ; + } + else + { + $head_aft_bef_s = ($head_s <= $i_w+1 ? 'b' : 'a') ; + } + + $head_aft_bef = $head_aft_bef_g.$head_aft_bef_s ; + + if ($err_head) + { + if ($head_aft_bef_s eq '0') + { + $head_err = 0 ; + } + else + { + $head_err = $head_s-$head_g ; + } + } + + if ($err_dep) + { + $dep_err = $dep_g.'->'.$dep_s ; + } + + if (! ($err_head || $err_dep)) + { + next ; + } + + # handle only the most frequent errors + + $err = $head_err.$sep.$head_aft_bef.$sep.$dep_err ; + + if (! exists $freq_err{$err}) + { + next ; + } + + ($w_2, $w_1, $w1, $w2, $p_2, $p_1, $p1, $p2) = get_context(\@sent_gold, $i_w) ; + + $con_bef = $w_1 ; + $con_bef_2 = $w_2.' + '.$w_1 ; + $con_aft = $w1 ; + $con_aft_2 = $w1.' + '.$w2 ; + + $con_pos_bef = $p_1 ; + $con_pos_bef_2 = $p_2.'+'.$p_1 ; + $con_pos_aft = $p1 ; + $con_pos_aft_2 = $p1.'+'.$p2 ; + + @cur_err = ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) ; + + # printf "# %-25s %-15s %-10s %-25s %-3s %-30s\n", + # $con_bef, $word, $pos, $con_aft, $head_err, $dep_err ; + + @bits = (0, 0, 0, 0, 0, 0) ; + $j = 0 ; + + while ($j == 0) + { + for ($i = 0; $i <= $#bits; $i++) + { + if ($bits[$i] == 0) + { + $bits[$i] = 1 ; + $j = 0 ; + last ; + } + else + { + $bits[$i] = 0 ; + $j = 1 ; + } + } + + @e_bits = @cur_err ; + + for ($i = 0; $i <= $#bits; $i++) + { + if (! $bits[$i]) + { + $e_bits[$i] = '*' ; + } + } + + # include also the last case which is the most general + # (wildcards for everything) + $err_counts{$err}{join($sep, @e_bits)}++ ; + + } + + } # loop on words +} # second reading loop + +printf OUT "\n\n" ; +printf OUT " Specific errors, %d most frequent errors:", $freq_err_num ; +printf OUT "\n %s\n", '=' x 41 ; + + +# deleting local contexts which are too general + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + @cur_err = split(/\Q$sep\E/, $loc_con) ; + + # In this loop, one or two elements of the local context are + # replaced with '*' to make it more general. If the entry for + # the general context has the same count it is removed. + + foreach $i (0 .. $#cur_err) + { + $w1 = $cur_err[$i] ; + if ($cur_err[$i] eq '*') + { + next ; + } + $cur_err[$i] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + for ($j = $i+1; $j <=$#cur_err; $j++) + { + if ($cur_err[$j] eq '*') + { + next ; + } + $w2 = $cur_err[$j] ; + $cur_err[$j] = '*' ; + $con1 = join($sep, @cur_err) ; + if ( defined($err_counts{$err}{$con1}) && defined($err_counts{$err}{$loc_con}) + && ($err_counts{$err}{$con1} == $err_counts{$err}{$loc_con})) + { + delete $err_counts{$err}{$con1} ; + } + $cur_err[$j] = $w2 ; + } + $cur_err[$i] = $w1 ; + } + } +} + +# Leaving only the topmost local contexts for each error + +foreach $err (keys %err_counts) +{ + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[$spec_err_loc_con-1] || 0 ; + + # of the threshold is too low, take the 2nd highest count + # (the highest may be the total which is the generic case + # and not relevant for printing) + + if ($thresh < 5) + { + $thresh = (sort {$b <=> $a} values %{$err_counts{$err}})[1] ; + } + + foreach $loc_con (keys %{$err_counts{$err}}) + { + if ($err_counts{$err}{$loc_con} < $thresh) + { + delete $err_counts{$err}{$loc_con} ; + } + else + { + if ($loc_con ne join($sep, ('*', '*', '*', '*', '*', '*'))) + { + $loc_con_err_counts{$loc_con}{$err} = $err_counts{$err}{$loc_con} ; + } + } + } +} + +# printing an error summary + +# calculating the context field length + +$max_word_spec_len= length('word') ; +$max_con_aft_len = length('word') ; +$max_con_bef_len = length('word') ; +$max_con_pos_len = length('CPOS') ; + +foreach $err (keys %err_counts) +{ + foreach $loc_con (sort keys %{$err_counts{$err}}) + { + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $loc_con) ; + + $l = uni_len($word) ; + if ($l > $max_word_spec_len) + { + $max_word_spec_len = $l ; + } + + $l = uni_len($con_bef) ; + if ($l > $max_con_bef_len) + { + $max_con_bef_len = $l ; + } + + $l = uni_len($con_aft) ; + if ($l > $max_con_aft_len) + { + $max_con_aft_len = $l ; + } + + if (length($con_pos_aft) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_aft) ; + } + + if (length($con_pos_bef) > $max_con_pos_len) + { + $max_con_pos_len = length($con_pos_bef) ; + } + } +} + +$err_counter = 0 ; + +foreach $err (sort {$freq_err{$b} <=> $freq_err{$a}} keys %freq_err) +{ + + ($head_err, $head_aft_bef, $dep_err) = split(/\Q$sep\E/, $err) ; + + $err_counter++ ; + $err_desc{$err} = sprintf("%2d. ", $err_counter). + describe_err($head_err, $head_aft_bef, $dep_err) ; + + # printf OUT " %-3s %-30s %d\n", $head_err, $dep_err, $freq_err{$err} ; + printf OUT "\n" ; + printf OUT " %s : %d times\n", $err_desc{$err}, $freq_err{$err} ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s | %s\n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After', + 'Count' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s |\n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $loc_con (sort {$err_counts{$err}{$b} <=> $err_counts{$err}{$a}} + keys %{$err_counts{$err}}) + { + if ($loc_con eq join($sep, ('*', '*', '*', '*', '*', '*'))) + { + next ; + } + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s | %3d\n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft, + $err_counts{$err}{$loc_con} ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-+------\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + +} + +printf OUT "\n\n" ; +printf OUT " Local contexts involved in several frequent errors:" ; +printf OUT "\n %s\n", '=' x 51 ; +printf OUT "\n\n" ; + +foreach $loc_con (sort {scalar keys %{$loc_con_err_counts{$b}} <=> + scalar keys %{$loc_con_err_counts{$a}}} + keys %loc_con_err_counts) +{ + + if (scalar keys %{$loc_con_err_counts{$loc_con}} == 1) + { + next ; + } + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + printf OUT " %-*s | %-*s | %-*s \n", + $max_con_pos_len+$max_con_bef_len+3, ' Before', + $max_word_spec_len+$max_pos_len+3, ' Focus', + $max_con_pos_len+$max_con_aft_len+3, ' After' ; + + printf OUT " %-*s %-*s | %-*s %-*s | %-*s %-*s \n", + $max_con_pos_len, 'CPOS', $max_con_bef_len, 'word', + $max_pos_len, 'CPOS', $max_word_spec_len, 'word', + $max_con_pos_len, 'CPOS', $max_con_aft_len, 'word' ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + $con1 = $loc_con ; + $con1 =~ s/\*/ /g ; + + ($con_pos_bef, $con_bef, $word, $pos, $con_pos_aft, $con_aft) = + split(/\Q$sep\E/, $con1) ; + + printf OUT " %-*s | %-*s | %-*s | %-*s | %-*s | %-*s \n", + $max_con_pos_len, $con_pos_bef, $max_con_bef_len+length($con_bef)-uni_len($con_bef), $con_bef, + $max_pos_len, $pos, $max_word_spec_len+length($word)-uni_len($word), $word, + $max_con_pos_len, $con_pos_aft, $max_con_aft_len+length($con_aft)-uni_len($con_aft), $con_aft ; + + printf OUT " %s-+-%s-+-%s-+-%s-+-%s-+-%s-\n", + '-' x $max_con_pos_len, '-' x $max_con_bef_len, + '-' x $max_pos_len, '-' x $max_word_spec_len, + '-' x $max_con_pos_len, '-' x $max_con_aft_len ; + + foreach $err (sort {$loc_con_err_counts{$loc_con}{$b} <=> + $loc_con_err_counts{$loc_con}{$a}} + keys %{$loc_con_err_counts{$loc_con}}) + { + printf OUT " %s : %d times\n", $err_desc{$err}, + $loc_con_err_counts{$loc_con}{$err} ; + } + + printf OUT "\n" ; +} + +close GOLD ; +close SYS ; + +close OUT ; diff --git a/bist_parser/bmstparser/src/utils.py b/bist_parser/bmstparser/src/utils.py new file mode 100644 index 0000000..901e3b5 --- /dev/null +++ b/bist_parser/bmstparser/src/utils.py @@ -0,0 +1,93 @@ +from collections import Counter +import re + + +class ConllEntry: + def __init__(self, id, form, pos, cpos, parent_id=None, relation=None): + self.id = id + self.form = form + self.norm = normalize(form) + self.cpos = cpos.upper() + self.pos = pos.upper() + self.parent_id = parent_id + self.relation = relation + + self.pred_parent_id = None + self.pred_relation = None + + +def vocab(conll_path): + wordsCount = Counter() + posCount = Counter() + relCount = Counter() + + with open(conll_path, 'r') as conllFP: + for sentence in read_conll(conllFP): + wordsCount.update([node.norm for node in sentence]) + posCount.update([node.pos for node in sentence]) + relCount.update([node.relation for node in sentence]) + + return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, + [k for k in posCount.keys()], [k for k in relCount.keys()]) + + +def vocab_conll(conll_entries): + """ + Create the vocabulary directly from CoNLL entries. + :param conll_entries: a list of lists of CoNLL entries + :return: the words count, a word-to-id mapping, a list of pos count keys, a list of rel count keys + """ + wordsCount = Counter() + posCount = Counter() + relCount = Counter() + + for sentence in conll_entries: + wordsCount.update([node.norm for node in sentence]) + posCount.update([node.pos for node in sentence]) + relCount.update([node.relation for node in sentence]) + + return (wordsCount, {w: i for i, w in enumerate(wordsCount.keys())}, + [k for k in posCount.keys()], [k for k in relCount.keys()]) + + +def read_conll(fh): + root = ConllEntry(0, '*root*', 'ROOT-POS', 'ROOT-CPOS', -1, 'rroot') + tokens = [root] + for line in fh: + tok = line.strip().split() + if not tok: + if len(tokens)>1: yield tokens + tokens = [root] + else: + tokens.append(ConllEntry(int(tok[0]), tok[1], tok[4], tok[3], int(tok[6]) if tok[6] != '_' else -1, tok[7])) + if len(tokens) > 1: + yield tokens + + +def write_conll(fn, conll_gen): + with open(fn, 'w') as fh: + for sentence in conll_gen: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.pred_parent_id), entry.pred_relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + +def write_original_conll(fn, conll_original): + """ + Write original CoNLL entries to file (in contrast to predicted/generated CoNLL entries). + :param fn: the path of the file to which the CoNLL entries should be written + :param conll_original: the original CoNLL entries that should be written to the file + """ + with open(fn, 'w') as fh: + for sentence in conll_original: + for entry in sentence[1:]: + fh.write('\t'.join([str(entry.id), entry.form, '_', entry.cpos, entry.pos, '_', str(entry.parent_id), entry.relation, '_', '_'])) + fh.write('\n') + fh.write('\n') + + +numberRegex = re.compile("[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"); +def normalize(word): + return 'NUM' if numberRegex.match(word) else word.lower() + diff --git a/constants.py b/constants.py new file mode 100644 index 0000000..62c3d85 --- /dev/null +++ b/constants.py @@ -0,0 +1,39 @@ +""" +Constants that are shared across files. +""" + +NEG_ID = 0 # the negative sentiment id +POS_ID = 1 # the positive sentiment id +NEU_ID = 2 # the neutral sentiment id + +# feature-related constants +FEATURE_SETS = ['similarity', 'topic_similarity', 'word_embedding_similarity', + 'diversity'] +SIMILARITY_FUNCTIONS = ['jensen-shannon', 'renyi', 'cosine', 'euclidean', + 'variational', 'bhattacharyya'] +DIVERSITY_FEATURES = ['num_word_types', 'type_token_ratio', 'entropy', + 'simpsons_index', 'quadratic_entropy', 'renyi_entropy'] + +# task-related constants +POS = 'pos' +POS_BILSTM = 'pos_bilstm' +SENTIMENT = 'sentiment' +PARSING = 'parsing' +TASKS = [POS, POS_BILSTM, SENTIMENT, PARSING] +POS_PARSING_TRG_DOMAINS = ['answers', 'emails', 'newsgroups', 'reviews', 'weblogs', 'wsj'] +SENTIMENT_TRG_DOMAINS = ['books', 'dvd', 'electronics', 'kitchen'] +TASK2TRAIN_EXAMPLES = { + POS: 2000, POS_BILSTM: 2000, SENTIMENT: 1600, PARSING: 2000 +} +TASK2DOMAINS = { + POS: POS_PARSING_TRG_DOMAINS, POS_BILSTM: POS_PARSING_TRG_DOMAINS, + SENTIMENT: SENTIMENT_TRG_DOMAINS, PARSING: POS_PARSING_TRG_DOMAINS +} + +# method-related constants +BAYES_OPT = 'bayes-opt' +RANDOM = 'random' +MOST_SIMILAR_DOMAIN = 'most-similar-domain' +MOST_SIMILAR_EXAMPLES = 'most-similar-examples' +ALL_SOURCE_DATA = 'all-source-data' +BASELINES = [RANDOM, MOST_SIMILAR_DOMAIN, MOST_SIMILAR_EXAMPLES, ALL_SOURCE_DATA] diff --git a/data_utils.py b/data_utils.py new file mode 100644 index 0000000..ad50b57 --- /dev/null +++ b/data_utils.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Utility methods for loading and processing data. +""" + +import os +import codecs +from collections import Counter +import itertools +import operator + +import numpy as np +import scipy.sparse +from sklearn.feature_extraction.text import TfidfVectorizer + +from constants import NEG_ID, POS_ID +from simpletagger import read_conll_file + +from constants import SENTIMENT, POS, POS_BILSTM, PARSING, \ + SENTIMENT_TRG_DOMAINS, POS_PARSING_TRG_DOMAINS +from bist_parser.bmstparser.src.utils import read_conll + + +class Vocab: + """ + The vocabulary class. Stores the word-to-id mapping. + """ + def __init__(self, max_vocab_size, vocab_path): + self.max_vocab_size = max_vocab_size + self.vocab_path = vocab_path + self.size = 0 + self.word2id = {} + self.id2word = {} + + def load(self): + """ + Loads the vocabulary from the vocabulary path. + """ + assert self.size == 0, 'Vocabulary has already been loaded or built.' + print('Reading vocabulary from %s...' % self.vocab_path) + with codecs.open(self.vocab_path, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + if i >= self.max_vocab_size: + print('Vocab in file is larger than max vocab size. ' + 'Only using top %d words.' % self.max_vocab_size) + break + word, idx = line.split('\t') + self.word2id[word] = int(idx.strip()) + self.size = len(self.word2id) + self.id2word = {index: word for word, index in self.word2id.items()} + assert self.size <= self.max_vocab_size, \ + 'Loaded vocab is of size %d., max vocab size is %d.' % ( + self.size, self.max_vocab_size) + + def create(self, texts, lowercase=True): + """ + Creates the vocabulary and stores it at the vocabulary path. + :param texts: a list of lists of tokens + :param lowercase: lowercase the input texts + """ + assert self.size == 0, 'Vocabulary has already been loaded or built.' + print('Building the vocabulary...') + if lowercase: + print('Lower-casing the input texts...') + texts = [[word.lower() for word in text] for text in texts] + + word_counts = Counter(itertools.chain(*texts)) + + # get the n most common words + most_common = word_counts.most_common(n=self.max_vocab_size) + + # construct the word to index mapping + self.word2id = {word: index for index, (word, count) + in enumerate(most_common)} + self.id2word = {index: word for word, index in self.word2id.items()} + + print('Writing vocabulary to %s...' % self.vocab_path) + with codecs.open(self.vocab_path, 'w', encoding='utf-8') as f: + for word, index in sorted(self.word2id.items(), + key=operator.itemgetter(1)): + f.write('%s\t%d\n' % (word, index)) + self.size = len(self.word2id) + + +def get_all_docs(domain_data_pairs, unlabeled=True): + """ + Return all labeled and undocumented documents of multiple domains. + :param domain_data_pairs: a list of (domain, (labeled_reviews, labels, + unlabeled_reviews)) tuples as obtained by + domain2data.items() + :param unlabeled: whether unlabeled documents should be incorporated + :return: a list containing the documents from all domains, the corresponding + labels, and a list containing the domain of each example + """ + docs, labels, domains = [], [], [] + for domain, (labeled_docs, doc_labels, unlabeled_docs) in domain_data_pairs: + length_of_docs = 0 + if not scipy.sparse.issparse(labeled_docs): + # if the labeled documents are not a sparse matrix, i.e. + # a tf-idf matrix, we can just flatten them into one array + docs += labeled_docs + length_of_docs += len(labeled_docs) + if unlabeled: + # if specified, we add the unlabeled documents + docs += unlabeled_docs + length_of_docs += len(labeled_docs) + else: + # if it is a sparse matrix, we just append the docs as a list and + # then stack the list in the end + docs.append(labeled_docs) + length_of_docs += labeled_docs.shape[0] + if unlabeled and unlabeled_docs is not None: + docs.append(unlabeled_docs) + length_of_docs += unlabeled_docs.shape[0] + labels.append(doc_labels) + + # we just add the corresponding domain for each document so that we can + # later see where the docs came from + domains += [domain] * length_of_docs + if scipy.sparse.issparse(labeled_docs): + # finally, if the matrix was sparse, we can stack the documents together + docs = scipy.sparse.vstack(docs) + return docs, np.hstack(labels), domains + + +def get_tfidf_data(domain2data, vocab): + """ + Transform the tokenized documents of each domain into a tf-idf matrix. + :param domain2data: the mapping of domains to a (tokenized_reviews, labels, + tokenized_unlabeled_reviews) tuple + :param vocab: the Vocabulary class + :return: a mapping of domains to a (labeled_tfidf_matrix, labels, + unlabeled_tfidf_matrix) tuple where both tfidf matrices are + scipy.sparse.csr.csr_matrix with shape (num_examples, vocab_size) + """ + domain2tfidf_data = {} + for domain, (labeled_examples, labels, unlabeled_examples) in domain2data.items(): + + # apply the vectorizer to the already tokenized and pre-processed input + vectorizer = TfidfVectorizer(vocabulary=vocab.word2id, + tokenizer=lambda x: x, + preprocessor=lambda x: x) + + # fit the vectorizer to both labeled and unlabeled examples but keep + # the transformed examples separate + vectorizer.fit(labeled_examples + unlabeled_examples) + tfidf_labeled_examples = vectorizer.transform(labeled_examples) + + # note: we cap unlabeled examples at 100k (only relevant for the books + # domain in the large-scale setting) + unlabeled_examples = unlabeled_examples[:100000] + tfidf_unlabeled_examples = vectorizer.transform(unlabeled_examples) \ + if len(unlabeled_examples) != 0 else None + assert isinstance(tfidf_labeled_examples, scipy.sparse.csr.csr_matrix),\ + 'The input is not a sparse matrix.' + assert isinstance(labels, np.ndarray), 'Labels are not a numpy array.' + domain2tfidf_data[domain] = [tfidf_labeled_examples, labels, + tfidf_unlabeled_examples] + return domain2tfidf_data + + +def log_to_file(log_file, run_dict, trg_domain, args): + """ + Log the results of experiment runs to a file. + :param log_file: the file used for logging + :param run_dict: a dictionary mapping a method name to a list of + (val_accuracy, test_accuracy) tuples or a list + of (val_accuracy, test_accuracy, best_feature_weight) + tuples for the bayes-opt method + :param trg_domain: the target domain + :param args: the arguments used as input to the script + """ + with open(log_file, 'a') as f: + for method, scores in run_dict.items(): + best_feature_weights = '' + if len(scores) == 0: + continue + if method.startswith('bayes-opt'): + val_accuracies, test_accuracies, best_feature_weights = \ + zip(*scores) + else: + val_accuracies, test_accuracies = zip(*scores) + mean_val, std_val = np.mean(val_accuracies), np.std(val_accuracies) + mean_test, std_test = np.mean(test_accuracies),\ + np.std(test_accuracies) + # target domain. method. feature_sets. # all other params + f.write('%s\t%s\t%s\t%.4f (+-%.4f)\t%.4f (+-%.4f)\t[%s]\t[%s]\t%s\t' + '%s\n' + % (trg_domain, method, ' '.join(args.feature_sets), + mean_val, std_val, mean_test, std_test, + ', '.join(['%.4f' % v for v in val_accuracies]), + ', '.join(['%.4f' % t for t in test_accuracies]), + str(list(best_feature_weights)), + ' '.join(['%s=%s' % (arg, str(getattr(args, arg))) + for arg in vars(args)]))) + + +def read_feature_weights_file(feature_weights_path): + """ + Reads a manually created file containing the learned feature weights for + some task, trg domain, and feature set and returns them. + The file format is this (note that ~ is used as delimiter to avoid clash + with other delimiters in the feature sets): + books~similarity diversity~[0.0, -0.66, -0.66, 0.66, 0.66, -0.66, 0.66, 0.0, 0.0, -0.66, 0.66, 0.66] + ... + :param feature_weights_path: the path to the feature weights file + :return: a generator of tuples (feature_weights_domain, feature_set, feature_weights) + """ + print('Reading feature weights from %s...' % feature_weights_path) + with open(feature_weights_path, 'r') as f: + for line in f: + feature_weights_domain, feature_set, feature_weights =\ + line.split('~') + feature_weights = feature_weights.strip('[]\n') + feature_weights = feature_weights.split(', ') + feature_weights = [float(f) for f in feature_weights] + print('Feature weights domain: %s. Feature set: %s. ' + 'Feature weights: %s' % + (feature_weights_domain, feature_set, str(feature_weights))) + yield feature_weights_domain, feature_set, feature_weights + + +def task2read_data_func(task): + """Returns the read data method for each task.""" + if task == SENTIMENT: + return read_processed + if task in [POS, POS_BILSTM]: + return read_tagging_data + if task == PARSING: + return read_parsing_data + raise ValueError( + 'No data reading function available for task %s.' % task) + + +# =============== sentiment data functions ======= + +def read_processed(dir_path): + """ + Reads the processed files in the processed_acl directory. + :param dir_path: the directory containing the processed_acl folder + :return: a dictionary that maps domains to a tuple of + (labeled_reviews,labels, unlabeled_reviews); labeled_reviews is + a list of reviews where each review is a list of (unordered) + ngrams; labels is a numpy array of label ids of shape (num_labels); + unlabeled_reviews has the same format as labeled_reviews + """ + domains_path = os.path.join(dir_path, 'processed_acl') + assert os.path.exists(domains_path), ('Error: %s does not exist.' % + domains_path) + domains = os.listdir(domains_path) + assert set(domains) == set(SENTIMENT_TRG_DOMAINS) + domain2data = {domain: [[], [], None] for domain in domains} + for domain in domains: + print('Processing %s...' % domain) + # file names are positive.review, negative.review, and unlabeled.review + # positive and negative each contain 2k examples; + # unlabeled contains ~4k examples + splits = ['positive', 'negative', 'unlabeled'] + for split in splits: + print('Processing %s/%s...' % (domain, split), end='') + file_path = os.path.join(domains_path, domain, '%s.review' % split) + assert os.path.exists(file_path), '%s does not exist.' % file_path + reviews = [] + with open(file_path, encoding='utf-8') as f: + for line in f: + # get the pre-processed features; these are a white-space + # separated list of unigram/bigram occurrence counts in + # the document, e.g. "must:1", "still_has:1" + features = line.split(' ')[:-1] + + # convert the features to a sequence (note: order does not + # matter here); we do this to be able to later use the + # same post-processing as for data from other sources + review = [] + for feature in features: + ngram, count = feature.split(':') + for _ in range(int(count)): + review.append(ngram) + + # add the review to the reviews + reviews.append(review) + + # the domain2data dict maps a domain to a tuple of + # (reviews, labels, unlabeled_reviews) + if split == 'unlabeled': + # add the unlabeled reviews at the third position of the tuple + domain2data[domain][2] = reviews + else: + # add labels with the same polarity as the file + domain2data[domain][0] += reviews + domain2data[domain][1] += [sentiment2id(split)] * len(reviews) + + print(' Processed %d reviews.' % len(reviews)) + domain2data[domain][1] = np.array(domain2data[domain][1]) + return domain2data + + +def sentiment2id(sentiment): + """ + Maps a sentiment to a label id. + :param sentiment: the sentiment; one of [positive, pos, negative, neg] + :return: the id of the specified sentiment + """ + if sentiment in ['positive', 'pos']: + return POS_ID + if sentiment in ['negative', 'neg']: + return NEG_ID + raise ValueError('%s is not a valid sentiment.' % sentiment) + + +# =============== tagging data functions ====== + +def read_tagging_data(dir_path, top_k_unlabeled=2000): + """ + Reads the CoNLL tagging files in the gweb_sancl/pos directory. Outputs the + documents as list of lists with tokens and lists of corresponding tags. + The domains are reviews, answer, emails, newsblogs, weblogs, wsj and + the corresponding files are called gweb-{domain}-{dev|test}.conll in folder + gweb_sancl/pos/{domain} + :param dir_path: the path to the directory gweb_sancl + :param top_k_unlabeled: only use the top k unlabeled examples + :return: a dictionary that maps domains to a tuple of (labeled_examples, + labels, unlabeled_examples); labeled_examples is a list of + sentences where each sentence is a list of tokens; labels + is a list of tags for each sentence; unlabeled_examples has the + same format as labeled_examples + """ + domains_path = os.path.join(dir_path, 'pos') + assert os.path.exists(domains_path), ('Error: %s does not exist.' % + domains_path) + domains = [d for d in os.listdir(domains_path)] + print(domains) + assert set(domains) == set(POS_PARSING_TRG_DOMAINS) + domain2data = {domain: [[], [], None] for domain in domains} + for domain in domains: + print('Processing %s...' % domain) + # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll + splits = ['dev', 'test', 'unlabeled'] + for split in splits: + print('Processing %s/%s...' % (domain, split), end='') + + if split == 'unlabeled': + file_path = os.path.join(dir_path, 'unlabeled', + 'gweb-%s.unlabeled.txt' % (domain)) + assert os.path.exists(file_path), ('%s does not exist.' % + file_path) + unlabeled_data = [] + print(file_path) + with open(file_path,'rb') as f: + for line in f: + line = line.decode('utf-8','ignore').strip().split() + unlabeled_data.append(line) + # add the unlabeled reviews at the third position of the tuple + print('Read %s number of unlabeled sentences' + % len(unlabeled_data)) + + unlabeled_data = unlabeled_data[:top_k_unlabeled] + print('Took top {} documents '.format(top_k_unlabeled)) + domain2data[domain][2] = unlabeled_data + else: + + file_path = os.path.join(domains_path, domain, + 'gweb-%s-%s.conll' % (domain, split)) + assert os.path.exists(file_path), ('%s does not exist.' % + file_path) + + data = list(read_conll_file(file_path)) + words = [words for words, tags in data] + tags = [tags for words, tags in data] + domain2data[domain][0] += words + domain2data[domain][1] += tags + + print(' Processed %d sentences.' % len(data)) + domain2data[domain][1] = np.array(domain2data[domain][1]) + return domain2data + + +# =============== parsing data functions ====== + +def read_parsing_data(dir_path, top_k_unlabeled=2000): + """ + Reads the CoNLL parsing files in the gweb_sancl/pos directory + :param dir_path: The gweb_sancl directory path. + :param top_k_unlabeled: only use the top k unlabeled examples + :return: a dictionary that maps domains to a tuple of ( + labeled_conll_entries, pseudo_labels, unlabeled_conll_entries); + labeled_conll_entries is a list of CoNLLEntry containing the + word forms, annotations, and target labels to be used for + parsing; since each CoNLLEntry already contains the target label, + pseudo_labels only contains pseudo-labels; unlabeled_conll_entries + are used as unlabeled data + """ + domains_path = os.path.join(dir_path, 'parse') + assert os.path.exists(domains_path), ('Error: %s does not exist.' % + domains_path) + domains = [d for d in os.listdir(domains_path)] + print(domains) + assert set(domains) == set(POS_PARSING_TRG_DOMAINS) + domain2data = {domain: [[], [], None] for domain in domains} + for domain in domains: + print('Processing %s...' % domain) + # file names are pos/{domain}/gweb-{domain}-{dev|test}.conll + splits = ['dev', 'test', 'unlabeled'] + for split in splits: + print('Processing %s/%s...' % (domain, split), end='') + if split == 'unlabeled': + file_path = os.path.join(dir_path, 'unlabeled', + 'gweb-%s.unlabeled.txt' % (domain)) + assert os.path.exists(file_path), ('%s does not exist.' % + file_path) + unlabeled_data = [] + with open(file_path,'rb') as f: + for line in f: + line = line.decode('utf-8','ignore').strip().split() + unlabeled_data.append(line) + + # add the unlabeled reviews at the third position of the tuple + print('Read %s number of unlabeled sentences' % len(unlabeled_data)) + + unlabeled_data = unlabeled_data[:top_k_unlabeled] + print('Took top {} documents '.format(top_k_unlabeled)) + domain2data[domain][2] = unlabeled_data + else: + if domain == 'wsj' and split == 'test': + file_path = os.path.join(domains_path, domain, + 'ontonotes-%s-%s.conll' + % (domain, split)) + else: + file_path = os.path.join(domains_path, domain, + 'gweb-%s-%s.conll' + % (domain, split)) + assert os.path.exists(file_path), ('%s does not exist.' % + file_path) + + with open(file_path, 'r') as conll_file_path: + data = list(read_conll(conll_file_path)) + domain2data[domain][0] += data + + # add pseudo-labels since the model doesn't use explicit + # labels for training + domain2data[domain][1] += [0] * len(data) + domain2data[domain][1] = np.array(domain2data[domain][1]) + return domain2data + + +def read_parsing_evaluation(evaluation_file_path): + """ + Read the labeled attachment score, unlabeled attachment score, and label + accuracy score from a file produced by the parsing evaluation perl + script. The beginning of the file looks like this: + Labeled attachment score: 6995 / 9615 * 100 = 72.75 % + Unlabeled attachment score: 7472 / 9615 * 100 = 77.71 % + Label accuracy score: 8038 / 9615 * 100 = 83.60 % + ... + :param evaluation_file_path: the path of the evaluation file produced by the perl script + :return: the labeled attachment score, the unlabeled attachment score, and the label accuracy score + """ + try: + with open(evaluation_file_path, 'r') as f: + lines = f.readlines() + las = float(lines[0].split('=')[1].strip('% \n')) + uas = float(lines[1].split('=')[1].strip('% \n')) + acc = float(lines[2].split('=')[1].strip('% \n')) + except Exception: + las = 0.0 + uas = 0.0 + acc = 0.0 + return las, uas, acc diff --git a/similarity.py b/similarity.py new file mode 100644 index 0000000..fdf91a0 --- /dev/null +++ b/similarity.py @@ -0,0 +1,342 @@ +""" +Methods for measuring domain similarity according to different metrics based on +different representations. +""" + +import os + +from sklearn.feature_extraction.text import CountVectorizer +import gensim + +import numpy as np +import scipy.stats +import scipy.spatial.distance + + +# SIMILARITY MEASURES + +def jensen_shannon_divergence(repr1, repr2): + """Calculates Jensen-Shannon divergence (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence).""" + avg_repr = 0.5 * (repr1 + repr2) + sim = 1 - 0.5 * (scipy.stats.entropy(repr1, avg_repr) + scipy.stats.entropy(repr1, avg_repr)) + if np.isinf(sim): + # the similarity is -inf if no term in the document is in the vocabulary + return 0 + return sim + + +def renyi_divergence(repr1, repr2, alpha=0.99): + """Calculates Renyi divergence (https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R.C3.A9nyi_divergence).""" + log_sum = np.sum([np.power(p, alpha) / np.power(q, alpha-1) for (p, q) in zip(repr1, repr2)]) + sim = 1 / (alpha - 1) * np.log(log_sum) + if np.isinf(sim): + # the similarity is -inf if no term in the document is in the vocabulary + return 0 + return sim + + +def cosine_similarity(repr1, repr2): + """Calculates cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity).""" + if repr1 is None or repr2 is None: + return 0 + assert not (np.isnan(repr2).any() or np.isinf(repr2).any()) + assert not (np.isnan(repr1).any() or np.isinf(repr1).any()) + sim = 1 - scipy.spatial.distance.cosine(repr1, repr2) + if np.isnan(sim): + # the similarity is nan if no term in the document is in the vocabulary + return 0 + return sim + + +def euclidean_distance(repr1, repr2): + """Calculates Euclidean distance (https://en.wikipedia.org/wiki/Euclidean_distance).""" + sim = np.sqrt(np.sum([np.power(p-q, 2) for (p, q) in zip(repr1, repr2)])) + return sim + + +def variational_distance(repr1, repr2): + """Also known as L1 or Manhattan distance (https://en.wikipedia.org/wiki/Taxicab_geometry).""" + sim = np.sum([np.abs(p-q) for (p, q) in zip(repr1, repr2)]) + return sim + + +def kl_divergence(repr1, repr2): + """Calculates Kullback-Leibler divergence (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence).""" + sim = scipy.stats.entropy(repr1, repr2) + return sim + + +def bhattacharyya_distance(repr1, repr2): + """Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance).""" + sim = - np.log(np.sum([np.sqrt(p*q) for (p, q) in zip(repr1, repr2)])) + assert not np.isnan(sim), 'Error: Similarity is nan.' + if np.isinf(sim): + # the similarity is -inf if no term in the review is in the vocabulary + return 0 + return sim + + +def similarity_name2value(s_name, repr1, repr2): + """Given a similarity function name, return the corresponding similarity function value.""" + if s_name == 'jensen-shannon': + return jensen_shannon_divergence(repr1, repr2) + if s_name == 'renyi': + return renyi_divergence(repr1, repr2) + if s_name == 'cos' or s_name == 'cosine': + return cosine_similarity(repr1, repr2) + if s_name == 'euclidean': + return euclidean_distance(repr1, repr2) + if s_name == 'variational': + return variational_distance(repr1, repr2) + if s_name == 'kl': + return kl_divergence(repr1, repr2) + if s_name == 'bhattacharyya': + return bhattacharyya_distance(repr1, repr2) + raise ValueError('%s is not a valid feature name.' % s_name) + + +# TERM DISTRIBUTIONS + +def get_domain_term_dists(term_dist_path, domain2data, vocab, lowercase=True): + """ + Retrieves relative term distributions from the provided domains. + :param term_dist_path: the path where the term distributions of the domains + should be saved + :param domain2data: the mapping of domains to (labeled_examples, labels, + unlabeled_examples) tuples + :param vocab: the Vocabulary object + :param lowercase: lower-case the input data + :return: a mapping of domains to their term distributions, + i.e. a numpy array of shape (vocab_size,) + """ + domain2term_dist = {} + if os.path.exists(term_dist_path): + print('Loading the term distributions from file...') + with open(term_dist_path, 'r') as f: + for line in f: + domain, term_dist = line.strip().split('\t') + term_dist = np.fromstring(term_dist, count=vocab.size, sep=' ') + assert len(term_dist) == vocab.size,\ + ('Length of term dist for %s should be %d, is %d.' % + (domain, vocab.size, len(term_dist))) + assert np.round(np.sum(term_dist), 6) == 1,\ + ('Sum of term distribution is %.6f instead of 1. The ' + 'vocabulary was likely created with a larger ' + 'max_vocab_size.' % np.sum(term_dist)) + domain2term_dist[domain] = term_dist + assert set(domain2term_dist.keys()) == set(domain2data.keys()),\ + ('Term distributions are not saved for all domains: "%s" and "%s"' + 'are not equal.' % (' '.join(domain2term_dist.keys()), + ' '.join(domain2data.keys()))) + return domain2term_dist + + if lowercase: + print('Lower-casing the data for calculating the term distributions...') + + # get the term domain counts for the term distributions + for domain, (examples, _, unlabeled_examples) in domain2data.items(): + domain2term_dist[domain] = get_term_dist( + examples + unlabeled_examples, vocab, lowercase) + + print('Writing relative frequency distributions to %s...' % term_dist_path) + with open(term_dist_path, 'w') as f: + for domain, term_dist in domain2term_dist.items(): + f.write('%s\t%s\n' % (domain, ' '.join([str(c) for c in term_dist]))) + return domain2term_dist + + +def get_term_dist(docs, vocab, lowercase=True): + """ + Calculates the term distribution of a list of documents. + :param docs: a list of tokenized docs; can also contain a single document + :param vocab: the Vocabulary object + :param lowercase: lower-case the input data + :return: the term distribution of the input documents, + i.e. a numpy array of shape (vocab_size,) + """ + term_dist = np.zeros(vocab.size) + for doc in docs: + for word in doc: + if lowercase: + word = word.lower() + if word in vocab.word2id: + term_dist[vocab.word2id[word]] += 1 + + # normalize absolute freqs to obtain a relative frequency term distribution + term_dist /= np.sum(term_dist) + if np.isnan(np.sum(term_dist)): + # the sum is nan if docs only contains one document and that document + # has no words in the vocabulary + term_dist = np.zeros(vocab.size) + return term_dist + + +def get_most_similar_domain(trg_domain, domain2term_dists, + similarity_name='jensen-shannon'): + """ + Given a target domain, retrieve the domain that is most similar to it + according to some domain similarity measure (default: Jensen-Shannon + divergence). + :param trg_domain: the target domain + :param domain2term_dists: a mapping of domain names to their term distribution + (a numpy array of shape (vocab_size,) ) + :param similarity_name: a string indicating the name of the similarity + measure used (default: 'jensen-shannon') + :return: the domain most similar to the target domain + """ + highest_sim_score, most_similar_domain = 0, None + trg_term_dist = domain2term_dists[trg_domain] + for domain, src_term_dist in domain2term_dists.items(): + if domain == trg_domain: + continue + sim_score = similarity_name2value(similarity_name, src_term_dist, trg_term_dist) + if sim_score > highest_sim_score: + highest_sim_score, most_similar_domain = sim_score, domain + return most_similar_domain + + +# TOPIC DISTRIBUTIONS + +def train_topic_model(examples, vocab, num_topics=50, num_iterations=2000, + num_passes=10): + """ + Trains an LDA topic model on the provided list of tokenised documents and + returns the vectorizer used for the transformation and the trained LDA + model. + :param examples: a list of tokenised documents of all domains + :param vocab: the Vocabulary object + :param num_topics: the number of topics that should be used + :param num_iterations: the number of iterations + :param num_passes: the number of passes over the corpus that should be + performed + :return: the CountVectorizer used for transforming the corpus and the + trained LDA topic model + """ + # the text is already tokenized and pre-processed; we only need to + # transform it to vectors + vectorizer = CountVectorizer(vocabulary=vocab.word2id, + tokenizer=lambda x: x, + preprocessor=lambda x: x) + lda_corpus = vectorizer.fit_transform(examples) + + # the gensim LDA implementation requires a sparse corpus; + # we could also use sci-kit learn instead + lda_corpus = gensim.matutils.Sparse2Corpus(lda_corpus, + documents_columns=False) + print('Training LDA model on data of all domains with %d topics, ' + '%d iterations, %d passes...' % (num_topics, num_iterations, + num_passes)) + lda_model = gensim.models.LdaMulticore( + lda_corpus, num_topics=num_topics, id2word=vocab.id2word, + iterations=num_iterations, passes=num_passes) + return vectorizer, lda_model + + +def get_topic_distributions(examples, vectorizer, lda_model): + """ + Retrieve the topic distributions of a collection of documents. + :param examples: a list of tokenised documents + :param vectorizer: the CountVectorizer used for transforming the documents + :param lda_model: the trained LDA model + :return: an array of shape (num_examples, num_topics) containing the topic + distribution of each example + """ + vectorized_corpus = vectorizer.transform(examples) + gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus, + documents_columns=False) + topic_representations = [] + for doc in gensim_corpus: + topic_representations.append( + [topic_prob for (topic_id, topic_prob) in + lda_model.get_document_topics(doc, minimum_probability=0.)]) + return np.array(topic_representations) + + +# PRE-TRAINED WORD EMBEDDINGS METHODS + +def load_word_vectors(file, vocab_word_vec_file, word2id, vector_size=300, + header=False): + """ + Loads word vectors from a text file, e.g. the one obtained from + http://nlp.stanford.edu/projects/glove/. + :param file: the file the word vectors should be loaded from + :param vocab_word_vec_file: the file where the word embeddings in the + vocabulary can be stored for faster retrieval + :param word2id: the mapping of words to their ids in the vocabulary + :param vector_size: the size of the word vectors + :param header: whether the word vectors text file contains a header; + default is False + :return a dictionary mapping each word to its numpy word vector + """ + word2vector = {} + if os.path.exists(vocab_word_vec_file): + print('Loading vocabulary word vectors from %s...' % vocab_word_vec_file) + with open(vocab_word_vec_file, 'r', encoding='utf-8') as f: + for line in f: + word = line.split(' ')[0] + assert word in word2id, ('Error: %s in vocab word vec file is ' + 'not in vocab.' % word) + line = ' '.join(line.split(' ')[1:]).strip() + vector = np.fromstring(line, dtype=float, sep=' ') + assert len(vector) == vector_size,\ + ('Error: %d != vector size %d for word %s.' + % (len(vector), vector_size, word)) + word2vector[word] = vector + return word2vector + + print('Reading word vectors from %s...' % file) + with open(file, 'r', encoding='utf-8') as f: + for i, line in enumerate(f): + if i == 0 and header: + continue + if i % 100000 == 0 and i > 0: + print('Processed %d vectors.' % i) + word = line.split(' ')[0] + if word not in word2id: + continue + line = ' '.join(line.split(' ')[1:]).strip() + vector = np.fromstring(line, dtype=float, sep=' ') + assert len(vector) == vector_size + word2vector[word] = vector + + print('Writing word vectors to %s...' % vocab_word_vec_file) + with open(vocab_word_vec_file, 'w', encoding='utf-8') as f: + for word, vector in word2vector.items(): + f.write('%s %s\n' % (word, ' '.join([str(c) for c in vector]))) + return word2vector + + +def weighted_sum_of_embeddings(docs, word2id, word2vector, term_dist): + """ + Get a weighted sum of embeddings representation for a list of documents + belonging to one domain. The documents are represented as a list of + ngrams. Also works if the list only contains a single document. + :param docs: a list of documents + :param word2id: the mapping of words to their ids in the vocabulary + :param word2vector: the mapping of words to their vector representations + :param term_dist: the term distribution of the data the words belong to + :return: the vector representation of the provided list of documents + """ + # the factor with which the word probability is smoothed, we empirically + # set this to the value used in Mikolov et al. (2013) + t = 10e-5 + word_embed_representations = [] + for doc in docs: + doc_vector = np.zeros(len(list(word2vector.values())[0])) + word_vector_count = 0 + for word in doc: + if word in word2vector: + vector = word2vector[word] + + # weight the vector with the smoothed inverse probability of + # the word + doc_vector += np.sqrt(t / (term_dist[word2id[word]])) * vector + word_vector_count += 1 + if word_vector_count == 0: + # this might be because the review is in another language by + # accident; set count to 1 to avoid division by 0 + word_vector_count = 1 + doc_vector /= word_vector_count + assert not (np.isnan(doc_vector).any() or np.isinf(doc_vector).any()) + word_embed_representations.append(doc_vector) + return np.array(word_embed_representations) diff --git a/simpletagger.py b/simpletagger.py new file mode 100644 index 0000000..5b39a05 --- /dev/null +++ b/simpletagger.py @@ -0,0 +1,359 @@ +#!/bin/env python3 +# -*- coding: utf-8 -*- + +# Simple structured perceptron tagger (bplank, parts by andersjo) - Language Proc 2 +import argparse +import codecs +from collections import defaultdict, Counter +import json +import re +import numpy as np +import sys +import random + +np.set_printoptions(precision=4) + + +def read_conll_file(file_name): + """ + read in a file with format: + word1 tag1 + ... ... + wordN tagN + + Sentences MUST be separated by newlines! + + :param file_name: file to read in + :return: generator of instances ((list of words, list of tags) pairs) + """ + current_words = [] + current_tags = [] + + for line in codecs.open(file_name, encoding='utf-8'): + line = line.strip() + + if line: + word, tag = line.split('\t') + current_words.append(word) + current_tags.append(tag) + + else: + yield (current_words, current_tags) + current_words = [] + current_tags = [] + + # if file does not end in newline (it should...), check whether there is an instance in the buffer + if current_tags != []: + yield (current_words, current_tags) + + +def memoize(f): + """ + helper function to be used as decorator to memoize features + :param f: + :return: + """ + memo = {} + def helper(*args): + key = tuple(args[1:]) + try: + return memo[key] + except KeyError: + memo[key] = f(*args) + return memo[key] + return helper + + +class StructuredPerceptron(object): + """ + implements a structured perceptron as described in Collins 2002 + """ + + def __init__(self, seed=1512141834): + """ + initialize model + :return: + """ + self.feature_weights = defaultdict(float) + self.tags = set() + + self.START = "__START__" + self.END = "__END__" + print("using seed: {}".format(seed)) + random.seed(seed) + np.random.seed(seed) + + def fit(self, train_data, iterations=5, learning_rate=0.2): + """ + read in a CoNLL file, extract emission features iterate over instances to train weight vector + :param file_name: + :return: + """ + averaged_weights = Counter() + + for iteration in range(iterations): + correct = 0 + total = 0.0 + sys.stderr.write('iteration %s\n************\n' % (iteration+1)) + + for i, (words, tags) in enumerate(train_data): + if i%100==0: + sys.stderr.write('%s'%i) + elif i%10==0: + sys.stderr.write('.') + + for tag in tags: + self.tags.add(tag) + + # get prediction + prediction = self.decode(words) + + # derive global features + global_gold_features = self.get_global_features(words, tags) + global_prediction_features = self.get_global_features(words, prediction) + + # update weight vector + for fid, count in global_gold_features.items(): + self.feature_weights[fid] += learning_rate * count + for fid, count in global_prediction_features.items(): + self.feature_weights[fid] -= learning_rate * count + + # compute training accuracy for this iteration + correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold]) + total += len(tags) + + #sys.stderr.write('\n\t%s features\n' % (len(self.feature_weights))) + averaged_weights.update(self.feature_weights) + sys.stderr.write('\tTraining accuracy: %.4f\n\n' % (correct/total)) + + random.shuffle(train_data) + + self.feature_weights = averaged_weights + + def get_global_features(self, words, tags): + """ + count how often each feature fired for the whole sentence + :param words: + :param tags: + :return: + """ + feature_counts = Counter() + + for i, (word, tag) in enumerate(zip(words, tags)): + previous_tag = self.START if i == 0 else tags[i-1] + feature_counts.update(self.get_features(word, tag, previous_tag)) + + return feature_counts + + @memoize + def get_features(self, word, tag, previous_tag): + """ + get all features that can be derived from the word and tags + :param word: + :param tag: + :param previous_tag: + :return: + """ + word_lower = word.lower() + prefix = word_lower[:3] + suffix = word_lower[-3:] + + features = [ + 'TAG_%s' % (tag), # current tag + 'TAG_BIGRAM_%s_%s' % (previous_tag, tag), # tag bigrams + 'WORD+TAG_%s_%s' % (word, tag), # word-tag combination + 'WORD_LOWER+TAG_%s_%s' % (word_lower, tag),# word-tag combination (lowercase) + 'UPPER_%s_%s' % (word[0].isupper(), tag), # word starts with uppercase letter + 'DASH_%s_%s' % ('-' in word, tag), # word contains a dash + 'PREFIX+TAG_%s_%s' % (prefix, tag), # prefix and tag + 'SUFFIX+TAG_%s_%s' % (suffix, tag), # suffix and tag + + ######################### + # ADD MOAAAAR FEATURES! # + ######################### + ('WORDSHAPE', self.shape(word), tag), + 'WORD+TAG_BIGRAM_%s_%s_%s' % (word, tag, previous_tag), + 'SUFFIX+2TAGS_%s_%s_%s' % (suffix, previous_tag, tag), + 'PREFIX+2TAGS_%s_%s_%s' % (prefix, previous_tag, tag) + ] + + return features + + @memoize + def shape(self, x): + result = [] + for c in x: + if c.isupper(): + result.append('X') + elif c.islower(): + result.append('x') + elif c in '0123456789': + result.append('d') + else: + result.append(c) + + # replace multiple occurrences of a character with 'x*' and return it + return re.sub(r"x+", "x*", ''.join(result)) + + def decode(self,words): + """ + Find best sequence + :param words: + :return: + """ + N=len(words) + M=len(self.tags) #number of tags + tags=list(self.tags) + + # create trellis of size M (number of tags) x N (sentence length) + Q = np.ones((len(self.tags), N)) * float('-Inf') + backp = np.ones((len(self.tags), N), dtype=np.int16) * -1 #backpointers + + ### initialization step + cur_word=words[0] + for j in range(M): + # initialize probs for tags j at position 1 (first word) + cur_tag=tags[j] + features = self.get_features(words[0], cur_tag, self.START) + feature_weights = sum((self.feature_weights[x] for x in features)) + Q[j,0]=feature_weights + + # iteration step + # filling the lattice, for every position and every tag find viterbi score Q + for i in range(1,N): + # for every tag + for j in range(M): + # checks if we are at end or start + tag=tags[j] + + best_score = float('-Inf') + + # for every possible previous tag + for k in range(M): + + # k=previous tag + previous_tag=tags[k] + + best_before=Q[k,i-1] # score until best step before + + features = self.get_features(words[i], tag, previous_tag) + feature_weights = sum((self.feature_weights[x] for x in features)) + + score = best_before + feature_weights + + if score > best_score: + Q[j,i]=score + best_score = score + backp[j,i]=k #best tag + + # final best + #best_id=np.argmax(Q[:, -1]) #the same + best_id=Q[:,-1].argmax() + + ## print best tags in reverse order + predtags=[] + predtags.append(tags[best_id]) + + for i in range(N-1,0,-1): + idx=int(backp[best_id,i]) + predtags.append(tags[idx]) + best_id=idx + + #return reversed predtags + #return (words,predtags[::-1]) + return predtags[::-1] + + def predict(self, test_data): + """ + Get predictions for entire test set + :param test_data: + :return: + """ + return [self.decode(words) for words in test_data] + + def predict_eval(self, test_data, output=False): + """ + compute accuracy on a test file + :param file_name: + :param output: + :return: + """ + correct = 0 + total = 0.0 + sys.stderr.write('\nTesting\n') + sys.stderr.write('*******\n') + + for i, (words, tags) in enumerate(test_data): + if i%100==0: + sys.stderr.write('%s'%i) + elif i%10==0: + sys.stderr.write('.') + + # get prediction + prediction = self.decode(words) + + if output: + for word, gold, pred in zip(words, tags, prediction): + print("{}\t{}\t{}".format(word, gold, pred)) + print("") + + correct += sum([1 for (predicted, gold) in zip(prediction, tags) if predicted == gold]) + total += len(tags) + print("\nTest accuracy on %s items: %.4f" % (i+1, correct/total), file=sys.stderr) + + def save(self, file_name): + """ + save model + :param file_name: + :return: + """ + print("saving model...", end=' ', file=sys.stderr) + with codecs.open(file_name, "w", encoding='utf-8') as model: + model.write("%s\n" % json.dumps({'tags': list(self.tags), 'weights': dict(self.feature_weights)})) + print("done", file=sys.stderr) + + def load(self, file_name): + """ + load model from JSON file + :param file_name: + :return: + """ + print("loading model...", end=' ', file=sys.stderr) + model_data = codecs.open(file_name, 'r', encoding='utf-8').readline().strip() + model = json.loads(model_data) + self.tags = set(model['tags']) + self.feature_weights = model['weights'] + print("done", file=sys.stderr) + + +# if script is run from command line, automatically execute the following +if __name__=="__main__": + + # parse command line options + parser = argparse.ArgumentParser(description="""Run a structured perceptron""") + parser.add_argument("--train", help="train model on a file (CoNLL format)", required=False) + parser.add_argument("--test", help="test model on a file (CoNLL format)", required=False) + parser.add_argument("--output", help="output predictions to stdout", required=False,action="store_true") + parser.add_argument("--load", help="load model from JSON file", required=False) + parser.add_argument("--save", help="save model as JSON file", required=False) + parser.add_argument("--iterations", help="number of training iterations", required=False, default=5, type=int) + parser.add_argument("--learning_rate", help="learning rate during training", required=False, default=0.2, type=float) + args = parser.parse_args() + + # create new model + sp = StructuredPerceptron() + + if args.load: + sp.load(args.load) + + if args.train: + train_data = list(read_conll_file(args.train)) + sp.fit(train_data, iterations=args.iterations, learning_rate=args.learning_rate) + + if args.save: + sp.save(args.save) + + # check whether to show predictions + if args.test: + test_data = list(read_conll_file(args.test)) + sp.predict_eval(test_data, output=args.output) diff --git a/task_utils.py b/task_utils.py new file mode 100644 index 0000000..99ab066 --- /dev/null +++ b/task_utils.py @@ -0,0 +1,409 @@ +""" +Utility methods that are used for training and evaluation of the tasks. +""" + +import os +import operator +import numpy as np +import random +from collections import namedtuple + +from sklearn import svm +from sklearn.metrics import accuracy_score + +import data_utils +from constants import POS_ID, NEG_ID, SENTIMENT, POS, POS_BILSTM, PARSING,\ + BAYES_OPT +from simpletagger import StructuredPerceptron + +from bist_parser.bmstparser.src import mstlstm +from bist_parser.bmstparser.src.utils import vocab_conll, write_conll,\ + write_original_conll + +from bilstm_tagger.src.simplebilty import SimpleBiltyTagger, load + +NUM_EPOCHS = 50 +PATIENCE = 2 + + +def get_data_subsets(feature_vals, feature_weights, train_data, train_labels, + task, num_train_examples): + """ + Given the feature values and the feature weights, return the stratified + subset of the training data with the highest feature scores. + :param feature_vals: a numpy array of shape (num_train_data, num_features) + containing the feature values + :param feature_weights: a numpy array of shape (num_features, ) containing + the weight for each feature + :param train_data: a sparse numpy array of shape (num_train_data, vocab_size) + containing the training data + :param train_labels: a numpy array of shape (num_train_data) containing the + training labels + :param task: the task; this determines whether we use stratification + :param num_train_examples: the number of training examples for the + respective task + :return: subsets of the training data and its labels as a tuple of two + numpy arrays + """ + # calculate the scores as the dot product between feature values and weights + scores = feature_vals.dot(np.transpose(feature_weights)) + + # sort the indices by their scores + sorted_index_score_pairs = sorted(zip(range(len(scores)), scores), + key=operator.itemgetter(1), reverse=True) + + # get the top indices + top_indices, _ = zip(*sorted_index_score_pairs) + + if task == SENTIMENT: + # for sentiment, rather than taking the top n indices, we still want to + # have a stratified training set so we take the top n/2 positive and + # top n/2 negative indices + top_pos_indices = [idx for idx in top_indices if train_labels[idx] == + POS_ID][:int(num_train_examples/2)] + top_neg_indices = [idx for idx in top_indices if train_labels[idx] == + NEG_ID][:int(num_train_examples/2)] + top_indices = top_pos_indices + top_neg_indices + elif task in [POS, POS_BILSTM, PARSING]: + # for POS tagging and parsing, we don't need a stratified train set + top_indices = list(top_indices[:num_train_examples]) + else: + raise ValueError('Top index retrieval not implemented for %s.' % task) + + if isinstance(train_data, list): + # numpy indexing does not work if train_data is a list + return [train_data[idx] for idx in top_indices],\ + train_labels[top_indices] + + # we get the corresponding subsets of the training data and the labels + return train_data[top_indices], train_labels[top_indices] + + +def task2train_and_evaluate_func(task): + """Return the train_and_evaluate function for a task.""" + if task == SENTIMENT: + return train_and_evaluate_sentiment + if task == POS: + return train_and_evaluate_pos + if task == POS_BILSTM: + return train_and_evaluate_pos_bilstm + if task == PARSING: + return train_and_evaluate_parsing + raise ValueError('Train_and_evaluate is not implemented for %s.' % task) + + +def train_and_evaluate_sentiment(train_data, train_labels, val_data, val_labels, + test_data=None, test_labels=None, + parser_output_path=None, perl_script_path=None): + """ + Trains an SVM on the provided training data. Calculates accuracy on the + validation set and (optionally) on the test set. + :param train_data: the training data; a sparse numpy matrix of shape + (num_examples, max_vocab_size) + :param train_labels: the training labels; a numpy array of shape (num_labels) + :param val_data: the validation data; same format as the training data + :param val_labels: the validation labels + :param test_data: the test data + :param test_labels: the test labels + :param parser_output_path: only necessary for parsing; is ignored here + :param perl_script_path: only necessary for parsing; is ignored here + :return: the validation accuracy and (optionally) the test data; + otherwise None + """ + print('Training the SVM on %d examples...' % train_data.shape[0]) + clf = svm.SVC() + clf.fit(train_data, train_labels) + + # validate the configuration on the validation and test set (if provided) + val_predictions = clf.predict(val_data) + val_accuracy = accuracy_score(val_labels, val_predictions) + print('Val acc: %.5f' % val_accuracy) + test_accuracy = None + if test_data is not None and test_labels is not None: + test_predictions = clf.predict(test_data) + test_accuracy = accuracy_score(test_labels, test_predictions) + print('Test acc: %.5f' % test_accuracy) + return val_accuracy, test_accuracy + + +def train_and_evaluate_pos(train_data, train_labels, val_data, val_labels, + test_data=None, test_labels=None, + parser_output_path=None, perl_script_path=None): + """ + Trains the tagger on the provided training data. Calculates accuracy on the + validation set and (optionally) on the test set. + :param train_data: the training data; a list of lists of shape + (num_examples, sequence_length) + :param train_labels: the training labels; a list of lists of tags + :param val_data: the validation data; same format as the training data + :param val_labels: the validation labels + :param test_data: the test data + :param test_labels: the test labels + :param parser_output_path: only necessary for parsing; is ignored here + :param perl_script_path: only necessary for parsing; is ignored here + :return: the validation accuracy and (optionally) the test acc; else None + """ + print('Training the tagger on %d examples...' % len(train_data)) + sp = StructuredPerceptron() + tr_data = [(words, tags) for words, tags in zip(train_data, train_labels)] + pos_iterations, pos_learning_rate = 5, 0.2 + sp.fit(tr_data, iterations=pos_iterations, learning_rate=pos_learning_rate) + + # validate the configuration on the validation and test set (if provided) + val_predictions = sp.predict(val_data) + + val_accuracy = pos_accuracy_score(val_labels, val_predictions) + print('Val acc: %.5f' % val_accuracy) + + test_accuracy = None + if test_data is not None and test_labels is not None: + test_predictions = sp.predict(test_data) + test_accuracy = pos_accuracy_score(test_labels, test_predictions) + print('Test acc: %.5f' % test_accuracy) + return val_accuracy, test_accuracy + + +def train_and_evaluate_pos_bilstm(train_data, train_labels, val_data, val_labels, + test_data=None, test_labels=None, + parser_output_path=None, perl_script_path=None): + """ + Trains the tagger on the provided training data. Calculates accuracy on the + validation set and (optionally) on the test set. + :param train_data: the training data; a list of lists of shape + (num_examples, sequence_length) + :param train_labels: the training labels; a list of lists of tags + :param val_data: the validation data; same format as the training data + :param val_labels: the validation labels + :param test_data: the test data + :param test_labels: the test labels + :return: the validation accuracy and (optionally) the test data; else None + """ + print('Training the BiLSTM tagger on %d examples...' % len(train_data)) + in_dim = 64 + h_dim = 100 + c_in_dim = 100 + h_layers = 1 + trainer = "adam" + # temporary file used to restore best model; random number is used to avoid + # name clash in parallel runs + model_path = '/tmp/bilstm_tagger_model_%d' % random.randint(0, 1000000) + tagger = SimpleBiltyTagger(in_dim, h_dim, c_in_dim, h_layers, + embeds_file=None) + train_X, train_Y = tagger.get_train_data_from_instances(train_data, + train_labels) + val_X, val_Y = tagger.get_data_as_indices_from_instances(val_data, + val_labels) + + # train the model with early stopping + tagger.fit(train_X, train_Y, NUM_EPOCHS, trainer, val_X=val_X, val_Y=val_Y, + patience=PATIENCE, model_path=model_path) + + # load the best model and remove the model files + tagger = load(model_path) + os.unlink(model_path) + os.unlink(model_path + '.pickle') # file used to save the parameters + val_correct, val_total = tagger.evaluate(val_X, val_Y) + val_accuracy = val_correct / val_total + print('Val acc: %.5f' % val_accuracy) + + test_accuracy = None + if test_data is not None and test_labels is not None: + test_X, test_Y = tagger.get_data_as_indices_from_instances(test_data, + test_labels) + test_correct, test_total = tagger.evaluate(test_X, test_Y) + test_accuracy = test_correct / test_total + print('Test acc: %.5f' % test_accuracy) + return val_accuracy, test_accuracy + + +def train_and_evaluate_parsing(train_data, train_labels, val_data, val_labels, + test_data=None, test_labels=None, + parser_output_path=None, perl_script_path=None): + """ + Trains the parser on the provided training data. Calculates LAS on the + validation set and (optionally) on the test set. + :param train_data: the training data; a list of CoNLL entries + :param train_labels: pseudo-labels; not used as labels as labels are + contained in train_data + :param val_data: the validation data; same format as the training data + :param val_labels: pseud-labels; not used as contained in val_data + :param test_data: the test data + :param test_labels: pseudo-labels; not used as contained in test_data + :return: the validation accuracy and (optionally) the test data; else None + """ + print('Training the parser on %d examples...' % len(train_data)) + if test_data is not None: + # incorporate the test data as some POS tags (e.g. XX) might only + # appear in the target domain + words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data, test_data])) + else: + words, w2i, pos, rels = vocab_conll(np.hstack([train_data, val_data])) + + # set the variables used for initializing the parser and initialize the + # parser + ParserOptions = namedtuple('parser_options', + 'activation, blstmFlag, labelsFlag, costaugFlag,' + ' bibiFlag, lstm_dims, wembedding_dims, ' + 'pembedding_dims, rembedding_dims, lstm_layers, ' + 'external_embedding, hidden_units, ' + 'hidden2_units, epochs') + parser_options = ParserOptions( + epochs=NUM_EPOCHS, + activation='tanh', + blstmFlag=True, + labelsFlag=True, + costaugFlag=True, + bibiFlag=False, + lstm_dims=125, + wembedding_dims=100, + pembedding_dims=25, + rembedding_dims=25, + lstm_layers=2, + external_embedding=None, + hidden_units=100, + hidden2_units=0 + ) + parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options) + + # write the dev data to a file + dev_data_path = os.path.join(parser_output_path, 'dev.conll') + write_original_conll(dev_data_path, val_data) + + # set the variables used for tracking training progress for early stopping + best_dev_las, epochs_no_improvement = 0., 0 + best_model_path = os.path.join(parser_output_path, 'parser') + print('Training model for %d max epochs with early stopping with patience ' + '%d...' % (NUM_EPOCHS, PATIENCE)) + for epoch in range(parser_options.epochs): + print('Starting epoch', epoch) + parser.TrainOnEntries(train_data) + + # write the predictions to a file + pred_path = os.path.join(parser_output_path, + 'dev_pred_epoch_' + str(epoch + 1) + '.conll') + write_conll(pred_path, parser.PredictOnEntries(val_data)) + eval_path = pred_path + '.eval' + perl_script_command = ('perl %s -g %s -s %s > %s' % ( + perl_script_path,dev_data_path, pred_path, eval_path)) + print('Evaluating with %s...' % perl_script_command) + os.system(perl_script_command) + las, uas, acc = data_utils.read_parsing_evaluation(eval_path) + + # remove the predictions and the evaluation file + if os.path.exists(pred_path): + os.unlink(pred_path) + if os.path.exists(eval_path): + os.unlink(eval_path) + if las > best_dev_las: + print('LAS %.2f is better than best dev LAS %.2f.' + % (las, best_dev_las)) + best_dev_las = las + epochs_no_improvement = 0 + parser.Save(best_model_path) + else: + print('LAS %.2f is worse than best dev LAS %.2f.' + % (las, best_dev_las)) + epochs_no_improvement += 1 + if epochs_no_improvement == PATIENCE: + print('No improvement for %d epochs. Early stopping...' + % epochs_no_improvement) + print('Best dev LAS:', best_dev_las) + break + + test_las = None + if test_data is not None: + # load the best model + parser = mstlstm.MSTParserLSTM(words, pos, rels, w2i, parser_options) + parser.Load(best_model_path) + + # first write the dev data to a file + test_data_path = os.path.join(parser_output_path, 'test.conll') + write_original_conll(test_data_path, test_data) + + # then write the prediction to another file + pred_path = os.path.join(parser_output_path, 'test_pred.conll') + write_conll(pred_path, parser.PredictOnEntries(test_data)) + eval_path = pred_path + '.eval' + perl_script_command = ('perl %s -g %s -s %s > %s' % ( + perl_script_path, test_data_path, pred_path, eval_path)) + print('Evaluating with %s...' % perl_script_command) + os.system(perl_script_command) + test_las, test_uas, test_acc = data_utils.read_parsing_evaluation( + eval_path) + print('Test LAS:', test_las, 'test UAS:', test_uas, + 'test acc:', test_acc) + + # remove the saved parser + if os.path.exists(best_model_path): + os.unlink(best_model_path) + return best_dev_las, test_las + + +def train_pretrained_weights(feature_values, X_train, y_train, train_domains, + num_train_examples, X_val, y_val, X_test, y_test, + trg_domain, args, feature_names, + parser_output_path, perl_script_path): + """ + Train a model using pre-trained data selection weights (which could have + been trained on an other model/domain/task). + :param feature_values: a numpy array of shape (num_examples, num_features) + :param X_train: the training data + :param y_train: the training labels + :param train_domains: a list of training domains, only used for counting + :param num_train_examples: the number of examples used for training + :param X_val: the validation data + :param y_val: the validation labels + :param X_test: the test data + :param y_test: the test labels + :param trg_domain: the target domain + :param args: the arguments used for calling the script; used for logging + :param feature_names: a list of the feature names + :param parser_output_path: the output path of the parser + :param perl_script_path: the path to the perl script + :return: + """ + for feat_weights_domain, feat_weights_feats, feature_weights in \ + data_utils.read_feature_weights_file(args.feature_weights_file): + assert len(feature_weights) == len(feature_names) + assert set(args.feature_sets) == set(feat_weights_feats.split(' ')) + + if trg_domain != feat_weights_domain: + continue + + # count how many examples belong to each source domain + train_domain_subset, _ = get_data_subsets( + feature_values, feature_weights, train_domains, y_train, args.task, + num_train_examples) + for subset_domain in set(train_domain_subset): + print('# of %s in train data for trg domain %s: %d' + % (subset_domain, trg_domain, + train_domain_subset.count(subset_domain))) + continue + + # get the train subset with the highest scores and train + train_subset, labels_subset = get_data_subsets( + feature_values, feature_weights, X_train, y_train, args.task, + num_train_examples) + val_accuracy, test_accuracy = task2train_and_evaluate_func(args.task)( + train_subset, labels_subset, X_val, y_val, X_test, y_test, + parser_output_path=parser_output_path, + perl_script_path=perl_script_path) + dict_key = ('%s-X-domain-%s-%s' % (BAYES_OPT, feat_weights_domain, + feat_weights_feats)) + + # log the result to the log file + data_utils.log_to_file(args.log_file, {dict_key: [( + val_accuracy, test_accuracy, feature_weights)]}, trg_domain, args) + + +def pos_accuracy_score(gold, predicted): + """ + Calculate the accuracy for POS. + :param gold: a list of lists of gold tags + :param predicted: a list of lists of predicted tags + :return the accuracy score + """ + tags_correct = np.sum([1 for gold_tags, pred_tags in zip(gold, predicted) + for g, p in zip(gold_tags, pred_tags) if g == p]) + tags_total = len([t for g in gold for t in g]) # ravel list + return tags_correct/float(tags_total)