elazarg
diff --git a/‎.dockerignore
+4 b/‎.dockerignore
+4
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎Dockerfile
+23 b/‎Dockerfile
+23
diff --git a/‎README.md
+55-5 b/‎README.md
+55-5
diff --git a/‎hebrew_diacritized b/‎hebrew_diacritized
diff --git a/‎final_model/group1-shard1of6.bin ‎model_js/group1-shard1of6.bin b/‎final_model/group1-shard1of6.bin ‎model_js/group1-shard1of6.bin
diff --git a/‎final_model/group1-shard2of6.bin ‎model_js/group1-shard2of6.bin b/‎final_model/group1-shard2of6.bin ‎model_js/group1-shard2of6.bin
diff --git a/‎final_model/group1-shard3of6.bin ‎model_js/group1-shard3of6.bin b/‎final_model/group1-shard3of6.bin ‎model_js/group1-shard3of6.bin
diff --git a/‎final_model/group1-shard4of6.bin ‎model_js/group1-shard4of6.bin b/‎final_model/group1-shard4of6.bin ‎model_js/group1-shard4of6.bin
diff --git a/‎final_model/group1-shard5of6.bin ‎model_js/group1-shard5of6.bin b/‎final_model/group1-shard5of6.bin ‎model_js/group1-shard5of6.bin
diff --git a/‎final_model/group1-shard6of6.bin ‎model_js/group1-shard6of6.bin b/‎final_model/group1-shard6of6.bin ‎model_js/group1-shard6of6.bin
diff --git a/‎final_model/model.json ‎model_js/model.json b/‎final_model/model.json ‎model_js/model.json
diff --git a/‎final_model/final.h5 ‎models/Nakdimon.h5 b/‎final_model/final.h5 ‎models/Nakdimon.h5
diff --git a/‎nakdimon/__main__.py
+97 b/‎nakdimon/__main__.py
+97
diff --git a/‎nakdimon/ablations.py
+3-15 b/‎nakdimon/ablations.py
+3-15
diff --git a/‎nakdimon/app.py
-27 b/‎nakdimon/app.py
-27
diff --git a/‎nakdimon/dataset.py
-5 b/‎nakdimon/dataset.py
-5
diff --git a/‎nakdimon/experiments/partial-modern.png
-24.3 KB b/‎nakdimon/experiments/partial-modern.png
-24.3 KB
@@ -0,0 +1,4 @@
+**/__pycache__/**/*
+**/__pycache__
+**/.cachier/**/*
+**/.cachier
@@ -3,6 +3,7 @@ checkpoints
 .ipynb_checkpoints
 .idea
 .vscode
+.cachier
 group*-shard*
 models
 final_model
 
@@ -0,0 +1,23 @@
+FROM tensorflow/tensorflow:latest-gpu
+
+#ENV VIRTUAL_ENV=/opt/venv
+#RUN python -m venv $VIRTUAL_ENV
+#ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN pip install --upgrade pip
+
+WORKDIR /app
+COPY README.md README.md
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY hebrew_diacritized hebrew_diacritized
+COPY tests tests
+COPY models models
+COPY nakdimon nakdimon
+
+RUN chown -R 1000:1000 .
+RUN chmod -R 755 .
+
+#RUN nohup python nakdimon server &
+
+#ENTRYPOINT ["python", "nakdimon"]
@@ -1,15 +1,65 @@
-# Nakdimon - a simple Hebrew Diacritizer
+## Running docker container
+```
+$ docker run --rm --gpus all --user 1000:1000 -it nakdimon-gpu
+```
+
+The `--gpus all` flag is required to run the container with GPU support.
 
-Repository for the paper [Restoring Hebrew Diacritics Without a Dictionary](https://arxiv.org/abs/2105.05209) by Elazar Gershuni and Yuval Pinter.
+## Training and evaluating
+To train, test and evaluate the system, run the following commands:
+```
+> python nakdimon train --model=models/Nakdimon.h5
+> python nakdimon run_test --test_set=tests/new --model=models/Nakdimon.h5
+> python nakdimon results --test_set=tests/new --systems Snopi Morfix Dicta MajAllWithDicta Nakdimon
+```
+The first step trains the model and create a file named `Nakdimon.h5` in the `models` directory.
+By default, the model is the one described in the paper: `models/Nakdimon.h5`.
+If the model already exists, you may skip this step. 
 
-Demo: http://www.nakdimon.org/
+The second step asks the Nakdimon server to predict the diacritics for the test set. You may skip this step.
+A folder for the results is created in the chosen test folder, with the same name as the model; in this case, `tests/new/NakdimonNew`.
+By default, the test set is the one used in the paper (`tests/new`); you can use `tests/dicta` instead.
+If the test results already exist, you may skip this step. If you are not sure, you can use the `--skip_existing` flag.
+
+The third step calculates and prints the results (DEC, CHA, WOR and VOC metrics, as well as OOV_WOR and OOV_VOC).
+By default, the systems are the folders in the chosen test folder.
+For the Dicta test set (`/tests/dicta`) you should use `MajAllNoDicta` instead of `MajAllWithDicta`, otherwise the vocabulary for the Majority would include the test set itself.
+
+## Diacritizing a single file
+```
+> python nakdimon predict input_file.txt output_file.txt
+```
 
-Citation (until NAACL 2022 prceedings are available):
+## Using other systems
+You can use the `run_test` command to run the test set on other systems, such as Dicta:
+```
+> python nakdimon run_test --test_set=tests/new --system=Dicta
+```
+This will create a folder named `Dicta` for the results in the `tests/new` folder.
+Note that `Morfix` cannot be used in this manner, as its license prohibit automatic use.
+
+## Running ablation tests
+You can use the `--ablation` flag to train different models for the ablation tests and other experiments:
+```
+> python nakdimon train --model=models/SingleLayer.h5 --ablation=SingleLayer
+```
+See the file `ablation.py` for the list of available ablation parameters.
+
+## Important folders
+* `hebrew_diacritized` is the training set.
+* `tests` contains three tests sets: `new`, `dicta` and `validation`.
+  Each test set has an `expected` folder that describes the ground truth.
+  The results of `python nakdimon run_test` are stored in sibling folder, named after the model.
+* `models` contains the trained model.
+* `nakdimon` holds the source code.
+
+## Citation
+(until NAACL 2022 prceedings are available):
 ```
 @article{gershuni2021restoring,
   title={Restoring Hebrew Diacritics Without a Dictionary},
   author={Gershuni, Elazar and Pinter, Yuval},
   journal={arXiv preprint arXiv:2105.05209},
   year={2021}
 }
-```
+```
@@ -0,0 +1,97 @@
+import argparse
+import sys
+import os
+import logging
+
+
+def do_train(**kwargs):
+    import train
+    train.main(**kwargs)
+
+
+def do_run_test(**kwargs):
+    import run_test
+    run_test.main(**kwargs)
+
+
+def do_metrics(**kwargs):
+    import metrics
+    metrics.main(**kwargs)
+
+
+def do_predict(**kwargs):
+    import predict
+    predict.main(**kwargs)
+
+
+def do_server(**kwargs):
+    import os
+    import sys
+    import pkgutil
+    package = pkgutil.get_loader("server")
+    assert package is not None
+    logging.info("Executing flask server...")
+    os.execv(sys.executable, [sys.executable, package.get_filename()])
+    exit(1)
+
+
+if __name__ == '__main__':
+
+    logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')
+
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(
+        description="""Train and evaluate Nakdimon and other diacritizers. Reproduce the Nakdimon paper.""",
+    )
+    parser.add_argument('-q', '--quiet', action='store_true', help='suppress info logging.', default=False)
+
+    subparsers = parser.add_subparsers(help='sub-command help', dest="command", required=True)
+
+    parser_train = subparsers.add_parser('train', help='train Nakdimon')
+    parser_train.add_argument('--wandb', action='store_true', help='use wandb.', default=False)
+    parser_train.add_argument('--model', help='path to output model (.h5 file)', default='models/Full.h5', dest='model_path')
+    parser_train.add_argument('--ablation', help='ablation test', default=None, dest='ablation_name')
+    parser_train.set_defaults(func=do_train)
+
+    test_systems = ['Snopi', 'Morfix', 'Dicta', 'Nakdimon', 'MajMod', 'MajAllWithDicta', 'MajAllWithoutDicta']
+    # iterate over folders to find available options:
+    available_tests = [f'tests/{folder}' for folder in os.listdir('tests/')
+                       if os.path.isdir(f'tests/{folder}') and os.path.isdir(f'tests/{folder}/expected')]
+
+    parser_test = subparsers.add_parser('run_test', help='diacritize a test set')
+    parser_test.add_argument('--test_set', choices=available_tests, help='choose test set', default='tests/new')
+    parser_test.add_argument('--system', choices=test_systems, help='diacritization system to use', default='Nakdimon')
+    parser_test.add_argument('--model', help='path to model (.h5 file)', default='models/Nakdimon.h5', dest='model_path')
+    parser_test.add_argument('--skip-existing', action='store_true', help='skip existing files')
+    parser_test.set_defaults(func=do_run_test)
+
+    parser_predict = subparsers.add_parser('predict', help='diacritize a text file')
+    parser_predict.add_argument('input_path', help='input file')
+    parser_predict.add_argument('output_path', help='output file')
+    parser_predict.set_defaults(func=do_predict)
+
+    # parser_daemon = subparsers.add_parser('server', help='run Nakdimon server as a daemon')
+    # parser_daemon.set_defaults(func=do_server)
+
+    parser_eval = subparsers.add_parser('results', help='evaluate the results of a test run')
+    parser_eval.add_argument('--test_set', choices=available_tests, help='choose test set', default='tests/new')
+    partial_result, _ = parser.parse_known_args()
+    if partial_result.command == 'results':
+        test_systems = [folder for folder in os.listdir(partial_result.test_set)
+                        if os.path.isdir(f'{partial_result.test_set}/{folder}') and folder != 'expected']
+    parser_eval.add_argument('--systems', choices=test_systems, nargs='+', help='list of systems to evaluate',
+                             default=test_systems)
+    parser_eval.set_defaults(func=do_metrics)
+
+    args = parser.parse_args()
+
+    if args.quiet:
+        logging.disable(logging.INFO)
+    del args.quiet
+
+    kwargs = vars(args).copy()
+    del kwargs['command']
+    del kwargs['func']
+    args.func(**kwargs)
+
+    sys.exit(0)
@@ -1,8 +1,6 @@
-import keras
 import tensorflow as tf
 
-import train
-from train import TrainingParams, train_ablation
+from train import TrainingParams
 import schedulers
 
 
@@ -97,7 +95,7 @@ def epoch_params(self, data):
 class Chunk(TrainingParams):
     def __init__(self, maxlen):
         super().__init__()
-        self.maxlen = maxlen
+        self.maxlen = int(maxlen)
 
     @property
     def name(self):
@@ -107,7 +105,7 @@ def name(self):
 class Batch(TrainingParams):
     def __init__(self, batch_size):
         super().__init__()
-        self.batch_size = batch_size
+        self.batch_size = int(batch_size)
 
     @property
     def name(self):
@@ -211,13 +209,3 @@ def epoch_params(self, data):
         yield ('automatic', len(lrs1), tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: lrs1[epoch-1-1]))
         lrs2 = [10e-4, 10e-4, 3e-4]
         yield ('modern', len(lrs2), tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: lrs2[epoch-len(lrs1)-1-1]))
-
-
-if __name__ == '__main__':
-    # units = 400
-    print(train.Full().build_model().count_params())
-    # for cls in [train.TwoLevelLSTM]:
-    #     for i in range(1):
-    #         print(cls(units).build_model().count_params())
-    #         train_ablation(cls(units), group=f"{cls.__name__}:2022")
-
@@ -3,8 +3,6 @@
 import random
 import numpy as np
 
-from cachier import cachier
-
 import hebrew
 import utils
 
@@ -128,7 +126,6 @@ def read_corpora(base_paths) -> tuple[tuple[str, list[hebrew.HebrewItem]], ...]:
     return tuple([(filename, list(hebrew.iterate_file(filename))) for filename in utils.iterate_files(base_paths)])
 
 
-@cachier()
 def load_data(base_paths, maxlen: int) -> Data:
     corpora = read_corpora(base_paths)
     corpus = [(filename, Data.from_text(heb_items, maxlen)) for (filename, heb_items) in corpora]
@@ -145,5 +142,3 @@ def load_data(base_paths, maxlen: int) -> Data:
     # print(res)
     print_tables()
     print(letters_table.to_ids(["שלום"]))
-
-# load_data.clear_cache()