Add test for TPST and support jsonl format

wq2012 · wq2012 · commit 4090f82c1419 · 2024-02-06T17:04:29.000-05:00
diff --git a/DiarizationLM/README.md b/DiarizationLM/README.md
@@ -75,16 +75,17 @@ We support 3 different output file formats:
 | Format | Description |
 | ------ | ----------- |
 | `tfrecord` | The [TFRecord format](https://www.tensorflow.org/tutorials/load_data/tfrecord) can be used by various machine learning libraries.|
-| `csv` | This format can be used by [OpenAI API](https://platform.openai.com/docs/api-reference/) for finetuning GPT models. OpenAI will usually convert these csv files to jsonl files.|
 | `json` | This format is more human readable and can be used for debugging. It's also useful for finetuning PaLM models via the [Google Cloud API](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-text-models-supervised#text).|
+| `csv` | This format can be used by many existing tools. OpenAI also provides a tool to convert csv files to jsonl files.|
+| `jsonl` | This format can be directly used by the [OpenAI API](https://platform.openai.com/docs/api-reference/) for finetuning GPT models.|
 
 Example command:
 
 ```bash
 python3 train_data_prep.py \
 --input="testdata/example_data.json" \
---output="/tmp/example_data.csv" \
---output_type=csv \
+--output="/tmp/example_data.jsonl" \
+--output_type=jsonl \
 --emit_input_length=1000 \
 --emit_target_length=1000 \
 --prompt_suffix=" --> " \
diff --git a/DiarizationLM/run_tests.sh b/DiarizationLM/run_tests.sh
diff --git a/DiarizationLM/run_tools.sh b/DiarizationLM/run_tools.sh
@@ -14,6 +14,14 @@ python3 train_data_prep.py \
 --output=/tmp/example_data.tfrecord \
 --output_type=tfrecord
 
+python3 train_data_prep.py \
+--input=testdata/example_data.json \
+--output=/tmp/example_data.jsonl \
+--input_feature_key=prompt \
+--output_feature_key=completion \
+--completion_suffix=" [eod]" \
+--output_type=jsonl
+
 python3 postprocess_completions.py \
 --input=testdata/example_completion_with_bad_completion.json \
 --output=/tmp/example_postprocessed.json
diff --git a/DiarizationLM/train_data_prep.py b/DiarizationLM/train_data_prep.py
@@ -14,7 +14,7 @@
 flags.DEFINE_enum(
     "output_type",
     "tfrecord",
-    ["tfrecord", "json", "csv"],
+    ["tfrecord", "json", "csv", "jsonl"],
     "Output container formats for different use cases.",
 )
 flags.DEFINE_string("text_field", "hyp_text", "Name of field to get text")
@@ -100,6 +100,13 @@ def main(argv: Sequence[str]) -> None:
       csv_lines.append('"{}","{}"'.format(prompt, target))
     with open(FLAGS.output, "wt") as f:
       f.write("\n".join(csv_lines))
+  elif FLAGS.output_type == "jsonl":
+    json_lines = []
+    for _, prompt, target in reader.generate_data_tuple():
+      json_lines.append('{{"{}":"{}","{}":"{}"}}'.format(
+          FLAGS.input_feature_key, prompt, FLAGS.output_feature_key, target))
+    with open(FLAGS.output, "wt") as f:
+      f.write("\n".join(json_lines))
 
   print("Output has been written to:", FLAGS.output)
 
diff --git a/DiarizationLM/utils_test.py b/DiarizationLM/utils_test.py
@@ -23,6 +23,16 @@ def test_get_oracle_speakers(self):
     expected = [1, 1, 1, 1, 2, 2, 2, 2]
     self.assertEqual(expected, hyp_spk_oracle)
 
+  def test_transcript_preserving_speaker_transfer(self):
+    src_text = "hello good morning hi how are you pretty good"
+    src_spk = "1 1 1 2 2 2 2 1 1"
+    tgt_text = "hello morning hi hey are you be good"
+    tgt_spk = "1 2 2 2 1 1 2 1"
+    expected = "1 1 2 2 2 2 1 1"
+    transfered_spk = utils.transcript_preserving_speaker_transfer(
+        src_text, src_spk, tgt_text, tgt_spk)
+    self.assertEqual(expected, transfered_spk)
+
   def test_ref_to_oracle(self):
     test_data = {
         "hyp_text": "yo hello hi wow great",