Skip to content

Commit

Permalink
Add scripts to convert raw file to text-only json
Browse files Browse the repository at this point in the history
Raw text file format (line by line):
    """
    <text_1>
    <text_2>
    ...
    <text_n>
    """
  • Loading branch information
research4pan committed Sep 6, 2023
1 parent bc569db commit 4dd031b
Showing 1 changed file with 81 additions and 0 deletions.
81 changes: 81 additions & 0 deletions scripts/data_preprocess/raw2textonly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
"""
Converts a raw text file, separated by lines, into a "text-only" formatted json.
"""
from __future__ import absolute_import

import argparse
import json
import textwrap
import sys

def parse_argument(sys_argv):
"""Parses arguments from command line.
Args:
sys_argv: the list of arguments (strings) from command line.
Returns:
A struct whose member corresponds to the required (optional) variable.
For example,
```
args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
args.input # 'a.txt'
args.num # 10
```
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter)

# Training parameters
parser.add_argument(
"--dataset_path", type=str,
default=None,
help=textwrap.dedent("input dataset path, reads from stdin by default")
)
parser.add_argument(
"--output_path", type=str,
default=None,
help=textwrap.dedent("output dataset path, writes to stdout by default")
)

# Parses from commandline
args = parser.parse_args(sys_argv[1:])

return args


def raw2textonly(fin):
"""
Converts raw text to text-only format.
Args:
fin: the input file description of the raw text file.
Returns:
a dict with "text-only" format.
"""
data_dict = {
"type": "text_only",
"instances": [ { "text": line.strip() } for line in fin ],
}
return data_dict


def main():
args = parse_argument(sys.argv)

if args.dataset_path is not None:
with open(args.dataset_path, "r") as fin:
data_dict = raw2textonly(fin)
else:
data_dict = raw2textonly(sys.stdin)

if args.output_path is not None:
with open(args.output_path, "w") as fout:
json.dump(data_dict, fout, indent=4, ensure_ascii=False)
else:
json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)


if __name__ == "__main__":
main()

0 comments on commit 4dd031b

Please sign in to comment.