Skip to content

Commit

Permalink
Refactor to remove requirement that the file parsed is seekable.
Browse files Browse the repository at this point in the history
  • Loading branch information
EmilStenstrom committed Oct 13, 2024
1 parent e2ef74a commit a76d996
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 58 deletions.
26 changes: 19 additions & 7 deletions conllu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

from conllu.models import Metadata, SentenceGenerator, SentenceList, Token, TokenList, TokenTree
from conllu.parser import (
_FieldParserType, _MetadataParserType, parse_conllu_plus_fields, parse_sentences, parse_token_and_metadata,
DEFAULT_FIELDS, _FieldParserType, _MetadataParserType, parse_sentences, parse_token_and_metadata,
)

__all__ = [
"parse", "parse_incr", "parse_tree", "parse_tree_incr",
"SentenceGenerator", "SentenceList", "TokenList", "TokenTree", "Token", "Metadata",
"parse_conllu_plus_fields", "parse_sentences", "parse_token_and_metadata",
"parse_sentences", "parse_token_and_metadata",
]

def parse(data: str, fields: T.Optional[T.Sequence[str]] = None,
Expand All @@ -31,14 +31,26 @@ def parse_incr(in_file: T.TextIO, fields: T.Optional[T.Sequence[str]] = None,
if not hasattr(in_file, 'read'):
raise FileNotFoundError("Invalid file, 'parse_incr' needs an opened file as input")

if not fields:
fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers)

def generator():
global_columns = None

for sentence in parse_sentences(in_file):
lines = sentence.strip().split('\n')
current_metadata = [line for line in lines if line.startswith('#')]
current_sentence = [line for line in lines if not line.startswith('#')]

if any(line.startswith('# global.columns = ') for line in current_metadata):
global_columns = next(
line.split('=', 1)[1].strip().split()
for line in current_metadata if line.startswith('# global.columns = ')
)

used_fields = global_columns if global_columns else (fields if fields else DEFAULT_FIELDS)
used_fields = [field.lower() for field in used_fields]

yield parse_token_and_metadata(
sentence,
fields=fields,
'\n'.join(current_metadata + current_sentence),
fields=used_fields,
field_parsers=field_parsers,
metadata_parsers=metadata_parsers
)
Expand Down
27 changes: 0 additions & 27 deletions conllu/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,33 +22,6 @@
"newdoc": lambda key, value: (key, value),
}

def parse_conllu_plus_fields(in_file: T.TextIO,
metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None
) -> T.Optional[T.Sequence[str]]:
pos = in_file.tell()

# Get first line
try:
first_sentence = next(parse_sentences(in_file))
first_line = first_sentence.split("\n")[0]
except StopIteration:
first_line = ""

# parse_sentences moves to file cursor, so reset it here
in_file.seek(pos)

if not first_line.startswith("#"):
return None

tokenlist = parse_token_and_metadata(first_line, metadata_parsers=metadata_parsers)
metadata = tokenlist.metadata

fields = None
if "global.columns" in metadata and metadata["global.columns"]:
fields = [value.lower() for value in metadata["global.columns"].split(" ")]

return fields

def parse_sentences(in_file: T.TextIO) -> T.Iterator[str]:
buf: T.List[str] = []
for line in in_file:
Expand Down
26 changes: 2 additions & 24 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,12 @@

from conllu.models import Token, TokenList
from conllu.parser import (
DEFAULT_FIELDS, ParseException, head_to_token, parse_comment_line, parse_conllu_plus_fields, parse_dict_value,
parse_id_value, parse_int_value, parse_line, parse_nullable_value, parse_paired_list_value, parse_sentences,
DEFAULT_FIELDS, ParseException, head_to_token, parse_comment_line, parse_dict_value, parse_id_value,
parse_int_value, parse_line, parse_nullable_value, parse_paired_list_value, parse_sentences,
parse_token_and_metadata, serialize, serialize_field,
)


class TestParseConlluPlusFields(unittest.TestCase):
def test_empty(self):
self.assertEqual(parse_conllu_plus_fields(StringIO("")), None)
self.assertEqual(parse_conllu_plus_fields(StringIO(None)), None)

def test_simple(self):
data = dedent("""\
# global.columns = ID FORM UPOS HEAD DEPREL MISC PARSEME:MWE
1\tDer\tDET\t2\tdet\t_\t*
""")
self.assertEqual(
parse_conllu_plus_fields(StringIO(data)),
["id", "form", "upos", "head", "deprel", "misc", "parseme:mwe"]
)

def test_empty_columns(self):
data = dedent("""\
# global.columns =
1\tDer\tDET\t2\tdet\t_\t*
""")
self.assertEqual(parse_conllu_plus_fields(StringIO(data)), None)

class TestParseSentencesGenerator(unittest.TestCase):
def test_empty(self):
self.assertEqual(list(parse_sentences(StringIO(""))), [])
Expand Down

0 comments on commit a76d996

Please sign in to comment.