From 836ed45821cbdf29fdaae3e957db7d76547d1c54 Mon Sep 17 00:00:00 2001 From: KOLANICH Date: Tue, 5 May 2020 19:58:06 +0300 Subject: [PATCH] Initial commit --- .editorconfig | 15 + .github/.templateMarker | 1 + .github/dependabot.yml | 8 + .github/workflows/CI.yml | 15 + .gitignore | 14 + .gitlab-ci.yml | 55 ++++ Code_Of_Conduct.md | 1 + MANIFEST.in | 6 + ReadMe.md | 62 +++++ UNLICENSE | 24 ++ UniGrammarRuntime/DSLMetadata.py | 13 + UniGrammarRuntime/FormatMetadata.py | 20 ++ UniGrammarRuntime/IParser.py | 125 +++++++++ UniGrammarRuntime/IParsingBackend.py | 119 ++++++++ UniGrammarRuntime/IWrapper.py | 34 +++ UniGrammarRuntime/ParserBundle.py | 183 ++++++++++++ UniGrammarRuntime/ToolMetadata.py | 22 ++ UniGrammarRuntime/__init__.py | 0 UniGrammarRuntime/backends/__init__.py | 2 + .../backends/multilanguage/CoCoR.py | 46 +++ .../backends/multilanguage/__init__.py | 0 .../backends/multilanguage/antlr4.py | 116 ++++++++ .../backends/multilanguage/waxeye.py | 181 ++++++++++++ UniGrammarRuntime/backends/python/PyDSL.py | 98 +++++++ UniGrammarRuntime/backends/python/TatSu.py | 147 ++++++++++ UniGrammarRuntime/backends/python/__init__.py | 0 UniGrammarRuntime/backends/python/arpeggio.py | 148 ++++++++++ UniGrammarRuntime/backends/python/lark.py | 101 +++++++ UniGrammarRuntime/backends/python/parglare.py | 101 +++++++ .../backends/python/parsimonious.py | 149 ++++++++++ .../backends/regExps/__init__.py | 0 UniGrammarRuntime/backends/regExps/python.py | 89 ++++++ UniGrammarRuntime/backends/rust/pest.py | 41 +++ UniGrammarRuntime/benchmark.py | 263 ++++++++++++++++++ UniGrammarRuntime/dslsMetadata.py | 11 + UniGrammarRuntime/grammarClasses.py | 81 ++++++ UniGrammarRuntime/py.typed | 0 UniGrammarRuntime/utils/__init__.py | 85 ++++++ pyproject.toml | 42 +++ 39 files changed, 2418 insertions(+) create mode 100644 .editorconfig create mode 100644 .github/.templateMarker create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/CI.yml create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 Code_Of_Conduct.md create mode 100644 MANIFEST.in create mode 100644 ReadMe.md create mode 100644 UNLICENSE create mode 100644 UniGrammarRuntime/DSLMetadata.py create mode 100644 UniGrammarRuntime/FormatMetadata.py create mode 100644 UniGrammarRuntime/IParser.py create mode 100644 UniGrammarRuntime/IParsingBackend.py create mode 100644 UniGrammarRuntime/IWrapper.py create mode 100644 UniGrammarRuntime/ParserBundle.py create mode 100644 UniGrammarRuntime/ToolMetadata.py create mode 100644 UniGrammarRuntime/__init__.py create mode 100644 UniGrammarRuntime/backends/__init__.py create mode 100644 UniGrammarRuntime/backends/multilanguage/CoCoR.py create mode 100644 UniGrammarRuntime/backends/multilanguage/__init__.py create mode 100644 UniGrammarRuntime/backends/multilanguage/antlr4.py create mode 100644 UniGrammarRuntime/backends/multilanguage/waxeye.py create mode 100644 UniGrammarRuntime/backends/python/PyDSL.py create mode 100644 UniGrammarRuntime/backends/python/TatSu.py create mode 100644 UniGrammarRuntime/backends/python/__init__.py create mode 100644 UniGrammarRuntime/backends/python/arpeggio.py create mode 100644 UniGrammarRuntime/backends/python/lark.py create mode 100644 UniGrammarRuntime/backends/python/parglare.py create mode 100644 UniGrammarRuntime/backends/python/parsimonious.py create mode 100644 UniGrammarRuntime/backends/regExps/__init__.py create mode 100644 UniGrammarRuntime/backends/regExps/python.py create mode 100644 UniGrammarRuntime/backends/rust/pest.py create mode 100644 UniGrammarRuntime/benchmark.py create mode 100644 UniGrammarRuntime/dslsMetadata.py create mode 100644 UniGrammarRuntime/grammarClasses.py create mode 100644 UniGrammarRuntime/py.typed create mode 100644 UniGrammarRuntime/utils/__init__.py create mode 100644 pyproject.toml diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..62d9a3f --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +root = true + +[*] +charset = utf-8 +indent_style = tab +indent_size = 4 +insert_final_newline = true +end_of_line = lf + +[*.{yml,yaml,yug}] +indent_style = space +indent_size = 2 + +[grammars/*.txt] +insert_final_newline = false diff --git a/.github/.templateMarker b/.github/.templateMarker new file mode 100644 index 0000000..5e3a3e0 --- /dev/null +++ b/.github/.templateMarker @@ -0,0 +1 @@ +KOLANICH/python_project_boilerplate.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..89ff339 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + allow: + - dependency-type: "all" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..7fe33b3 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,15 @@ +name: CI +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - name: typical python workflow + uses: KOLANICH-GHActions/typical-python-workflow@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b8fd4e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +__pycache__ +*.pyc +*.pyo +*.pgt +*.dot +/*.egg-info +/build +/dist +/.eggs +/tests/grammars +monkeytype.sqlite3 +*.srctrlprj +*.srctrldb +*.srctrlbm diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..e84ca87 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,55 @@ +image: registry.gitlab.com/kolanich-subgroups/docker-images/fixed_python:latest + +variables: + DOCKER_DRIVER: overlay2 + SAST_ANALYZER_IMAGE_TAG: latest + SAST_DISABLE_DIND: "true" + SAST_CONFIDENCE_LEVEL: 5 + CODECLIMATE_VERSION: latest + +include: + - template: SAST.gitlab-ci.yml + - template: Code-Quality.gitlab-ci.yml + +.build: + tags: + - shared + - linux + stage: build + interruptible: true + variables: + GIT_DEPTH: "1" + PYTHONUSERBASE: ${CI_PROJECT_DIR}/python_user_packages + + before_script: + - export PATH="$PATH:$PYTHONUSERBASE/bin" # don't move into `variables` + #- git clone --depth=1 --filter=sparse:path=src/python https://github.com/waxeye-org/waxeye.git + - git clone --depth=1 https://github.com/waxeye-org/waxeye.git + - cd ./waxeye/src/python + - python3 ./setup.py bdist_wheel + - pip3 install --upgrade ./dist/*.whl + - cd ../../../ + + cache: + paths: + - /usr/local/site-packages + - /usr/local/lib/python*/site-packages + + script: + - python3 setup.py bdist_wheel + - pip3 install --user --upgrade ./dist/*.whl + - cd ./tests + #- coverage run -a --branch --source=UniGrammar -m pytest --junitxml=./rspec.xml --forked ./test*.py + #- coverage report -m || true + #- coveralls || true + #- codecov || true + #- cd .. + - mkdir wheels + - mv ./dist/*.whl ./wheels/AptSourcesList-0.CI-py3-none-any.whl + + artifacts: + paths: + - wheels + - $PYTHONUSERBASE + reports: + junit: ./rspec.xml diff --git a/Code_Of_Conduct.md b/Code_Of_Conduct.md new file mode 100644 index 0000000..bcaa2bf --- /dev/null +++ b/Code_Of_Conduct.md @@ -0,0 +1 @@ +No codes of conduct! \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..602af25 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +include UNLICENSE +include *.md +include tests +global-include .editorconfig +global-include *.pgt +global-include *.pglr diff --git a/ReadMe.md b/ReadMe.md new file mode 100644 index 0000000..831f32e --- /dev/null +++ b/ReadMe.md @@ -0,0 +1,62 @@ +UniGrammarRuntime.py [![Unlicensed work](https://raw.githubusercontent.com/unlicense/unlicense.org/master/static/favicon.png)](https://unlicense.org/) +=================== +~~![GitLab Build Status](https://gitlab.com/UniGrammar/UniGrammarRuntime.py/badges/master/pipeline.svg)~~ +~~![GitLab Coverage](https://gitlab.com/UniGrammar/UniGrammarRuntime.py/badges/master/coverage.svg)~~ +[![Libraries.io Status](https://img.shields.io/librariesio/github/UniGrammar/UniGrammarRuntime.py.svg)](https://libraries.io/github/UniGrammar/UniGrammarRuntime.py) +[![Code style: antiflash](https://img.shields.io/badge/code%20style-antiflash-FFF.svg)](https://codeberg.org/KOLANICH-tools/antiflash.py) + +Runtime for UniGrammar-generated wrappers for generated parsers. Generated parsers can be used without wrappers, but wrappers allow to use them uniformly, swapping implementation but keeping the interface. + +This allows to +* get rid of hard dependencies on specific libraries, instead any supported parser library can be used, for which a parser is generated; +* benchmark and compare performance of various parsing libraries; +* use the most performant of the available libraries. + + +How-to use +----------- + +* Generate or construct manually a `parser bundle`. A parser bundle is an object storing and giving out + * pregenerated parsers for different backends (can be generated standalonely using `transpile`) + * auxilary information (can be generated using `gen-aux`): + * production names to capture groups mappings, for the parser generators not supporting capturing; + * production names to booleans mappings, telling if the AST node is a collection, for the parser generators not capable to tell the difference between an iterable or a node in AST; + * benchmark results + * a wrapper, transforming backend-specific AST into backend-agnostic one + Parser bundle can be constructed from a dir on storage or compiled directly into an object in memory. In any case it can be used by a backend. + +* Construct a backend. A backend here is an object + * storing underlying parser objects + * providing necessary functions to be used by a wrapper to transform backend-specific AST into backend-agnostic one. + +There are 2 ways to construct a backend: + * You can import the backend manually: `from UniGrammarRuntime.backends. import ` and construct it: `b = ("", )`. + * Or you can just call a method of the bundle, constructing the needed backend. Pass `None` to select the backend automatically based on benchmarking results. + +* Now you can do low-level stuff using backend methods: + * You can parse your grammar into its backend-native format using `b.parse("")` method. + * You can preprocess the AST generated by `parse` and observe the result, using `preprocessAST`. + * You can check if preprocessed AST nodes represent a collection using `isList` and iterate over them using `iterateList`. + * You can transform terminal nodes into `str`s using `getTextFromToken`. + * You can merge subtrees into a single `str` using `mergeShit`. + +This all can be useful if you + * don't want to use a generated wrapper + * are designing a new Template, so you need the generator to generate custom postprocessing, in order to do it you need to craft it manually first + * are debugging + * are just playing around + +* Now we go a level higher. You can use a wrapper to get a prettied backend-agnostic postprocessed AST. + * Import the generated wrapper module. + * manually `import ` + * Via a backend: + * Then it contains some classes. The class you usually need is aliased to `__MAIN_PARSER__`. + * Construct the wrapper, initializing it with the backend: `w = .__MAIN_PARSER__(b)` + * Parse what you need: `ast = w("")` + +Examples +-------- + +* https://codeberg.org/prebuilder/pyMetakitDefinitionString/src/branch/master/pyMetakitDefinitionString/__init__.py +* https://codeberg.org/KOLANICH-libs/FullingMotorModelDecoder.py/src/branch/master/FullingMotorModelDecoder/__init__.py +* https://codeberg.org/KOLANICH-libs/AptSourcesList.py/src/branch/master/AptSourcesList/__init__.py diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..efb9808 --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/UniGrammarRuntime/DSLMetadata.py b/UniGrammarRuntime/DSLMetadata.py new file mode 100644 index 0000000..f128560 --- /dev/null +++ b/UniGrammarRuntime/DSLMetadata.py @@ -0,0 +1,13 @@ +import typing +import warnings + +from .FormatMetadata import FormatMetadata +from .ToolMetadata import Product + + +class DSLMetadata(FormatMetadata): + __slots__ = ("officialLibraryRepo",) + + def __init__(self, officialLibraryRepo: typing.Optional[str] = None, grammarExtensions: typing.Optional[typing.Union[typing.Tuple[str, str], typing.Tuple[str]]] = None, product: typing.Optional[Product] = None) -> None: + super().__init__(grammarExtensions, product) + self.officialLibraryRepo = officialLibraryRepo diff --git a/UniGrammarRuntime/FormatMetadata.py b/UniGrammarRuntime/FormatMetadata.py new file mode 100644 index 0000000..49ed330 --- /dev/null +++ b/UniGrammarRuntime/FormatMetadata.py @@ -0,0 +1,20 @@ +import typing +import warnings + +from UniGrammarRuntime.ToolMetadata import Product + + +class FormatMetadata: + __slots__ = ("product", "grammarExtensions") + + def __init__(self, grammarExtensions: typing.Optional[typing.Union[typing.Tuple[str, str], typing.Tuple[str]]] = None, product: typing.Optional[Product] = None) -> None: + self.product = product + self.grammarExtensions = grammarExtensions + + @property + def mainExtension(self): + if self.grammarExtensions: + return self.grammarExtensions[0] + else: + warnings.warn(self.product.name + " has no well-known extension for grammar files. Using DSL name (" + self.product.name + ") instead of the extension.") + return self.product.name diff --git a/UniGrammarRuntime/IParser.py b/UniGrammarRuntime/IParser.py new file mode 100644 index 0000000..af705ab --- /dev/null +++ b/UniGrammarRuntime/IParser.py @@ -0,0 +1,125 @@ +import typing +from abc import abstractmethod, ABCMeta +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser +from UniGrammarRuntimeCore.IParser import IParserFactory as IParserFactoryCore +from UniGrammarRuntimeCore.IParser import IParserFactoryFromPrecompiled as IParserFactoryFromPrecompiledCore +from UniGrammarRuntimeCore.IParser import IParserFactoryFromSource as IParserFactoryFromSourceCore + +from .FormatMetadata import FormatMetadata +from .ToolMetadata import Product + +# pylint:disable=too-few-public-methods + + +class IParserFactoryMeta(ABCMeta): + __slots__ = () + + def __new__(cls: typing.Type["IParserFactoryCore"], className: str, parents: typing.Tuple[typing.Type, ...], attrs: typing.Dict[str, typing.Any]) -> "Tool": # pylint:disable=arguments-differ + + FORMAT = attrs.get("FORMAT", None) + META = attrs.get("META", None) + if FORMAT is not None and META is not None: + if FORMAT.product is None: + FORMAT.product = META.product + + return super().__new__(cls, className, parents, attrs) + + +class IParserFactory(IParserFactoryCore, metaclass=IParserFactoryMeta): + __slots__ = () + + FORMAT = None # type: FormatMetadata + + @abstractmethod + def fromBundle(self, grammarResources: "InMemoryGrammarResources"): + """Creates an executor from the files within bundle""" + raise NotImplementedError + + +class IParserFactoryFromSource(IParserFactoryFromSourceCore, metaclass=IParserFactoryMeta): # pylint:disable=abstract-method + __slots__ = () + + FORMAT = None # type: FormatMetadata + + def fromBundle(self, grammarResources: "InMemoryGrammarResources") -> IParser: + return self.fromInternal(self.getSource(grammarResources)) # since they cannot be precompiled, for them internal repr is source text + + @classmethod + def _getExt(cls): + if cls.FORMAT is not None: + return cls.FORMAT.mainExtension + else: + return cls.META.product.name + + def getSource(self, grammarResources: "InMemoryGrammarResources") -> str: + """Must return source code of the grammar in its DSL""" + return grammarResources.parent.backendsTextData[self.__class__.META.product.name, grammarResources.name + "." + self.__class__._getExt()] + + +class IParserFactoryFromPrecompiled(IParserFactoryFromPrecompiledCore): # pylint:disable=abstract-method + __slots__ = () + + FORMAT = FormatMetadata( + grammarExtensions=("py",), + product=Product( + name="python", + website="https://docs.python.org/3/tutorial/index.html", + ), + ) + + def fromBundle(self, grammarResources: "InMemoryGrammarResources") -> IParser: + ctor = self.compile(self.getSource(grammarResources), grammarResources.name) + return self.fromInternal(ctor()) + + def getSource(self, grammarResources: "InMemoryGrammarResources") -> "ast.Module": + """Must return source code of the grammar in its DSL""" + return grammarResources.parent.backendsPythonAST[self.__class__.META.product.name, grammarResources.name] + + +class IParserFactoryFromPrecompiledOrSource(IParserFactoryFromSourceCore): + """Hybrid between `IParserFromPrecompiled` and `IParserFromSource`: + tries to find and use precompiled file first, + if there is no, tries to find and use source + """ + + PRECOMPILED = None + SOURCE = None + + __slots__ = ("_precompiled", "_source") + + def __init__(self): + self._precompiled = None + self._source = None + super().__init__() + + @property + def precompiled(self) -> IParserFactoryFromPrecompiled: + res = self._precompiled + if res is None: + self._precompiled = res = self.__class__.PRECOMPILED() + return res + + @property + def source(self) -> IParserFactoryFromSource: + res = self._source + if res is None: + self._source = res = self.__class__.SOURCE() + return res + + def fromBundle(self, grammarResources: "InMemoryGrammarResources"): + """tries to find and use precompiled file first, + if there is no, tries to find and use source""" + try: + return self.precompiled.fromBundle(grammarResources) + except FileNotFoundError: + return self.source.fromBundle(grammarResources) + + def compileStr(self, grammarText: str, target: typing.Any = None, fileName: typing.Optional[typing.Union[Path, str]] = None): + """Proxies to the factory defined by `SOURCE`""" + return self.source.compileStr(grammarText, target, fileName) + + def compileFile(self, grammarFile: Path, target: typing.Any = None): + """Proxies to the factory defined by `SOURCE`""" + return self.source.compileFile(grammarFile, target) diff --git a/UniGrammarRuntime/IParsingBackend.py b/UniGrammarRuntime/IParsingBackend.py new file mode 100644 index 0000000..7a96845 --- /dev/null +++ b/UniGrammarRuntime/IParsingBackend.py @@ -0,0 +1,119 @@ +import typing +from abc import ABCMeta, abstractmethod + +backendsRegistry = {} + + +class ParserNotFoundException(Exception): + """Means that not all parser components have been found""" + + +class IParsingBackendMeta(ABCMeta): + __slots__ = () + + def __new__(cls: typing.Type["TemplateMeta"], className: str, parents: typing.Tuple[typing.Type, ...], attrs: typing.Dict[str, typing.Any]) -> "Template": # pylint:disable=arguments-differ + res = super().__new__(cls, className, parents, attrs) + + parserFactoryClass = attrs.get("PARSER", None) + if parserFactoryClass is not None: + parserClass = getattr(parserFactoryClass, "PARSER_CLASS", None) + if parserClass is not None and parserFactoryClass.META is not None: + backendsRegistry[parserFactoryClass.META.product.name] = res + + return res + + +class ToolSpecificGrammarASTWalkStrategy: + """Very generic methods to walk either + * ASTs of tool-specific grammars themselves; + * ASTs **parsed using tool-specific grammars. + + They are in the same class because often tools have similar interfaces for them. Very often the nodes of tools grammars are parsed with the tools themselves. + """ + + __slots__ = ("parserFactory",) + + def __init__(self, parserFactory): + self.parserFactory = parserFactory + + def iterateChildren(self, node): + """Gets an iterable of children nodes of tool-specific AST node""" + raise NotImplementedError + + def isTerminal(self, node): + """Returns if a node is a terminal that should not be further iterated""" + raise NotImplementedError + + def iterateCollection(self, lst) -> typing.Any: + """Gets an iterable of children nodes of tool-specific AST node""" + raise NotImplementedError + + def isCollection(self, lst: typing.Any) -> bool: + """Gets an iterable of children nodes of tool-specific collection AST node""" + raise NotImplementedError + + def enterOptional(self, optional: typing.Any, childProcessor) -> bool: + """Gets an iterable of children nodes of tool-specific collection AST node""" + raise NotImplementedError + + def isOptionalPresent(self, optional) -> bool: + return optional is not None + + def getOptional(self, optional) -> typing.Any: + return optional + + +class IParsingBackend(metaclass=IParsingBackendMeta): + """A class commanding the parsing. Calls the generated parser and postprocesses its output""" + + __slots__ = ("parser", "wstr") + + PARSER = None + WSTR = None # type: typing.Type[ToolSpecificGrammarASTWalkStrategy] + + @property + @classmethod + def NAME(cls): + return cls.PARSER.NAME + + EX_CLASS = Exception + ITER_INTROSPECTION = True + CAP_INTROSPECTION = True + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + self.parser = self.__class__.PARSER().fromBundle(grammarResources) + self.wstr = self.__class__.WSTR(self.__class__) + + def _getSubTreeText(self, lst: typing.Any) -> typing.Iterator[str]: + if self.wstr.isCollection(lst): + for t in self.wstr.iterateCollection(lst): + yield from self._getSubTreeText(t) + elif self.wstr.isTerminal(lst): + yield self.terminalNodeToStr(lst) + else: + for t in self.wstr.iterateChildren(lst): + yield from self._getSubTreeText(t) + + def enterOptional(self, optional: typing.Any, childProcessor) -> bool: + """Gets an iterable of children nodes of tool-specific collection AST node""" + if self.wstr.isOptionalPresent(optional): + return childProcessor(self.wstr.getOptional(optional)) + + return None + + def getSubTreeText(self, node: typing.Any) -> str: + """Merges a tree of text tokens into a single string""" + return "".join(self._getSubTreeText(node)) + + #@abstractmethod + #def isCollection(self, lst): + # raise NotImplementedError() + + def preprocessAST(self, ast: typing.Any) -> typing.Any: + return ast + + def parse(self, s: str) -> typing.Any: + return self.parser(s) + + def terminalNodeToStr(self, token: typing.Optional[typing.Any]) -> typing.Optional[typing.Any]: + return token diff --git a/UniGrammarRuntime/IWrapper.py b/UniGrammarRuntime/IWrapper.py new file mode 100644 index 0000000..80cbcfd --- /dev/null +++ b/UniGrammarRuntime/IWrapper.py @@ -0,0 +1,34 @@ +import typing +from abc import ABC + +# pylint:disable=too-few-public-methods + + +class IParseResult: + __slots__ = () + + def __repr__(self): + return "".join( + ( + self.__class__.__name__, + "<", + ", ".join( + "=".join((k, repr(getattr(self, k)))) + for k in self.__class__.__slots__ + ), + ">", + ) + ) + + +class IWrapper(ABC): + __slots__ = ("backend",) + + __MAIN_PRODUCTION__ = None + + def __init__(self, backend): + self.backend = backend + + def __call__(self, s: str) -> typing.Union[typing.Iterable[IParseResult], IParseResult]: + preprocessed = self.backend.preprocessAST(self.backend.parse(s)) + return self.__MAIN_PRODUCTION__(preprocessed) diff --git a/UniGrammarRuntime/ParserBundle.py b/UniGrammarRuntime/ParserBundle.py new file mode 100644 index 0000000..9e94c0d --- /dev/null +++ b/UniGrammarRuntime/ParserBundle.py @@ -0,0 +1,183 @@ +import typing +from collections import defaultdict +from pathlib import Path +from warnings import warn + +from transformerz import dummyTransformer +from transformerz.core import Transformer, TransformerBase +from transformerz.serialization.json import jsonFancySerializer +from transformerz.serialization.python import pythonASTFancySerializer +from transformerz.text import utf8Transformer +from urm.core import Dynamic +from urm.fields import Field0D, FieldND +from urm.mappers import ColdMapper, HotMapper +from urm.mappers.key import PrefixKeyMapper, fieldNameKeyMapper +from urm.mappers.serializer import JustReturnSerializerMapper +from urm.ProtoBundle import ProtoBundle +from urm.storers.cold import FileSaver +from urm.storers.hot import PrefixCacher + +from . import backends # pylint:disable=unused-import # Imports all the stuff this way creating classes auto-registered to the registry via a metaclass +from .benchmark import BenchmarkData, benchmark +from .IParsingBackend import backendsRegistry +from .utils import getPythonModule + +#fileSaverIGR = FileSaver(Dynamic(("parent", "bundleDir")), Dynamic(("parent", "serializer", "fileExtension"))) +fileSaverIGR = FileSaver(Dynamic(("parent", "bundleDir")), "json") +nameD = Dynamic("name") +ourCacher = HotMapper(fieldNameKeyMapper, PrefixCacher()) + + +class OurTransformer(Transformer): + registry = None + + +benchmarkDataNormalizer = OurTransformer("benchmarkDataNormalizer", lambda d: d.toNormalizedDict(), BenchmarkData.fromNormalizedDict, dict, BenchmarkData) + + +def constantParamsSerializerMapper(parent: ProtoBundle) -> TransformerBase: + return parent.parent.serializer + + +def benchmarkDataSerializerMapper(parent: ProtoBundle) -> TransformerBase: + return parent.parent.serializer + benchmarkDataNormalizer + + +pythonASTSerializerMapper = JustReturnSerializerMapper(utf8Transformer + pythonASTFancySerializer) + + +class InMemoryGrammarResources(ProtoBundle): + __slots__ = ("parent", "name", "_backendsData", "_metrics", "_capSchema", "_iterSchema", "_wraperClass") + + capSchema = Field0D(ColdMapper(PrefixKeyMapper("schemas", "capless", nameD), fileSaverIGR, constantParamsSerializerMapper), ourCacher) + iterSchema = Field0D(ColdMapper(PrefixKeyMapper("schemas", "iterless", nameD), fileSaverIGR, constantParamsSerializerMapper), ourCacher) + + metrics = Field0D(ColdMapper(PrefixKeyMapper("metrics", nameD), fileSaverIGR, benchmarkDataSerializerMapper), ourCacher) + wrapperAST = Field0D(ColdMapper(PrefixKeyMapper("wrappers", nameD), FileSaver(Dynamic(("parent", "bundleDir")), "py"), pythonASTSerializerMapper)) + + def __init__(self, name: str) -> None: + self.name = name + self.parent = None + self._backendsData = defaultdict(dict) + self._capSchema = None + self._iterSchema = None + self._wraperClass = None + self._metrics = None + + def getWrapperModule(self): + return getPythonModule(self.wrapperAST, self.parent.bundleDir / self.__class__.wrapperAST.strategy.cold.key.prefix[0] / (self.name + ".py")) + + @property + def wrapperClass(self): + res = self._wraperClass + if res is None: + self._wraperClass = res = self.getWrapperModule()["__MAIN_PARSER__"] + return res + + def getWrapper(self, backendName: typing.Optional[str] = None): + return self.wrapperClass(self.getBackend(backendName)) + + #def __repr__(self): + # return self.__class__.__name__ + "" + + def getBackend(self, backendName: typing.Optional[str] = None): + if backendName is None: + backendName = self.getFastestBackendName() + print(self.parent.backends[backendName]) + return self.parent.backends[backendName](self) + + def getFastestBackendName(self, criteria=None): + fastestMetrics = self.metrics.getFastest(criteria) + fastestBackendName = fastestMetrics[0] + return fastestBackendName + + def benchmark(self, testData: typing.Iterable[str], backendNames: str = None, timeBudget: float = 10, benchmarkModes=None, smallCount=100): + + if isinstance(backendNames, str): + backendNames = (backendNames,) + elif backendNames is None: + backendNames = tuple(self.parent.backends.keys()) + + return benchmark(self, testData, backendNames, timeBudget, benchmarkModes, smallCount) + + def benchmarkAndUpdate(self, *args, **kwargs): + metrics = self.benchmark(*args, **kwargs) + self.metrics = metrics + #self.save("metrics") + return metrics + + benchmarkAndUpdate.__wraps__ = benchmark + + +class GrammarsCollection: + """Fuck that `defaultdict` that doesn't have arguments""" + + __slots__ = ("parent", "underlyingCollection",) + + def __init__(self, parent: "ParserBundle") -> None: + self.parent = parent + self.underlyingCollection = {} + + def __getitem__(self, k: str) -> InMemoryGrammarResources: + res = self.underlyingCollection.get(k, None) + if res is None: + self[k] = res = InMemoryGrammarResources(k) + + return res + + def __setitem__(self, k: str, v: InMemoryGrammarResources) -> None: + if v.parent is not self.parent: + if v.parent is not None: + warn("Changing parent. Resources are not moved automatically") + v.parent = self.parent + self.underlyingCollection[k] = v + + def __getattr__(self, k: str) -> typing.Callable: + return getattr(self.underlyingCollection, k) + + +parseBundleCompiledKeyMapper = PrefixKeyMapper("compiled") +parseBundleCompiledParentDir = Dynamic(("bundleDir",)) +compiledMapperColdSaver = FileSaver(parseBundleCompiledParentDir, None) + + +class ParserBundle(ProtoBundle): + """A class to manage components of a parser""" + + __slots__ = ("backends", "grammars", "bundleDir") + + serializer = utf8Transformer + jsonFancySerializer + + backendsBinaryData = FieldND(ColdMapper(parseBundleCompiledKeyMapper, compiledMapperColdSaver, JustReturnSerializerMapper(dummyTransformer))) + backendsTextData = FieldND(ColdMapper(parseBundleCompiledKeyMapper, compiledMapperColdSaver, JustReturnSerializerMapper(utf8Transformer))) + backendsPythonAST = FieldND(ColdMapper(parseBundleCompiledKeyMapper, FileSaver(parseBundleCompiledParentDir, "py"), pythonASTSerializerMapper)) + + def __init__(self, path: typing.Optional[Path] = None) -> None: + self.bundleDir = path + self.initBackends() + self.grammars = GrammarsCollection(self) + + def initBackends(self): + self.backends = {b.PARSER.META.product.name: b for b in self.discoverBackends()} + + def discoverBackends(self) -> None: + """Used to discover backends for which parsers are present in a bundle""" + for name in self._discoverBackends(): + if name in backendsRegistry: + yield backendsRegistry[name] + else: + warn("Backend " + name + " is not in the registry, skipping") + + def _discoverBackends(self) -> None: + """Upstream stuff to discover backends. Must return backends names present in a bundle""" + backendsDataDir = self.bundleDir / "compiled" + if backendsDataDir.is_dir(): + for p in backendsDataDir.iterdir(): + if p.is_dir() and p.name[0] != "_": + yield p.name + + def save(self, propName: typing.Optional[str] = None) -> None: + self.bundleDir.mkdir(parents=True, exist_ok=True) + for el in self.grammars.values(): + el.save() + super().save(propName) diff --git a/UniGrammarRuntime/ToolMetadata.py b/UniGrammarRuntime/ToolMetadata.py new file mode 100644 index 0000000..eb66eb0 --- /dev/null +++ b/UniGrammarRuntime/ToolMetadata.py @@ -0,0 +1,22 @@ +import typing +import warnings + +from .grammarClasses import GrammarClassType + + +class Product: + __slots__ = ("name", "website") + + def __init__(self, name: str, website: typing.Union[typing.Tuple[str, str], str]) -> None: + self.name = name + self.website = website + + +class ToolMetadata(Product): + __slots__ = ("product", "buildsTree", "grammarClasses", "runtimeLib") + + def __init__(self, product: typing.Optional[Product], runtimeLib: typing.Dict[str, typing.Optional[str]], buildsTree: typing.Optional[bool], grammarClasses: typing.Iterable[GrammarClassType]) -> None: + self.product = product + self.runtimeLib = runtimeLib + self.buildsTree = buildsTree + self.grammarClasses = grammarClasses diff --git a/UniGrammarRuntime/__init__.py b/UniGrammarRuntime/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/UniGrammarRuntime/backends/__init__.py b/UniGrammarRuntime/backends/__init__.py new file mode 100644 index 0000000..79026e4 --- /dev/null +++ b/UniGrammarRuntime/backends/__init__.py @@ -0,0 +1,2 @@ +from .multilanguage import antlr4, waxeye +from .python import TatSu, arpeggio, parglare, parsimonious diff --git a/UniGrammarRuntime/backends/multilanguage/CoCoR.py b/UniGrammarRuntime/backends/multilanguage/CoCoR.py new file mode 100644 index 0000000..f3f7083 --- /dev/null +++ b/UniGrammarRuntime/backends/multilanguage/CoCoR.py @@ -0,0 +1,46 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser, IParserFactory + +from ...grammarClasses import LL +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin + + +class CoCoRParser(IParser): + NAME = "CoCo/R" + #EXT = "CoCo/R" + + __slots__ = ("parser",) + + def __init__(self, parser) -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +class CoCoRParserFactory(IParserFactory): + __slots__ = () + PARSER_CLASS = CoCoRParser + META = ToolMetadata( + Product( + name="CoCo/R", + website=("https://codeberg.org/KOLANICH/CoCoPy",), + ), + runtimeLib={ + "python": None, + "java": None, + "c#": None, + "c++": None, + "basic": None, + "pascal": None, + }, + grammarClasses=(LL(1),), + buildsTree=False, + ) diff --git a/UniGrammarRuntime/backends/multilanguage/__init__.py b/UniGrammarRuntime/backends/multilanguage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/UniGrammarRuntime/backends/multilanguage/antlr4.py b/UniGrammarRuntime/backends/multilanguage/antlr4.py new file mode 100644 index 0000000..ebc027d --- /dev/null +++ b/UniGrammarRuntime/backends/multilanguage/antlr4.py @@ -0,0 +1,116 @@ +import typing + +from ...grammarClasses import LL +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata + +try: + from antlrCompile.backends.python import ANTLRInternalClassesPython + from antlrCompile.core import ANTLRParserFactory as ANTLRCompileANTLRParserFactory + from antlrCompile.core import backendsPool +except ImportError: + from warnings import warn + + antlrCompileNotInstalledErrorMessage = "antlrCompile is not installed, generation of ANTLR bundles and visualization of results is not available" + warn(antlrCompileNotInstalledErrorMessage) + + class ANTLRDummy: + __slots__ = () + + def compileStr(self, *args, **kwargs): + raise NotImplementedError(antlrCompileNotInstalledErrorMessage) + + class ANTLRCompileDummy: + __slots__ = () + + def __init__(self, *args, **kwargs): + raise NotImplementedError(antlrCompileNotInstalledErrorMessage) + + ANTLR = ANTLRDummy + ANTLRCompileVis = ANTLRCompileDummy + ANTLRCompileANTLRParserFactory = ANTLRCompileDummy + ANTLRInternalClassesPython = ANTLRCompileDummy + + +toolGithubOrg = "https://github.com/antlr" +toolRepoBase = toolGithubOrg + "/antlr4" +toolRuntimesBase = toolRepoBase + "/tree/master/runtime" + +languagesRemap = { + "python": "Python3", + "js": "JavaScript", + "java": "Java", + "go": "Go", + "c++": "Cpp", + "c#": "CSharp", + "swift": "Swift", +} + + +class ANTLRParserFactory(ANTLRCompileANTLRParserFactory): + __slots__ = () + + META = ToolMetadata( + Product( + name="antlr4", + #website=toolRepoBase, + website="https://codeberg.org/UniGrammar/antlr4", # temporarily till our changes are merged + ), + runtimeLib={ + lang: (toolRuntimesBase + "/" + antlrLang) for lang, antlrLang in languagesRemap.items() + }, + grammarClasses=(LL,), + buildsTree=True, + ) + + def _bundleToIterable(self, backend, grammarResources: "InMemoryGrammarResources") -> typing.Iterable[typing.Any]: + return backend._somethingToIterable(grammarResources, lambda grammarResources, role, className: grammarResources.parent.backendsPythonAST[self.__class__.PARSER_CLASS.NAME, className]) + + antlr4 = None + + def fromBundle(self, grammarResources: "InMemoryGrammarResources") -> "antlrCompile.core.ANTLRParser": + pythonBackend = backendsPool(ANTLRInternalClassesPython) + self.__class__.antlr4 = pythonBackend.antlr4 + return self._fromAttrIterable(pythonBackend, self._bundleToIterable(pythonBackend, grammarResources)) + + +class ANTLRWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + return node.children + + def isTerminal(self, node: "antlr4.tree.Tree.TerminalNodeImpl") -> bool: + return isinstance(node, (str, self.parserFactory.PARSER.antlr4.tree.Tree.TerminalNode, self.parserFactory.PARSER.antlr4.Token)) + + def iterateCollection(self, lst: "antlr4.ParserRuleContext.ParserRuleContext") -> typing.Any: + if lst: + if lst.children: + return lst.children + + return () + + def isCollection(self, lst: typing.Any) -> bool: + return isinstance(lst, self.parserFactory.PARSER.antlr4.RuleContext) + + def isOptionalPresent(self, optional) -> bool: + return optional is not None and optional.children + + def getOptional(self, optional) -> typing.Any: + assert len(optional.children) == 1 + return optional.children[0] + + +class ANTLRParsingBackend(IParsingBackend): + __slots__ = () + PARSER = ANTLRParserFactory + WSTR = ANTLRWalkStrategy + + def terminalNodeToStr(self, token: typing.Union["antlr4.Token.CommonToken", "antlr4.tree.Tree.TerminalNodeImpl"]) -> typing.Optional[str]: + if token is not None: + if isinstance(token, str): + return token + if isinstance(token, self.__class__.PARSER.antlr4.Token): + return token.text + return token.getText() + return None diff --git a/UniGrammarRuntime/backends/multilanguage/waxeye.py b/UniGrammarRuntime/backends/multilanguage/waxeye.py new file mode 100644 index 0000000..9611d9b --- /dev/null +++ b/UniGrammarRuntime/backends/multilanguage/waxeye.py @@ -0,0 +1,181 @@ +import typing +from collections import OrderedDict + +from UniGrammarRuntimeCore.IParser import IParser + +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromPrecompiled +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin, TerminalNodeMixin + +waxeye = None + + +toolGitRepo = "https://github.com/waxeye-org/waxeye" +masterBranchURI = toolGitRepo + "/tree/master" +srcURI = masterBranchURI + "/src" + + +def decapitalizeFirst(s: str) -> str: + return "".join((s[0].lower(), s[1:])) + + +def capitalizeFirst(s: str) -> str: + return "".join((s[0].upper(), s[1:])) + + +class WaxeyeParser(IParser): + NAME = "waxeye" + + __slots__ = ("parser",) + + def __init__(self, parser): + super().__init__() + self.parser = parser + + def __call__(self, s: str) -> "waxeye.AST": + print("self.parser", self.parser) + return self.parser.parse(s) + + +class WaxeyeParserFactory(IParserFactoryFromPrecompiled): + __slots__ = () + PARSER_CLASS = WaxeyeParser + META = ToolMetadata( + Product( + name="waxeye", + website=toolGitRepo, + ), + runtimeLib={ + "python": srcURI + "/python", + "js": srcURI + "/javascript", + "java": srcURI + "/java", + "c++": srcURI + "/c", + "racket": srcURI + "/racket", + "ruby": srcURI + "/ruby", + "sml": srcURI + "/sml", + }, + grammarClasses=(PEG,), + buildsTree=True, + ) + + NodeWithAttrChildren = None + ListNodes = None + TerminalNode = None + + def processEvaledGlobals(self, globalz: dict, grammarName: str): + return globalz[grammarName.capitalize() + "Parser"] + + def getSource(self, grammarResources: "InMemoryGrammarResources") -> "ast.Module": + return grammarResources.parent.backendsPythonAST[self.__class__.META.product.name, grammarResources.name + "_parser"] + + def __init__(self) -> None: + global waxeye + if waxeye is None: + import waxeye # pylint:disable=import-outside-toplevel,redefined-outer-name + + class NodeWithAttrChildren(waxeye.AST, NodeWithAttrChildrenMixin): # pylint:disable=redefined-outer-name,unused-variable + __slots__ = () + self.__class__.NodeWithAttrChildren = NodeWithAttrChildren + + class ListNodes(waxeye.AST, ListNodesMixin): # pylint:disable=redefined-outer-name,unused-variable + __slots__ = () + self.__class__.ListNodes = ListNodes + + class TerminalNode(waxeye.AST, TerminalNodeMixin): # pylint:disable=redefined-outer-name,unused-variable + __slots__ = () + self.__class__.TerminalNode = TerminalNode + + super().__init__() + + +class WaxeyeParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + return node.children + + def isTerminal(self, node): + return isinstance(node, (str, WaxeyeParserFactory.TerminalNode)) + + def iterateCollection(self, lst): + return lst + + def isCollection(self, lst: typing.Union["waxeye.AST", str]) -> bool: + return isinstance(lst, WaxeyeParserFactory.ListNodes) + + +class WaxeyeParsingBackend(IParsingBackend): + __slots__ = ("parser", "capSchema", "iterSchema") + + PARSER = WaxeyeParserFactory + WSTR = WaxeyeParserBackendWalkStrategy + ITER_INTROSPECTION = False + CAP_INTROSPECTION = False + + #EX_CLASS = waxeye.ParseError # not an Exception + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + super().__init__(grammarResources) + + self.capSchema = grammarResources.capSchema # type: typing.Dict[str, typing.Dict[str, str]] + self.iterSchema = grammarResources.iterSchema # type: typing.List[str] + + def _transformWaxeyeAST(self, node: "waxeye.AST") -> None: + """ + Fucking waxeye decapitalizes all the identifiers, destroying uniformity between backends. So we have 2 lookups instead of one. It is definitely a bug in waxeye. + """ + capitalizedType = capitalizeFirst(node.type) + + if node.type not in self.iterSchema and capitalizedType not in self.iterSchema: + newChildren = OrderedDict() + thisElMapping = None + if node.type in self.capSchema: + thisElMapping = self.capSchema[node.type] + elif capitalizedType in self.capSchema: + thisElMapping = self.capSchema[capitalizedType] + + for i, child in enumerate(node.children): + nameToUse = str(i) # we cannot use just ints as keys for ListLikeDict because it also supports positional indexing + if not isinstance(child, str): + childProdName = child.type + self._transformWaxeyeAST(child) + if thisElMapping: + childProdNameCapitalized = capitalizeFirst(childProdName) + + if childProdName in thisElMapping: + nameToUse = thisElMapping[childProdName] # recovered name + elif childProdNameCapitalized in thisElMapping: + nameToUse = thisElMapping[childProdNameCapitalized] # recovered name + + if isinstance(nameToUse, int): + # we have to insert something, and in this case it's better to have prod name than just number + nameToUse = childProdName + newChildren[nameToUse] = child + node.children = ListLikeDict(newChildren) + + if len(node.children) == 1 and isinstance(node.children[0], str): + node.__class__ = self.__class__.PARSER.TerminalNode + else: + node.__class__ = self.__class__.PARSER.NodeWithAttrChildren + else: + for child in node.children: + self._transformWaxeyeAST(child) + node.__class__ = self.__class__.PARSER.ListNodes + + def parse(self, s: str) -> "waxeye.AST": + import waxeye + + res = self.parser(s) + if isinstance(res, waxeye.ParseError): + raise ValueError(res) + + return res + + def preprocessAST(self, ast): + self._transformWaxeyeAST(ast) + return ast + + def terminalNodeToStr(self, token: typing.Union[str, "waxeye.AST"]) -> str: + return str(token) diff --git a/UniGrammarRuntime/backends/python/PyDSL.py b/UniGrammarRuntime/backends/python/PyDSL.py new file mode 100644 index 0000000..c23805c --- /dev/null +++ b/UniGrammarRuntime/backends/python/PyDSL.py @@ -0,0 +1,98 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin + +lark = None +NodeWithAttrChildren = None +ListNodes = None + + +toolGitRepo = "https://github.com/coquelicot/PyDSL" +masterBranchURI = toolGitRepo + "/tree/master" + + +class PyDSLParser(IParser): + NAME = "PyDSL" + + __slots__ = ("parser", "lexer") + + def __init__(self, parser) -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +class PyDSLParserFactory(IParserFactoryFromSource): + __slots__ = () + PARSER_CLASS = PyDSLParser + META = ToolMetadata( + Product( + name="PyDSL", + website=toolGitRepo, + ), + runtimeLib={ + "python": srcURI, + }, + grammarClasses=(None,), + buildsTree=True, + ) + + def __init__(self) -> None: + global DSL + if DSL is None: + import DSL # pylint:disable=import-outside-toplevel,redefined-outer-name + + super().__init__() + + def compileStr(self, grammarText: str, target=None, fileName: Path = None): + return DSL.makeDSL(grammarText) + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +class PyDSLParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + raise NotImplementedError + + def isTerminal(self, node): + raise NotImplementedError + + def iterateCollection(self, lst) -> typing.Any: + raise NotImplementedError + + def isCollection(self, lst: typing.Any) -> bool: + raise NotImplementedError + + +class PyDSLParsingBackend(IParsingBackend): + __slots__ = ("parser", "capSchema") + ITER_INTROSPECTION = True + CAP_INTROSPECTION = True + PARSER = PyDSLParserFactory + WSTR = PyDSLParserBackendWalkStrategy + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + global NodeWithAttrChildren, ListNodes + + super().__init__(grammarResources) + + def terminalNodeToStr(self, token) -> typing.Optional[str]: + raise NotImplementedError + + def getSubTreeText(self, node) -> str: + """Merges a tree of text tokens into a single string""" + raise NotImplementedError diff --git a/UniGrammarRuntime/backends/python/TatSu.py b/UniGrammarRuntime/backends/python/TatSu.py new file mode 100644 index 0000000..e0bf359 --- /dev/null +++ b/UniGrammarRuntime/backends/python/TatSu.py @@ -0,0 +1,147 @@ +import ast +import typing +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromPrecompiled, IParserFactoryFromPrecompiledOrSource, IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata + +toolGitRepo = "https://github.com/neogeny/TatSu" + + +class TatSuParser(IParser): + __slots__ = ("parser",) + NAME = "TatSu" + + def __init__(self, parser): + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s, getattr(self.parser, "_MAIN_PRODUCTION_NAME", None)) + + +class TatSuParserFactoryFromPrecompiled(IParserFactoryFromPrecompiled): + __slots__ = () + + PARSER_CLASS = TatSuParser + + @classmethod + def ensureInitialized(cls): + TatSuParserFactory.ensureInitialized() + + def getSource(self, grammarResources: "InMemoryGrammarResources"): + parserAST = super().getSource(grammarResources) + parserClassNode = _getParserClass(parserAST, grammarResources.name) + firstRuleName = _getFirstRuleNameFromCompiled(parserClassNode) + parserClassNode.body.append(ast.Assign( + targets=[ast.Name( + "_MAIN_PRODUCTION_NAME", ctx=ast.Store(), + lineno=-1, + col_offset=-1 + )], + value=ast.Str(firstRuleName, + lineno=-1, + col_offset=-1 + ), + type_comment=None, + lineno=-1, + col_offset=-1 + )) + return parserAST + + def processEvaledGlobals(self, globalz: dict, grammarName: str): + return globalz[grammarName + "Parser"] + + +class TatSuParserFactoryFromSource(IParserFactoryFromSource): + __slots__ = () + + PARSER_CLASS = TatSuParser + FORMAT = DSLMetadata( + officialLibraryRepo=toolGitRepo + "/tree/master/examples", + grammarExtensions=("ebnf",), + ) + + @classmethod + def ensureInitialized(cls): + TatSuParserFactory.ensureInitialized() + + def compileStr(self, grammarText: str, target: str = None, fileName: Path = None): + return TatSuParserFactory.tatsu.compile(grammarText, None, filename=(str(fileName) if fileName else None)) + + +class TatSuParserFactory(IParserFactoryFromPrecompiledOrSource): + PRECOMPILED = TatSuParserFactoryFromPrecompiled + SOURCE = TatSuParserFactoryFromSource + PARSER_CLASS = TatSuParser + + META = ToolMetadata( + Product( + name="TatSu", + website=toolGitRepo, + ), + runtimeLib={ + "python": toolGitRepo, + }, + grammarClasses=(PEG,), + buildsTree=True, + ) + + tatsu = None + + @classmethod + def ensureInitialized(cls): + if cls.tatsu is None: + import tatsu # pylint:disable=import-outside-toplevel,redefined-outer-name + + cls.tatsu = tatsu + + +def _getParserClass(m: ast.Module, grammarName: str): + """TaTsu has a bug: to call a python-compiled grammar one needs to explicitly provide first rule name (for ga grammar created from source he doesn't), but it is not availablei n it in machine-readable form. Fortunately it is the first func in the class.""" + className = grammarName + "Parser" + + for n in m.body: + if isinstance(n, ast.ClassDef) and n.name == className: + return n + raise Exception("Parser class has not been found") + + +def _getFirstRuleNameFromCompiled(classNode: ast.ClassDef) -> str: + for cn in classNode.body: + if isinstance(cn, ast.FunctionDef) and cn.decorator_list: + firstDecorator = cn.decorator_list[0] + if isinstance(firstDecorator, ast.Call) and firstDecorator.func.id == "tatsumasu": + return cn.name[1:-1] + raise Exception("No productions has been found") + + +class TatSuParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + #return node.children + raise NotImplementedError + + def isTerminal(self, node): + return isinstance(node, str) + + def iterateCollection(self, lst) -> typing.Any: + return lst + + def isCollection(self, lst: typing.Any) -> bool: + return isinstance(lst, list) + + +class TatSuParsingBackend(IParsingBackend): + __slots__ = () + PARSER = TatSuParserFactory + WSTR = TatSuParserBackendWalkStrategy + + def terminalNodeToStr(self, token) -> typing.Optional[str]: + return token diff --git a/UniGrammarRuntime/backends/python/__init__.py b/UniGrammarRuntime/backends/python/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/UniGrammarRuntime/backends/python/arpeggio.py b/UniGrammarRuntime/backends/python/arpeggio.py new file mode 100644 index 0000000..eef9418 --- /dev/null +++ b/UniGrammarRuntime/backends/python/arpeggio.py @@ -0,0 +1,148 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata +from ...utils import AttrDict, flattenDictsIntoIterable + + +class ArpeggioParser(IParser): + __slots__ = ("parser",) + + def __init__(self, parser) -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +toolGitRepo = "https://github.com/textX/Arpeggio" + + +class ArpeggioParserFactory(IParserFactoryFromSource): + __slots__ = () + PARSER_CLASS = ArpeggioParser + FORMAT = DSLMetadata( + officialLibraryRepo=toolGitRepo + "/tree/master/examples", + grammarExtensions=("peg",) + ) + + META = ToolMetadata( + Product( + name="arpeggio", + website=toolGitRepo, + ), + runtimeLib={ + "python": toolGitRepo, + }, + grammarClasses=(PEG,), + buildsTree=None, + ) + + arpeggio = None + + @classmethod + def ensureInitialized(cls): + # pylint:disable=import-outside-toplevel,redefined-outer-name + if cls.arpeggio is None: + import arpeggio + import arpeggio.peg + + cls.arpeggio = arpeggio + + @classmethod + def getFirstRuleName(cls, grammarSrc: str) -> str: + parser = cls.arpeggio.ParserPython(cls.arpeggio.peg.peggrammar, cls.arpeggio.peg.comment, reduce_tree=False) + parsedAST = parser.parse(grammarSrc) + for el in parsedAST: + if el.rule_name == "rule": + if el[0].rule_name == "rule_name": + return el[0].flat_str() + + def compileStr(self, grammarText: str, target=None, fileName: Path = None): + firstRuleName = self.__class__.getFirstRuleName(grammarText) + return self.__class__.arpeggio.peg.ParserPEG(grammarText, firstRuleName, skipws=False, debug=False) + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +TransformedASTElT = typing.Union["arpeggio.Terminal", "TransformedASTT"] +TransformedASTT = typing.Mapping[str, TransformedASTElT] + + +class ArpeggioParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + yield from lst + + def isTerminal(self, node): + return isinstance(node, str) + + def iterateCollection(self, lst) -> typing.Any: + yield from lst + + def isCollection(self, lst) -> bool: + return isinstance(lst, ListNodes) + + +class ArpeggioParsingBackend(IParsingBackend): + __slots__ = ("parser", "capSchema", "iterSchema") + ITER_INTROSPECTION = False + CAP_INTROSPECTION = False + PARSER = ArpeggioParserFactory + WSTR = ArpeggioParserBackendWalkStrategy + + @classmethod + def _transformArpeggioAST(cls, node, capSchema: typing.Dict[str, typing.Dict[str, str]], iterSchema: typing.List[str]) -> TransformedASTElT: + if node.rule_name not in iterSchema: + newChildren = AttrDict() + thisElMapping = None + if node.rule_name in capSchema: + thisElMapping = capSchema[node.rule_name] + + if not isinstance(node, cls.PARSER.arpeggio.Terminal): + for i, child in enumerate(node): + nameToUse = str(i) # we cannot use just ints as keys for ListLikeDict because it also supports positional indexing + if not isinstance(child, str): + childProdName = child.rule_name + newChild = cls._transformArpeggioAST(child, capSchema, iterSchema) + if thisElMapping: + if childProdName in thisElMapping: + nameToUse = thisElMapping[childProdName] # recovered name + + if isinstance(nameToUse, int): + # we have to insert something, and in this case it's better to have prod name than just number + nameToUse = childProdName + else: + newChild = child + newChildren[nameToUse] = newChild + return newChildren + return node.flat_str() + else: + return [cls._transformArpeggioAST(child, capSchema, iterSchema) for child in node] + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + super().__init__(grammarResources) + self.capSchema = grammarResources.capSchema + self.iterSchema = grammarResources.iterSchema + + self.__class__.PARSER.ensureInitialized() + + def preprocessAST(self, ast): + return self.__class__._transformArpeggioAST(ast, self.capSchema, self.iterSchema) + + def terminalNodeToStr(self, token) -> typing.Optional[str]: + return "".join(flattenDictsIntoIterable(node)) + + def getSubTreeText(self, node) -> str: + """Merges a tree of text tokens into a single string""" + return "".join(flattenDictsIntoIterable(node)) diff --git a/UniGrammarRuntime/backends/python/lark.py b/UniGrammarRuntime/backends/python/lark.py new file mode 100644 index 0000000..f338804 --- /dev/null +++ b/UniGrammarRuntime/backends/python/lark.py @@ -0,0 +1,101 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin + +lark = None +NodeWithAttrChildren = None +ListNodes = None + + +toolGitRepo = "https://github.com/lark-parser/lark" +masterBranchURI = toolGitRepo + "/tree/master" +srcURI = masterBranchURI + "/src" + + +class LarkParser(IParser): + NAME = "lark" + #EXT = "lark" + + __slots__ = ("parser",) + + def __init__(self, parser) -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +class LarkParserFactory(IParserFactoryFromSource): + __slots__ = () + PARSER_CLASS = LarkParser + META = ToolMetadata( + Product( + name="lark", + website=toolGitRepo, + ), + runtimeLib={ + "python": srcURI, + }, + grammarClasses=(PEG,), + buildsTree=True, + ) + + def __init__(self) -> None: + global lark + + if lark is None: + import lark # pylint:disable=import-outside-toplevel,redefined-outer-name + + super().__init__() + + def compileStr(self, grammarText: str, target=None, fileName: Path = None): + return lark.Lark(grammarText, parser="lalr", lexer="auto") + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +class LarkParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + raise NotImplementedError + + def isTerminal(self, node): + raise NotImplementedError + + def iterateCollection(self, lst) -> typing.Any: + raise NotImplementedError + + def isCollection(self, lst: typing.Any) -> bool: + raise NotImplementedError + + +class LarkParsingBackend(IParsingBackend): + __slots__ = ("parser", "capSchema") + ITER_INTROSPECTION = True + CAP_INTROSPECTION = True + PARSER = LarkParserFactory + WSTR = LarkParserBackendWalkStrategy + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + global NodeWithAttrChildren, ListNodes + + super().__init__(grammarResources) + + def terminalNodeToStr(self, token: "lark.nodes.RegexNode") -> typing.Optional[str]: + raise NotImplementedError + + def getSubTreeText(self, node: "lark.nodes.Node") -> str: + """Merges a tree of text tokens into a single string""" + raise NotImplementedError diff --git a/UniGrammarRuntime/backends/python/parglare.py b/UniGrammarRuntime/backends/python/parglare.py new file mode 100644 index 0000000..a190372 --- /dev/null +++ b/UniGrammarRuntime/backends/python/parglare.py @@ -0,0 +1,101 @@ +import typing +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import GLR, LR +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata + +thisDir = Path(__file__).parent + +toolGitRepo = "https://github.com/igordejanovic/parglare" + + +class ParglareParser(IParser): + NAME = "parglare" + + __slots__ = ("parser",) + + def __init__(self, parser: "parglare.parser.Parser") -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +class ParglareParserFactory(IParserFactoryFromSource): + __slots__ = () + parglare = None + PARSER_CLASS = ParglareParser + FORMAT = DSLMetadata( + officialLibraryRepo=toolGitRepo + "/tree/master/examples", + grammarExtensions=("pg", "pgt"), + ) + + META = ToolMetadata( + Product( + name="parglare", + website=toolGitRepo, + ), + runtimeLib={ + "python": toolGitRepo, + }, + grammarClasses=(LR, GLR), + buildsTree=True, + ) + + @classmethod + def ensureInitialized(cls): + if cls.parglare is None: + import parglare # pylint:disable=import-outside-toplevel,redefined-outer-name + + cls.parglare = parglare + + def __init__(self) -> None: + super().__init__() + + def compileStr(self, grammarText: str, target: str = None, fileName: Path = None) -> "parglare.parser.Parser": + return self.__class__.parglare.Parser(self.__class__.parglare.Grammar.from_string(grammarText), ws="", debug=False) + + def compileFile(self, grammarFile: Path, target: str = None): + return self.__class__.parglare.Parser(self.__class__.parglare.Grammar.from_file(grammarFile), ws="", debug=False) + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +class ParglareParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + if node is not None: + for elName in node._pg_attrs: + yield getattr(node, elName, None) + + def isTerminal(self, node: str) -> bool: + return isinstance(node, str) + + def iterateCollection(self, lst: typing.Any) -> typing.List[typing.Any]: + return lst + + def isCollection(self, lst: typing.Any) -> bool: + return isinstance(lst, list) + + +class ParglareParsingBackend(IParsingBackend): + __slots__ = () + EX_CLASS = None + PARSER = ParglareParserFactory + WSTR = ParglareParserBackendWalkStrategy + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + super().__init__(grammarResources) + if self.__class__.EX_CLASS is None: + self.__class__.EX_CLASS = self.__class__.PARSER.parglare.exceptions.ParseError + + def terminalNodeToStr(self, token: typing.Optional[typing.Any]) -> typing.Optional[typing.Any]: + return token diff --git a/UniGrammarRuntime/backends/python/parsimonious.py b/UniGrammarRuntime/backends/python/parsimonious.py new file mode 100644 index 0000000..66d23a2 --- /dev/null +++ b/UniGrammarRuntime/backends/python/parsimonious.py @@ -0,0 +1,149 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin + +parsimonious = None +NodeWithAttrChildren = None +ListNodes = None + + +toolGitRepo = "https://github.com/erikrose/parsimonious" + + +class ParsimoniousParser(IParser): + __slots__ = ("parser",) + + def __init__(self, parser: "parsimonious.grammar.Grammar") -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str) -> "parsimonious.nodes.Node": + return self.parser.parse(s) + + +class ParsimoniousParserFactory(IParserFactoryFromSource): + __slots__ = () + PARSER_CLASS = ParsimoniousParser + + FORMAT = DSLMetadata( + officialLibraryRepo=None, + grammarExtensions=["ppeg"] + ) + + META = ToolMetadata( + Product( + name="parsimonious", + website=toolGitRepo, + ), + runtimeLib={ + "python": toolGitRepo, + }, + grammarClasses=(PEG,), + buildsTree=None, + ) + + parsimonious = None + + @classmethod + def ensureInitialized(cls): + if cls.parsimonious is None: + import parsimonious # pylint:disable=import-outside-toplevel,redefined-outer-name + + cls.parsimonious = parsimonious + + def compileStr(self, grammarText: str, target=None, fileName: Path = None) -> "parsimonious.grammar.Grammar": + return self.__class__.parsimonious.Grammar(grammarText) + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +def _transformParsimoniousAST(node: typing.Union["parsimonious.nodes.Node", "parsimonious.nodes.RegexNode"], capSchema: typing.Dict[str, typing.Dict[str, str]]) -> None: + """Walks parsimonious AST to make it more friendly for our processing: + 1. Replaces lists of children with `ListLikeDict`s, using `expr_name`s as keys + 2. Adds `__getattr__` to the nodes looking up attrs in the dicts of children + + All of this is needed because our postprocessing is attr-based. + """ + + if not isinstance(node, ParsimoniousParserFactory.parsimonious.nodes.RegexNode): + if not isinstance(node.expr, ParsimoniousParserFactory.parsimonious.expressions.Quantifier): # or (node.expr.min==0 and node.expr.max==1): # in pats it handled only ZeroOrMore and OneOrMore, but when P. has abstracted a bit, they have become a Quantifier, and so become Optional, was it a mistake not to handle it here too? + newChildren = OrderedDict() + for child in node.children: + childProdName = child.expr_name + _transformParsimoniousAST(child, capSchema) + nameToUse = None + if node.expr_name in capSchema: + thisElMapping = capSchema[node.expr_name] + + if childProdName in thisElMapping: + nameToUse = thisElMapping[childProdName] # recovered name + + if nameToUse is None: + # we have to insert something + nameToUse = childProdName + newChildren[nameToUse] = child + node.children = ListLikeDict(newChildren) + node.__class__ = NodeWithAttrChildren + else: + for child in node.children: + _transformParsimoniousAST(child, capSchema) + node.__class__ = ListNodes + + +class ParsimoniousParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + return node.children + + def isTerminal(self, node): + return isinstance(node, self.parserFactory.parsimonious.nodes.RegexNode) + + def iterateCollection(self, lst) -> typing.Any: + return lst.children + + def isCollection(self, lst: typing.Any) -> bool: + return isinstance(lst.expr, (self.parserFactory.parsimonious.expressions.ZeroOrMore, self.parserFactory.parsimonious.expressions.OneOrMore)) + + +class ParsimoniousParsingBackend(IParsingBackend): + __slots__ = ("parser", "capSchema") + ITER_INTROSPECTION = True + CAP_INTROSPECTION = False + PARSER = ParsimoniousParserFactory + WSTR = ParsimoniousParserBackendWalkStrategy + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + global NodeWithAttrChildren, ListNodes + + super().__init__(grammarResources) + self.capSchema = grammarResources.capSchema + + if NodeWithAttrChildren is None: + + class NodeWithAttrChildren(self.__class__.PARSER.parsimonious.nodes.Node, NodeWithAttrChildrenMixin): # pylint:disable=redefined-outer-name + __slots__ = () + + class ListNodes(self.__class__.PARSER.parsimonious.nodes.Node, ListNodesMixin): # pylint:disable=redefined-outer-name,unused-variable + __slots__ = () + + def preprocessAST(self, ast): + _transformParsimoniousAST(ast, self.capSchema) + return ast + + def terminalNodeToStr(self, token: "parsimonious.nodes.RegexNode") -> typing.Optional[str]: + return token.text + + def getSubTreeText(self, node: "parsimonious.nodes.Node") -> str: + """Merges a tree of text tokens into a single string""" + return node.text diff --git a/UniGrammarRuntime/backends/regExps/__init__.py b/UniGrammarRuntime/backends/regExps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/UniGrammarRuntime/backends/regExps/python.py b/UniGrammarRuntime/backends/regExps/python.py new file mode 100644 index 0000000..bb521e3 --- /dev/null +++ b/UniGrammarRuntime/backends/regExps/python.py @@ -0,0 +1,89 @@ +import re +import typing +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser + +from ...DSLMetadata import DSLMetadata +from ...grammarClasses import RegExp +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend, ToolSpecificGrammarASTWalkStrategy +from ...ToolMetadata import Product, ToolMetadata + +thisDir = Path(__file__).parent + +parglare = None + + +toolGitRepo = "https://github.com/python/cpython" + + +class PythonRegExpParser(IParser): + NAME = "python_re" + + __slots__ = ("parser",) + + def __init__(self, parser: "_sre.SRE_Pattern") -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.exec(s) + + +class PythonRegExpParserFactory(IParserFactoryFromSource): + __slots__ = () + PARSER_CLASS = PythonRegExpParser + FORMAT = DSLMetadata( + grammarExtensions=(), + ) + + META = ToolMetadata( + Product( + name="py_re", + website=toolGitRepo, + ), + runtimeLib={ + "python": toolGitRepo, + }, + grammarClasses=(RegExp), + buildsTree=True, + ) + + def compileStr(self, grammarText: str, target: str = None, fileName: Path = None) -> "_sre.SRE_Pattern": + return re.compile(grammarText) + + def fromInternal(self, internalRepr: str, target: str = None) -> typing.Any: + return self.__class__.PARSER_CLASS(self.compileStr(internalRepr, target)) + + +class PythonRegExpParserBackendWalkStrategy(ToolSpecificGrammarASTWalkStrategy): + __slots__ = () + + def iterateChildren(self, node): + raise NotImplementedError + + def isTerminal(self, node): + raise NotImplementedError + + def iterateCollection(self, lst) -> typing.Any: + raise NotImplementedError + + def isCollection(self, lst: typing.Any) -> bool: + raise NotImplementedError + + +# copied from parglare, not yet implemented +class PythonRegExpParsingBackend(IParsingBackend): + __slots__ = () + EX_CLASS = None + PARSER = PythonRegExpParserFactory + WSTR = PythonRegExpParserBackendWalkStrategy + + def __init__(self, grammarResources: "InMemoryGrammarResources") -> None: + super().__init__(grammarResources) + raise NotImplementedError + + def terminalNodeToStr(self, token: typing.Optional[typing.Any]) -> typing.Optional[typing.Any]: + raise NotImplementedError + #return token diff --git a/UniGrammarRuntime/backends/rust/pest.py b/UniGrammarRuntime/backends/rust/pest.py new file mode 100644 index 0000000..80ba50d --- /dev/null +++ b/UniGrammarRuntime/backends/rust/pest.py @@ -0,0 +1,41 @@ +import typing +from collections import OrderedDict +from pathlib import Path + +from UniGrammarRuntimeCore.IParser import IParser, IParserFactory + +from ...grammarClasses import PEG +from ...IParser import IParserFactoryFromSource +from ...IParsingBackend import IParsingBackend +from ...ToolMetadata import Product, ToolMetadata +from ...utils import ListLikeDict, ListNodesMixin, NodeWithAttrChildrenMixin + + +class PestParser(IParser): + #NAME = "pest" + #EXT = "pest" + + __slots__ = ("parser",) + + def __init__(self, parser) -> None: + super().__init__() + self.parser = parser + + def __call__(self, s: str): + return self.parser.parse(s) + + +class PestParserFactory(IParserFactory): + __slots__ = () + PARSER_CLASS = PestParser + META = ToolMetadata( + Product( + name="pest", + website=("https://github.com/pest-parser/pest",), + ), + runtimeLib={ + "rust": None, + }, + grammarClasses=(PEG,), + buildsTree=False, + ) diff --git a/UniGrammarRuntime/benchmark.py b/UniGrammarRuntime/benchmark.py new file mode 100644 index 0000000..accba63 --- /dev/null +++ b/UniGrammarRuntime/benchmark.py @@ -0,0 +1,263 @@ +import typing +from collections import OrderedDict, defaultdict +from functools import partial +from math import sqrt + + +class _BenchmarkMode: + __all__ = () + + +class BenchmarkModeMeta(type): + __slots__ = () + + def __new__(cls: typing.Type["BenchmarkModeMeta"], className: str, parents: typing.Tuple[type, ...], attrs: typing.Dict[str, typing.Any], *args, **kwargs) -> typing.Type["BenchmarkMode"]: + attrs = type(attrs)(attrs) + attrs["__all__"] = parents[0].__all__ + tuple(v for k, v in attrs.items() if k[0] != "_") + res = super().__new__(cls, className, parents, attrs, *args, **kwargs) + return res + + +StatementIncompleteFuncT = typing.Callable[[typing.Any], None] +SetupFuncT = typing.Callable[[], typing.Any] +CriteriaFuncRetT = typing.Tuple[StatementIncompleteFuncT, SetupFuncT] +CriteriaFuncT = typing.Callable[["InMemoryGrammarResources", str], CriteriaFuncRetT] +CriteriaT = typing.Union[str, CriteriaFuncT] + + +class BenchmarkMode(_BenchmarkMode, metaclass=BenchmarkModeMeta): + """All the methods are static, but we cannot use @classmethod and @staticmethod because they cause problems with __name__ + also pylint considers first arg as `self`, so we disable `no-member` + """ + + # pylint:disable=no-self-argument,no-member + + def parseRaw(grammarData: "InMemoryGrammarResources", backendName: str) -> CriteriaFuncRetT: + b = grammarData.getBackend(backendName) + return b.parse, lambda s: s + + def preprocess(grammarData: "InMemoryGrammarResources", backendName: str) -> CriteriaFuncRetT: + b = grammarData.getBackend(backendName) + return b.preprocessAST, b.parse + + def wrapper(grammarData: "InMemoryGrammarResources", backendName: str) -> CriteriaFuncRetT: + w = grammarData.getWrapper(backendName) + return w.__MAIN_PRODUCTION__, lambda s: w.backend.preprocessAST(w.backend.parse(s)) + + +def normalizeCriteria(criteria: typing.Iterable[CriteriaT]) -> typing.Tuple[typing.Iterable[str], typing.Iterable[CriteriaFuncT]]: + criteriaStr = [] + criteriaFunc = [] + for c in criteria: + if isinstance(c, str): + criteriaStr.append(c) + criteriaFunc.append(getattr(BenchmarkMode, c)) + else: + criteriaStr.append(c.__name__) + criteriaFunc.append(c) + return tuple(criteriaStr), tuple(criteriaFunc) + + +class _BenchmarkRecords: + __slots__ = ("root",) + + NAME = None + DOWNSTREAM = None + + def __init__(self, root): + self.root = root + + def getIndexer(self): + return getattr(self.root, self.__class__.NAME) + + def _getIndex(self, k): + return self.getIndexer()[k] + + def _getItem(self, idx): + return self.__class__.DOWNSTREAM(self.root, self, idx) + + def __getitem__(self, k): + return self._getItem(self._getIndex(k)) + + def __len__(self): + return len(self.getIndexer()) + + def __iter__(self): + return self.keys() + + def items(self): + for k, idx in self.getIndexer().items(): + yield k, self._getItem(idx) + + def keys(self): + return iter(self.getIndexer().keys()) + + def values(self): + for idx in self.getIndexer().values(): + yield self._getItem(idx) + + +class RecordsLayer(_BenchmarkRecords): + __slots__ = ("parent", "index") + + def __init__(self, root, parent, index): + super().__init__(root) + self.parent = parent + self.index = index + + @property + def denormMatrix(self): + return self.parent.denormMatrix[self.index] + + +class LastLayer(RecordsLayer): + def _getItem(self, idx): + return self.denormMatrix[idx] + + +class BenchmarkStatistics: + __slots__ = ("min", "max", "mean", "std", "iters", "repeats") + + def __init__(self, min: float, max: float, mean: float, std: float, iters: int, repeats: int): # pylint:disable=redefined-builtin + self.min = min + self.max = max + self.mean = mean + self.std = std + self.iters = iters + self.repeats = repeats + + def __iter__(self): + for k in __class__.__slots__: # pylint:disable=undefined-variable + yield getattr(self, k) + + def toTuple(self): + return tuple(self) + + def __repr__(self): + return self.__class__.__name__ + "(" + ", ".join(k + "=" + repr(getattr(self, k)) for k in __class__.__slots__) + ")" # pylint:disable=undefined-variable + + @classmethod + def fromSamples(cls, samples: typing.Iterable[float], iters: int) -> "BenchmarkStatistics": + repeats = len(samples) + mi = min(samples) / iters + ma = max(samples) / iters + me = sum(samples) / repeats + vari = sum(bt * bt for bt in samples) / repeats - me * me + std = sqrt(vari) / iters + me /= iters + return cls(mi, ma, me, std, iters, repeats) + + +class BenchmarksPerCriteria(LastLayer): + __slots__ = () + NAME = "criteria" + + def _getItem(self, idx): + return BenchmarkStatistics(*super()._getItem(idx)) + + +class BenchmarksPerBackends(RecordsLayer): + __slots__ = () + NAME = "backends" + DOWNSTREAM = BenchmarksPerCriteria + + +class BenchmarkData(_BenchmarkRecords): + __slots__ = ("criteria", "backends", "testData", "denormMatrix") + NAME = "testData" + DOWNSTREAM = BenchmarksPerBackends + + def __init__(self, criteria, backends: typing.Iterable[str], testData: typing.Iterable[str], denormMatrix: typing.Optional[typing.List[typing.List[typing.List[float]]]] = None) -> None: + self.criteria = OrderedDict((k, i) for i, k in enumerate(criteria)) + self.backends = OrderedDict((k, i) for i, k in enumerate(backends)) + self.testData = OrderedDict((k, i) for i, k in enumerate(testData)) + if denormMatrix is None: + denormMatrix = [ + [ + [None for i in range(len(self.criteria))] + for j in range(len(self.backends)) + ] + for k in range(len(self.testData)) + ] + self.denormMatrix = denormMatrix + super().__init__(self) + + def toNormalizedDict(self) -> typing.Mapping[str, typing.Any]: + return { + "criteria": tuple(self.criteria.keys()), + "backends": tuple(self.backends.keys()), + "testData": tuple(self.testData.keys()), + "matrix": self.denormMatrix + } + + def aggregateMetrics(self, criteria: typing.Optional[str] = None, stat: str = "min"): + res = defaultdict(float) + + for dPMetrics in self.values(): + for backendName, backendMetricsPerCriteria in dPMetrics.items(): + if criteria is not None: + res[backendName] += getattr(backendMetricsPerCriteria[criteria], stat) + else: + res[backendName] += sum(getattr(stats, stat) for stats in backendMetricsPerCriteria.values()) + return tuple(res.items()) + + def getFastest(self, criteria: typing.Optional[str] = None): + return min(self.aggregateMetrics(criteria, stat="min"), key=lambda it: it[1]) # 1 is for value + + def getSorted(self, criteria: typing.Optional[str] = None, reverse: bool = False): + return sorted(self.aggregateMetrics(criteria, stat="min"), reverse=reverse, key=lambda it: it[1]) # 1 is for value + + @classmethod + def fromNormalizedDict(cls, d: typing.Mapping[str, typing.Any]) -> "BenchmarkData": + return cls(criteria=d["criteria"], backends=d["backends"], testData=d["testData"], denormMatrix=d["matrix"]) + + +def _benchmarkSingle(Timer, stmtIncomplete, setup, dataPiece, smallCount, timeBudget) -> BenchmarkStatistics: + stmtArg = setup(dataPiece) + stmt = partial(stmtIncomplete, stmtArg) + + stmt() # to test that works and to warm-up + + t = Timer(stmt=stmt) + + smallTime = t.timeit(number=smallCount) + timePerIterPrelim = smallTime / smallCount + restItersCount = round((timeBudget - smallTime) / timePerIterPrelim) + + iters = round(sqrt(restItersCount)) + repeats = restItersCount // iters + + bigTimes = t.repeat(repeat=repeats, number=iters) + + return BenchmarkStatistics.fromSamples(bigTimes, iters) + + +def _reBenchmark(res, grammarData, smallCount, timeBudget, testData, backendNames, benchmarkModesFuncs): + from timeit import Timer # pylint:disable=import-outside-toplevel + + for backendIndex, backendName in enumerate(backendNames): + for modeIndex, benchmarkMode in enumerate(benchmarkModesFuncs): + stmtIncomplete, setup = benchmarkMode(grammarData, backendName) + + for dataIndex, dataPiece in enumerate(testData): + res.denormMatrix[dataIndex][backendIndex][modeIndex] = _benchmarkSingle(Timer, stmtIncomplete, setup, dataPiece, smallCount, timeBudget).toTuple() + + +def benchmark(grammarData: "InMemoryGrammarResources", testData: typing.Iterable[str], backendNames: typing.Iterable[str], timeBudget: float, benchmarkModes: typing.Iterable[CriteriaT], smallCount, prevRes=None): + if isinstance(testData, str): + testData = (testData,) + if benchmarkModes is None: + benchmarkModes = BenchmarkMode.__all__ + elif callable(benchmarkModes) or isinstance(benchmarkModes, str): + benchmarkModes = (benchmarkModes,) + + benchmarkModesStrs, benchmarkModesFuncs = normalizeCriteria(benchmarkModes) + + if prevRes is None: + res = BenchmarkData(benchmarkModesStrs, backendNames, testData) + else: + raise NotImplementedError("Currently editing is not implemented") + + _reBenchmark(res, grammarData, smallCount, timeBudget, testData, backendNames, benchmarkModesFuncs) + + return res diff --git a/UniGrammarRuntime/dslsMetadata.py b/UniGrammarRuntime/dslsMetadata.py new file mode 100644 index 0000000..a7eeb9b --- /dev/null +++ b/UniGrammarRuntime/dslsMetadata.py @@ -0,0 +1,11 @@ +from .DSLMetadata import DSLMetadata +from .ToolMetadata import Product + +packrat = DSLMetadata( + officialLibraryRepo=None, + grammarExtensions=("peg",), + product=Product( + name="packrat", + website="https://bford.info/pub/lang/packrat-icfp02/", + ), +) diff --git a/UniGrammarRuntime/grammarClasses.py b/UniGrammarRuntime/grammarClasses.py new file mode 100644 index 0000000..c083cf8 --- /dev/null +++ b/UniGrammarRuntime/grammarClasses.py @@ -0,0 +1,81 @@ +import re +import typing +from abc import ABCMeta + +# LL(1) <= LL(2) <= LL(*) <= LR <= PEG <= GLR + + +_grammarClassRx = None +_registry = {} +_infiniteCounts = {"*", "∞", "inf"} + +GrammarClassType = typing.Union["GrammarClass", typing.Type["GrammarClass"]] + + +class GrammarClassMeta(ABCMeta): + def __new__(cls: typing.Type["GrammarClassMeta"], className: str, parents: typing.Iterable[typing.Type["GrammarClass"]], attrs: typing.Dict[str, typing.Any], *args, **kwargs) -> "GrammarClass": + res = super().__new__(cls, className, parents, attrs, *args, **kwargs) + _registry[className] = res + return res + + +class _GrammarClass(metaclass=GrammarClassMeta): + __slots__ = () + + @classmethod + def __leq__(cls, other: GrammarClassType): + return isinstance(other, cls) or issubclass(other, cls) + + @classmethod + def fromStr(cls, s: str) -> GrammarClassType: + return _registry[s] + + +class GrammarClass(_GrammarClass): + __slots__ = ("count",) + + def __init__(self, count: typing.Optional[int]) -> None: + self.count = count + + @classmethod + def fromStr(cls, s: str) -> GrammarClassType: + s = s.upper() + m = _grammarClassRx.match(s) + if not m: + raise KeyError("Unknown grammar class", s, list(_registry.keys())) + gc = super().fromStr(m.group(1)) + count = m.group(2) + if count is not None: + if count in _infiniteCounts: + count = None + else: + count = int(count) + return gc(count) + return gc + + +class RegExp(_GrammarClass): + __slots__ = () + + +class LL(GrammarClass): + __slots__ = () + + +class LR(LL): + __slots__ = () + + +class GLR(LR): + __slots__ = () + + +class LALR(LR): + __slots__ = () + + +class PEG(LR): + __slots__ = () + + +_grammarClassRx = re.compile("(" + "|".join(_registry.keys()) + r")(?:\((\d+|\*|∞|inf)\))?") diff --git a/UniGrammarRuntime/py.typed b/UniGrammarRuntime/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/UniGrammarRuntime/utils/__init__.py b/UniGrammarRuntime/utils/__init__.py new file mode 100644 index 0000000..29f0a20 --- /dev/null +++ b/UniGrammarRuntime/utils/__init__.py @@ -0,0 +1,85 @@ +import typing +from collections import OrderedDict +from weakref import ref + +try: + from math import inf +except ImportError: + inf = float("inf") + + +class AttrDict(dict): + __slots__ = () + + def __getattr__(self, k: str) -> typing.Any: + try: + return self[k] + except KeyError: + raise AttributeError(k) + + def __dir__(self): + return super().__dir__() + self.keys() + + +def flattenDictsIntoIterable(el) -> typing.Iterable: + if isinstance(el, dict): + for sel in el.values(): + yield from flattenDictsIntoIterable(sel) + else: + yield el + + +class ListLikeDict(OrderedDict): + """A very fucking redundant and limited hack.""" + + __slots__ = ("_list",) + + def __init__(self, data: OrderedDict) -> None: + super().__init__(data) + self._list = list(super().keys()) + + def __getitem__(self, k: str) -> typing.Any: + if isinstance(k, int): + return self[self._list[k]] + return super().__getitem__(k) + + def __iter__(self): + return iter(self.values()) + + +class ListLikeAttrDict(ListLikeDict): + __slots__ = () + + def __getattr__(self, k: str) -> typing.Any: + return self[k] + + +def getPythonModule(fileText: str, fileName: str): + compiled = compile(fileText, fileName, "exec", optimize=2) + globalz = {} + eval(compiled, globalz) + return globalz + + +class NodeWithAttrChildrenMixin: + __slots__ = () + + def __getattr__(self, k): + try: + return self.children[k] + except KeyError: + raise AttributeError(k) + + +class ListNodesMixin: + __slots__ = () + + def __iter__(self): + return iter(self.children) + + +class TerminalNodeMixin: + __slots__ = () + + def __str__(self): + return self.children[0] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c5bdbf7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,42 @@ +[build-system] +requires = ["setuptools>=61.2.0", "wheel", "setuptools_scm[toml]>=3.4.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "UniGrammarRuntime" +authors = [{name = "KOLANICH"}] +description = "The runtime for UniGrammar. Needed for auto-generated wrappers. May be needed for handcrafted wrappers." +readme = "ReadMe.md" +keywords = ["grammars", "UniGrammar", "ANTLR", "CoCo/R", "parglare", "waxeye", "TatSu", "parsimonious", "YAML"] +license = {text = "Unlicense"} +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: Public Domain", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Security", + "Topic :: Text Processing", +] +requires-python = ">=3.4" +dependencies = [ + "UniGrammarRuntimeCore", # @ git+https://codeberg.org/UniGrammar/UniGrammarRuntimeCore.py.git", + "urm", # @ git+https://codeberg.org/KOLANICH/urm.py.git", + "transformerz", # @ git+https://codeberg.org/KOLANICH/transformerz.py.git", +] +dynamic = ["version"] + +[project.urls] +Homepage = "https://codeberg.org/UniGrammar/UniGrammarRuntime.py" + +[tool.setuptools] +zip-safe = true +include-package-data = true + +[tool.setuptools.packages] +find = {namespaces = false} + +[tool.setuptools_scm]