Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrote protobuf generation scripts in Python #12527

Merged
merged 14 commits into from
Sep 19, 2024
Merged
2 changes: 2 additions & 0 deletions requirements-tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ ruff==0.5.4 # must match .pre-commit-config.yaml

# Libraries used by our various scripts.
aiohttp==3.10.2
grpcio-tools
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sure there's a minimal version of protoc that is shipped with grpcio-tools that we should be using, but I can't recall off the top of my head and would have to search through past PRs to find what it was.

mypy-protobuf==3.6.0
packaging==24.1
pathspec>=0.11.1
pre-commit
Expand Down
66 changes: 66 additions & 0 deletions scripts/sync_protobuf/_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from __future__ import annotations

import subprocess
import sys
from http.client import HTTPResponse
from pathlib import Path
from typing import TYPE_CHECKING, Iterable
from urllib.request import urlopen
Copy link
Collaborator Author

@Avasam Avasam Aug 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm purposefully avoiding requests here, as to not add requests and types-requests in requirements-tests.txt

from zipfile import ZipFile

import tomlkit
from mypy_protobuf.main import ( # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs]
__version__ as mypy_protobuf__version__,
)

if TYPE_CHECKING:
from _typeshed import StrOrBytesPath, StrPath

REPO_ROOT = Path(__file__).absolute().parent.parent.parent
MYPY_PROTOBUF_VERSION = mypy_protobuf__version__


def download_file(url: str, destination: StrPath) -> None:
print(f"Downloading '{url}' to '{destination}'")
resp: HTTPResponse = urlopen(url)
if resp.getcode() != 200:
raise RuntimeError(f"Error downloading {url}")
with open(destination, "wb") as file:
file.write(resp.read())


def extract_archive(archive_path: StrPath, destination: StrPath) -> None:
print(f"Extracting '{archive_path}' to '{destination}'")
with ZipFile(archive_path) as file_in:
file_in.extractall(destination)


def update_metadata(metadata_folder: StrPath, new_extra_description: str) -> None:
metadata_path = Path(metadata_folder) / "METADATA.toml"
with open(metadata_path) as file:
metadata = tomlkit.load(file)
metadata["extra_description"] = new_extra_description
with open(metadata_path, "w") as file:
# tomlkit.dump has partially unknown IO type
tomlkit.dump(metadata, file) # pyright: ignore[reportUnknownMemberType]
print(f"Updated {metadata_path}")


def run_protoc(
proto_paths: Iterable[StrPath], mypy_out: StrPath, proto_globs: Iterable[str], cwd: StrOrBytesPath | None = None
) -> str:
"""TODO: Describe parameters and return"""
protoc_version = (
subprocess.run([sys.executable, "-m", "grpc_tools.protoc", "--version"], capture_output=True).stdout.decode().strip()
)
print()
print(protoc_version)
protoc_args = [
*[f"--proto_path={proto_path}" for proto_path in proto_paths],
"--mypy_out",
f"relax_strict_optional_primitives:{mypy_out}",
*proto_globs,
]
print("Running: protoc\n " + "\n ".join(protoc_args) + "\n")
subprocess.run((sys.executable, "-m", "grpc_tools.protoc", *protoc_args), cwd=cwd, check=True)
return protoc_version
91 changes: 91 additions & 0 deletions scripts/sync_protobuf/google_protobuf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
Generates the protobuf stubs for the given protobuf version using mypy-protobuf.
Generally, new minor versions are a good time to update the stubs.
"""

from __future__ import annotations

import json
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

from _helpers import MYPY_PROTOBUF_VERSION, REPO_ROOT, download_file, extract_archive, run_protoc, update_metadata

# Whenever you update PACKAGE_VERSION here, version should be updated
# in stubs/protobuf/METADATA.toml and vice-versa.
PACKAGE_VERSION = "27.1"

STUBS_FOLDER = REPO_ROOT / "stubs" / "protobuf"
ARCHIVE_FILENAME = f"protobuf-{PACKAGE_VERSION}.zip"
ARCHIVE_URL = f"https://github.com/protocolbuffers/protobuf/releases/download/v{PACKAGE_VERSION}/{ARCHIVE_FILENAME}"
EXTRACTED_PACKAGE_DIR = f"protobuf-{PACKAGE_VERSION}"

VERSION_PATTERN = re.compile(r'def game_version\(\):\n return "(.+?)"')
PROTO_FILE_PATTERN = re.compile(r'"//:(.*)_proto"')


def extract_python_version(file_path: Path) -> str:
"""Extract the Python version from https://github.com/protocolbuffers/protobuf/blob/main/version.json"""
with open(file_path) as file:
data: dict[str, dict[str, dict[str, str]]] = json.load(file)
Avasam marked this conversation as resolved.
Show resolved Hide resolved
# The root key will be the protobuf source code version
return next(iter(data.values()))["languages"]["python"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to see some validation of the version, considering its coming from an outside source. Something like:

Suggested change
data: dict[str, dict[str, dict[str, str]]] = json.load(file)
# The root key will be the protobuf source code version
return next(iter(data.values()))["languages"]["python"]
data = json.load(file)
# The root key will be the protobuf source code version
version = next(iter(data.values()))["languages"]["python"]
assert isinstance(version, str)
assert re.fullmatch(r"...", version) # proper re here
return version

This way we're also sure (at runtime) that version has the correct type and format.

Copy link
Collaborator Author

@Avasam Avasam Sep 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like validating the version string is unnecessary extra work. If they somehow write an invalid Python version, our script doesn't need to fail. We're not doing anything with it other than displaying it. Proper validation should probably use a Python packaging library (I don't remember which).
The str assertion I still find valuable in case protobuff changes the structure of that file and the value becomes an object (dict)



def extract_proto_file_paths(temp_dir: Path) -> list[str]:
"""
Roughly reproduce the subset of .proto files on the public interface
as described in py_proto_library calls in
https://github.com/protocolbuffers/protobuf/blob/main/python/dist/BUILD.bazel
"""
with open(temp_dir / EXTRACTED_PACKAGE_DIR / "python" / "dist" / "BUILD.bazel") as file:
matched_lines = filter(None, (re.search(PROTO_FILE_PATTERN, line) for line in file.readlines()))
Avasam marked this conversation as resolved.
Show resolved Hide resolved
proto_files = [
EXTRACTED_PACKAGE_DIR + "/src/google/protobuf/" + match.group(1).replace("compiler_", "compiler/") + ".proto"
for match in matched_lines
]
return proto_files


def main() -> None:
temp_dir = Path(tempfile.mkdtemp())
# Fetch s2clientprotocol (which contains all the .proto files)
archive_path = temp_dir / ARCHIVE_FILENAME
download_file(ARCHIVE_URL, archive_path)
extract_archive(archive_path, temp_dir)

# Remove existing pyi
for old_stub in STUBS_FOLDER.rglob("*_pb2.pyi"):
old_stub.unlink()

PROTOC_VERSION = run_protoc(
proto_paths=(f"{EXTRACTED_PACKAGE_DIR}/src",),
mypy_out=STUBS_FOLDER,
proto_globs=extract_proto_file_paths(temp_dir),
cwd=temp_dir,
)

PYTHON_PROTOBUF_VERSION = extract_python_version(temp_dir / EXTRACTED_PACKAGE_DIR / "version.json")

# Cleanup after ourselves, this is a temp dir, but it can still grow fast if run multiple times
shutil.rmtree(temp_dir)
Comment on lines +58 to +78
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure the temp directory is always cleaned up:

Suggested change
temp_dir = Path(tempfile.mkdtemp())
# Fetch s2clientprotocol (which contains all the .proto files)
archive_path = temp_dir / ARCHIVE_FILENAME
download_file(ARCHIVE_URL, archive_path)
extract_archive(archive_path, temp_dir)
# Remove existing pyi
for old_stub in STUBS_FOLDER.rglob("*_pb2.pyi"):
old_stub.unlink()
PROTOC_VERSION = run_protoc(
proto_paths=(f"{EXTRACTED_PACKAGE_DIR}/src",),
mypy_out=STUBS_FOLDER,
proto_globs=extract_proto_file_paths(temp_dir),
cwd=temp_dir,
)
PYTHON_PROTOBUF_VERSION = extract_python_version(temp_dir / EXTRACTED_PACKAGE_DIR / "version.json")
# Cleanup after ourselves, this is a temp dir, but it can still grow fast if run multiple times
shutil.rmtree(temp_dir)
with tempfile.TemporaryDirectory() as td:
temp_dir = Path(td)
# Fetch s2clientprotocol (which contains all the .proto files)
archive_path = temp_dir / ARCHIVE_FILENAME
download_file(ARCHIVE_URL, archive_path)
extract_archive(archive_path, temp_dir)
# Remove existing pyi
for old_stub in STUBS_FOLDER.rglob("*_pb2.pyi"):
old_stub.unlink()
PROTOC_VERSION = run_protoc(
proto_paths=(f"{EXTRACTED_PACKAGE_DIR}/src",),
mypy_out=STUBS_FOLDER,
proto_globs=extract_proto_file_paths(temp_dir),
cwd=temp_dir,
)
PYTHON_PROTOBUF_VERSION = extract_python_version(temp_dir / EXTRACTED_PACKAGE_DIR / "version.json")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did that originally, but it was more annoying to comment out for debugging purposes. Maybe I could do like #12151


update_metadata(
STUBS_FOLDER,
f"""Partially generated using \
[mypy-protobuf=={MYPY_PROTOBUF_VERSION}](https://github.com/nipunn1313/mypy-protobuf/tree/v{MYPY_PROTOBUF_VERSION}) \
and {PROTOC_VERSION} on \
[protobuf v{PACKAGE_VERSION}](https://github.com/protocolbuffers/protobuf/releases/tag/v{PACKAGE_VERSION}) \
(python `protobuf=={PYTHON_PROTOBUF_VERSION}`).""",
)

# Run pre-commit to cleanup the stubs
subprocess.run((sys.executable, "-m", "pre_commit", "run", "--files", *STUBS_FOLDER.rglob("*_pb2.pyi")))


if __name__ == "__main__":
main()
72 changes: 72 additions & 0 deletions scripts/sync_protobuf/s2clientprotocol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Generates the protobuf stubs for the given s2clientprotocol version using mypy-protobuf.
Generally, new minor versions are a good time to update the stubs.
"""

from __future__ import annotations

import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

from _helpers import MYPY_PROTOBUF_VERSION, REPO_ROOT, download_file, extract_archive, run_protoc, update_metadata

# Whenever you update PACKAGE_VERSION here, version should be updated
# in stubs/s2clientprotocol/METADATA.toml and vice-versa.
PACKAGE_VERSION = "c04df4adbe274858a4eb8417175ee32ad02fd609"

STUBS_FOLDER = REPO_ROOT / "stubs" / "s2clientprotocol"
ARCHIVE_FILENAME = f"{PACKAGE_VERSION}.zip"
ARCHIVE_URL = f"https://github.com/Blizzard/s2client-proto/archive/{ARCHIVE_FILENAME}"
EXTRACTED_PACKAGE_DIR = f"s2client-proto-{PACKAGE_VERSION}"

VERSION_PATTERN = re.compile(r'def game_version\(\):\n return "(.+?)"')


def extract_python_version(file_path: Path) -> str:
"""Extract Python version from s2clientprotocol's build file"""
match = re.search(VERSION_PATTERN, file_path.read_text())
assert match
return match.group(1)


def main() -> None:
temp_dir = Path(tempfile.mkdtemp())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above.

# Fetch s2clientprotocol (which contains all the .proto files)
archive_path = temp_dir / ARCHIVE_FILENAME
download_file(ARCHIVE_URL, archive_path)
extract_archive(archive_path, temp_dir)

# Remove existing pyi
for old_stub in STUBS_FOLDER.rglob("*_pb2.pyi"):
old_stub.unlink()

PROTOC_VERSION = run_protoc(
proto_paths=(EXTRACTED_PACKAGE_DIR,),
mypy_out=STUBS_FOLDER,
proto_globs=(f"{EXTRACTED_PACKAGE_DIR}/s2clientprotocol/*.proto",),
cwd=temp_dir,
)

PYTHON_S2_CLIENT_PROTO_VERSION = extract_python_version(temp_dir / EXTRACTED_PACKAGE_DIR / "s2clientprotocol" / "build.py")

# Cleanup after ourselves, this is a temp dir, but it can still grow fast if run multiple times
shutil.rmtree(temp_dir)

update_metadata(
STUBS_FOLDER,
f"""Partially generated using \
[mypy-protobuf=={MYPY_PROTOBUF_VERSION}](https://github.com/nipunn1313/mypy-protobuf/tree/v{MYPY_PROTOBUF_VERSION}) \
and {PROTOC_VERSION} on \
[s2client-proto {PYTHON_S2_CLIENT_PROTO_VERSION}](https://github.com/Blizzard/s2client-proto/tree/{PACKAGE_VERSION}).""",
)

# Run pre-commit to cleanup the stubs
subprocess.run((sys.executable, "-m", "pre_commit", "run", "--files", *STUBS_FOLDER.rglob("*_pb2.pyi")))


if __name__ == "__main__":
main()
137 changes: 137 additions & 0 deletions scripts/sync_protobuf/tensorflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
Generates the protobuf stubs for the given tensorflow version using mypy-protobuf.
Generally, new minor versions are a good time to update the stubs.
"""

from __future__ import annotations

import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

from _helpers import MYPY_PROTOBUF_VERSION, REPO_ROOT, download_file, extract_archive, run_protoc, update_metadata

# Whenever you update PACKAGE_VERSION here, version should be updated
# in stubs/tensorflow/METADATA.toml and vice-versa.
PACKAGE_VERSION = "2.17.0"

STUBS_FOLDER = REPO_ROOT / "stubs" / "tensorflow"
ARCHIVE_FILENAME = f"v{PACKAGE_VERSION}.zip"
ARCHIVE_URL = f"https://github.com/tensorflow/tensorflow/archive/refs/tags/{ARCHIVE_FILENAME}"
EXTRACTED_PACKAGE_DIR = f"tensorflow-{PACKAGE_VERSION}"

PROTOS_TO_REMOVE = (
"compiler/xla/autotune_results_pb2.pyi",
"compiler/xla/autotuning_pb2.pyi",
"compiler/xla/service/buffer_assignment_pb2.pyi",
"compiler/xla/service/hlo_execution_profile_data_pb2.pyi",
"core/protobuf/autotuning_pb2.pyi",
"core/protobuf/conv_autotuning_pb2.pyi",
"core/protobuf/critical_section_pb2.pyi",
"core/protobuf/eager_service_pb2.pyi",
"core/protobuf/master_pb2.pyi",
"core/protobuf/master_service_pb2.pyi",
"core/protobuf/replay_log_pb2.pyi",
"core/protobuf/tpu/compile_metadata_pb2.pyi",
"core/protobuf/worker_pb2.pyi",
"core/protobuf/worker_service_pb2.pyi",
"core/util/example_proto_fast_parsing_test_pb2.pyi",
)
"""
These protos exist in a folder with protos used in python,
but are not included in the python wheel.
They are likely only used for other language builds.
stubtest was used to identify them by looking for ModuleNotFoundError.
(comment out ".*_pb2.*" from the allowlist)
"""

TSL_IMPORT_PATTERN = re.compile(r"(\[|\s)tsl\.")
XLA_IMPORT_PATTERN = re.compile(r"(\[|\s)xla\.")


def post_creation() -> None:
"""Move third-party and fix imports"""
# Can't use shutil.move because it can't merge existing directories.
print()
print(f"Moving '{STUBS_FOLDER}/tsl' to '{STUBS_FOLDER}/tensorflow/tsl'")
shutil.copytree(f"{STUBS_FOLDER}/tsl", f"{STUBS_FOLDER}/tensorflow/tsl", dirs_exist_ok=True)
shutil.rmtree(f"{STUBS_FOLDER}/tsl")

print(f"Moving '{STUBS_FOLDER}/xla' to '{STUBS_FOLDER}/tensorflow/compiler/xla'")
shutil.copytree(f"{STUBS_FOLDER}/xla", f"{STUBS_FOLDER}/tensorflow/compiler/xla", dirs_exist_ok=True)
shutil.rmtree(f"{STUBS_FOLDER}/xla")

for path in STUBS_FOLDER.rglob("*_pb2.pyi"):
print(f"Fixing imports in '{path}'")
with open(path) as file:
filedata = file.read()

# Replace the target string
filedata = re.sub(TSL_IMPORT_PATTERN, "\\1tensorflow.tsl.", filedata)
filedata = re.sub(XLA_IMPORT_PATTERN, "\\1tensorflow.compiler.xla.", filedata)

# Write the file out again
with open(path, "w") as file:
file.write(filedata)

print()
for to_remove in PROTOS_TO_REMOVE:
file_path = STUBS_FOLDER / "tensorflow" / to_remove
os.remove(file_path)
print(f"Removed '{file_path}'")


def main() -> None:
temp_dir = Path(tempfile.mkdtemp())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above.

# Fetch tensorflow (which contains all the .proto files)
archive_path = temp_dir / ARCHIVE_FILENAME
download_file(ARCHIVE_URL, archive_path)
extract_archive(archive_path, temp_dir)

# Remove existing pyi
for old_stub in STUBS_FOLDER.rglob("*_pb2.pyi"):
old_stub.unlink()

PROTOC_VERSION = run_protoc(
proto_paths=(
f"{EXTRACTED_PACKAGE_DIR}/third_party/xla/third_party/tsl",
f"{EXTRACTED_PACKAGE_DIR}/third_party/xla",
f"{EXTRACTED_PACKAGE_DIR}",
),
mypy_out=STUBS_FOLDER,
proto_globs=(
f"{EXTRACTED_PACKAGE_DIR}/third_party/xla/xla/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/third_party/xla/xla/service/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/core/example/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/core/framework/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/core/protobuf/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/core/protobuf/tpu/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/core/util/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/tensorflow/python/keras/protobuf/*.proto",
f"{EXTRACTED_PACKAGE_DIR}/third_party/xla/third_party/tsl/tsl/protobuf/*.proto",
),
cwd=temp_dir,
)

# Cleanup after ourselves, this is a temp dir, but it can still grow fast if run multiple times
shutil.rmtree(temp_dir)

post_creation()

update_metadata(
STUBS_FOLDER,
f"""Partially generated using \
[mypy-protobuf=={MYPY_PROTOBUF_VERSION}](https://github.com/nipunn1313/mypy-protobuf/tree/v{MYPY_PROTOBUF_VERSION}) \
and {PROTOC_VERSION} on `tensorflow=={PACKAGE_VERSION}`.)""",
)

# Run pre-commit to cleanup the stubs
subprocess.run((sys.executable, "-m", "pre_commit", "run", "--files", *STUBS_FOLDER.rglob("*_pb2.pyi")))


if __name__ == "__main__":
main()
Loading
Loading