Skip to content

Commit a42b08b

Browse files
authored
Cache results of static analysis (#15)
* Cache collected types in JSON files * Use serializer protocol for serialization only and not for file IO. * Include version in cache path This will be useful in case of incompatibilities between different versions. * Move FileCache into its own submodule * Improve docstrings of `ImportSerializer`
1 parent 019e4e8 commit a42b08b

File tree

6 files changed

+387
-16
lines changed

6 files changed

+387
-16
lines changed

src/docstub/_analysis.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22

33
import builtins
44
import collections.abc
5+
import json
56
import logging
67
import re
78
import typing
8-
from dataclasses import dataclass
9+
from dataclasses import asdict, dataclass
910
from pathlib import Path
1011

1112
import libcst as cst
1213

13-
from ._utils import accumulate_qualname, module_name_from_path
14+
from ._utils import accumulate_qualname, module_name_from_path, pyfile_checksum
1415

1516
logger = logging.getLogger(__name__)
1617

@@ -260,6 +261,38 @@ def common_known_imports():
260261

261262

262263
class TypeCollector(cst.CSTVisitor):
264+
"""Collect types from a given Python file.
265+
266+
Examples
267+
--------
268+
>>> types = TypeCollector.collect(__file__)
269+
>>> types[f"{__name__}.TypeCollector"]
270+
<KnownImport 'from docstub._analysis import TypeCollector'>
271+
"""
272+
273+
class ImportSerializer:
274+
"""Implements the `FuncSerializer` protocol to cache `TypeCollector.collect`."""
275+
276+
suffix = ".json"
277+
encoding = "utf-8"
278+
279+
def hash_args(self, path: Path) -> str:
280+
"""Compute a unique hash from the path passed to `TypeCollector.collect`."""
281+
key = pyfile_checksum(path)
282+
return key
283+
284+
def serialize(self, data: dict[str, KnownImport]) -> bytes:
285+
"""Serialize results from `TypeCollector.collect`."""
286+
primitives = {qualname: asdict(imp) for qualname, imp in data.items()}
287+
raw = json.dumps(primitives, separators=(",", ":")).encode(self.encoding)
288+
return raw
289+
290+
def deserialize(self, raw: bytes) -> dict[str, KnownImport]:
291+
"""Deserialize results from `TypeCollector.collect`."""
292+
primitives = json.loads(raw.decode(self.encoding))
293+
data = {qualname: KnownImport(**kw) for qualname, kw in primitives.items()}
294+
return data
295+
263296
@classmethod
264297
def collect(cls, file):
265298
"""Collect importable type annotations in given file.

src/docstub/_cache.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import logging
2+
from functools import cached_property
3+
from typing import Protocol
4+
5+
logger = logging.getLogger(__name__)
6+
7+
8+
CACHEDIR_TAG_CONTENT = """\
9+
Signature: 8a477f597d28d172789f06886806bc55\
10+
# This file is a cache directory tag automatically created by docstub.\n"
11+
# For information about cache directory tags see https://bford.info/cachedir/\n"
12+
"""
13+
14+
15+
def _directory_size(path):
16+
"""Estimate total size of a directory's content in bytes.
17+
18+
Parameters
19+
----------
20+
path : Path
21+
22+
Returns
23+
-------
24+
total_bytes : int
25+
Total size of all objects in bytes.
26+
"""
27+
if not path.is_dir():
28+
msg = f"{path} doesn't exist, can't determine size"
29+
raise FileNotFoundError(msg)
30+
files = path.rglob("*")
31+
total_bytes = sum(f.stat().st_size for f in files)
32+
return total_bytes
33+
34+
35+
def create_cache(path):
36+
"""Create a cache directory.
37+
38+
Parameters
39+
----------
40+
path : Path
41+
Directory of the cache. The directory and it's parents will be created if it
42+
doesn't exist yet.
43+
"""
44+
path.mkdir(parents=True, exist_ok=True)
45+
cachdir_tag_path = path / "CACHEDIR.TAG"
46+
cachdir_tag_content = (
47+
"Signature: 8a477f597d28d172789f06886806bc55\n"
48+
"# This file is a cache directory tag automatically created by docstub.\n"
49+
"# For information about cache directory tags see https://bford.info/cachedir/\n"
50+
)
51+
if not cachdir_tag_path.is_file():
52+
with open(cachdir_tag_path, "w") as fp:
53+
fp.write(cachdir_tag_content)
54+
55+
gitignore_path = path / ".gitignore"
56+
gitignore_content = (
57+
"# This file is a cache directory tag automatically created by docstub.\n" "*\n"
58+
)
59+
if not gitignore_path.is_file():
60+
with open(gitignore_path, "w") as fp:
61+
fp.write(gitignore_content)
62+
63+
64+
class FuncSerializer[T](Protocol):
65+
"""Defines an interface to serialize and deserialize results of a function.
66+
67+
This interface is used by `FileCache` to cache results of a
68+
69+
Attributes
70+
----------
71+
suffix :
72+
A suffix corresponding to the format of the serialized data, e.g. ".json".
73+
"""
74+
75+
suffix: str
76+
77+
def hash_args(self, *args, **kwargs) -> str:
78+
"""Compute a unique hash from the arguments passed to a function."""
79+
80+
def serialize(self, data: T) -> bytes:
81+
"""Serialize results of a function from `T` to bytes."""
82+
83+
def deserialize(self, raw: bytes) -> T:
84+
"""Deserialize results of a function from bytes back to `T`."""
85+
86+
87+
class FileCache:
88+
"""Cache results from a function call as a files on disk.
89+
90+
This class can cache results of a function to the disk. A unique key is
91+
generated from the arguments to the function, and the result is cached
92+
inside a file named after this key.
93+
"""
94+
95+
def __init__(self, *, func, serializer, cache_dir, name):
96+
"""
97+
Parameters
98+
----------
99+
func : callable
100+
The function whose output shall be cached.
101+
serializer : FuncSerializer
102+
An interface that matches the given `func`. It must implement the
103+
`FileCachIO` protocol.
104+
cache_dir : Path
105+
The directory of the cache.
106+
name : str
107+
A unique name to separate parallel caches inside `cache_dir`.
108+
"""
109+
self.func = func
110+
self.serializer = serializer
111+
self._cache_dir = cache_dir
112+
self.name = name
113+
114+
@cached_property
115+
def named_cache_dir(self):
116+
"""Path to the named subdirectory inside the cache.
117+
118+
Warns when cache size exceeds 512 MiB.
119+
"""
120+
cache_dir = self._cache_dir
121+
create_cache(cache_dir)
122+
if _directory_size(cache_dir) > 512 * 1024**2:
123+
logger.warning("cache size at %r exceeds 512 MiB", cache_dir)
124+
_named_cache_dir = cache_dir / self.name
125+
_named_cache_dir.mkdir(parents=True, exist_ok=True)
126+
return _named_cache_dir
127+
128+
def __call__(self, *args, **kwargs):
129+
"""Call the wrapped `func` and cache each result in a file."""
130+
key = self.serializer.hash_args(*args, **kwargs)
131+
entry_path = self.named_cache_dir / f"{key}{self.serializer.suffix}"
132+
if entry_path.is_file():
133+
with entry_path.open("rb") as fp:
134+
raw = fp.read()
135+
data = self.serializer.deserialize(raw)
136+
else:
137+
data = self.func(*args, **kwargs)
138+
raw = self.serializer.serialize(data)
139+
with entry_path.open("xb") as fp:
140+
fp.write(raw)
141+
return data

src/docstub/_cli.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import logging
22
import sys
3+
import time
4+
from contextlib import contextmanager
35
from pathlib import Path
46

57
import click
@@ -10,6 +12,7 @@
1012
TypeCollector,
1113
common_known_imports,
1214
)
15+
from ._cache import FileCache
1316
from ._config import Config
1417
from ._stubs import Py2StubTransformer, walk_source, walk_source_and_targets
1518
from ._version import __version__
@@ -26,7 +29,7 @@ def _load_configuration(config_path=None):
2629
2730
Returns
2831
-------
29-
config : dict[str, Any]
32+
config : ~.Config
3033
"""
3134
config = Config.from_toml(Config.DEFAULT_CONFIG_PATH)
3235

@@ -65,6 +68,58 @@ def _setup_logging(*, verbose):
6568
)
6669

6770

71+
def _build_import_map(config, source_dir):
72+
"""Build a map of known imports.
73+
74+
Parameters
75+
----------
76+
config : ~.Config
77+
source_dir : Path
78+
79+
Returns
80+
-------
81+
imports : dict[str, ~.KnownImport]
82+
"""
83+
known_imports = common_known_imports()
84+
85+
collect_cached_types = FileCache(
86+
func=TypeCollector.collect,
87+
serializer=TypeCollector.ImportSerializer(),
88+
cache_dir=Path.cwd() / ".docstub_cache",
89+
name=f"{__version__}/collected_types",
90+
)
91+
for source_path in walk_source(source_dir):
92+
logger.info("collecting types in %s", source_path)
93+
known_imports_in_source = collect_cached_types(source_path)
94+
known_imports.update(known_imports_in_source)
95+
96+
known_imports.update(KnownImport.many_from_config(config.known_imports))
97+
98+
return known_imports
99+
100+
101+
@contextmanager
102+
def report_execution_time():
103+
start = time.time()
104+
try:
105+
yield
106+
finally:
107+
stop = time.time()
108+
total_seconds = stop - start
109+
110+
hours, remainder = divmod(total_seconds, 3600)
111+
minutes, seconds = divmod(remainder, 60)
112+
113+
formated_duration = f"{seconds:.3f} s"
114+
if minutes:
115+
formated_duration = f"{minutes} min {formated_duration}"
116+
if hours:
117+
formated_duration = f"{hours} h {formated_duration}"
118+
119+
click.echo()
120+
click.echo(f"Finished in {formated_duration}")
121+
122+
68123
@click.command()
69124
@click.version_option(__version__)
70125
@click.argument("source_dir", type=click.Path(exists=True, file_okay=False))
@@ -82,19 +137,13 @@ def _setup_logging(*, verbose):
82137
)
83138
@click.option("-v", "--verbose", count=True, help="Log more details.")
84139
@click.help_option("-h", "--help")
140+
@report_execution_time()
85141
def main(source_dir, out_dir, config_path, verbose):
86142
_setup_logging(verbose=verbose)
87143

88144
source_dir = Path(source_dir)
89145
config = _load_configuration(config_path)
90-
91-
# Build map of known imports
92-
known_imports = common_known_imports()
93-
for source_path in walk_source(source_dir):
94-
logger.info("collecting types in %s", source_path)
95-
known_imports_in_source = TypeCollector.collect(source_path)
96-
known_imports.update(known_imports_in_source)
97-
known_imports.update(KnownImport.many_from_config(config.known_imports))
146+
known_imports = _build_import_map(config, source_dir)
98147

99148
inspector = StaticInspector(
100149
source_pkgs=[source_dir.parent.resolve()], known_imports=known_imports

src/docstub/_utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from functools import lru_cache
55
from pathlib import Path
66
from textwrap import indent
7+
from zlib import crc32
78

89
import click
910

@@ -105,6 +106,28 @@ def module_name_from_path(path):
105106
return name
106107

107108

109+
def pyfile_checksum(path):
110+
"""Compute a unique key for a Python file.
111+
112+
The key takes into account the given `path`, the relative position if the
113+
file is part of a Python package and the file's content.
114+
115+
Parameters
116+
----------
117+
path : Path
118+
119+
Returns
120+
-------
121+
key : str
122+
"""
123+
module_name = module_name_from_path(path).encode()
124+
absolute_path = str(path.resolve()).encode()
125+
with open(path, "rb") as fp:
126+
content = fp.read()
127+
key = crc32(content + module_name + absolute_path)
128+
return key
129+
130+
108131
@dataclasses.dataclass(kw_only=True, slots=True, frozen=True)
109132
class ContextFormatter:
110133
"""Format messages in context of a location in a file.

0 commit comments

Comments
 (0)