formatting

DavidBuchanan314 · Nov 15, 2024 · f02446c · f02446c
1 parent ad56c83
commit f02446c
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 114 deletions.
diff --git a/atheris_fuzz.py b/atheris_fuzz.py
@@ -8,13 +8,17 @@
 
 ATJSON_MODE = True
 
+
 def TestOneInput(data):
 	try:
-		roundtrip = cbrrr.encode_dag_cbor(cbrrr.decode_dag_cbor(data, ATJSON_MODE), ATJSON_MODE)
+		roundtrip = cbrrr.encode_dag_cbor(
+			cbrrr.decode_dag_cbor(data, ATJSON_MODE), ATJSON_MODE
+		)
 	except:
 		return
-	if not ATJSON_MODE: # atjson does not roundtrip consistently!!!
-		assert(roundtrip==data)
+	if not ATJSON_MODE:  # atjson does not roundtrip consistently!!!
+		assert roundtrip == data
+
 
 atheris.Setup(sys.argv, TestOneInput)
 atheris.Fuzz()
diff --git a/car_parse_benchmark.py b/car_parse_benchmark.py
@@ -7,56 +7,64 @@
 
 sys.setrecursionlimit(99999999)
 
+
 # LEB128 (has not been strictly tested!)
 def parse_varint(stream):
 	n = 0
 	shift = 0
 	while True:
 		val = stream.read(1)[0]
-		n |= (val & 0x7f) << shift
+		n |= (val & 0x7F) << shift
 		if not val & 0x80:
 			return n
 		shift += 7
 
+
 enctime = 00
 dectime = 0
 
+
 def parse_car(stream, length):
 	global enctime
 	global dectime
 
 	header_len = parse_varint(stream)
 	header_bytes = stream.read(header_len)
-	assert(len(header_bytes) == header_len)
+	assert len(header_bytes) == header_len
 	car_header = decode_dag_cbor(header_bytes)
-	assert(car_header.get("version") == 1)
-	assert(len(car_header.get("roots", [])) == 1)
+	assert car_header.get("version") == 1
+	assert len(car_header.get("roots", [])) == 1
 
 	root = car_header["roots"][0]
 	nodes = {}
 
 	while stream.tell() != length:
 		block_len = parse_varint(stream)
-		cid = CID(stream.read(36)) # XXX: this needs to be parsed properly, length might not be 36
-		assert(cid.is_cidv1_dag_cbor_sha256_32()) # this is enough to validate atproto-flavoured CIDs
-
-		block_data = stream.read(block_len-36)
-		assert(len(block_data) == block_len-36)
-		#content_hash = hashlib.sha256(block_data).digest()
-		#assert(cid_raw.endswith(content_hash))
+		cid = CID(
+			stream.read(36)
+		)  # XXX: this needs to be parsed properly, length might not be 36
+		assert (
+			cid.is_cidv1_dag_cbor_sha256_32()
+		)  # this is enough to validate atproto-flavoured CIDs
+
+		block_data = stream.read(block_len - 36)
+		assert len(block_data) == block_len - 36
+		# content_hash = hashlib.sha256(block_data).digest()
+		# assert(cid_raw.endswith(content_hash))
 		start = time.time()
 		block = decode_dag_cbor(block_data, atjson_mode=ATJSON_MODE)
-		#block = libipld.decode_dag_cbor(block_data)
-		dectime += time.time()-start
+		# block = libipld.decode_dag_cbor(block_data)
+		dectime += time.time() - start
 		start = time.time()
 		roundtrip = encode_dag_cbor(block, atjson_mode=ATJSON_MODE)
-		enctime += time.time()-start
-		assert(block_data == roundtrip)
-		#print(block)
+		enctime += time.time() - start
+		assert block_data == roundtrip
+		# print(block)
 		nodes[cid] = block
-	
+
 	return root, nodes
 
+
 if __name__ == "__main__":
 	import sys
 
@@ -65,16 +73,16 @@ def parse_car(stream, length):
 
 	root, nodes = parse_car(io.BytesIO(car), len(car))
 
-	dec_speed = (len(car)/(1024*1024))/dectime
+	dec_speed = (len(car) / (1024 * 1024)) / dectime
 	print(f"Parsed {len(car)} bytes at {dec_speed:.2f}MB/s")
 
-	enc_speed = (len(car)/(1024*1024))/enctime
+	enc_speed = (len(car) / (1024 * 1024)) / enctime
 	print(f"Encoded {len(car)} bytes at {enc_speed:.2f}MB/s")
 
-	#start = time.time()
-	#libipld.decode_car(car)
-	#duration = time.time()-start
-	#car_speed = (len(car)/(1024*1024))/duration
-	#print(f"libipld.decode_car {len(car)} bytes at {car_speed:.2f}MB/s")
+	# start = time.time()
+	# libipld.decode_car(car)
+	# duration = time.time()-start
+	# car_speed = (len(car)/(1024*1024))/duration
+	# print(f"libipld.decode_car {len(car)} bytes at {car_speed:.2f}MB/s")
 
-	#print(nodes[root])
+	# print(nodes[root])
diff --git a/fuzz.py b/fuzz.py
@@ -1,27 +1,27 @@
 import cbrrr
 import os
 import gc
-#import sys
+# import sys
 
-#gc.collect()
-#print(len(gc.get_objects()))
+# gc.collect()
+# print(len(gc.get_objects()))
 gc.collect()
 prev_heap = {}
 
 for i in range(9999999999):
 	try:
-		#print(sys.getrefcount(cbrrr.CID))
-		res = cbrrr.decode_dag_cbor(os.urandom(1024), atjson_mode=i&1)
-		#print(res)
-	except Exception as e:
-		#print(e)
+		# print(sys.getrefcount(cbrrr.CID))
+		res = cbrrr.decode_dag_cbor(os.urandom(1024), atjson_mode=i & 1)
+		# print(res)
+	except Exception:# as e:
+		# print(e)
 		pass
-	if i%100000 == 0:
+	if i % 100000 == 0:
 		gc.collect()
 		print(len(gc.get_objects()))
 		if 1:
-			print("="*128)
-			this_heap = {id(x):x for x in gc.get_objects()}
+			print("=" * 128)
+			this_heap = {id(x): x for x in gc.get_objects()}
 			if id(prev_heap) in this_heap:
 				del this_heap[id(prev_heap)]
 			if prev_heap:
@@ -30,4 +30,4 @@
 						continue
 					print(this_heap[obj])
 			prev_heap = this_heap
-			print("="*128)
+			print("=" * 128)
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,3 +38,7 @@ fuzz = [
 [project.urls]
 Homepage = "https://github.com/DavidBuchanan314/dag-cbrrr"
 Issues = "https://github.com/DavidBuchanan314/dag-cbrrr/issues"
+
+
+[tool.ruff.format]
+indent-style = "tab"
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 		Extension(
 			"cbrrr._cbrrr",
 			sources=["src/cbrrr/_cbrrr.c"],
-			extra_compile_args=["-O3", "-Wall", "-Wextra", "-Wpedantic", "-std=c99"]
+			extra_compile_args=["-O3", "-Wall", "-Wextra", "-Wpedantic", "-std=c99"],
 		),
-	]
+	],
 )
diff --git a/src/cbrrr/__init__.py b/src/cbrrr/__init__.py
@@ -5,14 +5,17 @@
 
 CbrrrDecodeError = _cbrrr.CbrrrDecodeError
 
+
 class CID:
 	"""
 	This class is very minimal, intended to support atproto use cases and not
 	much else.
 	"""
 
+	# fmt: off
 	CIDV1_DAG_CBOR_SHA256_32_PFX = b"\x01\x71\x12\x20"
 	CIDV1_RAW_SHA256_32_PFX      = b"\x01\x55\x12\x20"
+	# fmt: on
 
 	__slots__ = ("cid_bytes",)
 
@@ -27,35 +30,29 @@ def __init__(self, cid_bytes: bytes) -> None:
 		and is_cidv1_raw_sha256_32() methods may be useful for this.
 		"""
 		self.cid_bytes = cid_bytes
-	
+
 	@classmethod
 	def cidv1_dag_cbor_sha256_32_from(cls, data: bytes) -> "CID":
-		return cls(
-			cls.CIDV1_DAG_CBOR_SHA256_32_PFX
-			+ hashlib.sha256(data).digest()
-		)
+		return cls(cls.CIDV1_DAG_CBOR_SHA256_32_PFX + hashlib.sha256(data).digest())
 
 	@classmethod
 	def cidv1_raw_sha256_32_from(cls, data: bytes) -> "CID":
-		return cls(
-			cls.CIDV1_RAW_SHA256_32_PFX
-			+ hashlib.sha256(data).digest()
-		)
-
+		return cls(cls.CIDV1_RAW_SHA256_32_PFX + hashlib.sha256(data).digest())
+
 	@classmethod
 	def decode(cls, data: Union[bytes, str]) -> "CID":
 		"""
 		Currently supported codecs: identity/raw, base32
 		"""
 
-		if type(data) is str:
+		if isinstance(data, str):
 			data = data.encode()
 
-		if data.startswith(b"\x00"): # identity multibase codec
+		if data.startswith(b"\x00"):  # identity multibase codec
 			return cls(data[1:])
 
-		if data.startswith(b"b"): # base32 multibase codec
-			data = data[1:] # strip prefix
+		if data.startswith(b"b"):  # base32 multibase codec
+			data = data[1:]  # strip prefix
 			if data.endswith(b"="):
 				raise ValueError("unexpected base32 padding")
 			# add back correct amount of padding (python is fussy)
@@ -67,8 +64,7 @@ def decode(cls, data: Union[bytes, str]) -> "CID":
 
 	def encode(self, base="base32") -> str:
 		if base == "base32":
-			return "b" + base64.b32encode(self.cid_bytes) \
-				.decode().lower().rstrip("=")
+			return "b" + base64.b32encode(self.cid_bytes).decode().lower().rstrip("=")
 		# this function might support other encodings in the future
 		raise ValueError("unsupported base encoding")
 
@@ -89,22 +85,22 @@ def __bytes__(self):
 
 	def __repr__(self):
 		return f"CID({self.encode()})"
-	
+
 	def __hash__(self) -> int:
 		return self.cid_bytes.__hash__()
-	
+
 	def __eq__(self, __value: object) -> bool:
 		if not isinstance(__value, CID):
 			return False
 		return self.cid_bytes == __value.cid_bytes
 
+
 # nb: | syntax not supported in <=py3.9
 DagCborTypes = Union[str, bytes, int, bool, float, CID, list, dict, None]
 
+
 def decode_dag_cbor(
-	data: bytes,
-	atjson_mode: bool=False,
-	cid_ctor: Callable[[bytes], Any]=CID
+	data: bytes, atjson_mode: bool = False, cid_ctor: Callable[[bytes], Any] = CID
 ) -> DagCborTypes:
 	"""
 	Decode DAG-CBOR bytes into python objects.
@@ -119,10 +115,9 @@ def decode_dag_cbor(
 		raise ValueError("did not parse to end of buffer")
 	return parsed
 
+
 def decode_multi_dag_cbor_in_violation_of_the_spec(
-	data: bytes,
-	atjson_mode: bool=False,
-	cid_ctor: Callable[[bytes], Any]=CID
+	data: bytes, atjson_mode: bool = False, cid_ctor: Callable[[bytes], Any] = CID
 ) -> Iterator[DagCborTypes]:
 	"""
 	https://ipld.io/specs/codecs/dag-cbor/spec/#strictness
@@ -137,12 +132,11 @@ def decode_multi_dag_cbor_in_violation_of_the_spec(
 		parsed, length = _cbrrr.decode_dag_cbor(view[offset:], cid_ctor, atjson_mode)
 		yield parsed
 		offset += length
-	assert(offset == len(data)) # should never fail!
+	assert offset == len(data)  # should never fail!
+
 
 def encode_dag_cbor(
-	obj: DagCborTypes,
-	atjson_mode: bool=False,
-	cid_type: Type=CID
+	obj: DagCborTypes, atjson_mode: bool = False, cid_type: Type = CID
 ) -> bytes:
 	"""
 	Encode python objects to DAG-CBOR bytes.
@@ -153,6 +147,7 @@ def encode_dag_cbor(
 	"""
 	return _cbrrr.encode_dag_cbor(obj, cid_type, atjson_mode)
 
+
 __all__ = [
 	"CbrrrDecodeError",
 	"CID",

diff --git a/src/cbrrr/_cbrrr.pyi b/src/cbrrr/_cbrrr.pyi
@@ -3,8 +3,7 @@ from typing import Type, TypeVar, Tuple, Callable, Any
 CbrrrDecodeErrorType = TypeVar("CbrrrDecodeErrorType", bound=ValueError)
 CbrrrDecodeError: CbrrrDecodeErrorType
 
-def decode_dag_cbor(buf: bytes, cid_ctor: Callable[[bytes], Any], atjson_mode: bool) -> Tuple[Any, int]:
-	...
-
-def encode_dag_cbor(obj: Any, cid_type: Type, atjson_mode: bool) -> bytes:
-	...
+def decode_dag_cbor(
+	buf: bytes, cid_ctor: Callable[[bytes], Any], atjson_mode: bool
+) -> Tuple[Any, int]: ...
+def encode_dag_cbor(obj: Any, cid_type: Type, atjson_mode: bool) -> bytes: ...