Initial prototype improving malloc invalidation efficiency

sternj · sternj · commit 9ed9e680d888 · 2024-11-24T16:42:14.000-05:00
diff --git a/.github/workflows/test-smoketests.yml b/.github/workflows/test-smoketests.yml
@@ -51,6 +51,9 @@ jobs:
     - name: decorator smoke test
       run: python test/smoketest_profile_decorator.py
 
+    - name: line invalidation test
+      run: python test/test_line_invalidation.py
+
     # Note: This test doesn't need to read an output,
     # it is meant to determine if there is an ImportError
     # or anything related if relative imports are used.
diff --git a/scalene/scalene_profiler.py b/scalene/scalene_profiler.py
@@ -546,7 +546,7 @@ def malloc_signal_handler(
             ByteCodeIndex(f.f_lasti),
         ]
         Scalene.__alloc_sigq.put([0])
-        pywhere.enable_settrace()
+        pywhere.enable_settrace(this_frame)
         del this_frame
 
     @staticmethod
diff --git a/src/source/pywhere.cpp b/src/source/pywhere.cpp
@@ -268,6 +268,15 @@ static void allocate_newline() {
 
 static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
                       PyObject* arg) {
+  if (what == PyTrace_CALL || what == PyTrace_C_CALL) {
+    PyThreadState* tstate = PyThreadState_Get();
+    frame->f_trace_lines = 0;
+    frame->f_trace = NULL;
+    #if PY_VERSION_HEX < 0x030c0000
+    tstate->cframe->use_tracing = 0;
+    #endif
+   
+  }
   if (what != PyTrace_LINE) {
     return 0;
   }
@@ -297,7 +306,6 @@ static int trace_func(PyObject* obj, PyFrameObject* frame, int what,
   // Needed because decref will be called in on_stack
   Py_INCREF(frame);
   if (on_stack(last_fname_s, lineno_l, static_cast<PyFrameObject*>(frame))) {
-    frame->f_trace_lines = 0;
     return 0;
   }
 
@@ -370,7 +378,14 @@ static PyObject* depopulate_struct(PyObject* self, PyObject* args) {
 }
 
 static PyObject* enable_settrace(PyObject* self, PyObject* args) {
+  PyObject* frame;
+  if (!PyArg_ParseTuple(args, "O", &frame)) {
+    return NULL;
+  }
+  PyFrameObject* frame_obj = (PyFrameObject*) frame;
   PyEval_SetTrace(trace_func, NULL);
+  // frame_obj->f_trace = &trace_func;
+  frame_obj->f_trace_lines = 1;
   Py_RETURN_NONE;
 }
 
@@ -389,7 +404,7 @@ static PyMethodDef EmbMethods[] = {
     {"print_files_to_profile", print_files_to_profile, METH_NOARGS,
      "printing for debug"},
     //  {"return_buffer", return_buffer, METH_NOARGS, ""},
-    {"enable_settrace", enable_settrace, METH_NOARGS, ""},
+    {"enable_settrace", enable_settrace, METH_VARARGS, ""},
     {"disable_settrace", disable_settrace, METH_NOARGS, ""},
     {"populate_struct", populate_struct, METH_NOARGS, ""},
     {"depopulate_struct", depopulate_struct, METH_NOARGS, ""},
diff --git a/test/line_attribution_tests/line_after_final_alloc.py b/test/line_attribution_tests/line_after_final_alloc.py
@@ -0,0 +1,14 @@
+
+def main():
+    accum = bytes()
+    for i in range(31):
+        accum += bytes(10485767 * 2) 
+        
+    print(len(accum))
+
+    asdf = bytes(2 * 10485767)
+    some_dead_line = None
+
+    
+if __name__ == '__main__':
+    main()
diff --git a/test/line_attribution_tests/loop_below_threshold.py b/test/line_attribution_tests/loop_below_threshold.py
@@ -0,0 +1,13 @@
+
+def main():
+    accum = bytes()
+    for i in range(31):
+        accum += bytes(10485767 // 4) # far below the allocation sampling window
+
+    print(len(accum))
+
+    asdf = bytes(2 * 10485767)
+
+    
+if __name__ == '__main__':
+    main()
diff --git a/test/line_attribution_tests/loop_with_multiple_lines.py b/test/line_attribution_tests/loop_with_multiple_lines.py
@@ -0,0 +1,13 @@
+
+def main():
+    accum = bytes()
+    for i in range(31):
+        accum += bytes(2 * 10485767) # 2x the allocation sampling window
+        bogus = None
+    print(len(accum))
+
+    asdf = bytes(2 * 10485767)
+
+    
+if __name__ == '__main__':
+    main()
diff --git a/test/line_attribution_tests/loop_with_one_alloc.py b/test/line_attribution_tests/loop_with_one_alloc.py
@@ -0,0 +1,13 @@
+
+def main():
+    accum = bytes()
+    for i in range(31):
+        accum += bytes(2 * 10485767) # 2x the allocation sampling window
+        
+    print(len(accum))
+
+    asdf = bytes(2 * 10485767)
+
+    
+if __name__ == '__main__':
+    main()
diff --git a/test/line_attribution_tests/loop_with_two_allocs.py b/test/line_attribution_tests/loop_with_two_allocs.py
@@ -0,0 +1,12 @@
+
+def main():
+    accum = bytes()
+    for i in range(31):
+        accum += bytes(2 * 10485767) + bytes(2 * 10485767) # 2x the allocation sampling window
+        
+    print(len(accum))
+
+    asdf = bytes(2 * 10485767)
+
+if __name__ == '__main__':
+    main()
diff --git a/test/test_line_invalidation.py b/test/test_line_invalidation.py
@@ -0,0 +1,104 @@
+"""
+This is bound very closely to the current implementation of 
+the tests in `test/line_attribution_tests. 
+
+The two things that matter are the number of loops, the size
+of the allocations, and the exact line numbers. 
+
+
+"""
+
+expected_md5_sums = {
+    "line_attribution_tests/loop_below_threshold.py": "7664a7dcc0f4ab94a44e431448b5f348",
+    "line_attribution_tests/loop_with_one_alloc.py": "da9ff0aa223123c956049e940c3ef093",
+    "line_attribution_tests/loop_with_multiple_lines.py": "48ce0e8693fe43b1ebb7eb75a0fd5832",
+    "line_attribution_tests/loop_with_two_allocs.py": "71f337140aa25383525e56a6167cabf8",
+    "line_attribution_tests/line_after_final_alloc.py": "ca8cdd44ea6e4a9c286c05facae6a721"
+}
+
+import subprocess
+import tempfile
+import sys
+from typing import List
+from pathlib import Path
+from hashlib import md5
+from scalene.scalene_json import ScaleneJSONSchema
+
+N_LOOPS = 31
+LOOP_ALLOC_LINENO = 5  #
+OUT_OF_LOOP_ALLOC_LINENO = 9
+
+def check_for_changes():
+    errors = []
+    for fname, expected_sum in expected_md5_sums.items():
+        with open(fname, 'rb') as f:
+            digest = md5(f.read()).hexdigest()
+        if digest != expected_sum:
+            errors.append(fname)
+    assert len(errors) == 0, f'Detected change in file(s) {','.join(errors)}'
+
+def get_line(scalene_profile: ScaleneJSONSchema, lineno: int):
+    files = list(scalene_profile.files.keys())
+    assert len(files) == 1
+    filename = files[0]
+    return scalene_profile.files[filename].lines[lineno - 1]
+
+
+
+
+def get_profile(test_stem, outdir_p, test_dir):
+    proc = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "scalene",
+            "--cli", 
+            "--json",
+            "--outfile",
+            outdir_p / f"{test_stem}.json",
+            test_dir / f"{test_stem}.py",
+        ],
+        capture_output=True,
+        check=True,
+    )
+    with open(outdir_p / f"{test_stem}.json", "r") as f:
+        profile = ScaleneJSONSchema.model_validate_json(f.read())
+    return (test_stem, profile)
+
+
+def main():
+    test_dir = Path(__file__).parent / "line_attribution_tests"
+    with tempfile.TemporaryDirectory() as outdir:
+        outdir_p = Path(outdir)
+        one_alloc = get_profile("loop_with_one_alloc", outdir_p, test_dir)
+        two_on_one_line = get_profile("loop_with_two_allocs", outdir_p, test_dir)
+        below_threshold = get_profile("loop_below_threshold", outdir_p, test_dir)
+        line_after_final_alloc = get_profile(
+            "line_after_final_alloc", outdir_p, test_dir
+        )
+    errors = []
+    for stem, profile in [one_alloc, two_on_one_line, line_after_final_alloc]:
+        line = get_line(profile, LOOP_ALLOC_LINENO)
+        if not line.n_mallocs == N_LOOPS:
+            errors.append(f"Expected {N_LOOPS} distinct lines on {stem}, got {line.n_mallocs} on {LOOP_ALLOC_LINENO}")
+
+    bt_stem, bt_prof = below_threshold
+    bt_line = get_line(bt_prof, LOOP_ALLOC_LINENO)
+    if not bt_line.n_mallocs < N_LOOPS:
+       errors.append(f"{bt_stem} makes smaller allocations than the allocation sampling window, so fewer than {N_LOOPS} allocations on {LOOP_ALLOC_LINENO} should be reported. Got {bt_line.n_mallocs}")
+
+    for stem, profile in [one_alloc, two_on_one_line, below_threshold, line_after_final_alloc]:
+        line = get_line(profile, OUT_OF_LOOP_ALLOC_LINENO)
+        if not line.n_mallocs == 1:
+            errors.append(f'Line {OUT_OF_LOOP_ALLOC_LINENO} in {stem} makes a large allocation, so it should be reported.')
+    
+    if len(errors) > 0:
+        for error in errors:
+            print(f'ERROR: {error}')
+        exit(1)
+    else:
+        print("PASS")
+        exit(0)
+    
+if __name__ == '__main__':
+    main()

Original file line number	Diff line number	Diff line change
`@@ -546,7 +546,7 @@ def malloc_signal_handler(`
`546`	`546`	`ByteCodeIndex(f.f_lasti),`
`547`	`547`	`]`
`548`	`548`	`Scalene.__alloc_sigq.put([0])`
`549`		`- pywhere.enable_settrace()`
	`549`	`+ pywhere.enable_settrace(this_frame)`
`550`	`550`	`del this_frame`
`551`	`551`
`552`	`552`	`@staticmethod`