From 9411528f6edd0ab65a77c0f9877cfe8f9cc3b8d2 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sat, 22 Aug 2020 12:57:34 -0500 Subject: [PATCH 1/2] RC update --- README.md | 11 +++++++++-- mypy.ini | 3 --- setup.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c35b1d1..f7715fc 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,6 @@ As you can see, the 8-bit integer array decreases the memory usage by 87.5%. ### 2. Appropriate machine representation -**🚧 This feature is in progress** - Compressio uses visions to infer the semantic type of data and coerce it into alternative computational representations which minimize memory impact while maintaining it's semantic meaning. @@ -127,3 +125,12 @@ The key insights from this analysis are: - The size of the Series is _not_ decisive for the string representation choice. You can find the full analysis [here](examples/notebooks/pandas%20string%20type%20analysis.ipynb). + +## Gotcha's +Compression has some obvious limitations. +- Overflow: dropping precision can lead to overflow. +(TODO: Mitigate by specifying bandwidth) +- Compatibility: we cannot expect all libs to be compatible with Sparse, RLE + for example observed must be set to True with spars: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html +Another example of that [here]( +https://pythonspeed.com/articles/numpy-memory-footprint/#when-these-strategies-wont-work) \ No newline at end of file diff --git a/mypy.ini b/mypy.ini index e1330ba..7576216 100644 --- a/mypy.ini +++ b/mypy.ini @@ -17,6 +17,3 @@ ignore_missing_imports = True [mypy-pint] ignore_missing_imports = True - -[mypy-visions] -ignore_missing_imports = True diff --git a/setup.py b/setup.py index 9f86c79..dbaab79 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="compressio", - version="0.0.4", + version="0.1.0", url="https://github.com/dylan-profiler/compressio", description="compressio", author="Ian Eaves, Simon Brugman", From cec48df79b77c26ef335a058911e1f8ca6f7ddb6 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Mon, 24 Aug 2020 01:03:42 +0200 Subject: [PATCH 2/2] Update README.md --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f7715fc..3cfd0f9 100644 --- a/README.md +++ b/README.md @@ -127,10 +127,13 @@ The key insights from this analysis are: You can find the full analysis [here](examples/notebooks/pandas%20string%20type%20analysis.ipynb). ## Gotcha's -Compression has some obvious limitations. -- Overflow: dropping precision can lead to overflow. -(TODO: Mitigate by specifying bandwidth) -- Compatibility: we cannot expect all libs to be compatible with Sparse, RLE - for example observed must be set to True with spars: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html -Another example of that [here]( -https://pythonspeed.com/articles/numpy-memory-footprint/#when-these-strategies-wont-work) \ No newline at end of file + +Compressing DataFrames can be helpful in many situations, but not all. +Be mindful of how to apply it in the following cases: + +- _Overflow_: compression by dropping precision can lead to overflows if the array is manipulated afterwards. +This can be an issue for instance for [numpy integers](https://mortada.net/can-integer-operations-overflow-in-python.html). In case this is a problem for your application, you can explicitly choose a precision. + +- _Compatibility_: other libraries may make different decisions to how to handle your compressed data. +One example where code needs to be adjusted to the compressed data is when the sparse data structure is used in combination with [`.groupby`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html). (`observed` must be set to `True`). +This [article](https://pythonspeed.com/articles/numpy-memory-footprint/#when-these-strategies-wont-work) provides another example of scikit-image, which for some functions immediately converts a given array to a float64 dtype.