Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of cuDF and HoloViews for GPU acceleration #485

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# Personal Changes to LUX
This branch of LUX is big data optimized, capable of running both the original lux as well as a GPU optimized version running cuDF with HoloViews as plotting engine. It is capable of a speed up of 9X when dealing with datasets in the millions of rows (measured on the NVIDIA RTX A3000 Laptop GPU). To run the cuDF + HoloViews implementation, open main.ipynb and see comments.

## Further Instructions:

1) You need to create a [<code>RAPIDS</code>](https://rapids.ai/start.html) environment containing cuDF (suggested RAPIDS version as 22.10) <br />
2) After installing the packages in requirements.txt, run the following commands <br />
```python
conda install -c conda-forge cartopy -y
pip install pyogrio
conda install -c pyviz spatialpandas
```
3) It currently does not support intents
<br />

<p align="center"><a href="#"><img width=60% alt="" src="https://github.com/lux-org/lux-resources/blob/master/readme_img/logo.png?raw=true"></a></p>
<h2 align="center">A Python API for Intelligent Visual Discovery</h2>

Expand Down
7 changes: 7 additions & 0 deletions global_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class backend:
def __init__(self):
self.backend = "pandas"

def set_back(self,type):
self.backend = type
return self.backend
2 changes: 2 additions & 0 deletions lux/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@
# limitations under the License.

# Register the commonly used modules (similar to how pandas does it: https://github.com/pandas-dev/pandas/blob/master/pandas/__init__.py)
from global_backend import backend
from lux.vis.Clause import Clause
from lux.core.frame import LuxDataFrame
from lux.core.sqltable import LuxSQLTable
from lux.core.joinedsqltable import JoinedSQLTable
from lux.utils.tracing_utils import LuxTracer
from ._version import __version__, version_info
from lux._config import config

from lux._config.config import warning_format
from lux.utils.debug_utils import debug_info, check_luxwidget_enabled

Expand Down
14 changes: 11 additions & 3 deletions lux/action/column_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

import lux
from lux.interestingness.interestingness import interestingness
from lux.processor.Compiler import Compiler
Expand All @@ -20,7 +22,8 @@
from lux.vis.Vis import Vis
from lux.vis.VisList import VisList
import pandas as pd

from global_backend import backend
if backend.set_back =="holoviews": import cudf

def column_group(ldf):
recommendation = {
Expand All @@ -34,7 +37,12 @@ def column_group(ldf):
}
collection = []
ldf_flat = ldf
if isinstance(ldf.columns, pd.DatetimeIndex):
if backend.set_back !="holoviews":
date_ind = pd.DatetimeIndex
else:
date_ind = cudf.DatetimeIndex

if isinstance(ldf.columns, date_ind):
ldf_flat.columns = ldf_flat.columns.format()

# use a single shared ldf_flat so that metadata doesn't need to be computed for every vis
Expand All @@ -44,7 +52,7 @@ def column_group(ldf):
index_column_name = ldf.index.name
else:
index_column_name = "index"
if isinstance(ldf.columns, pd.DatetimeIndex):
if isinstance(ldf.columns, date_ind):
ldf.columns = ldf.columns.to_native_types()
for attribute in ldf.columns:
if ldf[attribute].dtype != "object" and (attribute != "index"):
Expand Down
5 changes: 4 additions & 1 deletion lux/action/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

import lux
from lux.interestingness.interestingness import interestingness
from lux.processor.Compiler import Compiler
from lux.core.frame import LuxDataFrame
from lux.vis.VisList import VisList
from lux.utils import utils

from global_backend import backend
if backend.set_back =="holoviews": import cudf

# change ignore_transpose to false for now.
def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
Expand Down
5 changes: 4 additions & 1 deletion lux/action/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

from lux.interestingness.interestingness import interestingness
import lux
from lux.executor.PandasExecutor import PandasExecutor
from lux.executor.SQLExecutor import SQLExecutor
import lux

from global_backend import backend
if backend.set_back =="holoviews": import cudf

def custom(ldf):
"""
Expand Down
2 changes: 1 addition & 1 deletion lux/action/generalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def generalize(ldf):
"""
# takes in a dataObject and generates a list of new dataObjects, each with a single measure from the original object removed
# --> return list of dataObjects with corresponding interestingness scores

output = []
excluded_columns = []
attributes = list(filter(lambda x: x.value == "" and x.attribute != "Record", ldf._intent))
Expand Down
56 changes: 38 additions & 18 deletions lux/action/temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.


import lux
from lux.vis.VisList import VisList
from lux.vis.Vis import Vis
import pandas as pd
from lux.core.frame import LuxDataFrame
from lux.interestingness.interestingness import interestingness
from lux.utils import utils

from global_backend import backend
if backend.set_back =="holoviews": import cudf

def temporal(ldf):
"""
Expand All @@ -41,11 +45,15 @@ def temporal(ldf):
}
for c in ldf.columns:
if ldf.data_type[c] == "temporal":
try:
if backend.set_back !="holoviews":
try:
generated_vis = create_temporal_vis(ldf, c)
vlist.extend(generated_vis)
except:
pass
else:
generated_vis = create_temporal_vis(ldf, c)
vlist.extend(generated_vis)
except:
pass

# If no temporal visualizations were generated via parsing datetime, fallback to default behavior.
if len(vlist) == 0:
Expand All @@ -69,7 +77,6 @@ def temporal(ldf):
recommendation["collection"] = vlist
return recommendation


def create_temporal_vis(ldf, col):
"""
Creates and populates Vis objects for different timescales in the provided temporal column.
Expand All @@ -86,43 +93,56 @@ def create_temporal_vis(ldf, col):
vlist : [Vis]
Collection of Vis objects.
"""
formatted_date = pd.to_datetime(ldf[col], format="%Y-%m-%d")

if backend.set_back !="holoviews":
formatted_date = pd.to_datetime(ldf[col], format="%Y-%m-%d")
year_type = pd.to_datetime(formatted_date.dt.year, format="%Y")
day_type = formatted_date.dt.day
month_type = formatted_date.dt.month
dow_type = formatted_date.dt.dayofweek
else:
formatted_date = cudf.to_datetime(ldf[col])
year_type = cudf.to_datetime(formatted_date.year)
day_type = formatted_date.day
month_type = formatted_date.month
dow_type = formatted_date.dayofweek

overall_vis = Vis([lux.Clause(col, data_type="temporal")], source=ldf, score=5)

year_col = col + " (year)"
year_df = LuxDataFrame({year_col: pd.to_datetime(formatted_date.dt.year, format="%Y")})

year_df = LuxDataFrame({year_col: year_type})

year_vis = Vis([lux.Clause(year_col, data_type="temporal")], source=year_df, score=4)

month_col = col + " (month)"
month_df = LuxDataFrame({month_col: formatted_date.dt.month})
month_df = LuxDataFrame({month_col: month_type})
month_vis = Vis(
[lux.Clause(month_col, data_type="temporal", timescale="month")], source=month_df, score=3
)

day_col = col + " (day)"
day_df = LuxDataFrame({day_col: formatted_date.dt.day})
day_df = LuxDataFrame({day_col: day_type})
day_df.set_data_type(
{day_col: "nominal"}
) # Since day is high cardinality 1-31, it can get recognized as quantitative
day_vis = Vis([lux.Clause(day_col, data_type="temporal", timescale="day")], source=day_df, score=2)

week_col = col + " (day of week)"
week_df = lux.LuxDataFrame({week_col: formatted_date.dt.dayofweek})
week_df = lux.LuxDataFrame({week_col: dow_type})
week_vis = Vis(
[lux.Clause(week_col, data_type="temporal", timescale="day of week")], source=week_df, score=1
)

unique_year_values = len(year_df[year_col].unique())
unique_month_values = len(month_df[month_col].unique())
unique_week_values = len(week_df[week_col].unique())
vlist = []

vlist.append(overall_vis)

if unique_year_values != 1:
vlist.append(year_vis)
if unique_month_values != 1:
vlist.append(month_vis)
if unique_week_values != 1:
vlist.append(week_vis)
return vlist
return vlist

5 changes: 3 additions & 2 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from lux.vis.VisList import VisList
import lux
from lux.utils import utils

from global_backend import backend
#if backend.set_back =="holoviews": import cudf

def univariate(ldf, *args):
"""
Expand All @@ -36,7 +37,6 @@ def univariate(ldf, *args):
object with a collection of visualizations that result from the Distribution action.
"""
import numpy as np

if len(args) == 0:
data_type_constraint = "quantitative"
else:
Expand All @@ -61,6 +61,7 @@ def univariate(ldf, *args):
# Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
if len(ldf) < 5:
ignore_rec_flag = True

elif data_type_constraint == "nominal":
possible_attributes = [
c for c in ldf.columns if ldf.data_type[c] == "nominal" and c != "Number of Records"
Expand Down
28 changes: 21 additions & 7 deletions lux/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,33 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

import pandas as pd
from global_backend import backend
#If the backend is set to holoviews, then override cudf not pandas
if backend.set_back =="holoviews":
import cudf
overrideCudf=True
else:
overrideCudf=False

from .frame import LuxDataFrame
from .groupby import LuxDataFrameGroupBy, LuxSeriesGroupBy
from .series import LuxSeries

global originalDF
# Keep variable scope of original pandas df
originalDF = pd.core.frame.DataFrame
originalSeries = pd.core.series.Series

originalDF = pd.core.frame.DataFrame if backend.set_back !="holoviews" else cudf.core.dataframe.DataFrame
originalSeries = pd.core.series.Series if backend.set_back !="holoviews" else cudf.core.series.Series

def setOption(overridePandas=True):
if overridePandas:
def setOption(overridePandas=True,overrideCudf=False):
if overrideCudf:
cudf.DataFrame = cudf.core.dataframe.DataFrame = LuxDataFrame
cudf.Series = cudf.core.series.Series = LuxSeries
cudf.core.groupby.groupby.DataFrameGroupBy = LuxDataFrameGroupBy
cudf.core.groupby.groupby.SeriesGroupBy = LuxSeriesGroupBy
elif overridePandas:
pd.DataFrame = (
pd.io.json._json.DataFrame
) = (
Expand Down Expand Up @@ -69,5 +83,5 @@ def setOption(overridePandas=True):
pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF
pd.Series = originalSeries


setOption(overridePandas=True)
overridePandas=True
setOption(overridePandas, overrideCudf)
Loading