Skip to content

Commit

Permalink
fix: Histogram chart not able to use decimal datatype column (#30416)
Browse files Browse the repository at this point in the history
(cherry picked from commit 4834390)
  • Loading branch information
michael-s-molina authored and sadpandajoe committed Sep 30, 2024
1 parent ca5ed8b commit f743ae3
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ export default function buildQuery(formData: HistogramFormData) {
return buildQueryContext(formData, baseQueryObject => [
{
...baseQueryObject,
extras: { where: `${column} IS NOT NULL` },
columns: [...groupby, column],
post_processing: [histogramOperator(formData, baseQueryObject)],
metrics: undefined,
Expand Down
14 changes: 9 additions & 5 deletions superset/utils/pandas_postprocessing/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from __future__ import annotations

import numpy as np
from pandas import DataFrame, Series
from pandas import DataFrame, Series, to_numeric


# pylint: disable=too-many-arguments
Expand Down Expand Up @@ -48,12 +48,15 @@ def histogram(
if groupby is None:
groupby = []

# check if the column is numeric
if not np.issubdtype(df[column].dtype, np.number):
raise ValueError(f"The column '{column}' must be numeric.")
# convert to numeric, coercing errors to NaN
df[column] = to_numeric(df[column], errors="coerce")

# check if the column contains non-numeric values
if df[column].isna().any():
raise ValueError(f"Column '{column}' contains non-numeric values")

# calculate the histogram bin edges
bin_edges = np.histogram_bin_edges(df[column].dropna(), bins=bins)
bin_edges = np.histogram_bin_edges(df[column], bins=bins)

# convert the bin edges to strings
bin_edges_str = [
Expand All @@ -62,6 +65,7 @@ def histogram(
]

def hist_values(series: Series) -> np.ndarray:
# we might have NaN values as the result of grouping so we need to drop them
result = np.histogram(series.dropna(), bins=bin_edges)[0]
return result if not cumulative else np.cumsum(result)

Expand Down
28 changes: 10 additions & 18 deletions tests/unit_tests/pandas_postprocessing/test_histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,28 +117,20 @@ def test_histogram_with_groupby_and_cumulative_and_normalize():

def test_histogram_with_non_numeric_column():
try:
histogram(data, "b", ["group"], bins)
histogram(data, "group", None, bins)
except ValueError as e:
assert str(e) == "The column 'b' must be numeric."
assert str(e) == "Column 'group' contains non-numeric values"


# test histogram ignore null values
def test_histogram_ignore_null_values():
data_with_null = DataFrame(
def test_histogram_with_some_non_numeric_values():
data_with_non_numeric = DataFrame(
{
"group": ["A", "A", "B", "B", "A", "A", "B", "B", "A", "A"],
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
"b": [1, 2, 3, 4, 5, 6, 7, 8, 9, None],
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
"b": [1, 2, 3, 4, 5, 6, 7, 8, 9, "10"],
}
)
result = histogram(data_with_null, "a", ["group"], bins)
assert result.shape == (2, bins + 1)
assert result.columns.tolist() == [
"group",
"1 - 2",
"2 - 4",
"4 - 5",
"5 - 7",
"7 - 9",
]
assert result.values.tolist() == [["A", 2, 0, 1, 1, 1], ["B", 0, 2, 0, 1, 1]]
try:
histogram(data_with_non_numeric, "a", ["group"], bins)
except ValueError as e:
assert str(e) == "Column 'group' contains non-numeric values"

0 comments on commit f743ae3

Please sign in to comment.