From e8494701890d306aef7dbe1b523e2b53eec03831 Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Mon, 18 Dec 2023 16:21:13 -0500
Subject: [PATCH 1/6] add groupby and docs
---
src/DataFramesMeta.jl | 1 +
src/macros.jl | 79 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 80 insertions(+)
diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl
index a16cca5a..1f42a9f3 100644
--- a/src/DataFramesMeta.jl
+++ b/src/DataFramesMeta.jl
@@ -21,6 +21,7 @@ export @with,
@distinct, @rdistinct, @distinct!, @rdistinct!,
@eachrow, @eachrow!,
@byrow, @passmissing, @astable, @kwarg,
+ @groupby,
@based_on, @where # deprecated
const DOLLAR = raw"$"
diff --git a/src/macros.jl b/src/macros.jl
index 055cb4a2..d9ecce98 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -3008,3 +3008,82 @@ macro rename!(x, args...)
esc(rename!_helper(x, args...))
end
+function groupby_helper(df, args...)
+ cols = map(get_column_expr, args)
+ if any(isnothing, cols)
+ throw(ArgumentError("All inputs to @groupby must be valid column selectors"))
+ end
+
+ :($groupby($df, $make_source_concrete($reduce($vcat, $(Expr(:tuple, cols...))))))
+end
+
+function groupby_helper(df, arg)
+ if arg isa Expr && arg.head == :block
+ argsvec = MacroTools.rmlines(arg).args
+ return groupby_helper(df, argsvec...)
+ end
+ col = get_column_expr(arg)
+ if isnothing(col)
+ throw(ArgumentError("All inputs to @groupby must be valid column selectors"))
+ end
+ :($groupby($df, $col))
+end
+
+
+"""
+ groupby(df, args...)
+
+Group a data frame by columns. Similar
+
+```
+groupby(df, [args...])
+```
+
+but with a few convenience features.
+
+## Details
+
+`@groupby` does not perform any transformations or allow the
+generation of new columns. New column generation must be done
+before `@groupby` is called.
+
+Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol`
+and `String` inputs, such that `@groupby df :A $DOLLAR"B"`
+is supported. However integers cannot be mixed with strings or symbols.
+`@groupby(df, :A, 1)` fails.
+
+To use vectors as a single argument for grouping, escaping with `$DOLLAR`
+must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`.
+This behavior ensures consistency with other DataFramesMeta.jl macros.
+
+`@groupby` automatically concatenates together multiple inputs such that mixing
+vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]`
+
+`@groupby` also allows for the "block" style of DataFramesMeta.jl macros,
+as in
+
+```
+@grouby df begin
+ :A
+ :B
+end
+```
+
+## Examples
+```julia-repl
+julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]);
+julia> @groupby df :A;
+julia> @groupby df :A :B;
+julia> @groupby df $DOLLAR[:A, :B];
+julia> @groupby df begin
+ :A
+ :B
+ end;
+julia> @groupby df :A $DOLLAR[:B, :C];
+```
+
+"""
+macro groupby(df, args...)
+ esc(groupby_helper(df, args...))
+end
+
From acfda82bb095c0d78deb009cfb8555eb032bbb31 Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Wed, 20 Dec 2023 13:01:15 -0500
Subject: [PATCH 2/6] implementation
---
src/macros.jl | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/macros.jl b/src/macros.jl
index d9ecce98..6fdc1119 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -3009,12 +3009,19 @@ macro rename!(x, args...)
end
function groupby_helper(df, args...)
- cols = map(get_column_expr, args)
+ cols = map(args) do a
+ if a isa Expr && a.head == :call && a.args[1] in (:All, :Between, :Cols, :Not)
+ a
+ else
+ get_column_expr(a)
+ end
+ end
if any(isnothing, cols)
throw(ArgumentError("All inputs to @groupby must be valid column selectors"))
end
- :($groupby($df, $make_source_concrete($reduce($vcat, $(Expr(:tuple, cols...))))))
+ t = Expr(:tuple, cols...)
+ :($groupby($df, ($Cols($t...))))
end
function groupby_helper(df, arg)
From e2bfcfac0e15b00c4be02b44d75d6eed165e60f8 Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Fri, 22 Dec 2023 12:33:09 -0500
Subject: [PATCH 3/6] whatever
---
src/macros.jl | 87 ---------------------------------------------------
1 file changed, 87 deletions(-)
diff --git a/src/macros.jl b/src/macros.jl
index 6fdc1119..d36393f7 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -3007,90 +3007,3 @@ julia> @rename!(df, :new1 = $DOLLAR("old_col" * "1"), :new2 = :old_col2)
macro rename!(x, args...)
esc(rename!_helper(x, args...))
end
-
-function groupby_helper(df, args...)
- cols = map(args) do a
- if a isa Expr && a.head == :call && a.args[1] in (:All, :Between, :Cols, :Not)
- a
- else
- get_column_expr(a)
- end
- end
- if any(isnothing, cols)
- throw(ArgumentError("All inputs to @groupby must be valid column selectors"))
- end
-
- t = Expr(:tuple, cols...)
- :($groupby($df, ($Cols($t...))))
-end
-
-function groupby_helper(df, arg)
- if arg isa Expr && arg.head == :block
- argsvec = MacroTools.rmlines(arg).args
- return groupby_helper(df, argsvec...)
- end
- col = get_column_expr(arg)
- if isnothing(col)
- throw(ArgumentError("All inputs to @groupby must be valid column selectors"))
- end
- :($groupby($df, $col))
-end
-
-
-"""
- groupby(df, args...)
-
-Group a data frame by columns. Similar
-
-```
-groupby(df, [args...])
-```
-
-but with a few convenience features.
-
-## Details
-
-`@groupby` does not perform any transformations or allow the
-generation of new columns. New column generation must be done
-before `@groupby` is called.
-
-Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol`
-and `String` inputs, such that `@groupby df :A $DOLLAR"B"`
-is supported. However integers cannot be mixed with strings or symbols.
-`@groupby(df, :A, 1)` fails.
-
-To use vectors as a single argument for grouping, escaping with `$DOLLAR`
-must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`.
-This behavior ensures consistency with other DataFramesMeta.jl macros.
-
-`@groupby` automatically concatenates together multiple inputs such that mixing
-vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]`
-
-`@groupby` also allows for the "block" style of DataFramesMeta.jl macros,
-as in
-
-```
-@grouby df begin
- :A
- :B
-end
-```
-
-## Examples
-```julia-repl
-julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]);
-julia> @groupby df :A;
-julia> @groupby df :A :B;
-julia> @groupby df $DOLLAR[:A, :B];
-julia> @groupby df begin
- :A
- :B
- end;
-julia> @groupby df :A $DOLLAR[:B, :C];
-```
-
-"""
-macro groupby(df, args...)
- esc(groupby_helper(df, args...))
-end
-
From e285f83578bc4162688d09a7868538f6d675cb64 Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Fri, 22 Dec 2023 12:33:35 -0500
Subject: [PATCH 4/6] rebase
---
src/macros.jl | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)
diff --git a/src/macros.jl b/src/macros.jl
index d36393f7..9c0395e1 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -3007,3 +3007,86 @@ julia> @rename!(df, :new1 = $DOLLAR("old_col" * "1"), :new2 = :old_col2)
macro rename!(x, args...)
esc(rename!_helper(x, args...))
end
+
+function fix_c(x)
+ c = get_column_expr(x; allow_multicol = true)
+ if isnothing(c)
+ throw(ArgumentError("Invalid column selector in @groupby"))
+ end
+ c
+end
+
+function groupby_helper(df, args...)
+ cols = map(fix_c, args)
+ t = Expr(:tuple, cols...)
+ :($groupby($df, ($Cols($t...))))
+end
+
+function groupby_helper(df, arg::Expr)
+ if arg isa Expr && arg.head == :block
+ argsvec = MacroTools.rmlines(arg).args
+ return groupby_helper(df, argsvec...)
+ else
+ c = fix_c(arg)
+ return :($groupby($df, $c))
+ end
+end
+
+
+"""
+ groupby(df, args...)
+
+Group a data frame by columns. Similar
+
+```
+groupby(df, [args...])
+```
+
+but with a few convenience features.
+
+## Details
+
+`@groupby` does not perform any transformations or allow the
+generation of new columns. New column generation must be done
+before `@groupby` is called.
+
+Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol`
+and `String` inputs, such that `@groupby df :A $DOLLAR"B"`
+is supported. However integers cannot be mixed with strings or symbols.
+`@groupby(df, :A, 1)` fails.
+
+To use vectors as a single argument for grouping, escaping with `$DOLLAR`
+must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`.
+This behavior ensures consistency with other DataFramesMeta.jl macros.
+
+`@groupby` automatically concatenates together multiple inputs such that mixing
+vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]`
+
+`@groupby` also allows for the "block" style of DataFramesMeta.jl macros,
+as in
+
+```
+@grouby df begin
+ :A
+ :B
+end
+```
+
+## Examples
+```julia-repl
+julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]);
+julia> @groupby df :A;
+julia> @groupby df :A :B;
+julia> @groupby df $DOLLAR[:A, :B];
+julia> @groupby df begin
+ :A
+ :B
+ end;
+julia> @groupby df :A $DOLLAR[:B, :C];
+```
+
+"""
+macro groupby(df, args...)
+ esc(groupby_helper(df, args...))
+end
+
From b381960d79f1748395d226ca5dd796e4a835b11d Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Fri, 22 Dec 2023 13:07:06 -0500
Subject: [PATCH 5/6] change implemetation
---
src/macros.jl | 60 +++++++++------------------------------------------
1 file changed, 10 insertions(+), 50 deletions(-)
diff --git a/src/macros.jl b/src/macros.jl
index 9c0395e1..d66f13e5 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -3008,38 +3008,18 @@ macro rename!(x, args...)
esc(rename!_helper(x, args...))
end
-function fix_c(x)
- c = get_column_expr(x; allow_multicol = true)
- if isnothing(c)
- throw(ArgumentError("Invalid column selector in @groupby"))
- end
- c
-end
-
function groupby_helper(df, args...)
- cols = map(fix_c, args)
- t = Expr(:tuple, cols...)
+ t = Expr(:tuple, args...)
:($groupby($df, ($Cols($t...))))
end
-function groupby_helper(df, arg::Expr)
- if arg isa Expr && arg.head == :block
- argsvec = MacroTools.rmlines(arg).args
- return groupby_helper(df, argsvec...)
- else
- c = fix_c(arg)
- return :($groupby($df, $c))
- end
-end
-
-
"""
groupby(df, args...)
-Group a data frame by columns. Similar
+Group a data frame by columns. An alias for
```
-groupby(df, [args...])
+groupby(df, Cols(args...))
```
but with a few convenience features.
@@ -3050,41 +3030,21 @@ but with a few convenience features.
generation of new columns. New column generation must be done
before `@groupby` is called.
-Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol`
-and `String` inputs, such that `@groupby df :A $DOLLAR"B"`
-is supported. However integers cannot be mixed with strings or symbols.
-`@groupby(df, :A, 1)` fails.
+`@groupby` allows mixing of `Symbol`
+and `String` inputs, such that `@groupby df :A "B"`
+is supported.
-To use vectors as a single argument for grouping, escaping with `$DOLLAR`
-must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`.
-This behavior ensures consistency with other DataFramesMeta.jl macros.
-
-`@groupby` automatically concatenates together multiple inputs such that mixing
-vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]`
-
-`@groupby` also allows for the "block" style of DataFramesMeta.jl macros,
-as in
-
-```
-@grouby df begin
- :A
- :B
-end
-```
+Arguments are not escaped and DataFramesMeta.jl rules for column
+selection, such as `$DOLLAR` for escaping, do not apply.
## Examples
```julia-repl
julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]);
julia> @groupby df :A;
julia> @groupby df :A :B;
-julia> @groupby df $DOLLAR[:A, :B];
-julia> @groupby df begin
- :A
- :B
- end;
-julia> @groupby df :A $DOLLAR[:B, :C];
+julia> @groupby df [:A, :B];
+julia> @groupby df :A [:B, :C];
```
-
"""
macro groupby(df, args...)
esc(groupby_helper(df, args...))
From 237462f00706f479b437daa11174bf581381b4df Mon Sep 17 00:00:00 2001
From: Peter Deffebach
Date: Fri, 22 Dec 2023 13:49:51 -0500
Subject: [PATCH 6/6] docs
---
docs/src/dplyr.md | 10 +++++-----
docs/src/index.md | 27 +++++++++++++++++++++------
test/grouping.jl | 17 +++++++++++++++++
3 files changed, 43 insertions(+), 11 deletions(-)
diff --git a/docs/src/dplyr.md b/docs/src/dplyr.md
index b42d6d0d..f7ae6412 100644
--- a/docs/src/dplyr.md
+++ b/docs/src/dplyr.md
@@ -93,7 +93,7 @@ DataFramesMeta.jl macro | By-row version | Description | `dplyr` equivalent
`@subset` | `@rsubset` | filter rows | `filter`
`@orderby` | `@rorderby` | re-order or arrange rows | `arrange`
`@combine` | | summarise values | `summarize` (but `@combine` is more flexible)
-`groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by`
+`@groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by`
# DataFramesMeta.jl Verbs In Action
@@ -341,15 +341,15 @@ DataFrames.jl also provides the function `describe` which performs many of these
describe(msleep)
```
-## Group Operations using `groupby` and `@combine`
+## Group Operations using `@groupby` and `@combine`
-The `groupby` verb is an important function in DataFrames.jl (it does not live in DataFramesMeta.jl). As we mentioned before it's related to concept of "split-apply-combine". We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output.
+The `@groupby` verb is the first step in the "split-apply-combine" workflow. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output.
Let's do that: split the `msleep` data frame by the taxonomic order, then ask for the same summary statistics as above. We expect a set of summary statistics for each taxonomic order.
```@repl 1
@chain msleep begin
- groupby(:order)
+ @groupby :order
@combine begin
:avg_sleep = mean(:sleep_total)
:min_sleep = minimum(:sleep_total)
@@ -363,7 +363,7 @@ Split-apply-combine can also be used with `@transform` to add new variables to a
```@repl 1
@chain msleep begin
- groupby(:order)
+ @groupby :order
@transform :sleep_genus = :sleep_total .- mean(:sleep_total)
end
```
diff --git a/docs/src/index.md b/docs/src/index.md
index 31e8970f..15ef9c9b 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -16,6 +16,7 @@ In addition, DataFramesMeta provides
* Row-wise versions of the above macros in the form of `@rtransform`, `@rtransform!`,
`@rselect`, `@rselect!`, `@rorderby`, `@rsubset`, and `@rsubset!`.
* `@rename` and `@rename!` for renaming columns
+* `@groupby` for grouping data
* `@by`, for grouping and combining a data frame in a single step
* `@with`, for working with the columns of a data frame with high performance and
convenient syntax
@@ -64,7 +65,7 @@ data frame.
```julia
df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
@select(df, :x, :y)
@select(df, :x2 = 2 * :x, :y)
@select(gd, :x2 = 2 .* :y .* first(:y))
@@ -98,7 +99,7 @@ data frame.
```julia
df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
@transform(df, :x2 = 2 * :x, :y)
@transform(gd, :x2 = 2 .* :y .* first(:y))
@transform!(df, :x, :y)
@@ -115,7 +116,7 @@ Select row subsets. Operates on both a `DataFrame` and a `GroupedDataFrame`.
```julia
using Statistics
df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
outside_var = 1;
@subset(df, :x .> 1)
@subset(df, :x .> outside_var)
@@ -134,11 +135,14 @@ acts like a `GroupedDataFrame` with one group.
Like `@select` and `@transform`, transformations are called with the keyword-like
syntax `:y = f(:x)`.
+To group data together into a `GroupedDataFrame`, use `@groupby`, a short-hand for
+the DataFrames.jl function `groupby`.
+
Examples:
```julia
df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
@combine(gd, :x2 = sum(:y))
@combine(gd, :x2 = :y .- sum(:y))
@combine(gd, $AsTable = (n1 = sum(:y), n2 = first(:y)))
@@ -161,6 +165,17 @@ gd = groupby(df, :x);
@combine(gd, $AsTable = (a = sum(:x), b = sum(:y)))
```
+### `@by`
+
+Perform the grouping and combining operations in one step with `@by`
+
+```
+df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
+@by df :x begin
+ :x = sum(:y)
+end
+```
+
## `@orderby`
Sort rows in a `DataFrame` by values in one of several columns or a
@@ -355,7 +370,7 @@ julia> @subset df @byrow begin
however, like with `ByRow` in DataFrames.jl, when `@byrow` is
used, functions do not take into account the grouping, so for
example the result of `@transform(df, @byrow :y = f(:x))` and
-`@transform(groupby(df, :g), @byrow :y = f(:x))` is the same.
+`@transform(@groupby(df, :g), @byrow :y = f(:x))` is the same.
## Propagating missing values with `@passmissing`
@@ -912,7 +927,7 @@ functions.
| `@subset` | `filter` | `Where` |
| `@transform` | `mutate` | `Select` (?) |
| `@by` | | `GroupBy` |
-| `groupby` | `group_by` | `GroupBy` |
+| `@groupby` | `group_by` | `GroupBy` |
| `@combine` | `summarise`/`do` | |
| `@orderby` | `arrange` | `OrderBy` |
| `@select` | `select` | `Select` |
diff --git a/test/grouping.jl b/test/grouping.jl
index 0db08b6e..e06a1ed6 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -349,4 +349,21 @@ end
@test @select(g, :a, @byrow :t = :a ^ 2).t ≅ d.a .^ 2
end
+@testset "@groupby" begin
+ df = DataFrame(a = [1, 2], b = [3, 4], c = [5, 6])
+ resa = groupby(df, [:a])
+ resab = groupby(df, [:a, :b])
+ resabc = groupby(df, [:a, :b, :c])
+ ab = [:a, :b]
+
+ @test @groupby(df, :a) == resa
+ @test @groupby(df, :a, :b) == resab
+ @test (@groupby df ab) == resab
+ @test (@groupby df :a 2) == resab
+ @test (@groupby df [:a, :b]) == resab
+ @test (@groupby df :a "b") == resab
+ @test (@groupby df All()) == resabc
+ @test (@groupby df Cols(:a, 2, "c")) == resabc
+end
+
end # module