From e8494701890d306aef7dbe1b523e2b53eec03831 Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Mon, 18 Dec 2023 16:21:13 -0500 Subject: [PATCH 1/6] add groupby and docs --- src/DataFramesMeta.jl | 1 + src/macros.jl | 79 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl index a16cca5a..1f42a9f3 100644 --- a/src/DataFramesMeta.jl +++ b/src/DataFramesMeta.jl @@ -21,6 +21,7 @@ export @with, @distinct, @rdistinct, @distinct!, @rdistinct!, @eachrow, @eachrow!, @byrow, @passmissing, @astable, @kwarg, + @groupby, @based_on, @where # deprecated const DOLLAR = raw"$" diff --git a/src/macros.jl b/src/macros.jl index 055cb4a2..d9ecce98 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3008,3 +3008,82 @@ macro rename!(x, args...) esc(rename!_helper(x, args...)) end +function groupby_helper(df, args...) + cols = map(get_column_expr, args) + if any(isnothing, cols) + throw(ArgumentError("All inputs to @groupby must be valid column selectors")) + end + + :($groupby($df, $make_source_concrete($reduce($vcat, $(Expr(:tuple, cols...)))))) +end + +function groupby_helper(df, arg) + if arg isa Expr && arg.head == :block + argsvec = MacroTools.rmlines(arg).args + return groupby_helper(df, argsvec...) + end + col = get_column_expr(arg) + if isnothing(col) + throw(ArgumentError("All inputs to @groupby must be valid column selectors")) + end + :($groupby($df, $col)) +end + + +""" + groupby(df, args...) + +Group a data frame by columns. Similar + +``` +groupby(df, [args...]) +``` + +but with a few convenience features. + +## Details + +`@groupby` does not perform any transformations or allow the +generation of new columns. New column generation must be done +before `@groupby` is called. + +Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol` +and `String` inputs, such that `@groupby df :A $DOLLAR"B"` +is supported. However integers cannot be mixed with strings or symbols. +`@groupby(df, :A, 1)` fails. + +To use vectors as a single argument for grouping, escaping with `$DOLLAR` +must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`. +This behavior ensures consistency with other DataFramesMeta.jl macros. + +`@groupby` automatically concatenates together multiple inputs such that mixing +vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]` + +`@groupby` also allows for the "block" style of DataFramesMeta.jl macros, +as in + +``` +@grouby df begin + :A + :B +end +``` + +## Examples +```julia-repl +julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]); +julia> @groupby df :A; +julia> @groupby df :A :B; +julia> @groupby df $DOLLAR[:A, :B]; +julia> @groupby df begin + :A + :B + end; +julia> @groupby df :A $DOLLAR[:B, :C]; +``` + +""" +macro groupby(df, args...) + esc(groupby_helper(df, args...)) +end + From acfda82bb095c0d78deb009cfb8555eb032bbb31 Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Wed, 20 Dec 2023 13:01:15 -0500 Subject: [PATCH 2/6] implementation --- src/macros.jl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index d9ecce98..6fdc1119 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3009,12 +3009,19 @@ macro rename!(x, args...) end function groupby_helper(df, args...) - cols = map(get_column_expr, args) + cols = map(args) do a + if a isa Expr && a.head == :call && a.args[1] in (:All, :Between, :Cols, :Not) + a + else + get_column_expr(a) + end + end if any(isnothing, cols) throw(ArgumentError("All inputs to @groupby must be valid column selectors")) end - :($groupby($df, $make_source_concrete($reduce($vcat, $(Expr(:tuple, cols...)))))) + t = Expr(:tuple, cols...) + :($groupby($df, ($Cols($t...)))) end function groupby_helper(df, arg) From e2bfcfac0e15b00c4be02b44d75d6eed165e60f8 Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Fri, 22 Dec 2023 12:33:09 -0500 Subject: [PATCH 3/6] whatever --- src/macros.jl | 87 --------------------------------------------------- 1 file changed, 87 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 6fdc1119..d36393f7 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3007,90 +3007,3 @@ julia> @rename!(df, :new1 = $DOLLAR("old_col" * "1"), :new2 = :old_col2) macro rename!(x, args...) esc(rename!_helper(x, args...)) end - -function groupby_helper(df, args...) - cols = map(args) do a - if a isa Expr && a.head == :call && a.args[1] in (:All, :Between, :Cols, :Not) - a - else - get_column_expr(a) - end - end - if any(isnothing, cols) - throw(ArgumentError("All inputs to @groupby must be valid column selectors")) - end - - t = Expr(:tuple, cols...) - :($groupby($df, ($Cols($t...)))) -end - -function groupby_helper(df, arg) - if arg isa Expr && arg.head == :block - argsvec = MacroTools.rmlines(arg).args - return groupby_helper(df, argsvec...) - end - col = get_column_expr(arg) - if isnothing(col) - throw(ArgumentError("All inputs to @groupby must be valid column selectors")) - end - :($groupby($df, $col)) -end - - -""" - groupby(df, args...) - -Group a data frame by columns. Similar - -``` -groupby(df, [args...]) -``` - -but with a few convenience features. - -## Details - -`@groupby` does not perform any transformations or allow the -generation of new columns. New column generation must be done -before `@groupby` is called. - -Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol` -and `String` inputs, such that `@groupby df :A $DOLLAR"B"` -is supported. However integers cannot be mixed with strings or symbols. -`@groupby(df, :A, 1)` fails. - -To use vectors as a single argument for grouping, escaping with `$DOLLAR` -must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`. -This behavior ensures consistency with other DataFramesMeta.jl macros. - -`@groupby` automatically concatenates together multiple inputs such that mixing -vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]` - -`@groupby` also allows for the "block" style of DataFramesMeta.jl macros, -as in - -``` -@grouby df begin - :A - :B -end -``` - -## Examples -```julia-repl -julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]); -julia> @groupby df :A; -julia> @groupby df :A :B; -julia> @groupby df $DOLLAR[:A, :B]; -julia> @groupby df begin - :A - :B - end; -julia> @groupby df :A $DOLLAR[:B, :C]; -``` - -""" -macro groupby(df, args...) - esc(groupby_helper(df, args...)) -end - From e285f83578bc4162688d09a7868538f6d675cb64 Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Fri, 22 Dec 2023 12:33:35 -0500 Subject: [PATCH 4/6] rebase --- src/macros.jl | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/macros.jl b/src/macros.jl index d36393f7..9c0395e1 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3007,3 +3007,86 @@ julia> @rename!(df, :new1 = $DOLLAR("old_col" * "1"), :new2 = :old_col2) macro rename!(x, args...) esc(rename!_helper(x, args...)) end + +function fix_c(x) + c = get_column_expr(x; allow_multicol = true) + if isnothing(c) + throw(ArgumentError("Invalid column selector in @groupby")) + end + c +end + +function groupby_helper(df, args...) + cols = map(fix_c, args) + t = Expr(:tuple, cols...) + :($groupby($df, ($Cols($t...)))) +end + +function groupby_helper(df, arg::Expr) + if arg isa Expr && arg.head == :block + argsvec = MacroTools.rmlines(arg).args + return groupby_helper(df, argsvec...) + else + c = fix_c(arg) + return :($groupby($df, $c)) + end +end + + +""" + groupby(df, args...) + +Group a data frame by columns. Similar + +``` +groupby(df, [args...]) +``` + +but with a few convenience features. + +## Details + +`@groupby` does not perform any transformations or allow the +generation of new columns. New column generation must be done +before `@groupby` is called. + +Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol` +and `String` inputs, such that `@groupby df :A $DOLLAR"B"` +is supported. However integers cannot be mixed with strings or symbols. +`@groupby(df, :A, 1)` fails. + +To use vectors as a single argument for grouping, escaping with `$DOLLAR` +must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`. +This behavior ensures consistency with other DataFramesMeta.jl macros. + +`@groupby` automatically concatenates together multiple inputs such that mixing +vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]` + +`@groupby` also allows for the "block" style of DataFramesMeta.jl macros, +as in + +``` +@grouby df begin + :A + :B +end +``` + +## Examples +```julia-repl +julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]); +julia> @groupby df :A; +julia> @groupby df :A :B; +julia> @groupby df $DOLLAR[:A, :B]; +julia> @groupby df begin + :A + :B + end; +julia> @groupby df :A $DOLLAR[:B, :C]; +``` + +""" +macro groupby(df, args...) + esc(groupby_helper(df, args...)) +end + From b381960d79f1748395d226ca5dd796e4a835b11d Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Fri, 22 Dec 2023 13:07:06 -0500 Subject: [PATCH 5/6] change implemetation --- src/macros.jl | 60 +++++++++------------------------------------------ 1 file changed, 10 insertions(+), 50 deletions(-) diff --git a/src/macros.jl b/src/macros.jl index 9c0395e1..d66f13e5 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -3008,38 +3008,18 @@ macro rename!(x, args...) esc(rename!_helper(x, args...)) end -function fix_c(x) - c = get_column_expr(x; allow_multicol = true) - if isnothing(c) - throw(ArgumentError("Invalid column selector in @groupby")) - end - c -end - function groupby_helper(df, args...) - cols = map(fix_c, args) - t = Expr(:tuple, cols...) + t = Expr(:tuple, args...) :($groupby($df, ($Cols($t...)))) end -function groupby_helper(df, arg::Expr) - if arg isa Expr && arg.head == :block - argsvec = MacroTools.rmlines(arg).args - return groupby_helper(df, argsvec...) - else - c = fix_c(arg) - return :($groupby($df, $c)) - end -end - - """ groupby(df, args...) -Group a data frame by columns. Similar +Group a data frame by columns. An alias for ``` -groupby(df, [args...]) +groupby(df, Cols(args...)) ``` but with a few convenience features. @@ -3050,41 +3030,21 @@ but with a few convenience features. generation of new columns. New column generation must be done before `@groupby` is called. -Unlike `DataFrames.groupby`, `@groupby` allows mixing of `Symbol` -and `String` inputs, such that `@groupby df :A $DOLLAR"B"` -is supported. However integers cannot be mixed with strings or symbols. -`@groupby(df, :A, 1)` fails. +`@groupby` allows mixing of `Symbol` +and `String` inputs, such that `@groupby df :A "B"` +is supported. -To use vectors as a single argument for grouping, escaping with `$DOLLAR` -must be used. `@groupby df [:a, :b]` fails. Rather, use `@groupby df $DOLLAR[:a, :b]`. -This behavior ensures consistency with other DataFramesMeta.jl macros. - -`@groupby` automatically concatenates together multiple inputs such that mixing -vector and scalar column selectors is supported, as in `@groupby df :A $DOLLAR[:B, :C]` - -`@groupby` also allows for the "block" style of DataFramesMeta.jl macros, -as in - -``` -@grouby df begin - :A - :B -end -``` +Arguments are not escaped and DataFramesMeta.jl rules for column +selection, such as `$DOLLAR` for escaping, do not apply. ## Examples ```julia-repl julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]); julia> @groupby df :A; julia> @groupby df :A :B; -julia> @groupby df $DOLLAR[:A, :B]; -julia> @groupby df begin - :A - :B - end; -julia> @groupby df :A $DOLLAR[:B, :C]; +julia> @groupby df [:A, :B]; +julia> @groupby df :A [:B, :C]; ``` - """ macro groupby(df, args...) esc(groupby_helper(df, args...)) From 237462f00706f479b437daa11174bf581381b4df Mon Sep 17 00:00:00 2001 From: Peter Deffebach Date: Fri, 22 Dec 2023 13:49:51 -0500 Subject: [PATCH 6/6] docs --- docs/src/dplyr.md | 10 +++++----- docs/src/index.md | 27 +++++++++++++++++++++------ test/grouping.jl | 17 +++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/docs/src/dplyr.md b/docs/src/dplyr.md index b42d6d0d..f7ae6412 100644 --- a/docs/src/dplyr.md +++ b/docs/src/dplyr.md @@ -93,7 +93,7 @@ DataFramesMeta.jl macro | By-row version | Description | `dplyr` equivalent `@subset` | `@rsubset` | filter rows | `filter` `@orderby` | `@rorderby` | re-order or arrange rows | `arrange` `@combine` | | summarise values | `summarize` (but `@combine` is more flexible) -`groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by` +`@groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by` # DataFramesMeta.jl Verbs In Action @@ -341,15 +341,15 @@ DataFrames.jl also provides the function `describe` which performs many of these describe(msleep) ``` -## Group Operations using `groupby` and `@combine` +## Group Operations using `@groupby` and `@combine` -The `groupby` verb is an important function in DataFrames.jl (it does not live in DataFramesMeta.jl). As we mentioned before it's related to concept of "split-apply-combine". We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. +The `@groupby` verb is the first step in the "split-apply-combine" workflow. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. Let's do that: split the `msleep` data frame by the taxonomic order, then ask for the same summary statistics as above. We expect a set of summary statistics for each taxonomic order. ```@repl 1 @chain msleep begin - groupby(:order) + @groupby :order @combine begin :avg_sleep = mean(:sleep_total) :min_sleep = minimum(:sleep_total) @@ -363,7 +363,7 @@ Split-apply-combine can also be used with `@transform` to add new variables to a ```@repl 1 @chain msleep begin - groupby(:order) + @groupby :order @transform :sleep_genus = :sleep_total .- mean(:sleep_total) end ``` diff --git a/docs/src/index.md b/docs/src/index.md index 31e8970f..15ef9c9b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -16,6 +16,7 @@ In addition, DataFramesMeta provides * Row-wise versions of the above macros in the form of `@rtransform`, `@rtransform!`, `@rselect`, `@rselect!`, `@rorderby`, `@rsubset`, and `@rsubset!`. * `@rename` and `@rename!` for renaming columns +* `@groupby` for grouping data * `@by`, for grouping and combining a data frame in a single step * `@with`, for working with the columns of a data frame with high performance and convenient syntax @@ -64,7 +65,7 @@ data frame. ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @select(df, :x, :y) @select(df, :x2 = 2 * :x, :y) @select(gd, :x2 = 2 .* :y .* first(:y)) @@ -98,7 +99,7 @@ data frame. ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @transform(df, :x2 = 2 * :x, :y) @transform(gd, :x2 = 2 .* :y .* first(:y)) @transform!(df, :x, :y) @@ -115,7 +116,7 @@ Select row subsets. Operates on both a `DataFrame` and a `GroupedDataFrame`. ```julia using Statistics df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); outside_var = 1; @subset(df, :x .> 1) @subset(df, :x .> outside_var) @@ -134,11 +135,14 @@ acts like a `GroupedDataFrame` with one group. Like `@select` and `@transform`, transformations are called with the keyword-like syntax `:y = f(:x)`. +To group data together into a `GroupedDataFrame`, use `@groupby`, a short-hand for +the DataFrames.jl function `groupby`. + Examples: ```julia df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); -gd = groupby(df, :x); +gd = @groupby(df, :x); @combine(gd, :x2 = sum(:y)) @combine(gd, :x2 = :y .- sum(:y)) @combine(gd, $AsTable = (n1 = sum(:y), n2 = first(:y))) @@ -161,6 +165,17 @@ gd = groupby(df, :x); @combine(gd, $AsTable = (a = sum(:x), b = sum(:y))) ``` +### `@by` + +Perform the grouping and combining operations in one step with `@by` + +``` +df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]); +@by df :x begin + :x = sum(:y) +end +``` + ## `@orderby` Sort rows in a `DataFrame` by values in one of several columns or a @@ -355,7 +370,7 @@ julia> @subset df @byrow begin however, like with `ByRow` in DataFrames.jl, when `@byrow` is used, functions do not take into account the grouping, so for example the result of `@transform(df, @byrow :y = f(:x))` and -`@transform(groupby(df, :g), @byrow :y = f(:x))` is the same. +`@transform(@groupby(df, :g), @byrow :y = f(:x))` is the same. ## Propagating missing values with `@passmissing` @@ -912,7 +927,7 @@ functions. | `@subset` | `filter` | `Where` | | `@transform` | `mutate` | `Select` (?) | | `@by` | | `GroupBy` | -| `groupby` | `group_by` | `GroupBy` | +| `@groupby` | `group_by` | `GroupBy` | | `@combine` | `summarise`/`do` | | | `@orderby` | `arrange` | `OrderBy` | | `@select` | `select` | `Select` | diff --git a/test/grouping.jl b/test/grouping.jl index 0db08b6e..e06a1ed6 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -349,4 +349,21 @@ end @test @select(g, :a, @byrow :t = :a ^ 2).t ≅ d.a .^ 2 end +@testset "@groupby" begin + df = DataFrame(a = [1, 2], b = [3, 4], c = [5, 6]) + resa = groupby(df, [:a]) + resab = groupby(df, [:a, :b]) + resabc = groupby(df, [:a, :b, :c]) + ab = [:a, :b] + + @test @groupby(df, :a) == resa + @test @groupby(df, :a, :b) == resab + @test (@groupby df ab) == resab + @test (@groupby df :a 2) == resab + @test (@groupby df [:a, :b]) == resab + @test (@groupby df :a "b") == resab + @test (@groupby df All()) == resabc + @test (@groupby df Cols(:a, 2, "c")) == resabc +end + end # module