Skip to content

Commit

Permalink
Merge pull request #2 from maxhumber/easygather
Browse files Browse the repository at this point in the history
Easygather
  • Loading branch information
maxhumber authored Oct 18, 2022
2 parents d99c7d1 + d92c83f commit 0d1bea1
Show file tree
Hide file tree
Showing 9 changed files with 169 additions and 42 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
- 1.3b1
- NEW: `gather(beside=...)` argument!
- IMPROVED: `sample` errors are more explicit
- 1.2
- NEW: `cross` join verb!
- NEW: `join(..., postfix=("_lhs, "_rhs"))` argument
Expand Down
3 changes: 2 additions & 1 deletion TODO
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
- remove summarize
- explode, collapse verbs
- more research into `.assign` mutate(..., vectorized=True)?
- datasets
Expand All @@ -8,4 +9,4 @@
- hide/protect/private
- 10 minutes tutorial
- cheatsheet (pandas/dplyr/tidyr)
- anaconda
- anaconda?
82 changes: 60 additions & 22 deletions redframes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,50 +819,88 @@ def filter(self, func: Func) -> DataFrame:
def gather(
self,
columns: Columns | None = None,
beside: LazyColumns | None = None,
into: tuple[Column, Column] = ("variable", "value"),
):
"""Lengthen data, increase rows, decrease columns (opposite of `spread`)
"""Lengthen data by increasing rows and decreasing columns (opposite of `spread`)
pandas: `melt`
tidyverse: `gather`, `pivot_longer`
Examples:
```python
df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]})
df = rf.DataFrame({
"foo": [1, 2, 1, 2],
"bar": ["A", "B", "C", "D"],
"baz": ["!", "@", "#", "$"],
"jaz": range(4)
})
```
| foo | bar | baz |
|------:|------:|------:|
| 1 | 3 | 4 |
| 2 | 4 | 5 |
| foo | bar | baz | jaz |
|------:|:------|:------|------:|
| 1 | A | ! | 0 |
| 2 | B | @ | 1 |
| 1 | C | # | 2 |
| 2 | D | $ | 3 |
All columns:
```python
df.gather()
```
| variable | value |
|:-----------|--------:|
| foo | 1 |
| foo | 2 |
| bar | 3 |
| bar | 4 |
| baz | 4 |
| baz | 5 |
| variable | value |
|:-----------|:--------|
| foo | 1 |
| foo | 2 |
| foo | 1 |
| foo | 2 |
| bar | A |
| bar | B |
| bar | C |
| bar | D |
| baz | ! |
| baz | @ |
| baz | # |
| baz | $ |
| jaz | 0 |
| jaz | 1 |
| jaz | 2 |
| jaz | 3 |
Multiple columns:
```python
df.gather(["foo", "bar"], into=("var", "val"))
```
| baz | var | val |
|------:|:------|------:|
| 4 | foo | 1 |
| 5 | foo | 2 |
| 4 | bar | 3 |
| 5 | bar | 4 |
| baz | jaz | var | val |
|:------|------:|:------|:------|
| ! | 0 | foo | 1 |
| @ | 1 | foo | 2 |
| # | 2 | foo | 1 |
| $ | 3 | foo | 2 |
| ! | 0 | bar | A |
| @ | 1 | bar | B |
| # | 2 | bar | C |
| $ | 3 | bar | D |
All columns except:
```python
df.gather(beside=["foo", "bar"])
```
| foo | bar | variable | value |
|------:|:------|:-----------|:--------|
| 1 | A | baz | ! |
| 2 | B | baz | @ |
| 1 | C | baz | # |
| 2 | D | baz | $ |
| 1 | A | jaz | 0 |
| 2 | B | jaz | 1 |
| 1 | C | jaz | 2 |
| 2 | D | jaz | 3 |
"""
return _wrap(gather(self._data, columns, into))
return _wrap(gather(self._data, columns, beside, into))

def group(self, by: LazyColumns) -> GroupedFrame:
"""Create a GroupedFrame overwhich split-apply-combine operations can be applied
Expand Down Expand Up @@ -1324,7 +1362,7 @@ def split(
return _wrap(split(self._data, column, into, sep, drop))

def spread(self, column: Column, using: Column) -> DataFrame:
"""Widen data, increase columns, decreas rows (opposite of `gather`)
"""Widen data by increasing columns and decreasing rows (opposite of `gather`)
pandas: `pivot_table`
tidyverse: `spread`, `pivot_wider`
Expand Down
30 changes: 20 additions & 10 deletions redframes/verbs/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,41 @@
import pandas as pd

from ..checks import _check_type
from ..types import Column, Columns, PandasDataFrame
from ..types import Column, Columns, LazyColumns, PandasDataFrame


def gather(
df: PandasDataFrame,
columns: Columns | None = None,
beside: LazyColumns | None = None,
into: tuple[Column, Column] = ("variable", "value"),
) -> PandasDataFrame:
_check_type(columns, {list, None})
_check_type(beside, {str, list, None})
_check_type(into, tuple)
if not (isinstance(into, tuple) and len(into) == 2):
if not (isinstance(into, tuple) and (len(into) == 2)):
raise TypeError("must be tuple[str, str]")
if into[0] == into[1]:
raise TypeError("must be unique")
if into[0] in df.columns:
if (into[0] in df.columns) or (into[1] in df.columns):
raise TypeError("must not be an existing column key")
if into[1] in df.columns:
raise TypeError("must not be an existing column key")
if columns == None:
columns = list(df.columns)
index = [col for col in df.columns if col not in columns] # type: ignore
if (columns != None) and (beside != None):
raise ValueError("columns OR beside must be None")
if (columns == None) and (beside == None):
id_vars = []
value_vars = list(df.columns)
if isinstance(beside, str):
beside = [beside]
if isinstance(beside, list):
id_vars = beside
value_vars = [col for col in df.columns if col not in id_vars]
if isinstance(columns, list):
id_vars = [col for col in df.columns if col not in columns]
value_vars = columns
df = pd.melt(
df,
id_vars=index,
value_vars=columns,
id_vars=id_vars,
value_vars=value_vars,
var_name=into[0],
value_name=into[1],
)
Expand Down
4 changes: 2 additions & 2 deletions redframes/verbs/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ def sample(
_check_type(rows, {int, float})
if rows >= 1:
if isinstance(rows, float):
raise ValueError("rows (int) must be >= 1")
raise ValueError("must be int if > 1")
df = df.sample(rows, random_state=seed)
elif 0 < rows < 1:
df = df.sample(frac=rows, random_state=seed)
else:
raise ValueError("rows (float) must be (0, 1)")
raise ValueError("must be > 0")
df = df.reset_index(drop=True)
return df
2 changes: 1 addition & 1 deletion redframes/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2"
__version__ = "1.3b1"
64 changes: 58 additions & 6 deletions tests/test_docstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,24 +218,76 @@ def test_filter(self):
self.assertEqual(result3, expected3)

def test_gather(self):
df = rf.DataFrame({"foo": [1, 2], "bar": [3, 4], "baz": [4, 5]})
df = rf.DataFrame(
{
"foo": [1, 2, 1, 2],
"bar": ["A", "B", "C", "D"],
"baz": ["!", "@", "#", "$"],
"jaz": range(4),
}
)
result1 = df.gather()
result2 = df.gather(["foo", "bar"], into=("var", "val"))
result3 = df.gather(beside=["foo", "bar"])
expected1 = rf.DataFrame(
{
"variable": ["foo", "foo", "bar", "bar", "baz", "baz"],
"value": [1, 2, 3, 4, 4, 5],
"variable": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"baz",
"baz",
"baz",
"baz",
"jaz",
"jaz",
"jaz",
"jaz",
],
"value": [
1,
2,
1,
2,
"A",
"B",
"C",
"D",
"!",
"@",
"#",
"$",
0,
1,
2,
3,
],
}
)
expected2 = rf.DataFrame(
{
"baz": [4, 5, 4, 5],
"var": ["foo", "foo", "bar", "bar"],
"val": [1, 2, 3, 4],
"baz": ["!", "@", "#", "$", "!", "@", "#", "$"],
"jaz": [0, 1, 2, 3, 0, 1, 2, 3],
"var": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
"val": [1, 2, 1, 2, "A", "B", "C", "D"],
}
)
expected3 = rf.DataFrame(
{
"foo": [1, 2, 1, 2, 1, 2, 1, 2],
"bar": ["A", "B", "C", "D", "A", "B", "C", "D"],
"variable": ["baz", "baz", "baz", "baz", "jaz", "jaz", "jaz", "jaz"],
"value": ["!", "@", "#", "$", 0, 1, 2, 3],
}
)
self.assertEqual(result1, expected1)
self.assertEqual(result2, expected2)
self.assertEqual(result3, expected3)

def test_group(self):
df = rf.DataFrame(
Expand Down
10 changes: 10 additions & 0 deletions tests/test_ladybugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,13 @@ def test_comine_overwrite_and_drop_other(self):
result = df.combine(["foo", "bar"], into="foo", sep="-", drop=True)
expected = rf.DataFrame({"foo": ["1-1", "2-2", "3-3"]})
self.assertEqual(result, expected)

def test_sample_float_1_point_0(self):
df = rf.DataFrame({"foo": range(100)})
with self.assertRaisesRegex(ValueError, "must be int if > 1"):
df.sample(1.0)

def test_sample_negative_1(self):
df = rf.DataFrame({"foo": range(100)})
with self.assertRaisesRegex(ValueError, "must be > 0"):
df.sample(-1)
13 changes: 13 additions & 0 deletions tests/test_type_hints.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,23 @@ def test_gather_bad_columns(self):
with self.assertRaisesRegex(TypeError, "must be list | None"):
self.df.gather(1)

def test_gather_bad_beside(self):
with self.assertRaisesRegex(TypeError, "must be str | list | None"):
self.df.gather(beside=1)

def test_gather_bad_into_column(self):
with self.assertRaisesRegex(TypeError, "must be tuple"):
self.df.gather(["foo", "bar"], into=1)

def test_gather_bad_into_tuple(self):
# with self.assertRaisesRegex(TypeError, f'must be tuple[str, str]'):
# self.df.gather(into=("one", "two", "three"))
pass

def test_gather_bad_both_not_none(self):
with self.assertRaisesRegex(ValueError, "columns OR beside must be None"):
self.df.gather(columns=["foo", "bar"], beside=["baz"])

def test_group_bad_by_columns(self):
with self.assertRaisesRegex(TypeError, "must be list | str"):
self.df.group(1)
Expand Down

0 comments on commit 0d1bea1

Please sign in to comment.