Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC: correct docstring examples (#3439) #16432

Merged
merged 14 commits into from
May 31, 2017
9 changes: 9 additions & 0 deletions ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ if [ "$DOC" ]; then
git remote -v

git push origin gh-pages -f

echo "Running doctests"
cd "$TRAVIS_BUILD_DIR"
pytest --doctest-modules \
pandas/core/reshape/concat.py \
pandas/core/reshape/pivot.py \
pandas/core/reshape/reshape.py \
pandas/core/reshape/tile.py

fi

exit 0
2 changes: 2 additions & 0 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
0
a 2
>>> pd.concat([df5, df6], verify_integrity=True)
Traceback (most recent call last):
...
ValueError: Indexes have overlapping values: ['a']
"""
op = _Concatenator(objs, axis=axis, join_axes=join_axes,
Expand Down
72 changes: 41 additions & 31 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,26 +50,36 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',

Examples
--------
>>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
... "bar", "bar", "bar", "bar"],
... "B": ["one", "one", "one", "two", "two",
... "one", "one", "two", "two"],
... "C": ["small", "large", "large", "small",
... "small", "large", "small", "small",
... "large"],
... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
>>> df
A B C D
0 foo one small 1
1 foo one large 2
2 foo one large 2
3 foo two small 3
4 foo two small 3
5 bar one large 4
6 bar one small 5
7 bar two small 6
8 bar two large 7
A B C D
0 foo one small 1
1 foo one large 2
2 foo one large 2
3 foo two small 3
4 foo two small 3
5 bar one large 4
6 bar one small 5
7 bar two small 6
8 bar two large 7

>>> table = pivot_table(df, values='D', index=['A', 'B'],
... columns=['C'], aggfunc=np.sum)
>>> table
small large
foo one 1 4
two 6 NaN
bar one 5 4
two 6 7
... # doctest: +NORMALIZE_WHITESPACE
C large small
A B
bar one 4.0 5.0
two 7.0 6.0
foo one 4.0 1.0
two NaN 6.0

Returns
-------
Expand Down Expand Up @@ -438,27 +448,27 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,

Examples
--------
>>> a
array([foo, foo, foo, foo, bar, bar,
bar, bar, foo, foo, foo], dtype=object)
>>> b
array([one, one, one, two, one, one,
one, two, two, two, one], dtype=object)
>>> c
array([dull, dull, shiny, dull, dull, shiny,
shiny, dull, shiny, shiny, shiny], dtype=object)

>>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
b one two
c dull shiny dull shiny
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
>>> b = np.array(["one", "one", "one", "two", "one", "one",
... "one", "two", "two", "two", "one"], dtype=object)
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
... "shiny", "dull", "shiny", "shiny", "shiny"],
... dtype=object)

>>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
... # doctest: +NORMALIZE_WHITESPACE
b one two
c dull shiny dull shiny
a
bar 1 2 1 0
foo 2 2 1 2
bar 1 2 1 0
foo 2 2 1 2

>>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
>>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
>>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data,
# but they still will be counted in the output
... # but they still will be counted in the output
... # doctest: +SKIP
col_0 d e f
row_0
a 1 0 0
Expand Down
81 changes: 43 additions & 38 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,23 @@ class _Unstacker(object):
>>> import pandas as pd
>>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
... ('two', 'a'), ('two', 'b')])
>>> s = pd.Series(np.arange(1.0, 5.0), index=index)
>>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
>>> s
one a 1
b 2
two a 3
b 4
dtype: float64
one a 1
b 2
two a 3
b 4
dtype: int64

>>> s.unstack(level=-1)
a b
a b
one 1 2
two 3 4

>>> s.unstack(level=0)
one two
a 1 2
b 3 4
a 1 3
b 2 4

Returns
-------
Expand Down Expand Up @@ -789,18 +789,18 @@ def lreshape(data, groups, dropna=True, label=None):
>>> import pandas as pd
>>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
... 'team': ['Red Sox', 'Yankees'],
... 'year1': [2007, 2008], 'year2': [2008, 2008]})
... 'year1': [2007, 2007], 'year2': [2008, 2008]})
>>> data
hr1 hr2 team year1 year2
0 514 545 Red Sox 2007 2008
1 573 526 Yankees 2007 2008

>>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
team hr year
0 Red Sox 514 2007
1 Yankees 573 2007
2 Red Sox 545 2008
3 Yankees 526 2008
team year hr
0 Red Sox 2007 514
1 Yankees 2007 573
2 Red Sox 2008 545
3 Yankees 2008 526

Returns
-------
Expand Down Expand Up @@ -905,11 +905,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
... })
>>> df["id"] = df.index
>>> df
A1970 A1980 B1970 B1980 X id
A1970 A1980 B1970 B1980 X id
0 a d 2.5 3.2 -1.085631 0
1 b e 1.2 1.3 0.997345 1
2 c f 0.7 0.1 0.282978 2
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
... # doctest: +NORMALIZE_WHITESPACE
X A B
id year
0 1970 -1.085631 a 2.5
Expand Down Expand Up @@ -940,6 +941,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
8 3 3 2.1 2.9
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
>>> l
... # doctest: +NORMALIZE_WHITESPACE
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you give some explanation why this is needed in this case ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche I think the issue was when there's a df.index.name the output has whitespace for the rest of that line (which we don't want to include in the source)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, ...
Is that actually something we might want to solve in pandas, in the repr? (it's not a bug, but I also don't think it is a feature someone relies upon? So could change that (if it is easy) to not have to do this here) But that is certainly for another PR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or we could set this flag globally if this occurs a lot? (if that is possible)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what @TomAugspurger was thinking, but I don't think he knows either?

ht
famid birth age
1 1 1 2.8
Expand Down Expand Up @@ -979,41 +981,44 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):

Less wieldy column names are also handled

>>> np.random.seed(0)
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
... 'A(quarterly)-2011': np.random.rand(3),
... 'B(quarterly)-2010': np.random.rand(3),
... 'B(quarterly)-2011': np.random.rand(3),
... 'X' : np.random.randint(3, size=3)})
>>> df['id'] = df.index
>>> df
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
0 0.531828 0.724455 0.322959 0.293714
1 0.634401 0.611024 0.361789 0.630976
2 0.849432 0.722443 0.228263 0.092105
\
>>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ...
0 0.548814 0.544883 0.437587 ...
1 0.715189 0.423655 0.891773 ...
2 0.602763 0.645894 0.963663 ...
X id
0 0 0
1 1 1
2 2 2
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
i='id', j='year', sep='-')
X A(quarterly) B(quarterly)
2 1 2

>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id',
... j='year', sep='-')
... # doctest: +NORMALIZE_WHITESPACE
X A(quarterly) B(quarterly)
id year
0 2010 0 0.531828 0.322959
1 2010 2 0.634401 0.361789
2 2010 2 0.849432 0.228263
0 2011 0 0.724455 0.293714
1 2011 2 0.611024 0.630976
2 2011 2 0.722443 0.092105
0 2010 0 0.548814 0.437587
1 2010 1 0.715189 0.891773
2 2010 1 0.602763 0.963663
0 2011 0 0.544883 0.383442
1 2011 1 0.423655 0.791725
2 2011 1 0.645894 0.528895

If we have many columns, we could also use a regex to find our
stubnames and pass that list on to wide_to_long

>>> stubnames = set([match[0] for match in
df.columns.str.findall('[A-B]\(.*\)').values
if match != [] ])
>>> stubnames = sorted(
... set([match[0] for match in df.columns.str.findall(
... r'[A-B]\(.*\)').values if match != [] ])
... )
>>> list(stubnames)
['B(quarterly)', 'A(quarterly)']
['A(quarterly)', 'B(quarterly)']

Notes
-----
Expand Down Expand Up @@ -1133,7 +1138,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
2 0 0 1

>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
'C': [1, 2, 3]})
... 'C': [1, 2, 3]})

>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
Expand All @@ -1149,7 +1154,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
3 1 0 0
4 1 0 0

>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
b c
0 0 0
1 1 0
Expand Down
24 changes: 13 additions & 11 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,18 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
Examples
--------
>>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
(6.533, 9.7], (0.191, 3.367]]
Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]],
array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
... # doctest: +ELLIPSIS
([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ...
Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ...

>>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3,
labels=["good","medium","bad"])
>>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]),
... 3, labels=["good", "medium", "bad"])
... # doctest: +SKIP
[good, good, good, medium, bad, good]
Categories (3, object): [good < medium < bad]

>>> pd.cut(np.ones(5), 4, labels=False)
array([1, 1, 1, 1, 1], dtype=int64)
array([1, 1, 1, 1, 1])
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0

Expand Down Expand Up @@ -182,15 +182,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
Examples
--------
>>> pd.qcut(range(5), 4)
[[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]]
Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]]
... # doctest: +ELLIPSIS
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...

>>> pd.qcut(range(5), 3, labels=["good","medium","bad"])
>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
... # doctest: +SKIP
[good, good, medium, bad, bad]
Categories (3, object): [good < medium < bad]

>>> pd.qcut(range(5), 4, labels=False)
array([0, 0, 1, 2, 3], dtype=int64)
array([0, 0, 1, 2, 3])
"""
x_is_series, series_index, name, x = _preprocess_for_cut(x)

Expand Down