1
- # Copyright (c) 2018, NVIDIA CORPORATION.
1
+ # Copyright (c) 2018-2019 , NVIDIA CORPORATION.
2
2
3
3
from __future__ import print_function , division
4
4
29
29
30
30
from librmm_cffi import librmm as rmm
31
31
32
+ import cudf
32
33
from cudf import formatting
33
34
from cudf .utils import cudautils , queryutils , applyutils , utils , ioutils
34
35
from cudf .dataframe .index import as_index , Index , RangeIndex
@@ -224,10 +225,14 @@ def __getitem__(self, arg):
224
225
>>> print(df[[True, False, True, False]]) # mask the entire dataframe,
225
226
# returning the rows specified in the boolean mask
226
227
"""
228
+ if isinstance (self .columns , cudf .dataframe .multiindex .MultiIndex ) and \
229
+ isinstance (arg , tuple ):
230
+ return self .columns ._get_column_major (self , arg )
227
231
if isinstance (arg , str ) or isinstance (arg , numbers .Integral ) or \
228
232
isinstance (arg , tuple ):
229
233
s = self ._cols [arg ]
230
234
s .name = arg
235
+ s .index = self .index
231
236
return s
232
237
elif isinstance (arg , slice ):
233
238
df = DataFrame ()
@@ -247,7 +252,7 @@ def __getitem__(self, arg):
247
252
index = self .index .take (selinds .to_gpu_array ())
248
253
for col in self ._cols :
249
254
df [col ] = Series (self ._cols [col ][arg ], index = index )
250
- df .set_index (index )
255
+ df = df .set_index (index )
251
256
else :
252
257
for col in arg :
253
258
df [col ] = self [col ]
@@ -272,7 +277,6 @@ def mask(self, other):
272
277
def __setitem__ (self , name , col ):
273
278
"""Add/set column by *name or DataFrame*
274
279
"""
275
- # div[div < 0] = 0
276
280
if isinstance (name , DataFrame ):
277
281
for col_name in self ._cols :
278
282
mask = name [col_name ]
@@ -399,6 +403,11 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET):
399
403
>>> df.to_string()
400
404
' key val\\ n0 0 10.0\\ n1 1 11.0\\ n2 2 12.0'
401
405
"""
406
+ if isinstance (self .index , cudf .dataframe .multiindex .MultiIndex ) or \
407
+ isinstance (self .columns , cudf .dataframe .multiindex .MultiIndex ):
408
+ raise TypeError ("You're trying to print a DataFrame that contains "
409
+ "a MultiIndex. Print this dataframe with "
410
+ ".to_pandas()" )
402
411
if nrows is NOTSET :
403
412
nrows = settings .formatting .get ('nrows' )
404
413
if ncols is NOTSET :
@@ -420,9 +429,12 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET):
420
429
# Prepare cells
421
430
cols = OrderedDict ()
422
431
dtypes = OrderedDict ()
423
- use_cols = list (self .columns [:ncols - 1 ])
424
- if ncols > 0 :
425
- use_cols .append (self .columns [- 1 ])
432
+ if hasattr (self , 'multi_cols' ):
433
+ use_cols = list (range (len (self .columns )))
434
+ else :
435
+ use_cols = list (self .columns [:ncols - 1 ])
436
+ if ncols > 0 :
437
+ use_cols .append (self .columns [- 1 ])
426
438
427
439
for h in use_cols :
428
440
cols [h ] = self [h ].values_to_string (nrows = nrows )
@@ -664,19 +676,41 @@ def iloc(self):
664
676
def columns (self ):
665
677
"""Returns a tuple of columns
666
678
"""
667
- return pd .Index (self ._cols )
679
+ if hasattr (self , 'multi_cols' ):
680
+ return self .multi_cols
681
+ else :
682
+ return pd .Index (self ._cols )
668
683
669
684
@columns .setter
670
685
def columns (self , columns ):
686
+ if isinstance (columns , Index ):
687
+ if len (columns ) != len (self .columns ):
688
+ msg = f"Length mismatch: Expected axis has %d elements, " \
689
+ "new values have %d elements" \
690
+ % (len (self .columns ), len (columns ))
691
+ raise ValueError (msg )
692
+ """
693
+ new_names = []
694
+ for idx, name in enumerate(columns):
695
+ new_names.append(name)
696
+ self._rename_columns(new_names)
697
+ """
698
+ self .multi_cols = columns
699
+ else :
700
+ if hasattr (self , 'multi_cols' ):
701
+ delattr (self , 'multi_cols' )
702
+ self ._rename_columns (columns )
703
+
704
+ def _rename_columns (self , new_names ):
671
705
old_cols = list (self ._cols .keys ())
672
706
l_old_cols = len (old_cols )
673
- l_new_cols = len (columns )
707
+ l_new_cols = len (new_names )
674
708
if l_new_cols != l_old_cols :
675
709
msg = f'Length of new column names: { l_new_cols } does not ' \
676
710
'match length of previous column names: {l_old_cols}'
677
711
raise ValueError (msg )
678
712
679
- mapper = dict (zip (old_cols , columns ))
713
+ mapper = dict (zip (old_cols , new_names ))
680
714
self .rename (mapper = mapper , inplace = True )
681
715
682
716
@property
@@ -687,12 +721,26 @@ def index(self):
687
721
688
722
@index .setter
689
723
def index (self , _index ):
724
+ if isinstance (_index , cudf .dataframe .multiindex .MultiIndex ):
725
+ if len (_index ) != len (self [self .columns [0 ]]):
726
+ msg = f"Length mismatch: Expected axis has " \
727
+ "%d elements, new values " \
728
+ "have %d elements" \
729
+ % (len (self [self .columns [0 ]]), len (_index ))
730
+ raise ValueError (msg )
731
+ self ._index = _index
732
+ for k in self .columns :
733
+ self [k ].index = _index
734
+ return
735
+
690
736
new_length = len (_index )
691
737
old_length = len (self ._index )
692
738
693
739
if new_length != old_length :
694
- msg = f'Length mismatch: Expected index has { old_length } ' \
695
- ' elements, new values have {new_length} elements'
740
+ msg = f"Length mismatch: Expected axis has " \
741
+ "%d elements, new values " \
742
+ "have %d elements" \
743
+ % (old_length , new_length )
696
744
raise ValueError (msg )
697
745
698
746
# try to build an index from generic _index
@@ -906,8 +954,8 @@ def drop(self, labels, axis=None):
906
954
if axis == 0 :
907
955
raise NotImplementedError ("Can only drop columns, not rows" )
908
956
909
- columns = [labels ] if isinstance (labels , str ) else list ( labels )
910
-
957
+ columns = [labels ] if isinstance (
958
+ labels , ( str , numbers . Number )) else list ( labels )
911
959
outdf = self .copy ()
912
960
for c in columns :
913
961
outdf ._drop_column (c )
@@ -2240,6 +2288,13 @@ def to_pandas(self):
2240
2288
out = pd .DataFrame (index = index )
2241
2289
for c , x in self ._cols .items ():
2242
2290
out [c ] = x .to_pandas (index = index )
2291
+ if isinstance (self .columns , Index ):
2292
+ out .columns = self .columns
2293
+ if isinstance (self .columns , cudf .dataframe .multiindex .MultiIndex ):
2294
+ if self .columns .names is not None :
2295
+ out .columns .names = self .columns .names
2296
+ else :
2297
+ out .columns .name = self .columns .name
2243
2298
return out
2244
2299
2245
2300
@classmethod
@@ -2269,7 +2324,12 @@ def from_pandas(cls, dataframe, nan_as_null=True):
2269
2324
vals = dataframe [colk ].values
2270
2325
df [colk ] = Series (vals , nan_as_null = nan_as_null )
2271
2326
# Set index
2272
- return df .set_index (dataframe .index )
2327
+ if isinstance (dataframe .index , pd .MultiIndex ):
2328
+ import cudf
2329
+ index = cudf .from_pandas (dataframe .index )
2330
+ else :
2331
+ index = dataframe .index
2332
+ return df .set_index (index )
2273
2333
2274
2334
def to_arrow (self , preserve_index = True ):
2275
2335
"""
@@ -2696,6 +2756,13 @@ def __getitem__(self, arg):
2696
2756
row_slice = None
2697
2757
row_label = None
2698
2758
2759
+ if isinstance (self ._df .index , cudf .dataframe .multiindex .MultiIndex )\
2760
+ and isinstance (arg , tuple ): # noqa: E501
2761
+ # Explicitly ONLY support tuple indexes into MultiIndex.
2762
+ # Pandas allows non tuple indices and warns "results may be
2763
+ # undefined."
2764
+ return self ._df ._index ._get_row_major (self ._df , arg )
2765
+
2699
2766
if isinstance (arg , int ):
2700
2767
if arg < 0 or arg >= len (self ._df ):
2701
2768
raise IndexError ("label scalar %s is out of bound" % arg )
@@ -2785,7 +2852,9 @@ def __setitem__(self, key, value):
2785
2852
2786
2853
def from_pandas (obj ):
2787
2854
"""
2788
- Convert a Pandas DataFrame or Series object into the cudf equivalent
2855
+ Convert certain Pandas objects into the cudf equivalent.
2856
+
2857
+ Supports DataFrame, Series, or MultiIndex.
2789
2858
2790
2859
Raises
2791
2860
------
@@ -2804,9 +2873,12 @@ def from_pandas(obj):
2804
2873
return DataFrame .from_pandas (obj )
2805
2874
elif isinstance (obj , pd .Series ):
2806
2875
return Series .from_pandas (obj )
2876
+ elif isinstance (obj , pd .MultiIndex ):
2877
+ return cudf .dataframe .multiindex .MultiIndex .from_pandas (obj )
2807
2878
else :
2808
2879
raise TypeError (
2809
- "from_pandas only accepts Pandas Dataframes and Series objects. "
2880
+ "from_pandas only accepts Pandas Dataframes, Series, and "
2881
+ "MultiIndex objects. "
2810
2882
"Got %s" % type (obj )
2811
2883
)
2812
2884
0 commit comments