clean/reorg tests · mattip/pandas@db3c6e4 (original) (raw)
`@@ -3706,292 +3706,6 @@ def test_index_label_overlaps_location(self):
`
3706
3706
`expected = ser.take([1, 3, 4])
`
3707
3707
`assert_series_equal(actual, expected)
`
3708
3708
``
3709
``
`-
def test_groupby_selection_with_methods(self):
`
3710
``
`-
some methods which require DatetimeIndex
`
3711
``
`-
rng = pd.date_range('2014', periods=len(self.df))
`
3712
``
`-
self.df.index = rng
`
3713
``
-
3714
``
`-
g = self.df.groupby(['A'])[['C']]
`
3715
``
`-
g_exp = self.df[['C']].groupby(self.df['A'])
`
3716
``
`-
TODO check groupby with > 1 col ?
`
3717
``
-
3718
``
`-
methods which are called as .foo()
`
3719
``
`-
methods = ['count',
`
3720
``
`-
'corr',
`
3721
``
`-
'cummax',
`
3722
``
`-
'cummin',
`
3723
``
`-
'cumprod',
`
3724
``
`-
'describe',
`
3725
``
`-
'rank',
`
3726
``
`-
'quantile',
`
3727
``
`-
'diff',
`
3728
``
`-
'shift',
`
3729
``
`-
'all',
`
3730
``
`-
'any',
`
3731
``
`-
'idxmin',
`
3732
``
`-
'idxmax',
`
3733
``
`-
'ffill',
`
3734
``
`-
'bfill',
`
3735
``
`-
'pct_change',
`
3736
``
`-
'tshift']
`
3737
``
-
3738
``
`-
for m in methods:
`
3739
``
`-
res = getattr(g, m)()
`
3740
``
`-
exp = getattr(g_exp, m)()
`
3741
``
`-
assert_frame_equal(res, exp) # should always be frames!
`
3742
``
-
3743
``
`-
methods which aren't just .foo()
`
3744
``
`-
assert_frame_equal(g.fillna(0), g_exp.fillna(0))
`
3745
``
`-
assert_frame_equal(g.dtypes, g_exp.dtypes)
`
3746
``
`-
assert_frame_equal(g.apply(lambda x: x.sum()),
`
3747
``
`-
g_exp.apply(lambda x: x.sum()))
`
3748
``
-
3749
``
`-
assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean())
`
3750
``
`-
assert_frame_equal(g.resample('D').ohlc(),
`
3751
``
`-
g_exp.resample('D').ohlc())
`
3752
``
-
3753
``
`-
assert_frame_equal(g.filter(lambda x: len(x) == 3),
`
3754
``
`-
g_exp.filter(lambda x: len(x) == 3))
`
3755
``
-
3756
``
`-
The methods returned by these attributes don't have a name attribute
`
3757
``
`-
that matches that attribute.
`
3758
``
`-
TODO: Fix these inconsistencies
`
3759
``
`-
DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([
`
3760
``
`-
'boxplot',
`
3761
``
`-
'bfill',
`
3762
``
`-
'ffill'
`
3763
``
`-
])
`
3764
``
`-
S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([
`
3765
``
`-
'bfill',
`
3766
``
`-
'ffill'
`
3767
``
`-
])
`
3768
``
-
3769
``
`-
def test_groupby_whitelist(self):
`
3770
``
`-
from string import ascii_lowercase
`
3771
``
`-
letters = np.array(list(ascii_lowercase))
`
3772
``
`-
N = 10
`
3773
``
`-
random_letters = letters.take(np.random.randint(0, 26, N))
`
3774
``
`-
df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
`
3775
``
`-
'letters': Series(random_letters)})
`
3776
``
`-
s = df.floats
`
3777
``
-
3778
``
`-
df_whitelist = frozenset([
`
3779
``
`-
'last',
`
3780
``
`-
'first',
`
3781
``
`-
'mean',
`
3782
``
`-
'sum',
`
3783
``
`-
'min',
`
3784
``
`-
'max',
`
3785
``
`-
'head',
`
3786
``
`-
'tail',
`
3787
``
`-
'cumcount',
`
3788
``
`-
'resample',
`
3789
``
`-
'rank',
`
3790
``
`-
'quantile',
`
3791
``
`-
'fillna',
`
3792
``
`-
'mad',
`
3793
``
`-
'any',
`
3794
``
`-
'all',
`
3795
``
`-
'take',
`
3796
``
`-
'idxmax',
`
3797
``
`-
'idxmin',
`
3798
``
`-
'shift',
`
3799
``
`-
'tshift',
`
3800
``
`-
'ffill',
`
3801
``
`-
'bfill',
`
3802
``
`-
'pct_change',
`
3803
``
`-
'skew',
`
3804
``
`-
'plot',
`
3805
``
`-
'boxplot',
`
3806
``
`-
'hist',
`
3807
``
`-
'median',
`
3808
``
`-
'dtypes',
`
3809
``
`-
'corrwith',
`
3810
``
`-
'corr',
`
3811
``
`-
'cov',
`
3812
``
`-
'diff',
`
3813
``
`-
])
`
3814
``
`-
s_whitelist = frozenset([
`
3815
``
`-
'last',
`
3816
``
`-
'first',
`
3817
``
`-
'mean',
`
3818
``
`-
'sum',
`
3819
``
`-
'min',
`
3820
``
`-
'max',
`
3821
``
`-
'head',
`
3822
``
`-
'tail',
`
3823
``
`-
'cumcount',
`
3824
``
`-
'resample',
`
3825
``
`-
'rank',
`
3826
``
`-
'quantile',
`
3827
``
`-
'fillna',
`
3828
``
`-
'mad',
`
3829
``
`-
'any',
`
3830
``
`-
'all',
`
3831
``
`-
'take',
`
3832
``
`-
'idxmax',
`
3833
``
`-
'idxmin',
`
3834
``
`-
'shift',
`
3835
``
`-
'tshift',
`
3836
``
`-
'ffill',
`
3837
``
`-
'bfill',
`
3838
``
`-
'pct_change',
`
3839
``
`-
'skew',
`
3840
``
`-
'plot',
`
3841
``
`-
'hist',
`
3842
``
`-
'median',
`
3843
``
`-
'dtype',
`
3844
``
`-
'corr',
`
3845
``
`-
'cov',
`
3846
``
`-
'diff',
`
3847
``
`-
'unique',
`
3848
``
`-
'nlargest',
`
3849
``
`-
'nsmallest',
`
3850
``
`-
])
`
3851
``
-
3852
``
`-
names_dont_match_pair = (
`
3853
``
`-
self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE,
`
3854
``
`-
self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE)
`
3855
``
`-
for obj, whitelist, names_dont_match in (
`
3856
``
`-
zip((df, s),
`
3857
``
`-
(df_whitelist, s_whitelist),
`
3858
``
`-
names_dont_match_pair)):
`
3859
``
-
3860
``
`-
gb = obj.groupby(df.letters)
`
3861
``
-
3862
``
`-
assert whitelist == gb._apply_whitelist
`
3863
``
`-
for m in whitelist:
`
3864
``
`-
f = getattr(type(gb), m)
`
3865
``
-
3866
``
`-
name
`
3867
``
`-
try:
`
3868
``
`-
n = f.name
`
3869
``
`-
except AttributeError:
`
3870
``
`-
continue
`
3871
``
`-
if m not in names_dont_match:
`
3872
``
`-
assert n == m
`
3873
``
-
3874
``
`-
qualname
`
3875
``
`-
if compat.PY3:
`
3876
``
`-
try:
`
3877
``
`-
n = f.qualname
`
3878
``
`-
except AttributeError:
`
3879
``
`-
continue
`
3880
``
`-
if m not in names_dont_match:
`
3881
``
`-
assert n.endswith(m)
`
3882
``
-
3883
``
`-
def test_groupby_method_names_that_dont_match_attribute(self):
`
3884
``
`-
from string import ascii_lowercase
`
3885
``
`-
letters = np.array(list(ascii_lowercase))
`
3886
``
`-
N = 10
`
3887
``
`-
random_letters = letters.take(np.random.randint(0, 26, N))
`
3888
``
`-
df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
`
3889
``
`-
'letters': Series(random_letters)})
`
3890
``
`-
gb = df.groupby(df.letters)
`
3891
``
`-
s = df.floats
`
3892
``
-
3893
``
`-
names_dont_match_pair = (
`
3894
``
`-
self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE,
`
3895
``
`-
self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE)
`
3896
``
`-
for obj, names_dont_match in zip((df, s), names_dont_match_pair):
`
3897
``
`-
gb = obj.groupby(df.letters)
`
3898
``
`-
for m in names_dont_match:
`
3899
``
`-
f = getattr(gb, m)
`
3900
``
`-
self.assertNotEqual(f.name, m)
`
3901
``
-
3902
``
`-
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
`
3903
``
`-
'mad', 'std', 'var', 'sem']
`
3904
``
`-
AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']
`
3905
``
-
3906
``
`-
def test_regression_whitelist_methods(self):
`
3907
``
-
3908
``
`-
GH6944
`
3909
``
`-
explicity test the whitelest methods
`
3910
``
`-
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
`
3911
``
`-
'three']],
`
3912
``
`-
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
`
3913
``
`-
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
`
3914
``
`-
names=['first', 'second'])
`
3915
``
`-
raw_frame = DataFrame(np.random.randn(10, 3), index=index,
`
3916
``
`-
columns=Index(['A', 'B', 'C'], name='exp'))
`
3917
``
`-
raw_frame.iloc[1, [1, 2]] = np.nan
`
3918
``
`-
raw_frame.iloc[7, [0, 1]] = np.nan
`
3919
``
-
3920
``
`-
for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,
`
3921
``
`-
lrange(2), lrange(2),
`
3922
``
`-
[True, False]):
`
3923
``
-
3924
``
`-
if axis == 0:
`
3925
``
`-
frame = raw_frame
`
3926
``
`-
else:
`
3927
``
`-
frame = raw_frame.T
`
3928
``
-
3929
``
`-
if op in self.AGG_FUNCTIONS_WITH_SKIPNA:
`
3930
``
`-
grouped = frame.groupby(level=level, axis=axis)
`
3931
``
`-
result = getattr(grouped, op)(skipna=skipna)
`
3932
``
`-
expected = getattr(frame, op)(level=level, axis=axis,
`
3933
``
`-
skipna=skipna)
`
3934
``
`-
assert_frame_equal(result, expected)
`
3935
``
`-
else:
`
3936
``
`-
grouped = frame.groupby(level=level, axis=axis)
`
3937
``
`-
result = getattr(grouped, op)()
`
3938
``
`-
expected = getattr(frame, op)(level=level, axis=axis)
`
3939
``
`-
assert_frame_equal(result, expected)
`
3940
``
-
3941
``
`-
def test_groupby_blacklist(self):
`
3942
``
`-
from string import ascii_lowercase
`
3943
``
`-
letters = np.array(list(ascii_lowercase))
`
3944
``
`-
N = 10
`
3945
``
`-
random_letters = letters.take(np.random.randint(0, 26, N))
`
3946
``
`-
df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),
`
3947
``
`-
'letters': Series(random_letters)})
`
3948
``
`-
s = df.floats
`
3949
``
-
3950
``
`-
blacklist = [
`
3951
``
`-
'eval', 'query', 'abs', 'where',
`
3952
``
`-
'mask', 'align', 'groupby', 'clip', 'astype',
`
3953
``
`-
'at', 'combine', 'consolidate', 'convert_objects',
`
3954
``
`-
]
`
3955
``
`-
to_methods = [method for method in dir(df) if method.startswith('to_')]
`
3956
``
-
3957
``
`-
blacklist.extend(to_methods)
`
3958
``
-
3959
``
`-
e.g., to_csv
`
3960
``
`-
defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "
`
3961
``
`-
"'apply' method$)")
`
3962
``
-
3963
``
`-
e.g., query, eval
`
3964
``
`-
not_defined = "(?:^{1!r} object has no attribute {0!r}$)"
`
3965
``
`-
fmt = defined_but_not_allowed + '|' + not_defined
`
3966
``
`-
for bl in blacklist:
`
3967
``
`-
for obj in (df, s):
`
3968
``
`-
gb = obj.groupby(df.letters)
`
3969
``
`-
msg = fmt.format(bl, type(gb).name)
`
3970
``
`-
with tm.assertRaisesRegexp(AttributeError, msg):
`
3971
``
`-
getattr(gb, bl)
`
3972
``
-
3973
``
`-
def test_tab_completion(self):
`
3974
``
`-
grp = self.mframe.groupby(level='second')
`
3975
``
`-
results = set([v for v in dir(grp) if not v.startswith('_')])
`
3976
``
`-
expected = set(
`
3977
``
`-
['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',
`
3978
``
`-
'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
`
3979
``
`-
'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
`
3980
``
`-
'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
`
3981
``
`-
'nunique', 'head', 'describe', 'cummax', 'quantile',
`
3982
``
`-
'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
`
3983
``
`-
'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
`
3984
``
`-
'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
`
3985
``
`-
'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
`
3986
``
`-
'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
`
3987
``
`-
self.assertEqual(results, expected)
`
3988
``
-
3989
``
`-
def test_groupby_function_rename(self):
`
3990
``
`-
grp = self.mframe.groupby(level='second')
`
3991
``
`-
for name in ['sum', 'prod', 'min', 'max', 'first', 'last']:
`
3992
``
`-
f = getattr(grp, name)
`
3993
``
`-
self.assertEqual(f.name, name)
`
3994
``
-
3995
3709
`def test_lower_int_prec_count(self):
`
3996
3710
`df = DataFrame({'a': np.array(
`
3997
3711
` [0, 1, 2, 100], np.int8),
`