clean/reorg tests · mattip/pandas@db3c6e4 (original) (raw)

`@@ -3706,292 +3706,6 @@ def test_index_label_overlaps_location(self):

`

3706

3706

`expected = ser.take([1, 3, 4])

`

3707

3707

`assert_series_equal(actual, expected)

`

3708

3708

``

3709

``

`-

def test_groupby_selection_with_methods(self):

`

3710

``

`-

some methods which require DatetimeIndex

`

3711

``

`-

rng = pd.date_range('2014', periods=len(self.df))

`

3712

``

`-

self.df.index = rng

`

3713

``

-

3714

``

`-

g = self.df.groupby(['A'])[['C']]

`

3715

``

`-

g_exp = self.df[['C']].groupby(self.df['A'])

`

3716

``

`-

TODO check groupby with > 1 col ?

`

3717

``

-

3718

``

`-

methods which are called as .foo()

`

3719

``

`-

methods = ['count',

`

3720

``

`-

'corr',

`

3721

``

`-

'cummax',

`

3722

``

`-

'cummin',

`

3723

``

`-

'cumprod',

`

3724

``

`-

'describe',

`

3725

``

`-

'rank',

`

3726

``

`-

'quantile',

`

3727

``

`-

'diff',

`

3728

``

`-

'shift',

`

3729

``

`-

'all',

`

3730

``

`-

'any',

`

3731

``

`-

'idxmin',

`

3732

``

`-

'idxmax',

`

3733

``

`-

'ffill',

`

3734

``

`-

'bfill',

`

3735

``

`-

'pct_change',

`

3736

``

`-

'tshift']

`

3737

``

-

3738

``

`-

for m in methods:

`

3739

``

`-

res = getattr(g, m)()

`

3740

``

`-

exp = getattr(g_exp, m)()

`

3741

``

`-

assert_frame_equal(res, exp) # should always be frames!

`

3742

``

-

3743

``

`-

methods which aren't just .foo()

`

3744

``

`-

assert_frame_equal(g.fillna(0), g_exp.fillna(0))

`

3745

``

`-

assert_frame_equal(g.dtypes, g_exp.dtypes)

`

3746

``

`-

assert_frame_equal(g.apply(lambda x: x.sum()),

`

3747

``

`-

g_exp.apply(lambda x: x.sum()))

`

3748

``

-

3749

``

`-

assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean())

`

3750

``

`-

assert_frame_equal(g.resample('D').ohlc(),

`

3751

``

`-

g_exp.resample('D').ohlc())

`

3752

``

-

3753

``

`-

assert_frame_equal(g.filter(lambda x: len(x) == 3),

`

3754

``

`-

g_exp.filter(lambda x: len(x) == 3))

`

3755

``

-

3756

``

`-

The methods returned by these attributes don't have a name attribute

`

3757

``

`-

that matches that attribute.

`

3758

``

`-

TODO: Fix these inconsistencies

`

3759

``

`-

DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([

`

3760

``

`-

'boxplot',

`

3761

``

`-

'bfill',

`

3762

``

`-

'ffill'

`

3763

``

`-

])

`

3764

``

`-

S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([

`

3765

``

`-

'bfill',

`

3766

``

`-

'ffill'

`

3767

``

`-

])

`

3768

``

-

3769

``

`-

def test_groupby_whitelist(self):

`

3770

``

`-

from string import ascii_lowercase

`

3771

``

`-

letters = np.array(list(ascii_lowercase))

`

3772

``

`-

N = 10

`

3773

``

`-

random_letters = letters.take(np.random.randint(0, 26, N))

`

3774

``

`-

df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),

`

3775

``

`-

'letters': Series(random_letters)})

`

3776

``

`-

s = df.floats

`

3777

``

-

3778

``

`-

df_whitelist = frozenset([

`

3779

``

`-

'last',

`

3780

``

`-

'first',

`

3781

``

`-

'mean',

`

3782

``

`-

'sum',

`

3783

``

`-

'min',

`

3784

``

`-

'max',

`

3785

``

`-

'head',

`

3786

``

`-

'tail',

`

3787

``

`-

'cumcount',

`

3788

``

`-

'resample',

`

3789

``

`-

'rank',

`

3790

``

`-

'quantile',

`

3791

``

`-

'fillna',

`

3792

``

`-

'mad',

`

3793

``

`-

'any',

`

3794

``

`-

'all',

`

3795

``

`-

'take',

`

3796

``

`-

'idxmax',

`

3797

``

`-

'idxmin',

`

3798

``

`-

'shift',

`

3799

``

`-

'tshift',

`

3800

``

`-

'ffill',

`

3801

``

`-

'bfill',

`

3802

``

`-

'pct_change',

`

3803

``

`-

'skew',

`

3804

``

`-

'plot',

`

3805

``

`-

'boxplot',

`

3806

``

`-

'hist',

`

3807

``

`-

'median',

`

3808

``

`-

'dtypes',

`

3809

``

`-

'corrwith',

`

3810

``

`-

'corr',

`

3811

``

`-

'cov',

`

3812

``

`-

'diff',

`

3813

``

`-

])

`

3814

``

`-

s_whitelist = frozenset([

`

3815

``

`-

'last',

`

3816

``

`-

'first',

`

3817

``

`-

'mean',

`

3818

``

`-

'sum',

`

3819

``

`-

'min',

`

3820

``

`-

'max',

`

3821

``

`-

'head',

`

3822

``

`-

'tail',

`

3823

``

`-

'cumcount',

`

3824

``

`-

'resample',

`

3825

``

`-

'rank',

`

3826

``

`-

'quantile',

`

3827

``

`-

'fillna',

`

3828

``

`-

'mad',

`

3829

``

`-

'any',

`

3830

``

`-

'all',

`

3831

``

`-

'take',

`

3832

``

`-

'idxmax',

`

3833

``

`-

'idxmin',

`

3834

``

`-

'shift',

`

3835

``

`-

'tshift',

`

3836

``

`-

'ffill',

`

3837

``

`-

'bfill',

`

3838

``

`-

'pct_change',

`

3839

``

`-

'skew',

`

3840

``

`-

'plot',

`

3841

``

`-

'hist',

`

3842

``

`-

'median',

`

3843

``

`-

'dtype',

`

3844

``

`-

'corr',

`

3845

``

`-

'cov',

`

3846

``

`-

'diff',

`

3847

``

`-

'unique',

`

3848

``

`-

'nlargest',

`

3849

``

`-

'nsmallest',

`

3850

``

`-

])

`

3851

``

-

3852

``

`-

names_dont_match_pair = (

`

3853

``

`-

self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE,

`

3854

``

`-

self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE)

`

3855

``

`-

for obj, whitelist, names_dont_match in (

`

3856

``

`-

zip((df, s),

`

3857

``

`-

(df_whitelist, s_whitelist),

`

3858

``

`-

names_dont_match_pair)):

`

3859

``

-

3860

``

`-

gb = obj.groupby(df.letters)

`

3861

``

-

3862

``

`-

assert whitelist == gb._apply_whitelist

`

3863

``

`-

for m in whitelist:

`

3864

``

`-

f = getattr(type(gb), m)

`

3865

``

-

3866

``

`-

name

`

3867

``

`-

try:

`

3868

``

`-

n = f.name

`

3869

``

`-

except AttributeError:

`

3870

``

`-

continue

`

3871

``

`-

if m not in names_dont_match:

`

3872

``

`-

assert n == m

`

3873

``

-

3874

``

`-

qualname

`

3875

``

`-

if compat.PY3:

`

3876

``

`-

try:

`

3877

``

`-

n = f.qualname

`

3878

``

`-

except AttributeError:

`

3879

``

`-

continue

`

3880

``

`-

if m not in names_dont_match:

`

3881

``

`-

assert n.endswith(m)

`

3882

``

-

3883

``

`-

def test_groupby_method_names_that_dont_match_attribute(self):

`

3884

``

`-

from string import ascii_lowercase

`

3885

``

`-

letters = np.array(list(ascii_lowercase))

`

3886

``

`-

N = 10

`

3887

``

`-

random_letters = letters.take(np.random.randint(0, 26, N))

`

3888

``

`-

df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),

`

3889

``

`-

'letters': Series(random_letters)})

`

3890

``

`-

gb = df.groupby(df.letters)

`

3891

``

`-

s = df.floats

`

3892

``

-

3893

``

`-

names_dont_match_pair = (

`

3894

``

`-

self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE,

`

3895

``

`-

self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE)

`

3896

``

`-

for obj, names_dont_match in zip((df, s), names_dont_match_pair):

`

3897

``

`-

gb = obj.groupby(df.letters)

`

3898

``

`-

for m in names_dont_match:

`

3899

``

`-

f = getattr(gb, m)

`

3900

``

`-

self.assertNotEqual(f.name, m)

`

3901

``

-

3902

``

`-

AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',

`

3903

``

`-

'mad', 'std', 'var', 'sem']

`

3904

``

`-

AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad']

`

3905

``

-

3906

``

`-

def test_regression_whitelist_methods(self):

`

3907

``

-

3908

``

`-

GH6944

`

3909

``

`-

explicity test the whitelest methods

`

3910

``

`-

index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',

`

3911

``

`-

'three']],

`

3912

``

`-

labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],

`

3913

``

`-

[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],

`

3914

``

`-

names=['first', 'second'])

`

3915

``

`-

raw_frame = DataFrame(np.random.randn(10, 3), index=index,

`

3916

``

`-

columns=Index(['A', 'B', 'C'], name='exp'))

`

3917

``

`-

raw_frame.iloc[1, [1, 2]] = np.nan

`

3918

``

`-

raw_frame.iloc[7, [0, 1]] = np.nan

`

3919

``

-

3920

``

`-

for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS,

`

3921

``

`-

lrange(2), lrange(2),

`

3922

``

`-

[True, False]):

`

3923

``

-

3924

``

`-

if axis == 0:

`

3925

``

`-

frame = raw_frame

`

3926

``

`-

else:

`

3927

``

`-

frame = raw_frame.T

`

3928

``

-

3929

``

`-

if op in self.AGG_FUNCTIONS_WITH_SKIPNA:

`

3930

``

`-

grouped = frame.groupby(level=level, axis=axis)

`

3931

``

`-

result = getattr(grouped, op)(skipna=skipna)

`

3932

``

`-

expected = getattr(frame, op)(level=level, axis=axis,

`

3933

``

`-

skipna=skipna)

`

3934

``

`-

assert_frame_equal(result, expected)

`

3935

``

`-

else:

`

3936

``

`-

grouped = frame.groupby(level=level, axis=axis)

`

3937

``

`-

result = getattr(grouped, op)()

`

3938

``

`-

expected = getattr(frame, op)(level=level, axis=axis)

`

3939

``

`-

assert_frame_equal(result, expected)

`

3940

``

-

3941

``

`-

def test_groupby_blacklist(self):

`

3942

``

`-

from string import ascii_lowercase

`

3943

``

`-

letters = np.array(list(ascii_lowercase))

`

3944

``

`-

N = 10

`

3945

``

`-

random_letters = letters.take(np.random.randint(0, 26, N))

`

3946

``

`-

df = DataFrame({'floats': N / 10 * Series(np.random.random(N)),

`

3947

``

`-

'letters': Series(random_letters)})

`

3948

``

`-

s = df.floats

`

3949

``

-

3950

``

`-

blacklist = [

`

3951

``

`-

'eval', 'query', 'abs', 'where',

`

3952

``

`-

'mask', 'align', 'groupby', 'clip', 'astype',

`

3953

``

`-

'at', 'combine', 'consolidate', 'convert_objects',

`

3954

``

`-

]

`

3955

``

`-

to_methods = [method for method in dir(df) if method.startswith('to_')]

`

3956

``

-

3957

``

`-

blacklist.extend(to_methods)

`

3958

``

-

3959

``

`-

e.g., to_csv

`

3960

``

`-

defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the "

`

3961

``

`-

"'apply' method$)")

`

3962

``

-

3963

``

`-

e.g., query, eval

`

3964

``

`-

not_defined = "(?:^{1!r} object has no attribute {0!r}$)"

`

3965

``

`-

fmt = defined_but_not_allowed + '|' + not_defined

`

3966

``

`-

for bl in blacklist:

`

3967

``

`-

for obj in (df, s):

`

3968

``

`-

gb = obj.groupby(df.letters)

`

3969

``

`-

msg = fmt.format(bl, type(gb).name)

`

3970

``

`-

with tm.assertRaisesRegexp(AttributeError, msg):

`

3971

``

`-

getattr(gb, bl)

`

3972

``

-

3973

``

`-

def test_tab_completion(self):

`

3974

``

`-

grp = self.mframe.groupby(level='second')

`

3975

``

`-

results = set([v for v in dir(grp) if not v.startswith('_')])

`

3976

``

`-

expected = set(

`

3977

``

`-

['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter',

`

3978

``

`-

'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',

`

3979

``

`-

'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',

`

3980

``

`-

'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',

`

3981

``

`-

'nunique', 'head', 'describe', 'cummax', 'quantile',

`

3982

``

`-

'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',

`

3983

``

`-

'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',

`

3984

``

`-

'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',

`

3985

``

`-

'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',

`

3986

``

`-

'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])

`

3987

``

`-

self.assertEqual(results, expected)

`

3988

``

-

3989

``

`-

def test_groupby_function_rename(self):

`

3990

``

`-

grp = self.mframe.groupby(level='second')

`

3991

``

`-

for name in ['sum', 'prod', 'min', 'max', 'first', 'last']:

`

3992

``

`-

f = getattr(grp, name)

`

3993

``

`-

self.assertEqual(f.name, name)

`

3994

``

-

3995

3709

`def test_lower_int_prec_count(self):

`

3996

3710

`df = DataFrame({'a': np.array(

`

3997

3711

` [0, 1, 2, 100], np.int8),

`