ENH: Intervalindex by jreback · Pull Request #15309 · pandas-dev/pandas (original) (raw)
reprise of #8707
- redid the construction impl a bit, now uses the proper Index calling conventions (as well as other sub-classing conventions).
- more testing, including all generic index tests (but still prob need some more)
- add ._mask to track internal nans (its lazily computed), IOW you can have null entries in the index.
- using tempita, so have intervalindex.pxi.in
- the extension is housed in _interval now, rather than a part of lib
- IntervalDtype is full fledged
In [11]: df = DataFrame({'A': range(10)})
...: s = pd.cut(df.A, 5)
...: df['B'] = s
...: df['C'] = np.array(s)
...: df2 = df.set_index('B')
...: df3 = df.set_index('C')
...:
...:
In [12]: df
Out[12]:
A B C
0 0 (-0.009, 1.8] (-0.009, 1.8]
1 1 (-0.009, 1.8] (-0.009, 1.8]
2 2 (1.8, 3.6] (1.8, 3.6]
3 3 (1.8, 3.6] (1.8, 3.6]
4 4 (3.6, 5.4] (3.6, 5.4]
5 5 (3.6, 5.4] (3.6, 5.4]
6 6 (5.4, 7.2] (5.4, 7.2]
7 7 (5.4, 7.2] (5.4, 7.2]
8 8 (7.2, 9.0] (7.2, 9.0]
9 9 (7.2, 9.0] (7.2, 9.0]
In [13]: df.dtypes
Out[13]:
A int64
B category
C object
dtype: object
In [14]: df.C.values
Out[14]:
array([Interval(-0.0089999999999999993, 1.8, closed='right'),
Interval(-0.0089999999999999993, 1.8, closed='right'),
Interval(1.8, 3.6000000000000001, closed='right'),
Interval(1.8, 3.6000000000000001, closed='right'),
Interval(3.6000000000000001, 5.4000000000000004, closed='right'),
Interval(3.6000000000000001, 5.4000000000000004, closed='right'),
Interval(5.4000000000000004, 7.2000000000000002, closed='right'),
Interval(5.4000000000000004, 7.2000000000000002, closed='right'),
Interval(7.2000000000000002, 9.0, closed='right'),
Interval(7.2000000000000002, 9.0, closed='right')], dtype=object)
Similar indicies
In [6]: df3.index
Out[6]:
IntervalIndex(left=[-0.009, -0.009, 1.8, 1.8, 3.6, 3.6, 5.4, 5.4, 7.2, 7.2],
right=[1.8, 1.8, 3.6, 3.6, 5.4, 5.4, 7.2, 7.2, 9.0, 9.0],
closed='right',
name='C',
dtype='interval[float64]')
In [7]: df2.index
Out[7]: CategoricalIndex([(-0.009, 1.8], (-0.009, 1.8], (1.8, 3.6], (1.8, 3.6], (3.6, 5.4], (3.6, 5.4], (5.4, 7.2], (5.4, 7.2], (7.2, 9.0], (7.2, 9.0]], categories=[(-0.009, 1.8], (1.8, 3.6], (3.6, 5.4], (5.4, 7.2], (7.2, 9.0]], ordered=True, name='B', dtype='category')
indexing
In [2]: df2
Out[2]:
A C
B
(-0.009, 1.8] 0 (-0.009, 1.8]
(-0.009, 1.8] 1 (-0.009, 1.8]
(1.8, 3.6] 2 (1.8, 3.6]
(1.8, 3.6] 3 (1.8, 3.6]
(3.6, 5.4] 4 (3.6, 5.4]
(3.6, 5.4] 5 (3.6, 5.4]
(5.4, 7.2] 6 (5.4, 7.2]
(5.4, 7.2] 7 (5.4, 7.2]
(7.2, 9.0] 8 (7.2, 9.0]
(7.2, 9.0] 9 (7.2, 9.0]
In [3]: df2.loc[[2, 5]]
Out[3]:
A C
B
(1.8, 3.6] 2 (1.8, 3.6]
(1.8, 3.6] 3 (1.8, 3.6]
(3.6, 5.4] 4 (3.6, 5.4]
(3.6, 5.4] 5 (3.6, 5.4]
# work similarly
In [5]: df3.loc[[2, 5]]
Out[5]:
A B
C
(1.8, 3.6] 2 (1.8, 3.6]
(1.8, 3.6] 3 (1.8, 3.6]
(3.6, 5.4] 4 (3.6, 5.4]
(3.6, 5.4] 5 (3.6, 5.4]