ENH: Intervalindex by jreback · Pull Request #15309 · pandas-dev/pandas (original) (raw)

closes #7640
closes #8625

reprise of #8707

redid the construction impl a bit, now uses the proper Index calling conventions (as well as other sub-classing conventions).
more testing, including all generic index tests (but still prob need some more)
add ._mask to track internal nans (its lazily computed), IOW you can have null entries in the index.
using tempita, so have intervalindex.pxi.in
the extension is housed in _interval now, rather than a part of lib
IntervalDtype is full fledged

In [11]: df = DataFrame({'A': range(10)})
    ...: s = pd.cut(df.A, 5)
    ...: df['B'] = s
    ...: df['C'] = np.array(s)
    ...: df2 = df.set_index('B')
    ...: df3 = df.set_index('C')
    ...: 
    ...: 

In [12]: df
Out[12]: 
   A              B              C
0  0  (-0.009, 1.8]  (-0.009, 1.8]
1  1  (-0.009, 1.8]  (-0.009, 1.8]
2  2     (1.8, 3.6]     (1.8, 3.6]
3  3     (1.8, 3.6]     (1.8, 3.6]
4  4     (3.6, 5.4]     (3.6, 5.4]
5  5     (3.6, 5.4]     (3.6, 5.4]
6  6     (5.4, 7.2]     (5.4, 7.2]
7  7     (5.4, 7.2]     (5.4, 7.2]
8  8     (7.2, 9.0]     (7.2, 9.0]
9  9     (7.2, 9.0]     (7.2, 9.0]

In [13]: df.dtypes
Out[13]: 
A       int64
B    category
C      object
dtype: object

In [14]: df.C.values
Out[14]: 
array([Interval(-0.0089999999999999993, 1.8, closed='right'),
       Interval(-0.0089999999999999993, 1.8, closed='right'),
       Interval(1.8, 3.6000000000000001, closed='right'),
       Interval(1.8, 3.6000000000000001, closed='right'),
       Interval(3.6000000000000001, 5.4000000000000004, closed='right'),
       Interval(3.6000000000000001, 5.4000000000000004, closed='right'),
       Interval(5.4000000000000004, 7.2000000000000002, closed='right'),
       Interval(5.4000000000000004, 7.2000000000000002, closed='right'),
       Interval(7.2000000000000002, 9.0, closed='right'),
       Interval(7.2000000000000002, 9.0, closed='right')], dtype=object)

Similar indicies

In [6]: df3.index
Out[6]: 
IntervalIndex(left=[-0.009, -0.009, 1.8, 1.8, 3.6, 3.6, 5.4, 5.4, 7.2, 7.2],
              right=[1.8, 1.8, 3.6, 3.6, 5.4, 5.4, 7.2, 7.2, 9.0, 9.0],
              closed='right',
              name='C',
              dtype='interval[float64]')

In [7]: df2.index
Out[7]: CategoricalIndex([(-0.009, 1.8], (-0.009, 1.8], (1.8, 3.6], (1.8, 3.6], (3.6, 5.4], (3.6, 5.4], (5.4, 7.2], (5.4, 7.2], (7.2, 9.0], (7.2, 9.0]], categories=[(-0.009, 1.8], (1.8, 3.6], (3.6, 5.4], (5.4, 7.2], (7.2, 9.0]], ordered=True, name='B', dtype='category')

indexing

In [2]: df2
Out[2]: 
               A              C
B                              
(-0.009, 1.8]  0  (-0.009, 1.8]
(-0.009, 1.8]  1  (-0.009, 1.8]
(1.8, 3.6]     2     (1.8, 3.6]
(1.8, 3.6]     3     (1.8, 3.6]
(3.6, 5.4]     4     (3.6, 5.4]
(3.6, 5.4]     5     (3.6, 5.4]
(5.4, 7.2]     6     (5.4, 7.2]
(5.4, 7.2]     7     (5.4, 7.2]
(7.2, 9.0]     8     (7.2, 9.0]
(7.2, 9.0]     9     (7.2, 9.0]

In [3]: df2.loc[[2, 5]]
Out[3]: 
            A           C
B                        
(1.8, 3.6]  2  (1.8, 3.6]
(1.8, 3.6]  3  (1.8, 3.6]
(3.6, 5.4]  4  (3.6, 5.4]
(3.6, 5.4]  5  (3.6, 5.4]

# work similarly
In [5]: df3.loc[[2, 5]]
Out[5]: 
            A           B
C                        
(1.8, 3.6]  2  (1.8, 3.6]
(1.8, 3.6]  3  (1.8, 3.6]
(3.6, 5.4]  4  (3.6, 5.4]
(3.6, 5.4]  5  (3.6, 5.4]