Irregular errors when reading certain categorical strings from hdf (original) (raw)

It seems that there is something bad happening when we use certain strings with special characters AND the empty string with categoricals:

# -*- coding: latin-1 -*-
import pandas
import os

examples = [
        pandas.Series(['EÉ, 17', '', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['EÉ, 17', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['EE, 17', '', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['øü', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['Aøü', '', 'a', 'b', 'c'], dtype='category'),
        pandas.Series(['EÉ, 17', 'øü', 'a', 'b', 'c'], dtype='category')
        ]

def test_hdf(s):
    f = 'testhdf.h5'
    if os.path.exists(f):
        os.remove(f)
    s.to_hdf(f, 'data', format='table')
    return pandas.read_hdf(f, 'data')

for i, s in enumerate(examples):
    flag = True
    e = ''
    try:
        test_hdf(s)
    except Exception as ex:
        e = ex
        flag = False
    print('%d: %s\t%s\t%s' % (i, 'pass' if flag else 'fail', s.tolist(), e))

Results in:

    0: fail ['EÉ, 17', '', 'a', 'b', 'c']   Categorical categories must be unique
    1: pass ['EÉ, 17', 'a', 'b', 'c']
    2: pass ['', 'a', 'b', 'c']
    3: pass ['EE, 17', '', 'a', 'b', 'c']
    4: pass ['øü', 'a', 'b', 'c']
    5: fail ['Aøü', '', 'a', 'b', 'c']  Categorical categories must be unique
    6: pass ['EÉ, 17', 'øü', 'a', 'b', 'c']

Not sure if I am using this incorrectly or if this is actually a corner case.