Iterating through TableIterator with where clause can incorrectly ignore data · Issue #8014 · pandas-dev/pandas (original) (raw)
Expected behaviour: Using appendable table stored using HDFStore summed length of DataFrames returned using an iterator with a where clause should equal the length of the DataFrame when returned using the same where clause but with iterator=False e.g. TableIterator.get_values().
The attached code generates appendable tables of size 100064, 200064, ..., 400064. It uses a where clause which is a superset of all possible values to get DataFrames with iterator=False, with and without the where clause, and with iterator=True, also with and without the where clause. In all cases except for iterator=True with the where clause the length of the returned DataFrames is correct.
For the failure cases in closer inspection in iPython it is the last 64 rows which are not being returned.
Note: in create_file() the appending of DataFrames with lengths of 58689 and 41375 was chosen specifically to reproduce the problem. I originally encountered the problem with a dataset with length 174000064 and the last append was size 41375. I attempted to reproduce the problem by creating various length tables in chunks of 100000 with a final append of 64 and wasn't able to do so.
Creating the table with the last chunk = 41375 with total length exceeding 300000 does in my tests reproduce the problem.
Output:
iteration: 0 PASSED
expected: 100064, df len: 100064, it (no where clause) len: 100064, it len: 100064
iteration: 1 PASSED
expected: 200064, df len: 200064, it (no where clause) len: 200064, it len: 200064
iteration: 2 FAILED
expected: 300064, df len: 300064, it (no where clause) len: 300064, it len: 300000
iteration: 3 FAILED
expected: 400064, df len: 400064, it (no where clause) len: 400064, it len: 400000
pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 2.7.6.final.0
python-bits: 64
OS: Linux
OS-release: 3.13.0-32-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
pandas: 0.13.1
Cython: 0.20.1
numpy: 1.8.1
scipy: 0.14.0
statsmodels: None
IPython: 1.2.1
sphinx: None
patsy: None
scikits.timeseries: None
dateutil: 1.5
pytz: 2012c
bottleneck: 0.8.0
tables: 3.1.1
numexpr: 2.4
matplotlib: 1.3.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
sqlalchemy: None
lxml: 3.3.3
bs4: 4.3.2
html5lib: 0.999
bq: None
apiclient: None
import os from dateutil.relativedelta import relativedelta
import numpy as np randn = np.random.randn from pandas import DataFrame, HDFStore, date_range
def create_df(beg_dt, periods=1e5): """ Create a DataFrame containing values v. """
dr = date_range(beg_dt, periods=periods, freq='S')
df = DataFrame(index=dr, data=np.random.randn(periods, 4), columns=['bid_price','bid_vol', 'ask_price','ask_vol'])
return(df)
def create_file(iterations=1): beg_dt = '2014-08-12 13:30:00.000000' periods = 1e5 for i in xrange(iterations): df = create_df(beg_dt, periods) store_append(store, df, key="df") beg_dt = df.index[-1:][0] + relativedelta(seconds=1)
df = create_df(beg_dt, 58689)
store_append(store, df, key="df")
beg_dt = df.index[-1:][0] + relativedelta(seconds=1)
df = create_df(beg_dt, 41375)
store_append(store, df, key="df")
beg_dt = df.index[-1:][0] + relativedelta(seconds=1)
return(df)
def store_open(fname): return(HDFStore(fname))
def store_get(store, key="df", where=None, start=None, stop=None, iterator=False, chunksize=None): df = None try: df = store.select(key, where=where, start=start, stop=stop, iterator=iterator, chunksize=chunksize) except (KeyError, TypeError, ): pass
return(df)
def store_append(store, df, key="df", where=""): store.append(key, df, format='table')
path = '.' fname = '/'.join([path, 'delme_test.h5'])
store = None for n in xrange(0, 4): pass
if store:
try: store.close()
except: pass
try: os.unlink(fname)
except: pass
store = store_open(fname)
create_file(n)
store.close()
store = store_open(fname)
where = None
df = store_get(store, 'df', where=where, iterator=False)
expected_ln = len(df)
beg_dt = '2014-08-12 13:30:00.000000'
end_dt = '2032-12-31 13:30:00.000000'
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
# where clause, iterator=False
df = store_get(store, 'df', where=where, iterator=False)
ln_df = len(df)
# no where clause
it = store_get(store, 'df', where=None, iterator=True)
dfs = [df for df in it if not df.empty]
ln_it_no_where_clause = sum([len(df) for df in dfs])
# where clause, iterator=True
it = store_get(store, 'df', where=where, iterator=True)
dfs = [df for df in it if not df.empty]
ln_it = sum([len(df) for df in dfs])
if expected_ln == ln_df and expected_ln == ln_it:
print("iteration: %d PASSED" % n)
else:
print("iteration: %d FAILED" % n)
print("expected: %d, df len: %d, it (no where clause) len: %d, it len: %d" %
(expected_ln, ln_df, ln_it_no_where_clause, ln_it))
store.close()