MultiIndexes and large CSV files · Issue #4516 · pandas-dev/pandas (original) (raw)

I've run into a data-dependent bug with MultiIndex. Attached are a test script and a data file generation script:

My problem is that sales-data.py breaks with large CSV files but works with smaller CSV files. When it fails it displays the following error message:

'MultiIndex lexsort depth 0, key was length 4'

I know that MultiIndexes must be pre-sorted. So sales-gen.sh makes sure that the first four columns in the auto-generated CSV files are ordered. BTW, I'm not sure why lexsort is getting called.

sales-data.py

#!/usr/bin/env python

import numpy as np
import pandas as pd
from memory_profiler import profile

pd.set_option('display.height',            400)
pd.set_option('display.width',             400)
pd.set_option('display.max_rows',         1000)
pd.set_option('display.max_columns',        30)
pd.set_option('display.line_width',        200)

# @profile
def load_data():
    try:
        df = pd.read_csv(
            './sales-large.csv',
          # './sales-small.csv',
            header = None,
            na_values = ['NULL'],
            names = [
                'salesperson',
                'customer',
                'invoice_date',
                'ship_date',
                'product',
                'quantity',
                'price',
            ],
            index_col = [
                'salesperson',
                'customer',
                'invoice_date',
                'ship_date',
            ],
            parse_dates = [
                'invoice_date',
                'ship_date',
            ],
        )

        print(df.loc[(
            'A00000',                        # salesperson
            'A',                             # customer
            pd.datetime(2011,3,1,0,0,0),     # invoice_date
            pd.datetime(2011,3,6,0,0,0),     # ship_date
        )])

    except Exception as e:
        print(e)

if __name__== '__main__':
    load_data()

sales-gen.sh

#!/usr/bin/env bash

# 'salesperson',
# 'customer',
# 'invoice_date',
# 'ship_date',
# 'product',
# 'quantity',
# 'price',

awk "BEGIN {                                                                                              \
          first = 65;                                                                                     \
          last = 91;                                                                                      \
          s_char = first;                                                                                 \
          c_char = first;                                                                                 \
                                                                                                          \
          n_sa = 100;                                                                                     \
          n_cu = 100;                                                                                     \
          n_dt = 10;                                                                                      \
          n_pr = 100;                                                                                     \
                                                                                                          \
          for (i = 0; i < n_sa; i++ s_char++) {                                                           \
              if (s_char == last) {                                                                       \
                  s_char = first;                                                                         \
              }                                                                                           \
              c_char = first;                                                                             \
              for (j = 0; j < n_cu; j++ c_char++) {                                                       \
                  if (c_char == last) {                                                                   \
                      c_char = first;                                                                     \
                  }                                                                                       \
                  for (k = 1; k <= n_dt; k++) {                                                           \
                      for (l = 0; l < n_pr; l++) {                                                        \
                          printf(\"%c%05d,%c,2011-03-%02d 00:00:00,2011-03-%02d 00:00:00,%d,%d,%f\n\",    \
                              s_char, i, c_char, k, k+5, l, 1 + int(rand() * 10), rand())                 \
                      }                                                                                   \
                  }                                                                                       \
              }                                                                                           \
          }                                                                                               \
      }"