BUG: Use size_t to avoid array index overflow; add missing malloc of … · pandas-dev/pandas@8d7d3fb (original) (raw)

`@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":

`

121

121

` io_callback cb_io

`

122

122

` io_cleanup cb_cleanup

`

123

123

``

124

``

`-

int chunksize # Number of bytes to prepare for each chunk

`

125

``

`-

char *data # pointer to data to be processed

`

126

``

`-

int datalen # amount of data available

`

127

``

`-

int datapos

`

``

124

`+

int64_t chunksize # Number of bytes to prepare for each chunk

`

``

125

`+

char *data # pointer to data to be processed

`

``

126

`+

int64_t datalen # amount of data available

`

``

127

`+

int64_t datapos

`

128

128

``

129

129

`# where to write out tokenized data

`

130

130

`char *stream

`

131

``

`-

int stream_len

`

132

``

`-

int stream_cap

`

``

131

`+

int64_t stream_len

`

``

132

`+

int64_t stream_cap

`

133

133

``

134

134

`# Store words in (potentially ragged) matrix for now, hmm

`

135

135

`char **words

`

136

``

`-

int *word_starts # where we are in the stream

`

137

``

`-

int words_len

`

138

``

`-

int words_cap

`

``

136

`+

int64_t *word_starts # where we are in the stream

`

``

137

`+

int64_t words_len

`

``

138

`+

int64_t words_cap

`

139

139

``

140

``

`-

char *pword_start # pointer to stream start of current field

`

141

``

`-

int word_start # position start of current field

`

``

140

`+

char *pword_start # pointer to stream start of current field

`

``

141

`+

int64_t word_start # position start of current field

`

142

142

``

143

``

`-

int *line_start # position in words for start of line

`

144

``

`-

int *line_fields # Number of fields in each line

`

145

``

`-

int lines # Number of lines observed

`

146

``

`-

int file_lines # Number of file lines observed (with bad/skipped)

`

147

``

`-

int lines_cap # Vector capacity

`

``

143

`+

int64_t *line_start # position in words for start of line

`

``

144

`+

int64_t *line_fields # Number of fields in each line

`

``

145

`+

int64_t lines # Number of lines observed

`

``

146

`+

int64_t file_lines # Number of lines observed (with bad/skipped)

`

``

147

`+

int64_t lines_cap # Vector capacity

`

148

148

``

149

149

`# Tokenizing stuff

`

150

150

` ParserState state

`

`@@ -177,14 +177,14 @@ cdef extern from "parser/tokenizer.h":

`

177

177

`# thousands separator (comma, period)

`

178

178

`char thousands

`

179

179

``

180

``

`-

int header # Boolean: 1: has header, 0: no header

`

181

``

`-

int header_start # header row start

`

182

``

`-

int header_end # header row end

`

``

180

`+

int header # Boolean: 1: has header, 0: no header

`

``

181

`+

int64_t header_start # header row start

`

``

182

`+

int64_t header_end # header row end

`

183

183

``

184

184

`void *skipset

`

185

185

` PyObject *skipfunc

`

186

186

` int64_t skip_first_N_rows

`

187

``

`-

int skipfooter

`

``

187

`+

int64_t skipfooter

`

188

188

`# pick one, depending on whether the converter requires GIL

`

189

189

`double (*double_converter_nogil)(const char *, char **,

`

190

190

`char, char, char, int) nogil

`

`@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":

`

195

195

`char *warn_msg

`

196

196

`char *error_msg

`

197

197

``

198

``

`-

int skip_empty_lines

`

``

198

`+

int64_t skip_empty_lines

`

199

199

``

200

200

` ctypedef struct coliter_t:

`

201

201

`char **words

`

202

``

`-

int *line_start

`

203

``

`-

int col

`

``

202

`+

int64_t *line_start

`

``

203

`+

int64_t col

`

204

204

``

205

205

` ctypedef struct uint_state:

`

206

206

`int seen_sint

`

`@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h":

`

210

210

`void uint_state_init(uint_state *self)

`

211

211

`int uint64_conflict(uint_state *self)

`

212

212

``

213

``

`-

void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil

`

``

213

`+

void coliter_setup(coliter_t *it, parser_t *parser,

`

``

214

`+

int64_t i, int64_t start) nogil

`

214

215

`void COLITER_NEXT(coliter_t, const char *) nogil

`

215

216

``

216

217

` parser_t* parser_new()

`

`@@ -289,14 +290,14 @@ cdef class TextReader:

`

289

290

`object true_values, false_values

`

290

291

`object handle

`

291

292

` bint na_filter, verbose, has_usecols, has_mi_columns

`

292

``

`-

int parser_start

`

``

293

`+

int64_t parser_start

`

293

294

`list clocks

`

294

295

`char *c_encoding

`

295

296

` kh_str_t *false_set

`

296

297

` kh_str_t *true_set

`

297

298

``

298

299

` cdef public:

`

299

``

`-

int leading_cols, table_width, skipfooter, buffer_lines

`

``

300

`+

int64_t leading_cols, table_width, skipfooter, buffer_lines

`

300

301

`object allow_leading_cols

`

301

302

`object delimiter, converters, delim_whitespace

`

302

303

`object na_values

`

`@@ -730,7 +731,8 @@ cdef class TextReader:

`

730

731

` Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa

`

731

732

`char *word

`

732

733

`object name

`

733

``

`-

int status, hr, data_line

`

``

734

`+

int status

`

``

735

`+

int64_t hr, data_line

`

734

736

`char *errors = "strict"

`

735

737

` cdef StringPath path = _string_path(self.c_encoding)

`

736

738

``

`@@ -949,8 +951,8 @@ cdef class TextReader:

`

949

951

``

950

952

` cdef _read_rows(self, rows, bint trim):

`

951

953

` cdef:

`

952

``

`-

int buffered_lines

`

953

``

`-

int irows, footer = 0

`

``

954

`+

int64_t buffered_lines

`

``

955

`+

int64_t irows, footer = 0

`

954

956

``

955

957

`self._start_clock()

`

956

958

``

`@@ -1018,12 +1020,13 @@ cdef class TextReader:

`

1018

1020

``

1019

1021

`def _convert_column_data(self, rows=None, upcast_na=False, footer=0):

`

1020

1022

` cdef:

`

1021

``

`-

Py_ssize_t i, nused

`

``

1023

`+

int64_t i

`

``

1024

`+

int nused

`

1022

1025

` kh_str_t *na_hashset = NULL

`

1023

``

`-

int start, end

`

``

1026

`+

int64_t start, end

`

1024

1027

`object name, na_flist, col_dtype = None

`

1025

1028

` bint na_filter = 0

`

1026

``

`-

Py_ssize_t num_cols

`

``

1029

`+

int64_t num_cols

`

1027

1030

``

1028

1031

` start = self.parser_start

`

1029

1032

``

`@@ -1195,7 +1198,7 @@ cdef class TextReader:

`

1195

1198

`return col_res, na_count

`

1196

1199

``

1197

1200

` cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,

`

1198

``

`-

int start, int end,

`

``

1201

`+

int64_t start, int64_t end,

`

1199

1202

` bint na_filter,

`

1200

1203

` bint user_dtype,

`

1201

1204

` kh_str_t *na_hashset,

`

`@@ -1275,7 +1278,7 @@ cdef class TextReader:

`

1275

1278

`raise TypeError("the dtype %s is not "

`

1276

1279

`"supported for parsing" % dtype)

`

1277

1280

``

1278

``

`-

cdef _string_convert(self, Py_ssize_t i, int start, int end,

`

``

1281

`+

cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,

`

1279

1282

` bint na_filter, kh_str_t *na_hashset):

`

1280

1283

``

1281

1284

` cdef StringPath path = _string_path(self.c_encoding)

`

`@@ -1336,6 +1339,7 @@ cdef class TextReader:

`

1336

1339

` kh_destroy_str(table)

`

1337

1340

``

1338

1341

` cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):

`

``

1342

`+

cdef int64_t j

`

1339

1343

`if self.has_usecols and self.names is not None:

`

1340

1344

`if (not callable(self.usecols) and

`

1341

1345

`len(self.names) == len(self.usecols)):

`

`@@ -1427,8 +1431,8 @@ cdef inline StringPath _string_path(char *encoding):

`

1427

1431

`# ----------------------------------------------------------------------

`

1428

1432

`# Type conversions / inference support code

`

1429

1433

``

1430

``

`-

cdef _string_box_factorize(parser_t *parser, int col,

`

1431

``

`-

int line_start, int line_end,

`

``

1434

`+

cdef _string_box_factorize(parser_t *parser, int64_t col,

`

``

1435

`+

int64_t line_start, int64_t line_end,

`

1432

1436

` bint na_filter, kh_str_t *na_hashset):

`

1433

1437

` cdef:

`

1434

1438

`int error, na_count = 0

`

`@@ -1480,8 +1484,8 @@ cdef _string_box_factorize(parser_t *parser, int col,

`

1480

1484

``

1481

1485

`return result, na_count

`

1482

1486

``

1483

``

`-

cdef _string_box_utf8(parser_t *parser, int col,

`

1484

``

`-

int line_start, int line_end,

`

``

1487

`+

cdef _string_box_utf8(parser_t *parser, int64_t col,

`

``

1488

`+

int64_t line_start, int64_t line_end,

`

1485

1489

` bint na_filter, kh_str_t *na_hashset):

`

1486

1490

` cdef:

`

1487

1491

`int error, na_count = 0

`

`@@ -1533,8 +1537,8 @@ cdef _string_box_utf8(parser_t *parser, int col,

`

1533

1537

``

1534

1538

`return result, na_count

`

1535

1539

``

1536

``

`-

cdef _string_box_decode(parser_t *parser, int col,

`

1537

``

`-

int line_start, int line_end,

`

``

1540

`+

cdef _string_box_decode(parser_t *parser, int64_t col,

`

``

1541

`+

int64_t line_start, int64_t line_end,

`

1538

1542

` bint na_filter, kh_str_t *na_hashset,

`

1539

1543

`char *encoding):

`

1540

1544

` cdef:

`

`@@ -1592,8 +1596,8 @@ cdef _string_box_decode(parser_t *parser, int col,

`

1592

1596

``

1593

1597

``

1594

1598

`@cython.boundscheck(False)

`

1595

``

`-

cdef _categorical_convert(parser_t *parser, int col,

`

1596

``

`-

int line_start, int line_end,

`

``

1599

`+

cdef _categorical_convert(parser_t *parser, int64_t col,

`

``

1600

`+

int64_t line_start, int64_t line_end,

`

1597

1601

` bint na_filter, kh_str_t *na_hashset,

`

1598

1602

`char *encoding):

`

1599

1603

`"Convert column data into codes, categories"

`

`@@ -1663,8 +1667,8 @@ cdef _categorical_convert(parser_t *parser, int col,

`

1663

1667

` kh_destroy_str(table)

`

1664

1668

`return np.asarray(codes), result, na_count

`

1665

1669

``

1666

``

`-

cdef _to_fw_string(parser_t *parser, int col, int line_start,

`

1667

``

`-

int line_end, size_t width):

`

``

1670

`+

cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,

`

``

1671

`+

int64_t line_end, int64_t width):

`

1668

1672

` cdef:

`

1669

1673

` Py_ssize_t i

`

1670

1674

` coliter_t it

`

`@@ -1680,11 +1684,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,

`

1680

1684

``

1681

1685

`return result

`

1682

1686

``

1683

``

`-

cdef inline void _to_fw_string_nogil(parser_t *parser, int col,

`

1684

``

`-

int line_start, int line_end,

`

``

1687

`+

cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,

`

``

1688

`+

int64_t line_start, int64_t line_end,

`

1685

1689

` size_t width, char *data) nogil:

`

1686

1690

` cdef:

`

1687

``

`-

Py_ssize_t i

`

``

1691

`+

int64_t i

`

1688

1692

` coliter_t it

`

1689

1693

` const char *word = NULL

`

1690

1694

``

`@@ -1699,7 +1703,8 @@ cdef char* cinf = b'inf'

`

1699

1703

`cdef char* cposinf = b'+inf'

`

1700

1704

`cdef char* cneginf = b'-inf'

`

1701

1705

``

1702

``

`-

cdef _try_double(parser_t *parser, int col, int line_start, int line_end,

`

``

1706

`+

cdef _try_double(parser_t *parser, int64_t col,

`

``

1707

`+

int64_t line_start, int64_t line_end,

`

1703

1708

` bint na_filter, kh_str_t *na_hashset, object na_flist):

`

1704

1709

` cdef:

`

1705

1710

`int error, na_count = 0

`

`@@ -1808,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser,

`

1808

1813

``

1809

1814

`return 0

`

1810

1815

``

1811

``

`-

cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,

`

``

1816

`+

cdef _try_uint64(parser_t *parser, int64_t col,

`

``

1817

`+

int64_t line_start, int64_t line_end,

`

1812

1818

` bint na_filter, kh_str_t *na_hashset):

`

1813

1819

` cdef:

`

1814

1820

`int error

`

`@@ -1842,8 +1848,9 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,

`

1842

1848

``

1843

1849

`return result

`

1844

1850

``

1845

``

`-

cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,

`

1846

``

`-

int line_end, bint na_filter,

`

``

1851

`+

cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,

`

``

1852

`+

int64_t line_start,

`

``

1853

`+

int64_t line_end, bint na_filter,

`

1847

1854

` const kh_str_t *na_hashset,

`

1848

1855

` uint64_t *data, uint_state *state) nogil:

`

1849

1856

` cdef:

`

`@@ -1879,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,

`

1879

1886

``

1880

1887

`return 0

`

1881

1888

``

1882

``

`-

cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,

`

``

1889

`+

cdef _try_int64(parser_t *parser, int64_t col,

`

``

1890

`+

int64_t line_start, int64_t line_end,

`

1883

1891

` bint na_filter, kh_str_t *na_hashset):

`

1884

1892

` cdef:

`

1885

1893

`int error, na_count = 0

`

`@@ -1906,8 +1914,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,

`

1906

1914

``

1907

1915

`return result, na_count

`

1908

1916

``

1909

``

`-

cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,

`

1910

``

`-

int line_end, bint na_filter,

`

``

1917

`+

cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,

`

``

1918

`+

int64_t line_start,

`

``

1919

`+

int64_t line_end, bint na_filter,

`

1911

1920

` const kh_str_t *na_hashset, int64_t NA,

`

1912

1921

` int64_t *data, int *na_count) nogil:

`

1913

1922

` cdef:

`

`@@ -1944,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,

`

1944

1953

``

1945

1954

`return 0

`

1946

1955

``

1947

``

`-

cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,

`

``

1956

`+

cdef _try_bool(parser_t *parser, int64_t col,

`

``

1957

`+

int64_t line_start, int64_t line_end,

`

1948

1958

` bint na_filter, kh_str_t *na_hashset):

`

1949

1959

` cdef:

`

1950

1960

`int na_count

`

`@@ -1966,8 +1976,9 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,

`

1966

1976

`return None, None

`

1967

1977

`return result.view(np.bool_), na_count

`

1968

1978

``

1969

``

`-

cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,

`

1970

``

`-

int line_end, bint na_filter,

`

``

1979

`+

cdef inline int _try_bool_nogil(parser_t *parser, int64_t col,

`

``

1980

`+

int64_t line_start,

`

``

1981

`+

int64_t line_end, bint na_filter,

`

1971

1982

` const kh_str_t *na_hashset, uint8_t NA,

`

1972

1983

` uint8_t *data, int *na_count) nogil:

`

1973

1984

` cdef:

`

`@@ -2006,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,

`

2006

2017

` data += 1

`

2007

2018

`return 0

`

2008

2019

``

2009

``

`-

cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

`

``

2020

`+

cdef _try_bool_flex(parser_t *parser, int64_t col,

`

``

2021

`+

int64_t line_start, int64_t line_end,

`

2010

2022

` bint na_filter, const kh_str_t *na_hashset,

`

2011

2023

` const kh_str_t *true_hashset,

`

2012

2024

` const kh_str_t *false_hashset):

`

`@@ -2032,8 +2044,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

`

2032

2044

`return None, None

`

2033

2045

`return result.view(np.bool_), na_count

`

2034

2046

``

2035

``

`-

cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,

`

2036

``

`-

int line_end, bint na_filter,

`

``

2047

`+

cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,

`

``

2048

`+

int64_t line_start,

`

``

2049

`+

int64_t line_end, bint na_filter,

`

2037

2050

` const kh_str_t *na_hashset,

`

2038

2051

` const kh_str_t *true_hashset,

`

2039

2052

` const kh_str_t *false_hashset,

`

`@@ -2251,8 +2264,8 @@ for k in list(na_values):

`

2251

2264

` na_values[np.dtype(k)] = na_values[k]

`

2252

2265

``

2253

2266

``

2254

``

`-

cdef _apply_converter(object f, parser_t *parser, int col,

`

2255

``

`-

int line_start, int line_end,

`

``

2267

`+

cdef _apply_converter(object f, parser_t *parser, int64_t col,

`

``

2268

`+

int64_t line_start, int64_t line_end,

`

2256

2269

`char* c_encoding):

`

2257

2270

` cdef:

`

2258

2271

`int error

`

`@@ -2296,7 +2309,7 @@ def _to_structured_array(dict columns, object names, object usecols):

`

2296

2309

``

2297

2310

`object name, fnames, field_type

`

2298

2311

` Py_ssize_t i, offset, nfields, length

`

2299

``

`-

int stride, elsize

`

``

2312

`+

int64_t stride, elsize

`

2300

2313

`char *buf

`

2301

2314

``

2302

2315

`if names is None:

`

`@@ -2344,10 +2357,10 @@ def _to_structured_array(dict columns, object names, object usecols):

`

2344

2357

``

2345

2358

`return recs

`

2346

2359

``

2347

``

`-

cdef _fill_structured_column(char dst, char src, int elsize,

`

2348

``

`-

int stride, int length, bint incref):

`

``

2360

`+

cdef _fill_structured_column(char dst, char src, int64_t elsize,

`

``

2361

`+

int64_t stride, int64_t length, bint incref):

`

2349

2362

` cdef:

`

2350

``

`-

Py_ssize_t i

`

``

2363

`+

int64_t i

`

2351

2364

``

2352

2365

`if incref:

`

2353

2366

` util.transfer_object_column(dst, src, stride, length)

`