BUG: Use size_t to avoid array index overflow; add missing malloc of … · pandas-dev/pandas@8d7d3fb (original) (raw)

`@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":

121

` io_callback cb_io

122

` io_cleanup cb_cleanup

123

124

int chunksize # Number of bytes to prepare for each chunk

125

char *data # pointer to data to be processed

126

int datalen # amount of data available

127

int datapos

124

int64_t chunksize # Number of bytes to prepare for each chunk

125

char *data # pointer to data to be processed

126

int64_t datalen # amount of data available

127

int64_t datapos

128

129

`# where to write out tokenized data

130

`char *stream

131

int stream_len

132

int stream_cap

131

int64_t stream_len

132

int64_t stream_cap

133

134

`# Store words in (potentially ragged) matrix for now, hmm

135

`char **words

136

int *word_starts # where we are in the stream

137

int words_len

138

int words_cap

136

int64_t *word_starts # where we are in the stream

137

int64_t words_len

138

int64_t words_cap

139

140

char *pword_start # pointer to stream start of current field

141

int word_start # position start of current field

140

char *pword_start # pointer to stream start of current field

141

int64_t word_start # position start of current field

142

143

int *line_start # position in words for start of line

144

int *line_fields # Number of fields in each line

145

int lines # Number of lines observed

146

int file_lines # Number of file lines observed (with bad/skipped)

147

int lines_cap # Vector capacity

143

int64_t *line_start # position in words for start of line

144

int64_t *line_fields # Number of fields in each line

145

int64_t lines # Number of lines observed

146

int64_t file_lines # Number of lines observed (with bad/skipped)

147

int64_t lines_cap # Vector capacity

148

149

`# Tokenizing stuff

150

` ParserState state

`@@ -177,14 +177,14 @@ cdef extern from "parser/tokenizer.h":

177

`# thousands separator (comma, period)

178

`char thousands

179

180

int header # Boolean: 1: has header, 0: no header

181

int header_start # header row start

182

int header_end # header row end

180

int header # Boolean: 1: has header, 0: no header

181

int64_t header_start # header row start

182

int64_t header_end # header row end

183

184

`void *skipset

185

` PyObject *skipfunc

186

` int64_t skip_first_N_rows

187

int skipfooter

187

int64_t skipfooter

188

`# pick one, depending on whether the converter requires GIL

189

`double (*double_converter_nogil)(const char *, char **,

190

`char, char, char, int) nogil

`@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":

195

`char *warn_msg

196

`char *error_msg

197

198

int skip_empty_lines

198

int64_t skip_empty_lines

199

200

` ctypedef struct coliter_t:

201

`char **words

202

int *line_start

203

int col

202

int64_t *line_start

203

int64_t col

204

205

` ctypedef struct uint_state:

206

`int seen_sint

`@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h":

210

`void uint_state_init(uint_state *self)

211

`int uint64_conflict(uint_state *self)

212

213

void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil

213

void coliter_setup(coliter_t *it, parser_t *parser,

214

int64_t i, int64_t start) nogil

214

215

`void COLITER_NEXT(coliter_t, const char *) nogil

215

216

217

` parser_t* parser_new()

`@@ -289,14 +290,14 @@ cdef class TextReader:

289

290

`object true_values, false_values

290

291

`object handle

291

292

` bint na_filter, verbose, has_usecols, has_mi_columns

292

int parser_start

293

int64_t parser_start

293

294

`list clocks

294

295

`char *c_encoding

295

296

` kh_str_t *false_set

296

297

` kh_str_t *true_set

297

298

299

` cdef public:

299

int leading_cols, table_width, skipfooter, buffer_lines

300

int64_t leading_cols, table_width, skipfooter, buffer_lines

300

301

`object allow_leading_cols

301

302

`object delimiter, converters, delim_whitespace

302

303

`object na_values

`@@ -730,7 +731,8 @@ cdef class TextReader:

730

731

` Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa

731

732

`char *word

732

733

`object name

733

int status, hr, data_line

734

int status

735

int64_t hr, data_line

734

736

`char *errors = "strict"

735

737

` cdef StringPath path = _string_path(self.c_encoding)

736

738

`@@ -949,8 +951,8 @@ cdef class TextReader:

949

951

950

952

` cdef _read_rows(self, rows, bint trim):

951

953

` cdef:

952

int buffered_lines

953

int irows, footer = 0

954

int64_t buffered_lines

955

int64_t irows, footer = 0

954

956

955

957

`self._start_clock()

956

958

`@@ -1018,12 +1020,13 @@ cdef class TextReader:

1018

1020

1019

1021

`def _convert_column_data(self, rows=None, upcast_na=False, footer=0):

1020

1022

` cdef:

1021

Py_ssize_t i, nused

1023

int64_t i

1024

int nused

1022

1025

` kh_str_t *na_hashset = NULL

1023

int start, end

1026

int64_t start, end

1024

1027

`object name, na_flist, col_dtype = None

1025

1028

` bint na_filter = 0

1026

Py_ssize_t num_cols

1029

int64_t num_cols

1027

1030

1028

1031

` start = self.parser_start

1029

1032

`@@ -1195,7 +1198,7 @@ cdef class TextReader:

1195

1198

`return col_res, na_count

1196

1199

1197

1200

` cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,

1198

int start, int end,

1201

int64_t start, int64_t end,

1199

1202

` bint na_filter,

1200

1203

` bint user_dtype,

1201

1204

` kh_str_t *na_hashset,

`@@ -1275,7 +1278,7 @@ cdef class TextReader:

1275

1278

`raise TypeError("the dtype %s is not "

1276

1279

`"supported for parsing" % dtype)

1277

1280

1278

cdef _string_convert(self, Py_ssize_t i, int start, int end,

1281

cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,

1279

1282

` bint na_filter, kh_str_t *na_hashset):

1280

1283

1281

1284

` cdef StringPath path = _string_path(self.c_encoding)

`@@ -1336,6 +1339,7 @@ cdef class TextReader:

1336

1339

` kh_destroy_str(table)

1337

1340

1338

1341

` cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):

1342

cdef int64_t j

1339

1343

`if self.has_usecols and self.names is not None:

1340

1344

`if (not callable(self.usecols) and

1341

1345

`len(self.names) == len(self.usecols)):

`@@ -1427,8 +1431,8 @@ cdef inline StringPath _string_path(char *encoding):

1427

1431

`# ----------------------------------------------------------------------

1428

1432

`# Type conversions / inference support code

1429

1433

1430

cdef _string_box_factorize(parser_t *parser, int col,

1431

int line_start, int line_end,

1434

cdef _string_box_factorize(parser_t *parser, int64_t col,

1435

int64_t line_start, int64_t line_end,

1432

1436

` bint na_filter, kh_str_t *na_hashset):

1433

1437

` cdef:

1434

1438

`int error, na_count = 0

`@@ -1480,8 +1484,8 @@ cdef _string_box_factorize(parser_t *parser, int col,

1480

1484

1481

1485

`return result, na_count

1482

1486

1483

cdef _string_box_utf8(parser_t *parser, int col,

1484

int line_start, int line_end,

1487

cdef _string_box_utf8(parser_t *parser, int64_t col,

1488

int64_t line_start, int64_t line_end,

1485

1489

` bint na_filter, kh_str_t *na_hashset):

1486

1490

` cdef:

1487

1491

`int error, na_count = 0

`@@ -1533,8 +1537,8 @@ cdef _string_box_utf8(parser_t *parser, int col,

1533

1537

1534

1538

`return result, na_count

1535

1539

1536

cdef _string_box_decode(parser_t *parser, int col,

1537

int line_start, int line_end,

1540

cdef _string_box_decode(parser_t *parser, int64_t col,

1541

int64_t line_start, int64_t line_end,

1538

1542

` bint na_filter, kh_str_t *na_hashset,

1539

1543

`char *encoding):

1540

1544

` cdef:

`@@ -1592,8 +1596,8 @@ cdef _string_box_decode(parser_t *parser, int col,

1592

1596

1593

1597

1594

1598

`@cython.boundscheck(False)

1595

cdef _categorical_convert(parser_t *parser, int col,

1596

int line_start, int line_end,

1599

cdef _categorical_convert(parser_t *parser, int64_t col,

1600

int64_t line_start, int64_t line_end,

1597

1601

` bint na_filter, kh_str_t *na_hashset,

1598

1602

`char *encoding):

1599

1603

`"Convert column data into codes, categories"

`@@ -1663,8 +1667,8 @@ cdef _categorical_convert(parser_t *parser, int col,

1663

1667

` kh_destroy_str(table)

1664

1668

`return np.asarray(codes), result, na_count

1665

1669

1666

cdef _to_fw_string(parser_t *parser, int col, int line_start,

1667

int line_end, size_t width):

1670

cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,

1671

int64_t line_end, int64_t width):

1668

1672

` cdef:

1669

1673

` Py_ssize_t i

1670

1674

` coliter_t it

`@@ -1680,11 +1684,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,

1680

1684

1681

1685

`return result

1682

1686

1683

cdef inline void _to_fw_string_nogil(parser_t *parser, int col,

1684

int line_start, int line_end,

1687

cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,

1688

int64_t line_start, int64_t line_end,

1685

1689

` size_t width, char *data) nogil:

1686

1690

` cdef:

1687

Py_ssize_t i

1691

int64_t i

1688

1692

` coliter_t it

1689

1693

` const char *word = NULL

1690

1694

`@@ -1699,7 +1703,8 @@ cdef char* cinf = b'inf'

1699

1703

`cdef char* cposinf = b'+inf'

1700

1704

`cdef char* cneginf = b'-inf'

1701

1705

1702

cdef _try_double(parser_t *parser, int col, int line_start, int line_end,

1706

cdef _try_double(parser_t *parser, int64_t col,

1707

int64_t line_start, int64_t line_end,

1703

1708

` bint na_filter, kh_str_t *na_hashset, object na_flist):

1704

1709

` cdef:

1705

1710

`int error, na_count = 0

`@@ -1808,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser,

1808

1813

1809

1814

`return 0

1810

1815

1811

cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,

1816

cdef _try_uint64(parser_t *parser, int64_t col,

1817

int64_t line_start, int64_t line_end,

1812

1818

` bint na_filter, kh_str_t *na_hashset):

1813

1819

` cdef:

1814

1820

`int error

`@@ -1842,8 +1848,9 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,

1842

1848

1843

1849

`return result

1844

1850

1845

cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,

1846

int line_end, bint na_filter,

1851

cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,

1852

int64_t line_start,

1853

int64_t line_end, bint na_filter,

1847

1854

` const kh_str_t *na_hashset,

1848

1855

` uint64_t *data, uint_state *state) nogil:

1849

1856

` cdef:

`@@ -1879,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,

1879

1886

1880

1887

`return 0

1881

1888

1882

cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,

1889

cdef _try_int64(parser_t *parser, int64_t col,

1890

int64_t line_start, int64_t line_end,

1883

1891

` bint na_filter, kh_str_t *na_hashset):

1884

1892

` cdef:

1885

1893

`int error, na_count = 0

`@@ -1906,8 +1914,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,

1906

1914

1907

1915

`return result, na_count

1908

1916

1909

cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,

1910

int line_end, bint na_filter,

1917

cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,

1918

int64_t line_start,

1919

int64_t line_end, bint na_filter,

1911

1920

` const kh_str_t *na_hashset, int64_t NA,

1912

1921

` int64_t *data, int *na_count) nogil:

1913

1922

` cdef:

`@@ -1944,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,

1944

1953

1945

1954

`return 0

1946

1955

1947

cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,

1956

cdef _try_bool(parser_t *parser, int64_t col,

1957

int64_t line_start, int64_t line_end,

1948

1958

` bint na_filter, kh_str_t *na_hashset):

1949

1959

` cdef:

1950

1960

`int na_count

`@@ -1966,8 +1976,9 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,

1966

1976

`return None, None

1967

1977

`return result.view(np.bool_), na_count

1968

1978

1969

cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,

1970

int line_end, bint na_filter,

1979

cdef inline int _try_bool_nogil(parser_t *parser, int64_t col,

1980

int64_t line_start,

1981

int64_t line_end, bint na_filter,

1971

1982

` const kh_str_t *na_hashset, uint8_t NA,

1972

1983

` uint8_t *data, int *na_count) nogil:

1973

1984

` cdef:

`@@ -2006,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,

2006

2017

` data += 1

2007

2018

`return 0

2008

2019

2009

cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

2020

cdef _try_bool_flex(parser_t *parser, int64_t col,

2021

int64_t line_start, int64_t line_end,

2010

2022

` bint na_filter, const kh_str_t *na_hashset,

2011

2023

` const kh_str_t *true_hashset,

2012

2024

` const kh_str_t *false_hashset):

`@@ -2032,8 +2044,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,

2032

2044

`return None, None

2033

2045

`return result.view(np.bool_), na_count

2034

2046

2035

cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,

2036

int line_end, bint na_filter,

2047

cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,

2048

int64_t line_start,

2049

int64_t line_end, bint na_filter,

2037

2050

` const kh_str_t *na_hashset,

2038

2051

` const kh_str_t *true_hashset,

2039

2052

` const kh_str_t *false_hashset,

`@@ -2251,8 +2264,8 @@ for k in list(na_values):

2251

2264

` na_values[np.dtype(k)] = na_values[k]

2252

2265

2253

2266

2254

cdef _apply_converter(object f, parser_t *parser, int col,

2255

int line_start, int line_end,

2267

cdef _apply_converter(object f, parser_t *parser, int64_t col,

2268

int64_t line_start, int64_t line_end,

2256

2269

`char* c_encoding):

2257

2270

` cdef:

2258

2271

`int error

`@@ -2296,7 +2309,7 @@ def _to_structured_array(dict columns, object names, object usecols):

2296

2309

2297

2310

`object name, fnames, field_type

2298

2311

` Py_ssize_t i, offset, nfields, length

2299

int stride, elsize

2312

int64_t stride, elsize

2300

2313

`char *buf

2301

2314

2302

2315

`if names is None:

`@@ -2344,10 +2357,10 @@ def _to_structured_array(dict columns, object names, object usecols):

2344

2357

2345

2358

`return recs

2346

2359

2347

cdef _fill_structured_column(char dst, char src, int elsize,

2348

int stride, int length, bint incref):

2360

cdef _fill_structured_column(char dst, char src, int64_t elsize,

2361

int64_t stride, int64_t length, bint incref):

2349

2362

` cdef:

2350

Py_ssize_t i

2363

int64_t i

2351

2364

2352

2365

`if incref:

2353

2366

` util.transfer_object_column(dst, src, stride, length)