BUG: Use size_t to avoid array index overflow; add missing malloc of … · pandas-dev/pandas@8d7d3fb (original) (raw)
`@@ -121,30 +121,30 @@ cdef extern from "parser/tokenizer.h":
`
121
121
` io_callback cb_io
`
122
122
` io_cleanup cb_cleanup
`
123
123
``
124
``
`-
int chunksize # Number of bytes to prepare for each chunk
`
125
``
`-
char *data # pointer to data to be processed
`
126
``
`-
int datalen # amount of data available
`
127
``
`-
int datapos
`
``
124
`+
int64_t chunksize # Number of bytes to prepare for each chunk
`
``
125
`+
char *data # pointer to data to be processed
`
``
126
`+
int64_t datalen # amount of data available
`
``
127
`+
int64_t datapos
`
128
128
``
129
129
`# where to write out tokenized data
`
130
130
`char *stream
`
131
``
`-
int stream_len
`
132
``
`-
int stream_cap
`
``
131
`+
int64_t stream_len
`
``
132
`+
int64_t stream_cap
`
133
133
``
134
134
`# Store words in (potentially ragged) matrix for now, hmm
`
135
135
`char **words
`
136
``
`-
int *word_starts # where we are in the stream
`
137
``
`-
int words_len
`
138
``
`-
int words_cap
`
``
136
`+
int64_t *word_starts # where we are in the stream
`
``
137
`+
int64_t words_len
`
``
138
`+
int64_t words_cap
`
139
139
``
140
``
`-
char *pword_start # pointer to stream start of current field
`
141
``
`-
int word_start # position start of current field
`
``
140
`+
char *pword_start # pointer to stream start of current field
`
``
141
`+
int64_t word_start # position start of current field
`
142
142
``
143
``
`-
int *line_start # position in words for start of line
`
144
``
`-
int *line_fields # Number of fields in each line
`
145
``
`-
int lines # Number of lines observed
`
146
``
`-
int file_lines # Number of file lines observed (with bad/skipped)
`
147
``
`-
int lines_cap # Vector capacity
`
``
143
`+
int64_t *line_start # position in words for start of line
`
``
144
`+
int64_t *line_fields # Number of fields in each line
`
``
145
`+
int64_t lines # Number of lines observed
`
``
146
`+
int64_t file_lines # Number of lines observed (with bad/skipped)
`
``
147
`+
int64_t lines_cap # Vector capacity
`
148
148
``
149
149
`# Tokenizing stuff
`
150
150
` ParserState state
`
`@@ -177,14 +177,14 @@ cdef extern from "parser/tokenizer.h":
`
177
177
`# thousands separator (comma, period)
`
178
178
`char thousands
`
179
179
``
180
``
`-
int header # Boolean: 1: has header, 0: no header
`
181
``
`-
int header_start # header row start
`
182
``
`-
int header_end # header row end
`
``
180
`+
int header # Boolean: 1: has header, 0: no header
`
``
181
`+
int64_t header_start # header row start
`
``
182
`+
int64_t header_end # header row end
`
183
183
``
184
184
`void *skipset
`
185
185
` PyObject *skipfunc
`
186
186
` int64_t skip_first_N_rows
`
187
``
`-
int skipfooter
`
``
187
`+
int64_t skipfooter
`
188
188
`# pick one, depending on whether the converter requires GIL
`
189
189
`double (*double_converter_nogil)(const char *, char **,
`
190
190
`char, char, char, int) nogil
`
`@@ -195,12 +195,12 @@ cdef extern from "parser/tokenizer.h":
`
195
195
`char *warn_msg
`
196
196
`char *error_msg
`
197
197
``
198
``
`-
int skip_empty_lines
`
``
198
`+
int64_t skip_empty_lines
`
199
199
``
200
200
` ctypedef struct coliter_t:
`
201
201
`char **words
`
202
``
`-
int *line_start
`
203
``
`-
int col
`
``
202
`+
int64_t *line_start
`
``
203
`+
int64_t col
`
204
204
``
205
205
` ctypedef struct uint_state:
`
206
206
`int seen_sint
`
`@@ -210,7 +210,8 @@ cdef extern from "parser/tokenizer.h":
`
210
210
`void uint_state_init(uint_state *self)
`
211
211
`int uint64_conflict(uint_state *self)
`
212
212
``
213
``
`-
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
`
``
213
`+
void coliter_setup(coliter_t *it, parser_t *parser,
`
``
214
`+
int64_t i, int64_t start) nogil
`
214
215
`void COLITER_NEXT(coliter_t, const char *) nogil
`
215
216
``
216
217
` parser_t* parser_new()
`
`@@ -289,14 +290,14 @@ cdef class TextReader:
`
289
290
`object true_values, false_values
`
290
291
`object handle
`
291
292
` bint na_filter, verbose, has_usecols, has_mi_columns
`
292
``
`-
int parser_start
`
``
293
`+
int64_t parser_start
`
293
294
`list clocks
`
294
295
`char *c_encoding
`
295
296
` kh_str_t *false_set
`
296
297
` kh_str_t *true_set
`
297
298
``
298
299
` cdef public:
`
299
``
`-
int leading_cols, table_width, skipfooter, buffer_lines
`
``
300
`+
int64_t leading_cols, table_width, skipfooter, buffer_lines
`
300
301
`object allow_leading_cols
`
301
302
`object delimiter, converters, delim_whitespace
`
302
303
`object na_values
`
`@@ -730,7 +731,8 @@ cdef class TextReader:
`
730
731
` Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
`
731
732
`char *word
`
732
733
`object name
`
733
``
`-
int status, hr, data_line
`
``
734
`+
int status
`
``
735
`+
int64_t hr, data_line
`
734
736
`char *errors = "strict"
`
735
737
` cdef StringPath path = _string_path(self.c_encoding)
`
736
738
``
`@@ -949,8 +951,8 @@ cdef class TextReader:
`
949
951
``
950
952
` cdef _read_rows(self, rows, bint trim):
`
951
953
` cdef:
`
952
``
`-
int buffered_lines
`
953
``
`-
int irows, footer = 0
`
``
954
`+
int64_t buffered_lines
`
``
955
`+
int64_t irows, footer = 0
`
954
956
``
955
957
`self._start_clock()
`
956
958
``
`@@ -1018,12 +1020,13 @@ cdef class TextReader:
`
1018
1020
``
1019
1021
`def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
`
1020
1022
` cdef:
`
1021
``
`-
Py_ssize_t i, nused
`
``
1023
`+
int64_t i
`
``
1024
`+
int nused
`
1022
1025
` kh_str_t *na_hashset = NULL
`
1023
``
`-
int start, end
`
``
1026
`+
int64_t start, end
`
1024
1027
`object name, na_flist, col_dtype = None
`
1025
1028
` bint na_filter = 0
`
1026
``
`-
Py_ssize_t num_cols
`
``
1029
`+
int64_t num_cols
`
1027
1030
``
1028
1031
` start = self.parser_start
`
1029
1032
``
`@@ -1195,7 +1198,7 @@ cdef class TextReader:
`
1195
1198
`return col_res, na_count
`
1196
1199
``
1197
1200
` cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
`
1198
``
`-
int start, int end,
`
``
1201
`+
int64_t start, int64_t end,
`
1199
1202
` bint na_filter,
`
1200
1203
` bint user_dtype,
`
1201
1204
` kh_str_t *na_hashset,
`
`@@ -1275,7 +1278,7 @@ cdef class TextReader:
`
1275
1278
`raise TypeError("the dtype %s is not "
`
1276
1279
`"supported for parsing" % dtype)
`
1277
1280
``
1278
``
`-
cdef _string_convert(self, Py_ssize_t i, int start, int end,
`
``
1281
`+
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
`
1279
1282
` bint na_filter, kh_str_t *na_hashset):
`
1280
1283
``
1281
1284
` cdef StringPath path = _string_path(self.c_encoding)
`
`@@ -1336,6 +1339,7 @@ cdef class TextReader:
`
1336
1339
` kh_destroy_str(table)
`
1337
1340
``
1338
1341
` cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
`
``
1342
`+
cdef int64_t j
`
1339
1343
`if self.has_usecols and self.names is not None:
`
1340
1344
`if (not callable(self.usecols) and
`
1341
1345
`len(self.names) == len(self.usecols)):
`
`@@ -1427,8 +1431,8 @@ cdef inline StringPath _string_path(char *encoding):
`
1427
1431
`# ----------------------------------------------------------------------
`
1428
1432
`# Type conversions / inference support code
`
1429
1433
``
1430
``
`-
cdef _string_box_factorize(parser_t *parser, int col,
`
1431
``
`-
int line_start, int line_end,
`
``
1434
`+
cdef _string_box_factorize(parser_t *parser, int64_t col,
`
``
1435
`+
int64_t line_start, int64_t line_end,
`
1432
1436
` bint na_filter, kh_str_t *na_hashset):
`
1433
1437
` cdef:
`
1434
1438
`int error, na_count = 0
`
`@@ -1480,8 +1484,8 @@ cdef _string_box_factorize(parser_t *parser, int col,
`
1480
1484
``
1481
1485
`return result, na_count
`
1482
1486
``
1483
``
`-
cdef _string_box_utf8(parser_t *parser, int col,
`
1484
``
`-
int line_start, int line_end,
`
``
1487
`+
cdef _string_box_utf8(parser_t *parser, int64_t col,
`
``
1488
`+
int64_t line_start, int64_t line_end,
`
1485
1489
` bint na_filter, kh_str_t *na_hashset):
`
1486
1490
` cdef:
`
1487
1491
`int error, na_count = 0
`
`@@ -1533,8 +1537,8 @@ cdef _string_box_utf8(parser_t *parser, int col,
`
1533
1537
``
1534
1538
`return result, na_count
`
1535
1539
``
1536
``
`-
cdef _string_box_decode(parser_t *parser, int col,
`
1537
``
`-
int line_start, int line_end,
`
``
1540
`+
cdef _string_box_decode(parser_t *parser, int64_t col,
`
``
1541
`+
int64_t line_start, int64_t line_end,
`
1538
1542
` bint na_filter, kh_str_t *na_hashset,
`
1539
1543
`char *encoding):
`
1540
1544
` cdef:
`
`@@ -1592,8 +1596,8 @@ cdef _string_box_decode(parser_t *parser, int col,
`
1592
1596
``
1593
1597
``
1594
1598
`@cython.boundscheck(False)
`
1595
``
`-
cdef _categorical_convert(parser_t *parser, int col,
`
1596
``
`-
int line_start, int line_end,
`
``
1599
`+
cdef _categorical_convert(parser_t *parser, int64_t col,
`
``
1600
`+
int64_t line_start, int64_t line_end,
`
1597
1601
` bint na_filter, kh_str_t *na_hashset,
`
1598
1602
`char *encoding):
`
1599
1603
`"Convert column data into codes, categories"
`
`@@ -1663,8 +1667,8 @@ cdef _categorical_convert(parser_t *parser, int col,
`
1663
1667
` kh_destroy_str(table)
`
1664
1668
`return np.asarray(codes), result, na_count
`
1665
1669
``
1666
``
`-
cdef _to_fw_string(parser_t *parser, int col, int line_start,
`
1667
``
`-
int line_end, size_t width):
`
``
1670
`+
cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
`
``
1671
`+
int64_t line_end, int64_t width):
`
1668
1672
` cdef:
`
1669
1673
` Py_ssize_t i
`
1670
1674
` coliter_t it
`
`@@ -1680,11 +1684,11 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
`
1680
1684
``
1681
1685
`return result
`
1682
1686
``
1683
``
`-
cdef inline void _to_fw_string_nogil(parser_t *parser, int col,
`
1684
``
`-
int line_start, int line_end,
`
``
1687
`+
cdef inline void _to_fw_string_nogil(parser_t *parser, int64_t col,
`
``
1688
`+
int64_t line_start, int64_t line_end,
`
1685
1689
` size_t width, char *data) nogil:
`
1686
1690
` cdef:
`
1687
``
`-
Py_ssize_t i
`
``
1691
`+
int64_t i
`
1688
1692
` coliter_t it
`
1689
1693
` const char *word = NULL
`
1690
1694
``
`@@ -1699,7 +1703,8 @@ cdef char* cinf = b'inf'
`
1699
1703
`cdef char* cposinf = b'+inf'
`
1700
1704
`cdef char* cneginf = b'-inf'
`
1701
1705
``
1702
``
`-
cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
`
``
1706
`+
cdef _try_double(parser_t *parser, int64_t col,
`
``
1707
`+
int64_t line_start, int64_t line_end,
`
1703
1708
` bint na_filter, kh_str_t *na_hashset, object na_flist):
`
1704
1709
` cdef:
`
1705
1710
`int error, na_count = 0
`
`@@ -1808,7 +1813,8 @@ cdef inline int _try_double_nogil(parser_t *parser,
`
1808
1813
``
1809
1814
`return 0
`
1810
1815
``
1811
``
`-
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
`
``
1816
`+
cdef _try_uint64(parser_t *parser, int64_t col,
`
``
1817
`+
int64_t line_start, int64_t line_end,
`
1812
1818
` bint na_filter, kh_str_t *na_hashset):
`
1813
1819
` cdef:
`
1814
1820
`int error
`
`@@ -1842,8 +1848,9 @@ cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
`
1842
1848
``
1843
1849
`return result
`
1844
1850
``
1845
``
`-
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
`
1846
``
`-
int line_end, bint na_filter,
`
``
1851
`+
cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col,
`
``
1852
`+
int64_t line_start,
`
``
1853
`+
int64_t line_end, bint na_filter,
`
1847
1854
` const kh_str_t *na_hashset,
`
1848
1855
` uint64_t *data, uint_state *state) nogil:
`
1849
1856
` cdef:
`
`@@ -1879,7 +1886,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
`
1879
1886
``
1880
1887
`return 0
`
1881
1888
``
1882
``
`-
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
`
``
1889
`+
cdef _try_int64(parser_t *parser, int64_t col,
`
``
1890
`+
int64_t line_start, int64_t line_end,
`
1883
1891
` bint na_filter, kh_str_t *na_hashset):
`
1884
1892
` cdef:
`
1885
1893
`int error, na_count = 0
`
`@@ -1906,8 +1914,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
`
1906
1914
``
1907
1915
`return result, na_count
`
1908
1916
``
1909
``
`-
cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
`
1910
``
`-
int line_end, bint na_filter,
`
``
1917
`+
cdef inline int _try_int64_nogil(parser_t *parser, int64_t col,
`
``
1918
`+
int64_t line_start,
`
``
1919
`+
int64_t line_end, bint na_filter,
`
1911
1920
` const kh_str_t *na_hashset, int64_t NA,
`
1912
1921
` int64_t *data, int *na_count) nogil:
`
1913
1922
` cdef:
`
`@@ -1944,7 +1953,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start,
`
1944
1953
``
1945
1954
`return 0
`
1946
1955
``
1947
``
`-
cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
`
``
1956
`+
cdef _try_bool(parser_t *parser, int64_t col,
`
``
1957
`+
int64_t line_start, int64_t line_end,
`
1948
1958
` bint na_filter, kh_str_t *na_hashset):
`
1949
1959
` cdef:
`
1950
1960
`int na_count
`
`@@ -1966,8 +1976,9 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end,
`
1966
1976
`return None, None
`
1967
1977
`return result.view(np.bool_), na_count
`
1968
1978
``
1969
``
`-
cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
`
1970
``
`-
int line_end, bint na_filter,
`
``
1979
`+
cdef inline int _try_bool_nogil(parser_t *parser, int64_t col,
`
``
1980
`+
int64_t line_start,
`
``
1981
`+
int64_t line_end, bint na_filter,
`
1971
1982
` const kh_str_t *na_hashset, uint8_t NA,
`
1972
1983
` uint8_t *data, int *na_count) nogil:
`
1973
1984
` cdef:
`
`@@ -2006,7 +2017,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start,
`
2006
2017
` data += 1
`
2007
2018
`return 0
`
2008
2019
``
2009
``
`-
cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
`
``
2020
`+
cdef _try_bool_flex(parser_t *parser, int64_t col,
`
``
2021
`+
int64_t line_start, int64_t line_end,
`
2010
2022
` bint na_filter, const kh_str_t *na_hashset,
`
2011
2023
` const kh_str_t *true_hashset,
`
2012
2024
` const kh_str_t *false_hashset):
`
`@@ -2032,8 +2044,9 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end,
`
2032
2044
`return None, None
`
2033
2045
`return result.view(np.bool_), na_count
`
2034
2046
``
2035
``
`-
cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start,
`
2036
``
`-
int line_end, bint na_filter,
`
``
2047
`+
cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col,
`
``
2048
`+
int64_t line_start,
`
``
2049
`+
int64_t line_end, bint na_filter,
`
2037
2050
` const kh_str_t *na_hashset,
`
2038
2051
` const kh_str_t *true_hashset,
`
2039
2052
` const kh_str_t *false_hashset,
`
`@@ -2251,8 +2264,8 @@ for k in list(na_values):
`
2251
2264
` na_values[np.dtype(k)] = na_values[k]
`
2252
2265
``
2253
2266
``
2254
``
`-
cdef _apply_converter(object f, parser_t *parser, int col,
`
2255
``
`-
int line_start, int line_end,
`
``
2267
`+
cdef _apply_converter(object f, parser_t *parser, int64_t col,
`
``
2268
`+
int64_t line_start, int64_t line_end,
`
2256
2269
`char* c_encoding):
`
2257
2270
` cdef:
`
2258
2271
`int error
`
`@@ -2296,7 +2309,7 @@ def _to_structured_array(dict columns, object names, object usecols):
`
2296
2309
``
2297
2310
`object name, fnames, field_type
`
2298
2311
` Py_ssize_t i, offset, nfields, length
`
2299
``
`-
int stride, elsize
`
``
2312
`+
int64_t stride, elsize
`
2300
2313
`char *buf
`
2301
2314
``
2302
2315
`if names is None:
`
`@@ -2344,10 +2357,10 @@ def _to_structured_array(dict columns, object names, object usecols):
`
2344
2357
``
2345
2358
`return recs
`
2346
2359
``
2347
``
`-
cdef _fill_structured_column(char dst, char src, int elsize,
`
2348
``
`-
int stride, int length, bint incref):
`
``
2360
`+
cdef _fill_structured_column(char dst, char src, int64_t elsize,
`
``
2361
`+
int64_t stride, int64_t length, bint incref):
`
2349
2362
` cdef:
`
2350
``
`-
Py_ssize_t i
`
``
2363
`+
int64_t i
`
2351
2364
``
2352
2365
`if incref:
`
2353
2366
` util.transfer_object_column(dst, src, stride, length)
`