Infer compression from URL extension · pandas-dev/pandas@83b2bc5 (original) (raw)

Original file line number Diff line number Diff line change
@@ -63,6 +63,13 @@ def urlopen(*args, **kwargs):
63 63 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
64 64 _VALID_URLS.discard('')
65 65
66 +_compression_to_extension = {
67 +'gzip': '.gz',
68 +'bz2': '.bz2',
69 +'zip': '.zip',
70 +'xz': '.xz',
71 +}
72 +
66 73
67 74 class ParserError(ValueError):
68 75 """
@@ -234,20 +241,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
234 241 -------
235 242 a filepath_or_buffer, the encoding, the compression
236 243 """
237 -
244 +
238 245 if _is_url(filepath_or_buffer):
239 -req = _urlopen(str(filepath_or_buffer))
246 +url = str(filepath_or_buffer)
247 +req = _urlopen(url)
240 248 if compression == 'infer':
241 -content_encoding = req.headers.get('Content-Encoding', None)
242 -if content_encoding == 'gzip':
243 -compression = 'gzip'
249 +for compression, extension in _compression_to_extension.items():
250 + if url.endswith(extension):
251 + break
244 252 else:
245 -compression = None
246 -# cat on the compression to the tuple returned by the function
247 -to_return = (list(maybe_read_encoded_stream(req, encoding,
248 -compression)) +
249 - [compression])
250 -return tuple(to_return)
253 +content_encoding = req.headers.get('Content-Encoding', None)
254 +compression = 'gzip' if content_encoding == 'gzip' else None
255 +reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
256 +return reader, encoding, compression
251 257
252 258 if _is_s3_url(filepath_or_buffer):
253 259 from pandas.io.s3 import get_filepath_or_buffer