Infer compression from URL extension · pandas-dev/pandas@83b2bc5 (original) (raw)

Original file line number	Diff line number	Diff line change
@@ -63,6 +63,13 @@ def urlopen(args, *kwargs):
63	63	_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
64	64	_VALID_URLS.discard('')
65	65
	66	+_compression_to_extension = {
	67	+'gzip': '.gz',
	68	+'bz2': '.bz2',
	69	+'zip': '.zip',
	70	+'xz': '.xz',
	71	+}
	72	+
66	73
67	74	class ParserError(ValueError):
68	75	"""
@@ -234,20 +241,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
234	241	-------
235	242	a filepath_or_buffer, the encoding, the compression
236	243	"""
237		-
	244	+
238	245	if _is_url(filepath_or_buffer):
239		-req = _urlopen(str(filepath_or_buffer))
	246	+url = str(filepath_or_buffer)
	247	+req = _urlopen(url)
240	248	if compression == 'infer':
241		-content_encoding = req.headers.get('Content-Encoding', None)
242		-if content_encoding == 'gzip':
243		-compression = 'gzip'
	249	+for compression, extension in _compression_to_extension.items():
	250	+ if url.endswith(extension):
	251	+ break
244	252	else:
245		-compression = None
246		-# cat on the compression to the tuple returned by the function
247		-to_return = (list(maybe_read_encoded_stream(req, encoding,
248		-compression)) +
249		- [compression])
250		-return tuple(to_return)
	253	+content_encoding = req.headers.get('Content-Encoding', None)
	254	+compression = 'gzip' if content_encoding == 'gzip' else None
	255	+reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
	256	+return reader, encoding, compression
251	257
252	258	if _is_s3_url(filepath_or_buffer):
253	259	from pandas.io.s3 import get_filepath_or_buffer