ENH: Add read support for Google Cloud Storage. · pandas-dev/pandas@4c9196a (original) (raw)

2 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -168,6 +168,14 @@ def is_s3_url(url):
168 168 return False
169 169
170 170
171 +def is_google_cloud_storage_url(url):
172 +"""Check for a gs url"""
173 +try:
174 +return parse_url(url).scheme == 'gs'
175 +except: # noqa
176 +return False
177 +
178 +
171 179 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
172 180 compression=None, mode=None):
173 181 """
@@ -203,6 +211,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
203 211 compression=compression,
204 212 mode=mode)
205 213
214 +if is_google_cloud_storage_url(filepath_or_buffer):
215 +from pandas.io import google_cloud_storage
216 +return google_cloud_storage.get_filepath_or_buffer(
217 +filepath_or_buffer,
218 +encoding=encoding,
219 +compression=compression,
220 +mode=mode)
221 +
206 222 if isinstance(filepath_or_buffer, (compat.string_types,
207 223 compat.binary_type,
208 224 mmap.mmap)):
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
1 +""" Google Cloud Storage support for remote file interactivity """
2 +from io import BytesIO
3 +from pandas import compat
4 +try:
5 +from google.cloud.storage import Client
6 +except:
7 +raise ImportError("The google-cloud-storage library is required to "
8 +"read gs:// files")
9 +
10 +if compat.PY3:
11 +from urllib.parse import urlparse as parse_url
12 +else:
13 +from urlparse import urlparse as parse_url
14 +
15 +
16 +def _get_bucket_name(url):
17 +"""Returns the bucket name from the gs:// url"""
18 +result = parse_url(url)
19 +return result.netloc
20 +
21 +
22 +def _get_object_path(url):
23 +"""Returns the object path from the gs:// url"""
24 +result = parse_url(url)
25 +return result.path.lstrip('/')
26 +
27 +
28 +def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
29 +compression=None, mode=None):
30 +
31 +if mode is None:
32 +mode = 'rb'
33 +
34 +client = Client()
35 +bucket = client.get_bucket(_get_bucket_name(filepath_or_buffer))
36 +blob = bucket.blob(_get_object_path(filepath_or_buffer))
37 +data = BytesIO()
38 +blob.download_to_file(data)
39 +data.seek(0)
40 +filepath_or_buffer = data
41 +
42 +return filepath_or_buffer, None, compression