Add remote file fetcher and N3 test suite · RDFLib/rdflib@02dd2bf (original) (raw)
``
1
`+
#!/usr/bin/env python
`
``
2
`+
import argparse
`
``
3
`+
import enum
`
``
4
`+
import logging
`
``
5
`+
import os
`
``
6
`+
import random
`
``
7
`+
import re
`
``
8
`+
import shutil
`
``
9
`+
import string
`
``
10
`+
import sys
`
``
11
`+
import tarfile
`
``
12
`+
from contextlib import ExitStack, contextmanager
`
``
13
`+
from dataclasses import dataclass, field
`
``
14
`+
from pathlib import Path
`
``
15
`+
from tarfile import TarFile, TarInfo
`
``
16
`+
from tempfile import TemporaryDirectory, mkdtemp
`
``
17
`+
from typing import IO, Generator, List, Pattern, Union
`
``
18
`+
from urllib.request import Request, urlopen
`
``
19
`+
from zipfile import ZipFile, ZipInfo
`
``
20
+
``
21
`+
DATA_PATH = Path(file).parent
`
``
22
+
``
23
+
``
24
`+
@dataclass
`
``
25
`+
class Resource:
`
``
26
`+
remote: Union[str, Request]
`
``
27
`+
local_path: Path
`
``
28
+
``
29
`+
def fetch(self, tmp_path: Path) -> None:
`
``
30
`+
raise NotImplementedError()
`
``
31
+
``
32
+
``
33
`+
@dataclass
`
``
34
`+
class FileResource(Resource):
`
``
35
`+
def fetch(self, tmp_path: Path) -> None:
`
``
36
`+
if self.local_path.exists():
`
``
37
`+
logging.debug("info %s", self.local_path)
`
``
38
`+
os.remove(self.local_path)
`
``
39
+
``
40
`+
with ExitStack() as xstack:
`
``
41
`+
request = (
`
``
42
`+
self.remote
`
``
43
`+
if isinstance(self.remote, Request)
`
``
44
`+
else Request(self.remote)
`
``
45
`+
)
`
``
46
`+
response = urlopen(request)
`
``
47
`+
remote_io: IO[bytes] = xstack.enter_context(response)
`
``
48
+
``
49
`+
local_io = xstack.enter_context(self.local_path.open("wb+"))
`
``
50
`+
shutil.copyfileobj(remote_io, local_io)
`
``
51
+
``
52
`+
logging.info("Downloaded %s to %s", request.full_url, self.local_path)
`
``
53
+
``
54
+
``
55
`+
class ArchiveType(enum.Enum):
`
``
56
`+
ZIP = "zip"
`
``
57
`+
TAR_GZ = "tar.gz"
`
``
58
+
``
59
+
``
60
`+
@dataclass
`
``
61
`+
class ArchiveResource(Resource):
`
``
62
`+
type: ArchiveType
`
``
63
`+
pattern: Pattern[str]
`
``
64
+
``
65
`+
def fetch(self, tmp_path: Path) -> None:
`
``
66
`+
if self.local_path.exists():
`
``
67
`+
logging.debug("info %s", self.local_path)
`
``
68
`+
shutil.rmtree(self.local_path)
`
``
69
`+
with ExitStack() as xstack:
`
``
70
`+
request = (
`
``
71
`+
self.remote
`
``
72
`+
if isinstance(self.remote, Request)
`
``
73
`+
else Request(self.remote)
`
``
74
`+
)
`
``
75
`+
response = urlopen(request)
`
``
76
`+
remote_io: IO[bytes] = xstack.enter_context(response)
`
``
77
`+
name = (
`
``
78
`+
"".join(
`
``
79
`+
random.choices(
`
``
80
`+
string.ascii_uppercase + string.digits + string.ascii_lowercase,
`
``
81
`+
k=10,
`
``
82
`+
)
`
``
83
`+
)
`
``
84
`+
- f".{self.type.value}"
`
``
85
`+
)
`
``
86
`+
tmp_file = tmp_path / name
`
``
87
`+
logging.info("fetching %s to temp file %s", self.remote, tmp_file)
`
``
88
`+
with tmp_file.open("wb+") as tmp_io:
`
``
89
`+
shutil.copyfileobj(remote_io, tmp_io)
`
``
90
+
``
91
`+
archive_file: Union[ZipFile, TarFile]
`
``
92
`+
if self.type is ArchiveType.ZIP:
`
``
93
`+
archive_file = xstack.enter_context(ZipFile(tmp_file))
`
``
94
`+
elif self.type is ArchiveType.TAR_GZ:
`
``
95
`+
archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
`
``
96
`+
archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
`
``
97
`+
else:
`
``
98
`+
raise ValueError(f"invalid type {self.type}")
`
``
99
+
``
100
`+
for member_info in self._member_list(archive_file):
`
``
101
`+
member_filename = self._member_filename(member_info)
`
``
102
`+
if self._member_isdir(member_info):
`
``
103
`+
logging.debug("Ignoring directory %s", member_filename)
`
``
104
`+
continue
`
``
105
+
``
106
`+
match = self.pattern.match(member_filename)
`
``
107
`+
if match is None:
`
``
108
`+
logging.debug("Ignoring unmatched %s", member_filename)
`
``
109
`+
continue
`
``
110
`+
groups = match.groups()
`
``
111
`+
if len(groups) > 0:
`
``
112
`+
dest_filename = groups[0]
`
``
113
+
``
114
`+
member_io: IO[bytes]
`
``
115
`+
with self._member_io(archive_file, member_info) as member_io:
`
``
116
`+
local_file = self.local_path / dest_filename
`
``
117
`+
if not local_file.parent.exists():
`
``
118
`+
local_file.parent.mkdir(parents=True)
`
``
119
`+
logging.debug("writing %s to %s", member_filename, local_file)
`
``
120
`+
local_file.write_bytes(member_io.read())
`
``
121
+
``
122
`+
logging.info(
`
``
123
`+
"Downloaded %s and extracted files matching %s to %s",
`
``
124
`+
request.full_url,
`
``
125
`+
self.pattern,
`
``
126
`+
self.local_path,
`
``
127
`+
)
`
``
128
+
``
129
`+
@classmethod
`
``
130
`+
def _member_list(
`
``
131
`+
cls, archive: Union[ZipFile, TarFile]
`
``
132
`+
) -> Union[List[ZipInfo], List[TarInfo]]:
`
``
133
`+
if isinstance(archive, ZipFile):
`
``
134
`+
return archive.infolist()
`
``
135
`+
return archive.getmembers()
`
``
136
+
``
137
`+
@classmethod
`
``
138
`+
def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
`
``
139
`+
if isinstance(member_info, ZipInfo):
`
``
140
`+
return member_info.is_dir()
`
``
141
`+
return member_info.isdir()
`
``
142
+
``
143
`+
@classmethod
`
``
144
`+
def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
`
``
145
`+
if isinstance(member_info, ZipInfo):
`
``
146
`+
return member_info.filename
`
``
147
`+
return member_info.name
`
``
148
+
``
149
`+
@classmethod
`
``
150
`+
@contextmanager
`
``
151
`+
def _member_io(
`
``
152
`+
cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
`
``
153
`+
) -> Generator[IO[bytes], None, None]:
`
``
154
`+
if isinstance(archive, ZipFile):
`
``
155
`+
assert isinstance(member_info, ZipInfo)
`
``
156
`+
with archive.open(member_info) as member_io:
`
``
157
`+
yield member_io
`
``
158
`+
else:
`
``
159
`+
assert isinstance(member_info, TarInfo)
`
``
160
`+
opt_io = archive.extractfile(member_info)
`
``
161
`+
assert opt_io is not None
`
``
162
`+
yield opt_io
`
``
163
+
``
164
+
``
165
`+
RESOURCES: List[Resource] = [
`
``
166
`+
ArchiveResource(
`
``
167
`+
remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
`
``
168
`+
local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
`
``
169
`+
type=ArchiveType.ZIP,
`
``
170
`+
pattern=re.compile(r"^[^/]+[/]tests/$"),
`
``
171
`+
),
`
``
172
`+
ArchiveResource(
`
``
173
`+
remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
`
``
174
`+
local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
`
``
175
`+
type=ArchiveType.TAR_GZ,
`
``
176
`+
pattern=re.compile(r"^[^/]+/$"),
`
``
177
`+
),
`
``
178
`+
ArchiveResource(
`
``
179
`+
remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
`
``
180
`+
local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
`
``
181
`+
type=ArchiveType.TAR_GZ,
`
``
182
`+
pattern=re.compile(r"^(.+)$"),
`
``
183
`+
),
`
``
184
`+
ArchiveResource(
`
``
185
`+
remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
`
``
186
`+
local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
`
``
187
`+
type=ArchiveType.TAR_GZ,
`
``
188
`+
pattern=re.compile(r"^(.+)$"),
`
``
189
`+
),
`
``
190
`+
ArchiveResource(
`
``
191
`+
remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
`
``
192
`+
local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
`
``
193
`+
type=ArchiveType.TAR_GZ,
`
``
194
`+
pattern=re.compile(r"^(.+)$"),
`
``
195
`+
),
`
``
196
`+
NOTE: Commented out as these files contains local modifications.
`
``
197
`+
ArchiveResource(
`
``
198
`+
remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
`
``
199
`+
local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
`
``
200
`+
type=ArchiveType.ZIP,
`
``
201
`+
pattern=re.compile(r"^(.+)$"),
`
``
202
`+
),
`
``
203
`+
NOTE: Commented out as this contains local modifications.
`
``
204
`+
ArchiveResource(
`
``
205
`+
remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
`
``
206
`+
local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
`
``
207
`+
type=ArchiveType.TAR_GZ,
`
``
208
`+
pattern=re.compile(r"^[^/]+/$"),
`
``
209
`+
),
`
``
210
`+
FileResource(
`
``
211
`+
remote=Request(
`
``
212
`+
"http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
`
``
213
`+
),
`
``
214
`+
local_path=(DATA_PATH / "rdfs.ttl"),
`
``
215
`+
),
`
``
216
`+
]
`
``
217
+
``
218
+
``
219
`+
@dataclass
`
``
220
`+
class Application:
`
``
221
`+
parser: argparse.ArgumentParser = field(
`
``
222
`+
default_factory=lambda: argparse.ArgumentParser(add_help=True)
`
``
223
`+
)
`
``
224
+
``
225
`+
def post_init(self) -> None:
`
``
226
`+
parser = self.parser
`
``
227
`+
parser.add_argument(
`
``
228
`+
"-v",
`
``
229
`+
"--verbose",
`
``
230
`+
action="count",
`
``
231
`+
dest="verbosity",
`
``
232
`+
help="increase verbosity level",
`
``
233
`+
)
`
``
234
`+
parser.add_argument(
`
``
235
`+
"--keep-tmp",
`
``
236
`+
action="store_true",
`
``
237
`+
default=False,
`
``
238
`+
)
`
``
239
`+
parser.add_argument("paths", nargs="*", type=str)
`
``
240
`+
parser.set_defaults(handler=self.handle)
`
``
241
+
``
242
`+
def run(self, args: List[str]) -> None:
`
``
243
`+
parse_result = self.parser.parse_args(args)
`
``
244
+
``
245
`+
verbosity = parse_result.verbosity
`
``
246
`+
if verbosity is not None:
`
``
247
`+
root_logger = logging.getLogger("")
`
``
248
`+
root_logger.propagate = True
`
``
249
`+
new_level = (
`
``
250
`+
root_logger.getEffectiveLevel()
`
``
251
`+
- (min(1, verbosity)) * 10
`
``
252
`+
- min(max(0, verbosity - 1), 9) * 1
`
``
253
`+
)
`
``
254
`+
root_logger.setLevel(new_level)
`
``
255
+
``
256
`+
logging.debug(
`
``
257
`+
"args = %s, parse_result = %s, logging.level = %s",
`
``
258
`+
args,
`
``
259
`+
parse_result,
`
``
260
`+
logging.getLogger("").getEffectiveLevel(),
`
``
261
`+
)
`
``
262
+
``
263
`+
parse_result.handler(parse_result)
`
``
264
+
``
265
`+
def handle(self, parse_result: argparse.Namespace) -> None:
`
``
266
`+
logging.debug("entry ...")
`
``
267
+
``
268
`+
paths = {Path(path).absolute() for path in parse_result.paths}
`
``
269
+
``
270
`+
logging.debug("paths = %s", paths)
`
``
271
+
``
272
`+
if parse_result.keep_tmp:
`
``
273
`+
tmp_path = Path(mkdtemp())
`
``
274
`+
else:
`
``
275
`+
tmp_dir = TemporaryDirectory()
`
``
276
`+
tmp_path = Path(tmp_dir.name)
`
``
277
+
``
278
`+
for resource in RESOURCES:
`
``
279
`+
if paths:
`
``
280
`+
include = False
`
``
281
`+
for path in paths:
`
``
282
`+
try:
`
``
283
`+
resource.local_path.absolute().relative_to(path)
`
``
284
`+
include = True
`
``
285
`+
except ValueError:
`
``
286
`+
not relative to, ignoring
`
``
287
`+
pass
`
``
288
`+
if not include:
`
``
289
`+
logging.info("skipping %s", resource.local_path)
`
``
290
`+
continue
`
``
291
`+
resource.fetch(tmp_path)
`
``
292
+
``
293
+
``
294
`+
def main() -> None:
`
``
295
`+
logging.basicConfig(
`
``
296
`+
level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
`
``
297
`+
stream=sys.stderr,
`
``
298
`+
datefmt="%Y-%m-%dT%H:%M:%S",
`
``
299
`+
format=(
`
``
300
`+
"%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
`
``
301
`+
"%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
`
``
302
`+
),
`
``
303
`+
)
`
``
304
+
``
305
`+
Application().run(sys.argv[1:])
`
``
306
+
``
307
+
``
308
`+
if name == "main":
`
``
309
`+
main()
`