Add remote file fetcher and N3 test suite · RDFLib/rdflib@02dd2bf (original) (raw)

``

1

`+

#!/usr/bin/env python

`

``

2

`+

import argparse

`

``

3

`+

import enum

`

``

4

`+

import logging

`

``

5

`+

import os

`

``

6

`+

import random

`

``

7

`+

import re

`

``

8

`+

import shutil

`

``

9

`+

import string

`

``

10

`+

import sys

`

``

11

`+

import tarfile

`

``

12

`+

from contextlib import ExitStack, contextmanager

`

``

13

`+

from dataclasses import dataclass, field

`

``

14

`+

from pathlib import Path

`

``

15

`+

from tarfile import TarFile, TarInfo

`

``

16

`+

from tempfile import TemporaryDirectory, mkdtemp

`

``

17

`+

from typing import IO, Generator, List, Pattern, Union

`

``

18

`+

from urllib.request import Request, urlopen

`

``

19

`+

from zipfile import ZipFile, ZipInfo

`

``

20

+

``

21

`+

DATA_PATH = Path(file).parent

`

``

22

+

``

23

+

``

24

`+

@dataclass

`

``

25

`+

class Resource:

`

``

26

`+

remote: Union[str, Request]

`

``

27

`+

local_path: Path

`

``

28

+

``

29

`+

def fetch(self, tmp_path: Path) -> None:

`

``

30

`+

raise NotImplementedError()

`

``

31

+

``

32

+

``

33

`+

@dataclass

`

``

34

`+

class FileResource(Resource):

`

``

35

`+

def fetch(self, tmp_path: Path) -> None:

`

``

36

`+

if self.local_path.exists():

`

``

37

`+

logging.debug("info %s", self.local_path)

`

``

38

`+

os.remove(self.local_path)

`

``

39

+

``

40

`+

with ExitStack() as xstack:

`

``

41

`+

request = (

`

``

42

`+

self.remote

`

``

43

`+

if isinstance(self.remote, Request)

`

``

44

`+

else Request(self.remote)

`

``

45

`+

)

`

``

46

`+

response = urlopen(request)

`

``

47

`+

remote_io: IO[bytes] = xstack.enter_context(response)

`

``

48

+

``

49

`+

local_io = xstack.enter_context(self.local_path.open("wb+"))

`

``

50

`+

shutil.copyfileobj(remote_io, local_io)

`

``

51

+

``

52

`+

logging.info("Downloaded %s to %s", request.full_url, self.local_path)

`

``

53

+

``

54

+

``

55

`+

class ArchiveType(enum.Enum):

`

``

56

`+

ZIP = "zip"

`

``

57

`+

TAR_GZ = "tar.gz"

`

``

58

+

``

59

+

``

60

`+

@dataclass

`

``

61

`+

class ArchiveResource(Resource):

`

``

62

`+

type: ArchiveType

`

``

63

`+

pattern: Pattern[str]

`

``

64

+

``

65

`+

def fetch(self, tmp_path: Path) -> None:

`

``

66

`+

if self.local_path.exists():

`

``

67

`+

logging.debug("info %s", self.local_path)

`

``

68

`+

shutil.rmtree(self.local_path)

`

``

69

`+

with ExitStack() as xstack:

`

``

70

`+

request = (

`

``

71

`+

self.remote

`

``

72

`+

if isinstance(self.remote, Request)

`

``

73

`+

else Request(self.remote)

`

``

74

`+

)

`

``

75

`+

response = urlopen(request)

`

``

76

`+

remote_io: IO[bytes] = xstack.enter_context(response)

`

``

77

`+

name = (

`

``

78

`+

"".join(

`

``

79

`+

random.choices(

`

``

80

`+

string.ascii_uppercase + string.digits + string.ascii_lowercase,

`

``

81

`+

k=10,

`

``

82

`+

)

`

``

83

`+

)

`

``

84

`+

`

``

85

`+

)

`

``

86

`+

tmp_file = tmp_path / name

`

``

87

`+

logging.info("fetching %s to temp file %s", self.remote, tmp_file)

`

``

88

`+

with tmp_file.open("wb+") as tmp_io:

`

``

89

`+

shutil.copyfileobj(remote_io, tmp_io)

`

``

90

+

``

91

`+

archive_file: Union[ZipFile, TarFile]

`

``

92

`+

if self.type is ArchiveType.ZIP:

`

``

93

`+

archive_file = xstack.enter_context(ZipFile(tmp_file))

`

``

94

`+

elif self.type is ArchiveType.TAR_GZ:

`

``

95

`+

archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))

`

``

96

`+

archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))

`

``

97

`+

else:

`

``

98

`+

raise ValueError(f"invalid type {self.type}")

`

``

99

+

``

100

`+

for member_info in self._member_list(archive_file):

`

``

101

`+

member_filename = self._member_filename(member_info)

`

``

102

`+

if self._member_isdir(member_info):

`

``

103

`+

logging.debug("Ignoring directory %s", member_filename)

`

``

104

`+

continue

`

``

105

+

``

106

`+

match = self.pattern.match(member_filename)

`

``

107

`+

if match is None:

`

``

108

`+

logging.debug("Ignoring unmatched %s", member_filename)

`

``

109

`+

continue

`

``

110

`+

groups = match.groups()

`

``

111

`+

if len(groups) > 0:

`

``

112

`+

dest_filename = groups[0]

`

``

113

+

``

114

`+

member_io: IO[bytes]

`

``

115

`+

with self._member_io(archive_file, member_info) as member_io:

`

``

116

`+

local_file = self.local_path / dest_filename

`

``

117

`+

if not local_file.parent.exists():

`

``

118

`+

local_file.parent.mkdir(parents=True)

`

``

119

`+

logging.debug("writing %s to %s", member_filename, local_file)

`

``

120

`+

local_file.write_bytes(member_io.read())

`

``

121

+

``

122

`+

logging.info(

`

``

123

`+

"Downloaded %s and extracted files matching %s to %s",

`

``

124

`+

request.full_url,

`

``

125

`+

self.pattern,

`

``

126

`+

self.local_path,

`

``

127

`+

)

`

``

128

+

``

129

`+

@classmethod

`

``

130

`+

def _member_list(

`

``

131

`+

cls, archive: Union[ZipFile, TarFile]

`

``

132

`+

) -> Union[List[ZipInfo], List[TarInfo]]:

`

``

133

`+

if isinstance(archive, ZipFile):

`

``

134

`+

return archive.infolist()

`

``

135

`+

return archive.getmembers()

`

``

136

+

``

137

`+

@classmethod

`

``

138

`+

def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:

`

``

139

`+

if isinstance(member_info, ZipInfo):

`

``

140

`+

return member_info.is_dir()

`

``

141

`+

return member_info.isdir()

`

``

142

+

``

143

`+

@classmethod

`

``

144

`+

def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:

`

``

145

`+

if isinstance(member_info, ZipInfo):

`

``

146

`+

return member_info.filename

`

``

147

`+

return member_info.name

`

``

148

+

``

149

`+

@classmethod

`

``

150

`+

@contextmanager

`

``

151

`+

def _member_io(

`

``

152

`+

cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]

`

``

153

`+

) -> Generator[IO[bytes], None, None]:

`

``

154

`+

if isinstance(archive, ZipFile):

`

``

155

`+

assert isinstance(member_info, ZipInfo)

`

``

156

`+

with archive.open(member_info) as member_io:

`

``

157

`+

yield member_io

`

``

158

`+

else:

`

``

159

`+

assert isinstance(member_info, TarInfo)

`

``

160

`+

opt_io = archive.extractfile(member_info)

`

``

161

`+

assert opt_io is not None

`

``

162

`+

yield opt_io

`

``

163

+

``

164

+

``

165

`+

RESOURCES: List[Resource] = [

`

``

166

`+

ArchiveResource(

`

``

167

`+

remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",

`

``

168

`+

local_path=(DATA_PATH / "suites" / "w3c" / "n3"),

`

``

169

`+

type=ArchiveType.ZIP,

`

``

170

`+

pattern=re.compile(r"^[^/]+[/]tests/$"),

`

``

171

`+

),

`

``

172

`+

ArchiveResource(

`

``

173

`+

remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",

`

``

174

`+

local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),

`

``

175

`+

type=ArchiveType.TAR_GZ,

`

``

176

`+

pattern=re.compile(r"^[^/]+/$"),

`

``

177

`+

),

`

``

178

`+

ArchiveResource(

`

``

179

`+

remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",

`

``

180

`+

local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),

`

``

181

`+

type=ArchiveType.TAR_GZ,

`

``

182

`+

pattern=re.compile(r"^(.+)$"),

`

``

183

`+

),

`

``

184

`+

ArchiveResource(

`

``

185

`+

remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",

`

``

186

`+

local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),

`

``

187

`+

type=ArchiveType.TAR_GZ,

`

``

188

`+

pattern=re.compile(r"^(.+)$"),

`

``

189

`+

),

`

``

190

`+

ArchiveResource(

`

``

191

`+

remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",

`

``

192

`+

local_path=(DATA_PATH / "suites" / "w3c" / "trig"),

`

``

193

`+

type=ArchiveType.TAR_GZ,

`

``

194

`+

pattern=re.compile(r"^(.+)$"),

`

``

195

`+

),

`

``

196

`+

NOTE: Commented out as these files contains local modifications.

`

``

197

`+

ArchiveResource(

`

``

198

`+

remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",

`

``

199

`+

local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),

`

``

200

`+

type=ArchiveType.ZIP,

`

``

201

`+

pattern=re.compile(r"^(.+)$"),

`

``

202

`+

),

`

``

203

`+

NOTE: Commented out as this contains local modifications.

`

``

204

`+

ArchiveResource(

`

``

205

`+

remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",

`

``

206

`+

local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),

`

``

207

`+

type=ArchiveType.TAR_GZ,

`

``

208

`+

pattern=re.compile(r"^[^/]+/$"),

`

``

209

`+

),

`

``

210

`+

FileResource(

`

``

211

`+

remote=Request(

`

``

212

`+

"http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}

`

``

213

`+

),

`

``

214

`+

local_path=(DATA_PATH / "rdfs.ttl"),

`

``

215

`+

),

`

``

216

`+

]

`

``

217

+

``

218

+

``

219

`+

@dataclass

`

``

220

`+

class Application:

`

``

221

`+

parser: argparse.ArgumentParser = field(

`

``

222

`+

default_factory=lambda: argparse.ArgumentParser(add_help=True)

`

``

223

`+

)

`

``

224

+

``

225

`+

def post_init(self) -> None:

`

``

226

`+

parser = self.parser

`

``

227

`+

parser.add_argument(

`

``

228

`+

"-v",

`

``

229

`+

"--verbose",

`

``

230

`+

action="count",

`

``

231

`+

dest="verbosity",

`

``

232

`+

help="increase verbosity level",

`

``

233

`+

)

`

``

234

`+

parser.add_argument(

`

``

235

`+

"--keep-tmp",

`

``

236

`+

action="store_true",

`

``

237

`+

default=False,

`

``

238

`+

)

`

``

239

`+

parser.add_argument("paths", nargs="*", type=str)

`

``

240

`+

parser.set_defaults(handler=self.handle)

`

``

241

+

``

242

`+

def run(self, args: List[str]) -> None:

`

``

243

`+

parse_result = self.parser.parse_args(args)

`

``

244

+

``

245

`+

verbosity = parse_result.verbosity

`

``

246

`+

if verbosity is not None:

`

``

247

`+

root_logger = logging.getLogger("")

`

``

248

`+

root_logger.propagate = True

`

``

249

`+

new_level = (

`

``

250

`+

root_logger.getEffectiveLevel()

`

``

251

`+

`

``

252

`+

`

``

253

`+

)

`

``

254

`+

root_logger.setLevel(new_level)

`

``

255

+

``

256

`+

logging.debug(

`

``

257

`+

"args = %s, parse_result = %s, logging.level = %s",

`

``

258

`+

args,

`

``

259

`+

parse_result,

`

``

260

`+

logging.getLogger("").getEffectiveLevel(),

`

``

261

`+

)

`

``

262

+

``

263

`+

parse_result.handler(parse_result)

`

``

264

+

``

265

`+

def handle(self, parse_result: argparse.Namespace) -> None:

`

``

266

`+

logging.debug("entry ...")

`

``

267

+

``

268

`+

paths = {Path(path).absolute() for path in parse_result.paths}

`

``

269

+

``

270

`+

logging.debug("paths = %s", paths)

`

``

271

+

``

272

`+

if parse_result.keep_tmp:

`

``

273

`+

tmp_path = Path(mkdtemp())

`

``

274

`+

else:

`

``

275

`+

tmp_dir = TemporaryDirectory()

`

``

276

`+

tmp_path = Path(tmp_dir.name)

`

``

277

+

``

278

`+

for resource in RESOURCES:

`

``

279

`+

if paths:

`

``

280

`+

include = False

`

``

281

`+

for path in paths:

`

``

282

`+

try:

`

``

283

`+

resource.local_path.absolute().relative_to(path)

`

``

284

`+

include = True

`

``

285

`+

except ValueError:

`

``

286

`+

not relative to, ignoring

`

``

287

`+

pass

`

``

288

`+

if not include:

`

``

289

`+

logging.info("skipping %s", resource.local_path)

`

``

290

`+

continue

`

``

291

`+

resource.fetch(tmp_path)

`

``

292

+

``

293

+

``

294

`+

def main() -> None:

`

``

295

`+

logging.basicConfig(

`

``

296

`+

level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),

`

``

297

`+

stream=sys.stderr,

`

``

298

`+

datefmt="%Y-%m-%dT%H:%M:%S",

`

``

299

`+

format=(

`

``

300

`+

"%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "

`

``

301

`+

"%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"

`

``

302

`+

),

`

``

303

`+

)

`

``

304

+

``

305

`+

Application().run(sys.argv[1:])

`

``

306

+

``

307

+

``

308

`+

if name == "main":

`

``

309

`+

main()

`