optimizations due to Fred Drake; added urldefrag() function · python/cpython@3fd32ec (original) (raw)
`@@ -3,6 +3,7 @@
`
3
3
``
4
4
`# Standard/builtin Python modules
`
5
5
`import string
`
``
6
`+
from string import joinfields, splitfields, find, rfind
`
6
7
``
7
8
`# A classification of schemes ('' means apply by default)
`
8
9
`uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
`
`@@ -18,17 +19,23 @@
`
18
19
`# Characters valid in scheme names
`
19
20
`scheme_chars = string.letters + string.digits + '+-.'
`
20
21
``
``
22
`+
_parse_cache = {}
`
``
23
+
``
24
`+
def clear_cache():
`
``
25
`+
global _parse_cache
`
``
26
`+
_parse_cache = {}
`
``
27
+
``
28
+
21
29
`# Parse a URL into 6 components:
`
22
30
`# :///
`
23
31
`# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
`
24
32
`# Note that we don't break the components up in smaller bits
`
25
33
`# (e.g. netloc is a single string) and we don't expand % escapes.
`
26
34
`def urlparse(url, scheme = '', allow_framents = 1):
`
27
``
`-
netloc = ''
`
28
``
`-
path = ''
`
29
``
`-
params = ''
`
30
``
`-
query = ''
`
31
``
`-
fragment = ''
`
``
35
`+
key = url, scheme, allow_framents
`
``
36
`+
if _parse_cache.has_key(key):
`
``
37
`+
return _parse_cache[key]
`
``
38
`+
netloc = path = params = query = fragment = ''
`
32
39
`i = string.find(url, ':')
`
33
40
`if i > 0:
`
34
41
`for c in url[:i]:
`
`@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):
`
54
61
`i = string.find(url, ';')
`
55
62
`if i >= 0:
`
56
63
`url, params = url[:i], url[i+1:]
`
57
``
`-
return scheme, netloc, url, params, query, fragment
`
``
64
`+
tuple = scheme, netloc, url, params, query, fragment
`
``
65
`+
_parse_cache[key] = tuple
`
``
66
`+
return tuple
`
58
67
``
59
68
`# Put a parsed URL back together again. This may result in a slightly
`
60
69
`# different, but equivalent URL, if the URL that was parsed originally
`
`@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):
`
80
89
`if not base:
`
81
90
`return url
`
82
91
`bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
`
83
``
`-
urlparse(base, '', allow_framents)
`
``
92
`+
urlparse(base, '', allow_framents)
`
84
93
`scheme, netloc, path, params, query, fragment = \
`
85
94
`urlparse(url, bscheme, allow_framents)
`
86
95
`# XXX Unofficial hack: default netloc to bnetloc even if
`
`@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):
`
90
99
`scheme in uses_netloc and bscheme in uses_netloc:
`
91
100
`netloc = bnetloc
`
92
101
`# Strip the port number
`
93
``
`-
i = string.find(netloc, '@')
`
``
102
`+
i = find(netloc, '@')
`
94
103
`if i < 0: i = 0
`
95
``
`-
i = string.find(netloc, ':', i)
`
``
104
`+
i = find(netloc, ':', i)
`
96
105
`if i >= 0:
`
97
106
`netloc = netloc[:i]
`
98
107
`if scheme != bscheme or scheme not in uses_relative:
`
`@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):
`
107
116
`return urlunparse((scheme, netloc, path,
`
108
117
`params, query, fragment))
`
109
118
`if not path:
`
110
``
`-
path = bpath
`
111
``
`-
if not query:
`
112
``
`-
query = bquery
`
113
``
`-
return urlunparse((scheme, netloc, path,
`
114
``
`-
params, query, fragment))
`
115
``
`-
i = string.rfind(bpath, '/')
`
``
119
`+
return urlunparse((scheme, netloc, bpath,
`
``
120
`+
params, query or bquery, fragment))
`
``
121
`+
i = rfind(bpath, '/')
`
116
122
`if i >= 0:
`
117
123
`path = bpath[:i] + '/' + path
`
118
``
`-
segments = string.splitfields(path, '/')
`
``
124
`+
segments = splitfields(path, '/')
`
119
125
`if segments[-1] == '.':
`
120
126
`segments[-1] = ''
`
121
127
`while '.' in segments:
`
`@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):
`
132
138
`break
`
133
139
`if len(segments) >= 2 and segments[-1] == '..':
`
134
140
`segments[-2:] = ['']
`
135
``
`-
path = string.joinfields(segments, '/')
`
136
``
`-
return urlunparse((scheme, netloc, path,
`
``
141
`+
return urlunparse((scheme, netloc, joinfields(segments, '/'),
`
137
142
`params, query, fragment))
`
138
143
``
``
144
`+
def urldefrag(url):
`
``
145
`+
"""Removes any existing fragment from URL.
`
``
146
+
``
147
`+
Returns a tuple of the defragmented URL and the fragment. If
`
``
148
`+
the URL contained no fragments, the second element is the
`
``
149
`+
empty string.
`
``
150
`+
"""
`
``
151
`+
s, n, p, a, q, frag = urlparse(url)
`
``
152
`+
defrag = urlunparse((s, n, p, a, q, ''))
`
``
153
`+
return defrag, frag
`
``
154
+
``
155
+
139
156
`test_input = """
`
140
157
`
141
158
``