optimizations due to Fred Drake; added urldefrag() function · python/cpython@3fd32ec (original) (raw)

`@@ -3,6 +3,7 @@

`

3

3

``

4

4

`# Standard/builtin Python modules

`

5

5

`import string

`

``

6

`+

from string import joinfields, splitfields, find, rfind

`

6

7

``

7

8

`# A classification of schemes ('' means apply by default)

`

8

9

`uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',

`

`@@ -18,17 +19,23 @@

`

18

19

`# Characters valid in scheme names

`

19

20

`scheme_chars = string.letters + string.digits + '+-.'

`

20

21

``

``

22

`+

_parse_cache = {}

`

``

23

+

``

24

`+

def clear_cache():

`

``

25

`+

global _parse_cache

`

``

26

`+

_parse_cache = {}

`

``

27

+

``

28

+

21

29

`# Parse a URL into 6 components:

`

22

30

`# :///;?#

`

23

31

`# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

`

24

32

`# Note that we don't break the components up in smaller bits

`

25

33

`# (e.g. netloc is a single string) and we don't expand % escapes.

`

26

34

`def urlparse(url, scheme = '', allow_framents = 1):

`

27

``

`-

netloc = ''

`

28

``

`-

path = ''

`

29

``

`-

params = ''

`

30

``

`-

query = ''

`

31

``

`-

fragment = ''

`

``

35

`+

key = url, scheme, allow_framents

`

``

36

`+

if _parse_cache.has_key(key):

`

``

37

`+

return _parse_cache[key]

`

``

38

`+

netloc = path = params = query = fragment = ''

`

32

39

`i = string.find(url, ':')

`

33

40

`if i > 0:

`

34

41

`for c in url[:i]:

`

`@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):

`

54

61

`i = string.find(url, ';')

`

55

62

`if i >= 0:

`

56

63

`url, params = url[:i], url[i+1:]

`

57

``

`-

return scheme, netloc, url, params, query, fragment

`

``

64

`+

tuple = scheme, netloc, url, params, query, fragment

`

``

65

`+

_parse_cache[key] = tuple

`

``

66

`+

return tuple

`

58

67

``

59

68

`# Put a parsed URL back together again. This may result in a slightly

`

60

69

`# different, but equivalent URL, if the URL that was parsed originally

`

`@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):

`

80

89

`if not base:

`

81

90

`return url

`

82

91

`bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

`

83

``

`-

urlparse(base, '', allow_framents)

`

``

92

`+

urlparse(base, '', allow_framents)

`

84

93

`scheme, netloc, path, params, query, fragment = \

`

85

94

`urlparse(url, bscheme, allow_framents)

`

86

95

`# XXX Unofficial hack: default netloc to bnetloc even if

`

`@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):

`

90

99

`scheme in uses_netloc and bscheme in uses_netloc:

`

91

100

`netloc = bnetloc

`

92

101

`# Strip the port number

`

93

``

`-

i = string.find(netloc, '@')

`

``

102

`+

i = find(netloc, '@')

`

94

103

`if i < 0: i = 0

`

95

``

`-

i = string.find(netloc, ':', i)

`

``

104

`+

i = find(netloc, ':', i)

`

96

105

`if i >= 0:

`

97

106

`netloc = netloc[:i]

`

98

107

`if scheme != bscheme or scheme not in uses_relative:

`

`@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):

`

107

116

`return urlunparse((scheme, netloc, path,

`

108

117

`params, query, fragment))

`

109

118

`if not path:

`

110

``

`-

path = bpath

`

111

``

`-

if not query:

`

112

``

`-

query = bquery

`

113

``

`-

return urlunparse((scheme, netloc, path,

`

114

``

`-

params, query, fragment))

`

115

``

`-

i = string.rfind(bpath, '/')

`

``

119

`+

return urlunparse((scheme, netloc, bpath,

`

``

120

`+

params, query or bquery, fragment))

`

``

121

`+

i = rfind(bpath, '/')

`

116

122

`if i >= 0:

`

117

123

`path = bpath[:i] + '/' + path

`

118

``

`-

segments = string.splitfields(path, '/')

`

``

124

`+

segments = splitfields(path, '/')

`

119

125

`if segments[-1] == '.':

`

120

126

`segments[-1] = ''

`

121

127

`while '.' in segments:

`

`@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):

`

132

138

`break

`

133

139

`if len(segments) >= 2 and segments[-1] == '..':

`

134

140

`segments[-2:] = ['']

`

135

``

`-

path = string.joinfields(segments, '/')

`

136

``

`-

return urlunparse((scheme, netloc, path,

`

``

141

`+

return urlunparse((scheme, netloc, joinfields(segments, '/'),

`

137

142

`params, query, fragment))

`

138

143

``

``

144

`+

def urldefrag(url):

`

``

145

`+

"""Removes any existing fragment from URL.

`

``

146

+

``

147

`+

Returns a tuple of the defragmented URL and the fragment. If

`

``

148

`+

the URL contained no fragments, the second element is the

`

``

149

`+

empty string.

`

``

150

`+

"""

`

``

151

`+

s, n, p, a, q, frag = urlparse(url)

`

``

152

`+

defrag = urlunparse((s, n, p, a, q, ''))

`

``

153

`+

return defrag, frag

`

``

154

+

``

155

+

139

156

`test_input = """

`

140

157

` http://a/b/c/d

`

141

158

``