optimizations due to Fred Drake; added urldefrag() function · python/cpython@3fd32ec (original) (raw)

`@@ -3,6 +3,7 @@

3

4

`# Standard/builtin Python modules

5

`import string

6

from string import joinfields, splitfields, find, rfind

6

7

8

`# A classification of schemes ('' means apply by default)

8

9

`uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',

`@@ -18,17 +19,23 @@

18

19

`# Characters valid in scheme names

19

20

`scheme_chars = string.letters + string.digits + '+-.'

20

21

22

_parse_cache = {}

23

+

24

def clear_cache():

25

global _parse_cache

26

_parse_cache = {}

27

+

28

+

21

29

`# Parse a URL into 6 components:

22

30

`# :///;?#

23

31

`# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

24

32

`# Note that we don't break the components up in smaller bits

25

33

`# (e.g. netloc is a single string) and we don't expand % escapes.

26

34

`def urlparse(url, scheme = '', allow_framents = 1):

27

netloc = ''

28

path = ''

29

params = ''

30

query = ''

31

fragment = ''

35

key = url, scheme, allow_framents

36

if _parse_cache.has_key(key):

37

return _parse_cache[key]

38

netloc = path = params = query = fragment = ''

32

39

`i = string.find(url, ':')

33

40

`if i > 0:

34

41

`for c in url[:i]:

`@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):

54

61

`i = string.find(url, ';')

55

62

`if i >= 0:

56

63

`url, params = url[:i], url[i+1:]

57

return scheme, netloc, url, params, query, fragment

64

tuple = scheme, netloc, url, params, query, fragment

65

_parse_cache[key] = tuple

66

return tuple

58

67

59

68

`# Put a parsed URL back together again. This may result in a slightly

60

69

`# different, but equivalent URL, if the URL that was parsed originally

`@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):

80

89

`if not base:

81

90

`return url

82

91

`bscheme, bnetloc, bpath, bparams, bquery, bfragment = \

83

urlparse(base, '', allow_framents)

92

urlparse(base, '', allow_framents)

84

93

`scheme, netloc, path, params, query, fragment = \

85

94

`urlparse(url, bscheme, allow_framents)

86

95

`# XXX Unofficial hack: default netloc to bnetloc even if

`@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):

90

99

`scheme in uses_netloc and bscheme in uses_netloc:

91

100

`netloc = bnetloc

92

101

`# Strip the port number

93

i = string.find(netloc, '@')

102

i = find(netloc, '@')

94

103

`if i < 0: i = 0

95

i = string.find(netloc, ':', i)

104

i = find(netloc, ':', i)

96

105

`if i >= 0:

97

106

`netloc = netloc[:i]

98

107

`if scheme != bscheme or scheme not in uses_relative:

`@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):

107

116

`return urlunparse((scheme, netloc, path,

108

117

`params, query, fragment))

109

118

`if not path:

110

path = bpath

111

if not query:

112

query = bquery

113

return urlunparse((scheme, netloc, path,

114

params, query, fragment))

115

i = string.rfind(bpath, '/')

119

return urlunparse((scheme, netloc, bpath,

120

params, query or bquery, fragment))

121

i = rfind(bpath, '/')

116

122

`if i >= 0:

117

123

`path = bpath[:i] + '/' + path

118

segments = string.splitfields(path, '/')

124

segments = splitfields(path, '/')

119

125

`if segments[-1] == '.':

120

126

`segments[-1] = ''

121

127

`while '.' in segments:

`@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):

132

138

`break

133

139

`if len(segments) >= 2 and segments[-1] == '..':

134

140

`segments[-2:] = ['']

135

path = string.joinfields(segments, '/')

136

return urlunparse((scheme, netloc, path,

141

return urlunparse((scheme, netloc, joinfields(segments, '/'),

137

142

`params, query, fragment))

138

143

144

def urldefrag(url):

145

"""Removes any existing fragment from URL.

146

+

147

Returns a tuple of the defragmented URL and the fragment. If

148

the URL contained no fragments, the second element is the

149

empty string.

150

"""

151

s, n, p, a, q, frag = urlparse(url)

152

defrag = urlunparse((s, n, p, a, q, ''))

153

return defrag, frag

154

+

155

+

139

156

`test_input = """

140

157

` http://a/b/c/d

141

158