bpo-40328: Add tool for generating cjk mapping headers (GH-19602) · python/cpython@113feb3 (original) (raw)

``

1

`+

`

``

2

`+

genmap_ja_codecs.py: Japanese Codecs Map Generator

`

``

3

`+

`

``

4

`+

Original Author: Hye-Shik Chang perky@FreeBSD.org

`

``

5

`+

Modified Author: Dong-hee Na donghee.na92@gmail.com

`

``

6

`+

`

``

7

`+

import os

`

``

8

+

``

9

`+

from genmap_support import *

`

``

10

+

``

11

`+

JISX0208_C1 = (0x21, 0x74)

`

``

12

`+

JISX0208_C2 = (0x21, 0x7e)

`

``

13

`+

JISX0212_C1 = (0x22, 0x6d)

`

``

14

`+

JISX0212_C2 = (0x21, 0x7e)

`

``

15

`+

JISX0213_C1 = (0x21, 0x7e)

`

``

16

`+

JISX0213_C2 = (0x21, 0x7e)

`

``

17

`+

CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932

`

``

18

`+

CP932P0_C2 = (0x5f, 0xca)

`

``

19

`+

CP932P1_C1 = (0x87, 0x87) # CP932 P1

`

``

20

`+

CP932P1_C2 = (0x40, 0x9c)

`

``

21

`+

CP932P2_C1 = (0xed, 0xfc) # CP932 P2

`

``

22

`+

CP932P2_C2 = (0x40, 0xfc)

`

``

23

+

``

24

`+

MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'

`

``

25

`+

MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'

`

``

26

`+

MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'

`

``

27

`+

MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'

`

``

28

+

``

29

+

``

30

`+

def loadmap_jisx0213(fo):

`

``

31

`+

decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4

`

``

32

`+

decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4

`

``

33

`+

decmap3_pair = {} # maps to BMP-pair for level 3

`

``

34

`+

for line in fo:

`

``

35

`+

line = line.split('#', 1)[0].strip()

`

``

36

`+

if not line or len(line.split()) < 2:

`

``

37

`+

continue

`

``

38

+

``

39

`+

row = line.split()

`

``

40

`+

loc = eval('0x' + row[0][2:])

`

``

41

`+

level = eval(row[0][0])

`

``

42

`+

m = None

`

``

43

`+

if len(row[1].split('+')) == 2: # single unicode

`

``

44

`+

uni = eval('0x' + row[1][2:])

`

``

45

`+

if level == 3:

`

``

46

`+

if uni < 0x10000:

`

``

47

`+

m = decmap3

`

``

48

`+

elif 0x20000 <= uni < 0x30000:

`

``

49

`+

uni -= 0x20000

`

``

50

`+

m = decmap3_2

`

``

51

`+

elif level == 4:

`

``

52

`+

if uni < 0x10000:

`

``

53

`+

m = decmap4

`

``

54

`+

elif 0x20000 <= uni < 0x30000:

`

``

55

`+

uni -= 0x20000

`

``

56

`+

m = decmap4_2

`

``

57

`+

m.setdefault((loc >> 8), {})

`

``

58

`+

m[(loc >> 8)][(loc & 0xff)] = uni

`

``

59

`+

else: # pair

`

``

60

`+

uniprefix = eval('0x' + row[1][2:6]) # body

`

``

61

`+

uni = eval('0x' + row[1][7:11]) # modifier

`

``

62

`+

if level != 3:

`

``

63

`+

raise ValueError("invalid map")

`

``

64

`+

decmap3_pair.setdefault(uniprefix, {})

`

``

65

`+

m = decmap3_pair[uniprefix]

`

``

66

+

``

67

`+

if m is None:

`

``

68

`+

raise ValueError("invalid map")

`

``

69

`+

m.setdefault((loc >> 8), {})

`

``

70

`+

m[(loc >> 8)][(loc & 0xff)] = uni

`

``

71

+

``

72

`+

return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair

`

``

73

+

``

74

+

``

75

`+

def main():

`

``

76

`+

jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)

`

``

77

`+

jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)

`

``

78

`+

cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)

`

``

79

`+

jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)

`

``

80

+

``

81

`+

print("Loading Mapping File...")

`

``

82

+

``

83

`+

sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)

`

``

84

`+

jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)

`

``

85

`+

jisx0212decmap = loadmap(jisx0212file)

`

``

86

`+

cp932decmap = loadmap(cp932file)

`

``

87

`+

jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)

`

``

88

+

``

89

`+

if jis3decmap[0x21][0x24] != 0xff0c:

`

``

90

`+

raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')

`

``

91

+

``

92

`+

sjisencmap, cp932encmap = {}, {}

`

``

93

`+

jisx0208_0212encmap = {}

`

``

94

`+

for c1, m in sjisdecmap.items():

`

``

95

`+

for c2, code in m.items():

`

``

96

`+

sjisencmap.setdefault(code >> 8, {})

`

``

97

`+

sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2

`

``

98

`+

for c1, m in cp932decmap.items():

`

``

99

`+

for c2, code in m.items():

`

``

100

`+

cp932encmap.setdefault(code >> 8, {})

`

``

101

`+

if (code & 0xff) not in cp932encmap[code >> 8]:

`

``

102

`+

cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2

`

``

103

`+

for c1, m in cp932encmap.copy().items():

`

``

104

`+

for c2, code in m.copy().items():

`

``

105

`+

if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:

`

``

106

`+

del cp932encmap[c1][c2]

`

``

107

`+

if not cp932encmap[c1]:

`

``

108

`+

del cp932encmap[c1]

`

``

109

+

``

110

`+

jisx0213pairdecmap = {}

`

``

111

`+

jisx0213pairencmap = []

`

``

112

`+

for unibody, m1 in jis3_pairdecmap.items():

`

``

113

`+

for c1, m2 in m1.items():

`

``

114

`+

for c2, modifier in m2.items():

`

``

115

`+

jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))

`

``

116

`+

jisx0213pairdecmap.setdefault(c1, {})

`

``

117

`+

jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier

`

``

118

+

``

119

`+

Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)

`

``

120

`+

for c1, m in jisx0208decmap.items():

`

``

121

`+

for c2, code in m.items():

`

``

122

`+

jisx0208_0212encmap.setdefault(code >> 8, {})

`

``

123

`+

jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2

`

``

124

+

``

125

`+

for c1, m in jisx0212decmap.items():

`

``

126

`+

for c2, code in m.items():

`

``

127

`+

jisx0208_0212encmap.setdefault(code >> 8, {})

`

``

128

`+

if (code & 0xff) in jisx0208_0212encmap[code >> 8]:

`

``

129

`+

print("OOPS!!!", (code))

`

``

130

`+

jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

`

``

131

+

``

132

`+

jisx0213bmpencmap = {}

`

``

133

`+

for c1, m in jis3decmap.copy().items():

`

``

134

`+

for c2, code in m.copy().items():

`

``

135

`+

if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:

`

``

136

`+

if code in jis3_pairdecmap:

`

``

137

`+

jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair

`

``

138

`+

jisx0213pairencmap.append((code, 0, c1 << 8 | c2))

`

``

139

`+

elif jisx0208decmap[c1][c2] == code:

`

``

140

`+

del jis3decmap[c1][c2]

`

``

141

`+

if not jis3decmap[c1]:

`

``

142

`+

del jis3decmap[c1]

`

``

143

`+

else:

`

``

144

`+

raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")

`

``

145

`+

else:

`

``

146

`+

jisx0213bmpencmap.setdefault(code >> 8, {})

`

``

147

`+

if code not in jis3_pairdecmap:

`

``

148

`+

jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2

`

``

149

`+

else:

`

``

150

`+

jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair

`

``

151

`+

jisx0213pairencmap.append((code, 0, c1 << 8 | c2))

`

``

152

+

``

153

`+

for c1, m in jis4decmap.items():

`

``

154

`+

for c2, code in m.items():

`

``

155

`+

jisx0213bmpencmap.setdefault(code >> 8, {})

`

``

156

`+

jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

`

``

157

+

``

158

`+

jisx0213empencmap = {}

`

``

159

`+

for c1, m in jis3_2_decmap.items():

`

``

160

`+

for c2, code in m.items():

`

``

161

`+

jisx0213empencmap.setdefault(code >> 8, {})

`

``

162

`+

jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2

`

``

163

`+

for c1, m in jis4_2_decmap.items():

`

``

164

`+

for c2, code in m.items():

`

``

165

`+

jisx0213empencmap.setdefault(code >> 8, {})

`

``

166

`+

jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2

`

``

167

+

``

168

`+

with open("mappings_jp.h", "w") as fp:

`

``

169

`+

print_autogen(fp, os.path.basename(file))

`

``

170

`+

print("Generating JIS X 0208 decode map...")

`

``

171

`+

writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)

`

``

172

`+

writer.update_decode_map(JISX0208_C1, JISX0208_C2)

`

``

173

`+

writer.generate()

`

``

174

+

``

175

`+

print("Generating JIS X 0212 decode map...")

`

``

176

`+

writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)

`

``

177

`+

writer.update_decode_map(JISX0212_C1, JISX0212_C2)

`

``

178

`+

writer.generate()

`

``

179

+

``

180

`+

print("Generating JIS X 0208 && JIS X 0212 encode map...")

`

``

181

`+

writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)

`

``

182

`+

writer.generate()

`

``

183

+

``

184

`+

print("Generating CP932 Extension decode map...")

`

``

185

`+

writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)

`

``

186

`+

writer.update_decode_map(CP932P0_C1, CP932P0_C2)

`

``

187

`+

writer.update_decode_map(CP932P1_C1, CP932P1_C2)

`

``

188

`+

writer.update_decode_map(CP932P2_C1, CP932P2_C2)

`

``

189

`+

writer.generate()

`

``

190

+

``

191

`+

print("Generating CP932 Extension encode map...")

`

``

192

`+

writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)

`

``

193

`+

writer.generate()

`

``

194

+

``

195

`+

print("Generating JIS X 0213 Plane 1 BMP decode map...")

`

``

196

`+

writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)

`

``

197

`+

writer.update_decode_map(JISX0213_C1, JISX0213_C2)

`

``

198

`+

writer.generate()

`

``

199

+

``

200

`+

print("Generating JIS X 0213 Plane 2 BMP decode map...")

`

``

201

`+

writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)

`

``

202

`+

writer.update_decode_map(JISX0213_C1, JISX0213_C2)

`

``

203

`+

writer.generate()

`

``

204

+

``

205

`+

print("Generating JIS X 0213 BMP encode map...")

`

``

206

`+

writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)

`

``

207

`+

writer.generate()

`

``

208

+

``

209

`+

print("Generating JIS X 0213 Plane 1 EMP decode map...")

`

``

210

`+

writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)

`

``

211

`+

writer.update_decode_map(JISX0213_C1, JISX0213_C2)

`

``

212

`+

writer.generate()

`

``

213

+

``

214

`+

print("Generating JIS X 0213 Plane 2 EMP decode map...")

`

``

215

`+

writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)

`

``

216

`+

writer.update_decode_map(JISX0213_C1, JISX0213_C2)

`

``

217

`+

writer.generate()

`

``

218

+

``

219

`+

print("Generating JIS X 0213 EMP encode map...")

`

``

220

`+

writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)

`

``

221

`+

writer.generate()

`

``

222

+

``

223

`+

with open('mappings_jisx0213_pair.h', 'w') as fp:

`

``

224

`+

print_autogen(fp, os.path.basename(file))

`

``

225

`+

fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")

`

``

226

`+

fp.write("""\

`

``

227

`+

#ifdef EXTERN_JISX0213_PAIR

`

``

228

`+

static const struct widedbcs_index *jisx0213_pair_decmap;

`

``

229

`+

static const struct pair_encodemap *jisx0213_pair_encmap;

`

``

230

`+

#else

`

``

231

`+

""")

`

``

232

+

``

233

`+

print("Generating JIS X 0213 unicode-pair decode map...")

`

``

234

`+

writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)

`

``

235

`+

writer.update_decode_map(JISX0213_C1, JISX0213_C2)

`

``

236

`+

writer.generate(wide=True)

`

``

237

+

``

238

`+

print("Generating JIS X 0213 unicode-pair encode map...")

`

``

239

`+

jisx0213pairencmap.sort()

`

``

240

`+

fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")

`

``

241

`+

filler = BufferedFiller()

`

``

242

`+

for body, modifier, jis in jisx0213pairencmap:

`

``

243

`+

filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')

`

``

244

`+

filler.printout(fp)

`

``

245

`+

fp.write("};\n")

`

``

246

`+

fp.write("#endif\n")

`

``

247

+

``

248

`+

print("Done!")

`

``

249

+

``

250

`+

if name == 'main':

`

``

251

`+

main()

`