bpo-40328: Add tool for generating cjk mapping headers (GH-19602) · python/cpython@113feb3 (original) (raw)
``
1
`+
`
``
2
`+
genmap_ja_codecs.py: Japanese Codecs Map Generator
`
``
3
`+
`
``
4
`+
Original Author: Hye-Shik Chang perky@FreeBSD.org
`
``
5
`+
Modified Author: Dong-hee Na donghee.na92@gmail.com
`
``
6
`+
`
``
7
`+
import os
`
``
8
+
``
9
`+
from genmap_support import *
`
``
10
+
``
11
`+
JISX0208_C1 = (0x21, 0x74)
`
``
12
`+
JISX0208_C2 = (0x21, 0x7e)
`
``
13
`+
JISX0212_C1 = (0x22, 0x6d)
`
``
14
`+
JISX0212_C2 = (0x21, 0x7e)
`
``
15
`+
JISX0213_C1 = (0x21, 0x7e)
`
``
16
`+
JISX0213_C2 = (0x21, 0x7e)
`
``
17
`+
CP932P0_C1 = (0x81, 0x81) # patches between shift-jis and cp932
`
``
18
`+
CP932P0_C2 = (0x5f, 0xca)
`
``
19
`+
CP932P1_C1 = (0x87, 0x87) # CP932 P1
`
``
20
`+
CP932P1_C2 = (0x40, 0x9c)
`
``
21
`+
CP932P2_C1 = (0xed, 0xfc) # CP932 P2
`
``
22
`+
CP932P2_C2 = (0x40, 0xfc)
`
``
23
+
``
24
`+
MAPPINGS_JIS0208 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
`
``
25
`+
MAPPINGS_JIS0212 = 'http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT'
`
``
26
`+
MAPPINGS_CP932 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT'
`
``
27
`+
MAPPINGS_JISX0213_2004 = 'http://wakaba-web.hp.infoseek.co.jp/table/jisx0213-2004-std.txt'
`
``
28
+
``
29
+
``
30
`+
def loadmap_jisx0213(fo):
`
``
31
`+
decmap3, decmap4 = {}, {} # maps to BMP for level 3 and 4
`
``
32
`+
decmap3_2, decmap4_2 = {}, {} # maps to U+2xxxx for level 3 and 4
`
``
33
`+
decmap3_pair = {} # maps to BMP-pair for level 3
`
``
34
`+
for line in fo:
`
``
35
`+
line = line.split('#', 1)[0].strip()
`
``
36
`+
if not line or len(line.split()) < 2:
`
``
37
`+
continue
`
``
38
+
``
39
`+
row = line.split()
`
``
40
`+
loc = eval('0x' + row[0][2:])
`
``
41
`+
level = eval(row[0][0])
`
``
42
`+
m = None
`
``
43
`+
if len(row[1].split('+')) == 2: # single unicode
`
``
44
`+
uni = eval('0x' + row[1][2:])
`
``
45
`+
if level == 3:
`
``
46
`+
if uni < 0x10000:
`
``
47
`+
m = decmap3
`
``
48
`+
elif 0x20000 <= uni < 0x30000:
`
``
49
`+
uni -= 0x20000
`
``
50
`+
m = decmap3_2
`
``
51
`+
elif level == 4:
`
``
52
`+
if uni < 0x10000:
`
``
53
`+
m = decmap4
`
``
54
`+
elif 0x20000 <= uni < 0x30000:
`
``
55
`+
uni -= 0x20000
`
``
56
`+
m = decmap4_2
`
``
57
`+
m.setdefault((loc >> 8), {})
`
``
58
`+
m[(loc >> 8)][(loc & 0xff)] = uni
`
``
59
`+
else: # pair
`
``
60
`+
uniprefix = eval('0x' + row[1][2:6]) # body
`
``
61
`+
uni = eval('0x' + row[1][7:11]) # modifier
`
``
62
`+
if level != 3:
`
``
63
`+
raise ValueError("invalid map")
`
``
64
`+
decmap3_pair.setdefault(uniprefix, {})
`
``
65
`+
m = decmap3_pair[uniprefix]
`
``
66
+
``
67
`+
if m is None:
`
``
68
`+
raise ValueError("invalid map")
`
``
69
`+
m.setdefault((loc >> 8), {})
`
``
70
`+
m[(loc >> 8)][(loc & 0xff)] = uni
`
``
71
+
``
72
`+
return decmap3, decmap4, decmap3_2, decmap4_2, decmap3_pair
`
``
73
+
``
74
+
``
75
`+
def main():
`
``
76
`+
jisx0208file = open_mapping_file('python-mappings/JIS0208.TXT', MAPPINGS_JIS0208)
`
``
77
`+
jisx0212file = open_mapping_file('python-mappings/JIS0212.TXT', MAPPINGS_JIS0212)
`
``
78
`+
cp932file = open_mapping_file('python-mappings/CP932.TXT', MAPPINGS_CP932)
`
``
79
`+
jisx0213file = open_mapping_file('python-mappings/jisx0213-2004-std.txt', MAPPINGS_JISX0213_2004)
`
``
80
+
``
81
`+
print("Loading Mapping File...")
`
``
82
+
``
83
`+
sjisdecmap = loadmap(jisx0208file, natcol=0, unicol=2)
`
``
84
`+
jisx0208decmap = loadmap(jisx0208file, natcol=1, unicol=2)
`
``
85
`+
jisx0212decmap = loadmap(jisx0212file)
`
``
86
`+
cp932decmap = loadmap(cp932file)
`
``
87
`+
jis3decmap, jis4decmap, jis3_2_decmap, jis4_2_decmap, jis3_pairdecmap = loadmap_jisx0213(jisx0213file)
`
``
88
+
``
89
`+
if jis3decmap[0x21][0x24] != 0xff0c:
`
``
90
`+
raise SystemExit('Please adjust your JIS X 0213 map using jisx0213-2000-std.txt.diff')
`
``
91
+
``
92
`+
sjisencmap, cp932encmap = {}, {}
`
``
93
`+
jisx0208_0212encmap = {}
`
``
94
`+
for c1, m in sjisdecmap.items():
`
``
95
`+
for c2, code in m.items():
`
``
96
`+
sjisencmap.setdefault(code >> 8, {})
`
``
97
`+
sjisencmap[code >> 8][code & 0xff] = c1 << 8 | c2
`
``
98
`+
for c1, m in cp932decmap.items():
`
``
99
`+
for c2, code in m.items():
`
``
100
`+
cp932encmap.setdefault(code >> 8, {})
`
``
101
`+
if (code & 0xff) not in cp932encmap[code >> 8]:
`
``
102
`+
cp932encmap[code >> 8][code & 0xff] = c1 << 8 | c2
`
``
103
`+
for c1, m in cp932encmap.copy().items():
`
``
104
`+
for c2, code in m.copy().items():
`
``
105
`+
if c1 in sjisencmap and c2 in sjisencmap[c1] and sjisencmap[c1][c2] == code:
`
``
106
`+
del cp932encmap[c1][c2]
`
``
107
`+
if not cp932encmap[c1]:
`
``
108
`+
del cp932encmap[c1]
`
``
109
+
``
110
`+
jisx0213pairdecmap = {}
`
``
111
`+
jisx0213pairencmap = []
`
``
112
`+
for unibody, m1 in jis3_pairdecmap.items():
`
``
113
`+
for c1, m2 in m1.items():
`
``
114
`+
for c2, modifier in m2.items():
`
``
115
`+
jisx0213pairencmap.append((unibody, modifier, c1 << 8 | c2))
`
``
116
`+
jisx0213pairdecmap.setdefault(c1, {})
`
``
117
`+
jisx0213pairdecmap[c1][c2] = unibody << 16 | modifier
`
``
118
+
``
119
`+
Twinmap for both of JIS X 0208 (MSB unset) and JIS X 0212 (MSB set)
`
``
120
`+
for c1, m in jisx0208decmap.items():
`
``
121
`+
for c2, code in m.items():
`
``
122
`+
jisx0208_0212encmap.setdefault(code >> 8, {})
`
``
123
`+
jisx0208_0212encmap[code >> 8][code & 0xff] = c1 << 8 | c2
`
``
124
+
``
125
`+
for c1, m in jisx0212decmap.items():
`
``
126
`+
for c2, code in m.items():
`
``
127
`+
jisx0208_0212encmap.setdefault(code >> 8, {})
`
``
128
`+
if (code & 0xff) in jisx0208_0212encmap[code >> 8]:
`
``
129
`+
print("OOPS!!!", (code))
`
``
130
`+
jisx0208_0212encmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
`
``
131
+
``
132
`+
jisx0213bmpencmap = {}
`
``
133
`+
for c1, m in jis3decmap.copy().items():
`
``
134
`+
for c2, code in m.copy().items():
`
``
135
`+
if c1 in jisx0208decmap and c2 in jisx0208decmap[c1]:
`
``
136
`+
if code in jis3_pairdecmap:
`
``
137
`+
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
`
``
138
`+
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
`
``
139
`+
elif jisx0208decmap[c1][c2] == code:
`
``
140
`+
del jis3decmap[c1][c2]
`
``
141
`+
if not jis3decmap[c1]:
`
``
142
`+
del jis3decmap[c1]
`
``
143
`+
else:
`
``
144
`+
raise ValueError("Difference between JIS X 0208 and JIS X 0213 Plane 1 is found.")
`
``
145
`+
else:
`
``
146
`+
jisx0213bmpencmap.setdefault(code >> 8, {})
`
``
147
`+
if code not in jis3_pairdecmap:
`
``
148
`+
jisx0213bmpencmap[code >> 8][code & 0xff] = c1 << 8 | c2
`
``
149
`+
else:
`
``
150
`+
jisx0213bmpencmap[code >> 8][code & 0xff] = (0,) # pair
`
``
151
`+
jisx0213pairencmap.append((code, 0, c1 << 8 | c2))
`
``
152
+
``
153
`+
for c1, m in jis4decmap.items():
`
``
154
`+
for c2, code in m.items():
`
``
155
`+
jisx0213bmpencmap.setdefault(code >> 8, {})
`
``
156
`+
jisx0213bmpencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
`
``
157
+
``
158
`+
jisx0213empencmap = {}
`
``
159
`+
for c1, m in jis3_2_decmap.items():
`
``
160
`+
for c2, code in m.items():
`
``
161
`+
jisx0213empencmap.setdefault(code >> 8, {})
`
``
162
`+
jisx0213empencmap[code >> 8][code & 0xff] = c1 << 8 | c2
`
``
163
`+
for c1, m in jis4_2_decmap.items():
`
``
164
`+
for c2, code in m.items():
`
``
165
`+
jisx0213empencmap.setdefault(code >> 8, {})
`
``
166
`+
jisx0213empencmap[code >> 8][code & 0xff] = 0x8000 | c1 << 8 | c2
`
``
167
+
``
168
`+
with open("mappings_jp.h", "w") as fp:
`
``
169
`+
print_autogen(fp, os.path.basename(file))
`
``
170
`+
print("Generating JIS X 0208 decode map...")
`
``
171
`+
writer = DecodeMapWriter(fp, "jisx0208", jisx0208decmap)
`
``
172
`+
writer.update_decode_map(JISX0208_C1, JISX0208_C2)
`
``
173
`+
writer.generate()
`
``
174
+
``
175
`+
print("Generating JIS X 0212 decode map...")
`
``
176
`+
writer = DecodeMapWriter(fp, "jisx0212", jisx0212decmap)
`
``
177
`+
writer.update_decode_map(JISX0212_C1, JISX0212_C2)
`
``
178
`+
writer.generate()
`
``
179
+
``
180
`+
print("Generating JIS X 0208 && JIS X 0212 encode map...")
`
``
181
`+
writer = EncodeMapWriter(fp, "jisxcommon", jisx0208_0212encmap)
`
``
182
`+
writer.generate()
`
``
183
+
``
184
`+
print("Generating CP932 Extension decode map...")
`
``
185
`+
writer = DecodeMapWriter(fp, "cp932ext", cp932decmap)
`
``
186
`+
writer.update_decode_map(CP932P0_C1, CP932P0_C2)
`
``
187
`+
writer.update_decode_map(CP932P1_C1, CP932P1_C2)
`
``
188
`+
writer.update_decode_map(CP932P2_C1, CP932P2_C2)
`
``
189
`+
writer.generate()
`
``
190
+
``
191
`+
print("Generating CP932 Extension encode map...")
`
``
192
`+
writer = EncodeMapWriter(fp, "cp932ext", cp932encmap)
`
``
193
`+
writer.generate()
`
``
194
+
``
195
`+
print("Generating JIS X 0213 Plane 1 BMP decode map...")
`
``
196
`+
writer = DecodeMapWriter(fp, "jisx0213_1_bmp", jis3decmap)
`
``
197
`+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
`
``
198
`+
writer.generate()
`
``
199
+
``
200
`+
print("Generating JIS X 0213 Plane 2 BMP decode map...")
`
``
201
`+
writer = DecodeMapWriter(fp, "jisx0213_2_bmp", jis4decmap)
`
``
202
`+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
`
``
203
`+
writer.generate()
`
``
204
+
``
205
`+
print("Generating JIS X 0213 BMP encode map...")
`
``
206
`+
writer = EncodeMapWriter(fp, "jisx0213_bmp", jisx0213bmpencmap)
`
``
207
`+
writer.generate()
`
``
208
+
``
209
`+
print("Generating JIS X 0213 Plane 1 EMP decode map...")
`
``
210
`+
writer = DecodeMapWriter(fp, "jisx0213_1_emp", jis3_2_decmap)
`
``
211
`+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
`
``
212
`+
writer.generate()
`
``
213
+
``
214
`+
print("Generating JIS X 0213 Plane 2 EMP decode map...")
`
``
215
`+
writer = DecodeMapWriter(fp, "jisx0213_2_emp", jis4_2_decmap)
`
``
216
`+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
`
``
217
`+
writer.generate()
`
``
218
+
``
219
`+
print("Generating JIS X 0213 EMP encode map...")
`
``
220
`+
writer = EncodeMapWriter(fp, "jisx0213_emp", jisx0213empencmap)
`
``
221
`+
writer.generate()
`
``
222
+
``
223
`+
with open('mappings_jisx0213_pair.h', 'w') as fp:
`
``
224
`+
print_autogen(fp, os.path.basename(file))
`
``
225
`+
fp.write(f"#define JISX0213_ENCPAIRS {len(jisx0213pairencmap)}\n")
`
``
226
`+
fp.write("""\
`
``
227
`+
#ifdef EXTERN_JISX0213_PAIR
`
``
228
`+
static const struct widedbcs_index *jisx0213_pair_decmap;
`
``
229
`+
static const struct pair_encodemap *jisx0213_pair_encmap;
`
``
230
`+
#else
`
``
231
`+
""")
`
``
232
+
``
233
`+
print("Generating JIS X 0213 unicode-pair decode map...")
`
``
234
`+
writer = DecodeMapWriter(fp, "jisx0213_pair", jisx0213pairdecmap)
`
``
235
`+
writer.update_decode_map(JISX0213_C1, JISX0213_C2)
`
``
236
`+
writer.generate(wide=True)
`
``
237
+
``
238
`+
print("Generating JIS X 0213 unicode-pair encode map...")
`
``
239
`+
jisx0213pairencmap.sort()
`
``
240
`+
fp.write("static const struct pair_encodemap jisx0213_pair_encmap[JISX0213_ENCPAIRS] = {\n")
`
``
241
`+
filler = BufferedFiller()
`
``
242
`+
for body, modifier, jis in jisx0213pairencmap:
`
``
243
`+
filler.write('{', '0x%04x%04x,' % (body, modifier), '0x%04x' % jis, '},')
`
``
244
`+
filler.printout(fp)
`
``
245
`+
fp.write("};\n")
`
``
246
`+
fp.write("#endif\n")
`
``
247
+
``
248
`+
print("Done!")
`
``
249
+
``
250
`+
if name == 'main':
`
``
251
`+
main()
`