2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
38 keysymunicodedatabase = {}
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
76 * file for a list of people on the GTK+ Team. See the ChangeLog
77 * files for a list of changes. These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
84 /* === These are the original comments of the file; we keep for historical purposes ===
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
90 * The following compose letter letter sequences confliced
91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 * spanish. atilde and otilde are used at least for Portuguese ]
101 * at and Aring; resolved to Aring [ AA ]
102 * guillemotleft and caron; resolved to guillemotleft [ << ]
103 * ogonek and cedilla; resolved to cedilla [ ,, ]
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
109 static const guint16 gtk_compose_seqs_compact[] = {"""
111 headerfile_end = """};
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
116 def stringtohex(str): return atoi(str, 16)
122 return n * factorial(n-1)
125 """ Performs a uniq operation on a list or lists """
128 theInputList += theList
130 for elem in theInputList:
131 if elem not in theFinalList:
132 theFinalList.append(elem)
137 def all_permutations(seq):
138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139 """ Produces all permutations of the items of a list """
143 for perm in all_permutations(seq[1:]):
144 for i in range(len(perm)+1):
145 #nb str[0:1] works in both string and list contexts
146 yield perm[:i] + seq[0:1] + perm[i:]
149 print """compose-parse available parameters:
150 -h, --help this craft
151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
152 -a, --algorithmic show sequences saved with algorithmic optimisation
153 -g, --gtk show entries that go to GTK+
154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
155 -v, --verbose show verbose output
156 -p, --plane1 show plane1 compose sequences
157 -n, --numeric when used with --gtk, create file with numeric values only
158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
160 Default is to show statistics.
164 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
170 opt_statistics = False
171 opt_algorithmic = False
173 opt_unicodedatatxt = False
177 opt_gtkexpanded = False
180 if o in ("-h", "--help"):
183 if o in ("-s", "--statistics"):
184 opt_statistics = True
185 if o in ("-a", "--algorithmic"):
186 opt_algorithmic = True
187 if o in ("-g", "--gtk"):
189 if o in ("-u", "--unicodedatatxt"):
190 opt_unicodedatatxt = True
191 if o in ("-v", "--verbose"):
193 if o in ("-p", "--plane1"):
195 if o in ("-n", "--numeric"):
197 if o in ("-e", "--gtk-expanded"):
198 opt_gtkexpanded = True
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201 opt_statistics = True
203 def download_hook(blocks_transferred, block_size, file_size):
204 """ A download hook to provide some feedback when downloading """
205 if blocks_transferred == 0:
208 print "Downloading", file_size, "bytes: ",
211 print "Downloading: ",
212 sys.stdout.write('#')
216 def download_file(url):
217 """ Downloads a file provided a URL. Returns the filename. """
218 """ Borks on failure """
219 localfilename = url.split('/')[-1]
220 if not isfile(localfilename) or getsize(localfilename) <= 0:
222 print "Downloading ", url, "..."
224 urlretrieve(url, localfilename, download_hook)
225 except IOError, (errno, strerror):
226 print "I/O error(%s): %s" % (errno, strerror)
229 print "Unexpected error: ", sys.exc_info()[0]
234 print "Using cached file for ", url
237 def process_gdkkeysymsh():
238 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239 """ Fills up keysymdb with contents """
240 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
242 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243 except IOError, (errno, strerror):
244 print "I/O error(%s): %s" % (errno, strerror)
247 print "Unexpected error: ", sys.exc_info()[0]
250 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
251 linenum_gdkkeysymsh = 0
253 for line in gdkkeysymsh.readlines():
254 linenum_gdkkeysymsh += 1
256 if line == "" or not match('^#define GDK_KEY_', line):
258 components = split('\s+', line)
259 if len(components) < 3:
260 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262 print "Was expecting 3 items in the line"
264 if not match('^GDK_KEY_', components[1]):
265 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267 print "Was expecting a keysym starting with GDK_KEY_"
269 if match('^0x[0-9a-fA-F]+$', components[2]):
270 unival = long(components[2][2:], 16)
273 keysymdb[components[1][8:]] = unival
275 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277 print "Was expecting a hexadecimal number at the end of the line"
281 """ Patch up the keysymdb with some of our own stuff """
283 """ This is for a missing keysym from the currently upstream file """
284 keysymdb['dead_stroke'] = 0x338
286 """ This is for a missing keysym from the currently upstream file """
287 ###keysymdb['dead_belowring'] = 0x323
288 ###keysymdb['dead_belowmacron'] = 0x331
289 ###keysymdb['dead_belowcircumflex'] = 0x32d
290 ###keysymdb['dead_belowtilde'] = 0x330
291 ###keysymdb['dead_belowbreve'] = 0x32e
292 ###keysymdb['dead_belowdiaeresis'] = 0x324
294 """ This is^Wwas preferential treatment for Greek """
295 # keysymdb['dead_tilde'] = 0x342
296 """ This is^was preferential treatment for Greek """
297 #keysymdb['combining_tilde'] = 0x342
299 """ Fixing VoidSymbol """
300 keysymdb['VoidSymbol'] = 0xFFFF
304 def process_keysymstxt():
305 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306 """ This file keeps a record between keysyms <-> unicode chars """
307 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
309 keysymstxt = open(filename_keysymstxt, 'r')
310 except IOError, (errno, strerror):
311 print "I/O error(%s): %s" % (errno, strerror)
314 print "Unexpected error: ", sys.exc_info()[0]
317 """ Parse the keysyms.txt file and place content in keysymdb """
318 linenum_keysymstxt = 0
320 for line in keysymstxt.readlines():
321 linenum_keysymstxt += 1
323 if line == "" or match('^#', line):
325 components = split('\s+', line)
326 if len(components) < 5:
327 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329 print "Was expecting 5 items in the line"
331 if match('^U[0-9a-fA-F]+$', components[1]):
332 unival = long(components[1][1:], 16)
335 keysymdb[components[4]] = unival
338 """ Patch up the keysymdb with some of our own stuff """
339 """ This is for a missing keysym from the currently upstream file """
340 ###keysymdb['dead_belowring'] = 0x323
341 ###keysymdb['dead_belowmacron'] = 0x331
342 ###keysymdb['dead_belowcircumflex'] = 0x32d
343 ###keysymdb['dead_belowtilde'] = 0x330
344 ###keysymdb['dead_belowbreve'] = 0x32e
345 ###keysymdb['dead_belowdiaeresis'] = 0x324
347 """ This is preferential treatment for Greek """
348 """ => we get more savings if used for Greek """
349 # keysymdb['dead_tilde'] = 0x342
350 """ This is preferential treatment for Greek """
351 # keysymdb['combining_tilde'] = 0x342
353 """ This is for a missing keysym from Markus Kuhn's db """
354 keysymdb['dead_stroke'] = 0x338
355 """ This is for a missing keysym from Markus Kuhn's db """
356 keysymdb['Oslash'] = 0x0d8
357 """ This is for a missing keysym from Markus Kuhn's db """
358 keysymdb['Ssharp'] = 0x1e9e
360 """ This is for a missing (recently added) keysym """
361 keysymdb['dead_psili'] = 0x313
362 """ This is for a missing (recently added) keysym """
363 keysymdb['dead_dasia'] = 0x314
365 """ Allows to import Multi_key sequences """
366 keysymdb['Multi_key'] = 0xff20
368 keysymdb['zerosubscript'] = 0x2080
369 keysymdb['onesubscript'] = 0x2081
370 keysymdb['twosubscript'] = 0x2082
371 keysymdb['threesubscript'] = 0x2083
372 keysymdb['foursubscript'] = 0x2084
373 keysymdb['fivesubscript'] = 0x2085
374 keysymdb['sixsubscript'] = 0x2086
375 keysymdb['sevensubscript'] = 0x2087
376 keysymdb['eightsubscript'] = 0x2088
377 keysymdb['ninesubscript'] = 0x2089
378 keysymdb['dead_doublegrave'] = 0x030F
379 keysymdb['dead_invertedbreve'] = 0x0311
383 def keysymvalue(keysym, file = "n/a", linenum = 0):
384 """ Extracts a value from the keysym """
385 """ Find the value of keysym, using the data from keysyms """
386 """ Use file and linenum to when reporting errors """
389 if keysymdatabase.has_key(keysym):
390 return keysymdatabase[keysym]
391 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
392 return atoi(keysym[1:], 16)
393 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
394 return atoi(keysym[2:], 16)
396 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
400 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
401 """ Extracts a value from the keysym """
402 """ Find the value of keysym, using the data from keysyms """
403 """ Use file and linenum to when reporting errors """
406 if keysymunicodedatabase.has_key(keysym):
407 return keysymunicodedatabase[keysym]
408 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
409 return atoi(keysym[1:], 16)
410 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
411 return atoi(keysym[2:], 16)
413 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
416 def rename_combining(seq):
417 filtered_sequence = []
419 if findall('^combining_', ks):
420 ks = sub('^combining_', 'dead_', ks)
421 if ks == 'dead_double_grave':
422 ks = 'dead_doublegrave'
423 if ks == 'dead_inverted_breve':
424 ks = 'dead_invertedbreve'
425 filtered_sequence.append(ks)
426 return filtered_sequence
429 keysymunicodedatabase = process_keysymstxt()
430 keysymdatabase = process_gdkkeysymsh()
432 """ Grab and open the compose file from upstream """
433 filename_compose = download_file(URL_COMPOSE)
435 composefile = open(filename_compose, 'r')
436 except IOError, (errno, strerror):
437 print "I/O error(%s): %s" % (errno, strerror)
440 print "Unexpected error: ", sys.exc_info()[0]
443 """ Look if there is a lookaside (supplementary) compose file in the current
444 directory, and if so, open, then merge with upstream Compose file.
446 xorg_compose_sequences_raw = []
447 for seq in composefile.readlines():
448 xorg_compose_sequences_raw.append(seq)
451 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
452 for seq in composefile_lookaside.readlines():
453 xorg_compose_sequences_raw.append(seq)
454 except IOError, (errno, strerror):
456 print "I/O error(%s): %s" % (errno, strerror)
457 print "Did not find lookaside compose file. Continuing..."
459 print "Unexpected error: ", sys.exc_info()[0]
462 """ Parse the compose file in xorg_compose_sequences"""
463 xorg_compose_sequences = []
464 xorg_compose_sequences_algorithmic = []
466 comment_nest_depth = 0
467 for line in xorg_compose_sequences_raw:
470 if match("^XCOMM", line) or match("^#", line):
473 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
475 comment_start = line.find("/*")
477 if comment_start >= 0:
478 if comment_nest_depth == 0:
479 line = line[:comment_start]
483 comment_nest_depth += 1
485 comment_end = line.find("*/")
488 comment_nest_depth -= 1
490 if comment_nest_depth < 0:
491 print "Invalid comment %(linenum_compose)d in %(filename)s: \
492 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
495 if comment_nest_depth > 0:
498 line = line[comment_end + 2:]
504 components = split(':', line)
505 if len(components) != 2:
506 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
507 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
509 (seq, val ) = split(':', line)
512 raw_sequence = findall('\w+', seq)
513 values = split('\s+', val)
514 unichar_temp = split('"', values[0])
515 unichar = unichar_temp[1]
518 codepointstr = values[1]
520 # No codepoints that are >1 characters yet.
522 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
523 raw_sequence[0] = '0x' + raw_sequence[0][1:]
524 if match('^U[0-9a-fA-F]+$', codepointstr):
525 codepoint = long(codepointstr[1:], 16)
526 elif keysymunicodedatabase.has_key(codepointstr):
527 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
528 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
529 #print raw_sequence, codepointstr
530 codepoint = keysymunicodedatabase[codepointstr]
533 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
534 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
536 sequence = rename_combining(raw_sequence)
539 if keysymvalue(i) > 0xFFFF:
544 if keysymvalue(i) < 0:
549 if "U0342" in sequence or \
550 "U0313" in sequence or \
551 "U0314" in sequence or \
552 "0x0313" in sequence or \
553 "0x0342" in sequence or \
554 "0x0314" in sequence:
556 if "dead_belowring" in sequence or\
557 "dead_currency" in sequence or\
558 "dead_belowcomma" in sequence or\
559 "dead_belowmacron" in sequence or\
560 "dead_belowtilde" in sequence or\
561 "dead_belowbreve" in sequence or\
562 "dead_belowdiaeresis" in sequence or\
563 "dead_belowcircumflex" in sequence:
565 #for i in range(len(sequence)):
566 # if sequence[i] == "0x0342":
567 # sequence[i] = "dead_tilde"
568 if "Multi_key" not in sequence:
569 """ Ignore for now >0xFFFF keysyms """
570 if codepoint < 0xFFFF:
571 original_sequence = copy(sequence)
572 stats_sequence = copy(sequence)
573 base = sequence.pop()
574 basechar = keysymvalue(base, filename_compose, linenum_compose)
576 if basechar < 0xFFFF:
579 not_normalised = True
580 skipping_this = False
581 for i in range(0, len(sequence)):
582 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
583 because of lack of dead_perispomeni (i.e. conflict)
586 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
589 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
592 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
595 if sequence[-1] == "dead_psili":
596 sequence[i] = "dead_horn"
597 if sequence[-1] == "dead_dasia":
598 sequence[-1] = "dead_ogonek"
600 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
604 for perm in all_permutations(unisequence):
605 # print counter, original_sequence, unichr(basechar) + "".join(perm)
606 # print counter, map(unichr, perm)
607 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
608 if len(normalized) == 1:
609 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
610 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
611 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
612 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
613 stats_sequence_data.append(normalized)
614 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
615 not_normalised = False
619 original_sequence.append(codepoint)
620 xorg_compose_sequences.append(original_sequence)
621 """ print xorg_compose_sequences[-1] """
624 print "Error in base char !?!"
627 print "OVER", sequence
630 sequence.append(codepoint)
631 xorg_compose_sequences.append(sequence)
632 """ print xorg_compose_sequences[-1] """
634 def sequence_cmp(x, y):
635 if keysymvalue(x[0]) > keysymvalue(y[0]):
637 elif keysymvalue(x[0]) < keysymvalue(y[0]):
639 elif len(x) > len(y):
641 elif len(x) < len(y):
643 elif keysymvalue(x[1]) > keysymvalue(y[1]):
645 elif keysymvalue(x[1]) < keysymvalue(y[1]):
649 elif keysymvalue(x[2]) > keysymvalue(y[2]):
651 elif keysymvalue(x[2]) < keysymvalue(y[2]):
655 elif keysymvalue(x[3]) > keysymvalue(y[3]):
657 elif keysymvalue(x[3]) < keysymvalue(y[3]):
661 elif keysymvalue(x[4]) > keysymvalue(y[4]):
663 elif keysymvalue(x[4]) < keysymvalue(y[4]):
668 def sequence_unicode_cmp(x, y):
669 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
671 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
673 elif len(x) > len(y):
675 elif len(x) < len(y):
677 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
679 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
683 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
685 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
689 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
691 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
695 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
697 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
702 def sequence_algorithmic_cmp(x, y):
705 elif len(x) > len(y):
708 for i in range(len(x)):
716 xorg_compose_sequences.sort(sequence_cmp)
718 xorg_compose_sequences_uniqued = []
721 for next_item in xorg_compose_sequences:
725 if sequence_unicode_cmp(item, next_item) != 0:
726 xorg_compose_sequences_uniqued.append(item)
729 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
732 for item in xorg_compose_sequences:
733 if findall('Multi_key', "".join(item[:-1])) != []:
734 counter_multikey += 1
736 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
737 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
740 num_first_keysyms = 0
743 num_algorithmic_greek = 0
744 for sequence in xorg_compose_sequences:
745 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
746 firstitem = sequence[0]
747 num_first_keysyms += 1
748 zeroes += 6 - len(sequence) + 1
751 for sequence in xorg_compose_sequences_algorithmic_uniqued:
752 ch = ord(sequence[-1:][0])
753 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
754 num_algorithmic_greek += 1
758 for sequence in xorg_compose_sequences_algorithmic_uniqued:
759 letter = "".join(sequence[-1:])
760 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
761 for elem in sequence[:-2]:
762 print "<0x%(keysym)04X>," % { 'keysym': elem },
763 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
764 print "], recomposed as", letter.encode('utf-8'), "verified"
766 def num_of_keysyms(seq):
769 def convert_UnotationToHex(arg):
770 if isinstance(arg, str):
771 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
772 return sub('^U', '0x', arg)
775 def addprefix_GDK(arg):
776 if match('^0x', arg):
777 return '%(arg)s, ' % { 'arg': arg }
779 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
786 ct_sequence_width = 2
787 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
791 sequence_iterator = iter(xorg_compose_sequences)
792 sequence = sequence_iterator.next()
794 first_keysym = sequence[0] # Set the first keysym
795 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
796 while sequence[0] == first_keysym:
797 compose_table[counter][num_of_keysyms(sequence)-1] += 1
799 sequence = sequence_iterator.next()
800 except StopIteration:
807 ct_index = start_offset
808 for line_num in range(len(compose_table)):
809 for i in range(WIDTHOFCOMPOSETABLE):
810 occurences = compose_table[line_num][i+1]
811 compose_table[line_num][i+1] = ct_index
812 ct_index += occurences * (i+2)
814 for sequence in xorg_compose_sequences:
815 ct_second_part.append(map(convert_UnotationToHex, sequence))
817 print headerfile_start
818 for i in compose_table:
820 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
821 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
822 elif not match('^0x', i[0]):
823 print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
825 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
826 for i in ct_second_part:
828 for ks in i[1:][:-1]:
829 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
830 print '0x%(cp)04X, ' % { 'cp':i[-1] }
833 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
834 print '0x%(cp)04X, ' % { 'cp':i[-1] }
836 elif opt_gtkexpanded:
837 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
839 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
842 def redecompose(codepoint):
843 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
844 if decomposition[0] == '' or decomposition[0] == '0':
846 if match('<\w+>', decomposition[0]):
847 numdecomposition = map(stringtohex, decomposition[1:])
848 return map(redecompose, numdecomposition)
849 numdecomposition = map(stringtohex, decomposition)
850 return map(redecompose, numdecomposition)
852 def process_unicodedata_file(verbose = False):
853 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
854 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
856 unicodedatatxt = open(filename_unicodedatatxt, 'r')
857 except IOError, (errno, strerror):
858 print "I/O error(%s): %s" % (errno, strerror)
861 print "Unexpected error: ", sys.exc_info()[0]
863 for line in unicodedatatxt.readlines():
864 if line[0] == "" or line[0] == '#':
867 uniproperties = split(';', line)
868 codepoint = stringtohex(uniproperties[0])
869 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
870 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
872 name = uniproperties[1]
873 category = uniproperties[2]
874 combiningclass = uniproperties[3]
875 decomposition = uniproperties[5]
876 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
878 counter_combinations = 0
879 counter_combinations_greek = 0
881 counter_entries_greek = 0
883 for item in unicodedatabase.keys():
884 (name, decomposition, combiningclass) = unicodedatabase[item]
885 if decomposition[0] == '':
887 print name, "is empty"
888 elif match('<\w+>', decomposition[0]):
890 print name, "has weird", decomposition[0]
892 sequence = map(stringtohex, decomposition)
893 chrsequence = map(unichr, sequence)
894 normalized = normalize('NFC', "".join(chrsequence))
896 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
897 decomposedsequence = []
898 for subseq in map(redecompose, sequence):
899 for seqitem in subseq:
900 if isinstance(seqitem, list):
902 if isinstance(i, list):
904 decomposedsequence.append(j)
906 decomposedsequence.append(i)
908 decomposedsequence.append(seqitem)
909 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
910 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
912 counter_combinations += factorial(len(decomposedsequence)-1)
914 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
915 counter_entries_greek += 1
916 counter_combinations_greek += factorial(len(decomposedsequence)-1)
918 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
920 for elem in decomposedsequence:
921 print '<0x%(hex)04X>,' % { 'hex': elem },
922 print "], recomposed as", recomposedchar,
923 if unichr(item) == recomposedchar:
927 print "Unicode statistics from UnicodeData.txt"
928 print "Number of entries that can be algorithmically produced :", counter_entries
929 print " of which are for Greek :", counter_entries_greek
930 print "Number of compose sequence combinations requiring :", counter_combinations
931 print " of which are for Greek :", counter_combinations_greek
932 print "Note: We do not include partial compositions, "
933 print "thus the slight discrepancy in the figures"
936 if opt_unicodedatatxt:
937 process_unicodedata_file(True)
941 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
942 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
943 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
944 print " of which have Multi_key :", counter_multikey
946 print "Algorithmic (stats for Xorg Compose file)"
947 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
948 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
949 print " of which are for Greek :", num_algorithmic_greek
951 process_unicodedata_file()
952 print "Not algorithmic (stats from Xorg Compose file)"
953 print "Number of sequences :", len(xorg_compose_sequences)
954 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
955 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
956 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
957 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
958 print "Number of different first items :", num_first_keysyms
959 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
960 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
962 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
963 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
965 print "Existing (old) implementation in GTK+"
966 print "Number of sequences in old gtkimcontextsimple.c :", 691
967 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"