2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
38 keysymunicodedatabase = {}
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library. If not, see see <http://www.gnu.org/licenses/>.
59 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
60 * using the input files
61 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
62 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
63 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
65 * This table is optimised for space and requires special handling to access the content.
66 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
68 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
69 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
73 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
74 * file for a list of people on the GTK+ Team. See the ChangeLog
75 * files for a list of changes. These files are distributed with
76 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
80 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 /* === These are the original comments of the file; we keep for historical purposes ===
84 * The following table was generated from the X compose tables include with
85 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
86 * to obtain the relevant perl scripts.
88 * The following compose letter letter sequences confliced
89 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
90 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
91 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
92 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
93 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
94 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
96 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
97 * spanish. atilde and otilde are used at least for Portuguese ]
99 * at and Aring; resolved to Aring [ AA ]
100 * guillemotleft and caron; resolved to guillemotleft [ << ]
101 * ogonek and cedilla; resolved to cedilla [ ,, ]
103 * This probably should be resolved by first checking an additional set of compose tables
104 * that depend on the locale or selected input method.
107 static const guint16 gtk_compose_seqs_compact[] = {"""
109 headerfile_end = """};
111 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114 def stringtohex(str): return atoi(str, 16)
120 return n * factorial(n-1)
123 """ Performs a uniq operation on a list or lists """
126 theInputList += theList
128 for elem in theInputList:
129 if elem not in theFinalList:
130 theFinalList.append(elem)
135 def all_permutations(seq):
136 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
137 """ Produces all permutations of the items of a list """
141 for perm in all_permutations(seq[1:]):
142 for i in range(len(perm)+1):
143 #nb str[0:1] works in both string and list contexts
144 yield perm[:i] + seq[0:1] + perm[i:]
147 print """compose-parse available parameters:
148 -h, --help this craft
149 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
150 -a, --algorithmic show sequences saved with algorithmic optimisation
151 -g, --gtk show entries that go to GTK+
152 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
153 -v, --verbose show verbose output
154 -p, --plane1 show plane1 compose sequences
155 -n, --numeric when used with --gtk, create file with numeric values only
156 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
158 Default is to show statistics.
162 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
163 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
168 opt_statistics = False
169 opt_algorithmic = False
171 opt_unicodedatatxt = False
175 opt_gtkexpanded = False
178 if o in ("-h", "--help"):
181 if o in ("-s", "--statistics"):
182 opt_statistics = True
183 if o in ("-a", "--algorithmic"):
184 opt_algorithmic = True
185 if o in ("-g", "--gtk"):
187 if o in ("-u", "--unicodedatatxt"):
188 opt_unicodedatatxt = True
189 if o in ("-v", "--verbose"):
191 if o in ("-p", "--plane1"):
193 if o in ("-n", "--numeric"):
195 if o in ("-e", "--gtk-expanded"):
196 opt_gtkexpanded = True
198 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
199 opt_statistics = True
201 def download_hook(blocks_transferred, block_size, file_size):
202 """ A download hook to provide some feedback when downloading """
203 if blocks_transferred == 0:
206 print "Downloading", file_size, "bytes: ",
209 print "Downloading: ",
210 sys.stdout.write('#')
214 def download_file(url):
215 """ Downloads a file provided a URL. Returns the filename. """
216 """ Borks on failure """
217 localfilename = url.split('/')[-1]
218 if not isfile(localfilename) or getsize(localfilename) <= 0:
220 print "Downloading ", url, "..."
222 urlretrieve(url, localfilename, download_hook)
223 except IOError, (errno, strerror):
224 print "I/O error(%s): %s" % (errno, strerror)
227 print "Unexpected error: ", sys.exc_info()[0]
232 print "Using cached file for ", url
235 def process_gdkkeysymsh():
236 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
237 """ Fills up keysymdb with contents """
238 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
240 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
241 except IOError, (errno, strerror):
242 print "I/O error(%s): %s" % (errno, strerror)
245 print "Unexpected error: ", sys.exc_info()[0]
248 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
249 linenum_gdkkeysymsh = 0
251 for line in gdkkeysymsh.readlines():
252 linenum_gdkkeysymsh += 1
254 if line == "" or not match('^#define GDK_KEY_', line):
256 components = split('\s+', line)
257 if len(components) < 3:
258 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
259 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
260 print "Was expecting 3 items in the line"
262 if not match('^GDK_KEY_', components[1]):
263 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
265 print "Was expecting a keysym starting with GDK_KEY_"
267 if match('^0x[0-9a-fA-F]+$', components[2]):
268 unival = long(components[2][2:], 16)
271 keysymdb[components[1][8:]] = unival
273 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
274 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
275 print "Was expecting a hexadecimal number at the end of the line"
279 """ Patch up the keysymdb with some of our own stuff """
281 """ This is for a missing keysym from the currently upstream file """
282 keysymdb['dead_stroke'] = 0x338
284 """ This is for a missing keysym from the currently upstream file """
285 ###keysymdb['dead_belowring'] = 0x323
286 ###keysymdb['dead_belowmacron'] = 0x331
287 ###keysymdb['dead_belowcircumflex'] = 0x32d
288 ###keysymdb['dead_belowtilde'] = 0x330
289 ###keysymdb['dead_belowbreve'] = 0x32e
290 ###keysymdb['dead_belowdiaeresis'] = 0x324
292 """ This is^Wwas preferential treatment for Greek """
293 # keysymdb['dead_tilde'] = 0x342
294 """ This is^was preferential treatment for Greek """
295 #keysymdb['combining_tilde'] = 0x342
297 """ Fixing VoidSymbol """
298 keysymdb['VoidSymbol'] = 0xFFFF
302 def process_keysymstxt():
303 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
304 """ This file keeps a record between keysyms <-> unicode chars """
305 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
307 keysymstxt = open(filename_keysymstxt, 'r')
308 except IOError, (errno, strerror):
309 print "I/O error(%s): %s" % (errno, strerror)
312 print "Unexpected error: ", sys.exc_info()[0]
315 """ Parse the keysyms.txt file and place content in keysymdb """
316 linenum_keysymstxt = 0
318 for line in keysymstxt.readlines():
319 linenum_keysymstxt += 1
321 if line == "" or match('^#', line):
323 components = split('\s+', line)
324 if len(components) < 5:
325 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
326 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
327 print "Was expecting 5 items in the line"
329 if match('^U[0-9a-fA-F]+$', components[1]):
330 unival = long(components[1][1:], 16)
333 keysymdb[components[4]] = unival
336 """ Patch up the keysymdb with some of our own stuff """
337 """ This is for a missing keysym from the currently upstream file """
338 ###keysymdb['dead_belowring'] = 0x323
339 ###keysymdb['dead_belowmacron'] = 0x331
340 ###keysymdb['dead_belowcircumflex'] = 0x32d
341 ###keysymdb['dead_belowtilde'] = 0x330
342 ###keysymdb['dead_belowbreve'] = 0x32e
343 ###keysymdb['dead_belowdiaeresis'] = 0x324
345 """ This is preferential treatment for Greek """
346 """ => we get more savings if used for Greek """
347 # keysymdb['dead_tilde'] = 0x342
348 """ This is preferential treatment for Greek """
349 # keysymdb['combining_tilde'] = 0x342
351 """ This is for a missing keysym from Markus Kuhn's db """
352 keysymdb['dead_stroke'] = 0x338
353 """ This is for a missing keysym from Markus Kuhn's db """
354 keysymdb['Oslash'] = 0x0d8
355 """ This is for a missing keysym from Markus Kuhn's db """
356 keysymdb['Ssharp'] = 0x1e9e
358 """ This is for a missing (recently added) keysym """
359 keysymdb['dead_psili'] = 0x313
360 """ This is for a missing (recently added) keysym """
361 keysymdb['dead_dasia'] = 0x314
363 """ Allows to import Multi_key sequences """
364 keysymdb['Multi_key'] = 0xff20
366 keysymdb['zerosubscript'] = 0x2080
367 keysymdb['onesubscript'] = 0x2081
368 keysymdb['twosubscript'] = 0x2082
369 keysymdb['threesubscript'] = 0x2083
370 keysymdb['foursubscript'] = 0x2084
371 keysymdb['fivesubscript'] = 0x2085
372 keysymdb['sixsubscript'] = 0x2086
373 keysymdb['sevensubscript'] = 0x2087
374 keysymdb['eightsubscript'] = 0x2088
375 keysymdb['ninesubscript'] = 0x2089
376 keysymdb['dead_doublegrave'] = 0x030F
377 keysymdb['dead_invertedbreve'] = 0x0311
381 def keysymvalue(keysym, file = "n/a", linenum = 0):
382 """ Extracts a value from the keysym """
383 """ Find the value of keysym, using the data from keysyms """
384 """ Use file and linenum to when reporting errors """
387 if keysymdatabase.has_key(keysym):
388 return keysymdatabase[keysym]
389 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
390 return atoi(keysym[1:], 16)
391 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
392 return atoi(keysym[2:], 16)
394 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
398 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
399 """ Extracts a value from the keysym """
400 """ Find the value of keysym, using the data from keysyms """
401 """ Use file and linenum to when reporting errors """
404 if keysymunicodedatabase.has_key(keysym):
405 return keysymunicodedatabase[keysym]
406 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
407 return atoi(keysym[1:], 16)
408 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
409 return atoi(keysym[2:], 16)
411 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
414 def rename_combining(seq):
415 filtered_sequence = []
417 if findall('^combining_', ks):
418 ks = sub('^combining_', 'dead_', ks)
419 if ks == 'dead_double_grave':
420 ks = 'dead_doublegrave'
421 if ks == 'dead_inverted_breve':
422 ks = 'dead_invertedbreve'
423 filtered_sequence.append(ks)
424 return filtered_sequence
427 keysymunicodedatabase = process_keysymstxt()
428 keysymdatabase = process_gdkkeysymsh()
430 """ Grab and open the compose file from upstream """
431 filename_compose = download_file(URL_COMPOSE)
433 composefile = open(filename_compose, 'r')
434 except IOError, (errno, strerror):
435 print "I/O error(%s): %s" % (errno, strerror)
438 print "Unexpected error: ", sys.exc_info()[0]
441 """ Look if there is a lookaside (supplementary) compose file in the current
442 directory, and if so, open, then merge with upstream Compose file.
444 xorg_compose_sequences_raw = []
445 for seq in composefile.readlines():
446 xorg_compose_sequences_raw.append(seq)
449 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
450 for seq in composefile_lookaside.readlines():
451 xorg_compose_sequences_raw.append(seq)
452 except IOError, (errno, strerror):
454 print "I/O error(%s): %s" % (errno, strerror)
455 print "Did not find lookaside compose file. Continuing..."
457 print "Unexpected error: ", sys.exc_info()[0]
460 """ Parse the compose file in xorg_compose_sequences"""
461 xorg_compose_sequences = []
462 xorg_compose_sequences_algorithmic = []
464 comment_nest_depth = 0
465 for line in xorg_compose_sequences_raw:
468 if match("^XCOMM", line) or match("^#", line):
471 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
473 comment_start = line.find("/*")
475 if comment_start >= 0:
476 if comment_nest_depth == 0:
477 line = line[:comment_start]
481 comment_nest_depth += 1
483 comment_end = line.find("*/")
486 comment_nest_depth -= 1
488 if comment_nest_depth < 0:
489 print "Invalid comment %(linenum_compose)d in %(filename)s: \
490 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
493 if comment_nest_depth > 0:
496 line = line[comment_end + 2:]
502 components = split(':', line)
503 if len(components) != 2:
504 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
505 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
507 (seq, val ) = split(':', line)
510 raw_sequence = findall('\w+', seq)
511 values = split('\s+', val)
512 unichar_temp = split('"', values[0])
513 unichar = unichar_temp[1]
516 codepointstr = values[1]
518 # No codepoints that are >1 characters yet.
520 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
521 raw_sequence[0] = '0x' + raw_sequence[0][1:]
522 if match('^U[0-9a-fA-F]+$', codepointstr):
523 codepoint = long(codepointstr[1:], 16)
524 elif keysymunicodedatabase.has_key(codepointstr):
525 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
526 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
527 #print raw_sequence, codepointstr
528 codepoint = keysymunicodedatabase[codepointstr]
531 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
532 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
534 sequence = rename_combining(raw_sequence)
537 if keysymvalue(i) > 0xFFFF:
542 if keysymvalue(i) < 0:
547 if "U0342" in sequence or \
548 "U0313" in sequence or \
549 "U0314" in sequence or \
550 "0x0313" in sequence or \
551 "0x0342" in sequence or \
552 "0x0314" in sequence:
554 if "dead_belowring" in sequence or\
555 "dead_currency" in sequence or\
556 "dead_belowcomma" in sequence or\
557 "dead_belowmacron" in sequence or\
558 "dead_belowtilde" in sequence or\
559 "dead_belowbreve" in sequence or\
560 "dead_belowdiaeresis" in sequence or\
561 "dead_belowcircumflex" in sequence:
563 #for i in range(len(sequence)):
564 # if sequence[i] == "0x0342":
565 # sequence[i] = "dead_tilde"
566 if "Multi_key" not in sequence:
567 """ Ignore for now >0xFFFF keysyms """
568 if codepoint < 0xFFFF:
569 original_sequence = copy(sequence)
570 stats_sequence = copy(sequence)
571 base = sequence.pop()
572 basechar = keysymvalue(base, filename_compose, linenum_compose)
574 if basechar < 0xFFFF:
577 not_normalised = True
578 skipping_this = False
579 for i in range(0, len(sequence)):
580 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
581 because of lack of dead_perispomeni (i.e. conflict)
584 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
587 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593 if sequence[-1] == "dead_psili":
594 sequence[i] = "dead_horn"
595 if sequence[-1] == "dead_dasia":
596 sequence[-1] = "dead_ogonek"
598 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
602 for perm in all_permutations(unisequence):
603 # print counter, original_sequence, unichr(basechar) + "".join(perm)
604 # print counter, map(unichr, perm)
605 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
606 if len(normalized) == 1:
607 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
608 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
609 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
610 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
611 stats_sequence_data.append(normalized)
612 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
613 not_normalised = False
617 original_sequence.append(codepoint)
618 xorg_compose_sequences.append(original_sequence)
619 """ print xorg_compose_sequences[-1] """
622 print "Error in base char !?!"
625 print "OVER", sequence
628 sequence.append(codepoint)
629 xorg_compose_sequences.append(sequence)
630 """ print xorg_compose_sequences[-1] """
632 def sequence_cmp(x, y):
633 if keysymvalue(x[0]) > keysymvalue(y[0]):
635 elif keysymvalue(x[0]) < keysymvalue(y[0]):
637 elif len(x) > len(y):
639 elif len(x) < len(y):
641 elif keysymvalue(x[1]) > keysymvalue(y[1]):
643 elif keysymvalue(x[1]) < keysymvalue(y[1]):
647 elif keysymvalue(x[2]) > keysymvalue(y[2]):
649 elif keysymvalue(x[2]) < keysymvalue(y[2]):
653 elif keysymvalue(x[3]) > keysymvalue(y[3]):
655 elif keysymvalue(x[3]) < keysymvalue(y[3]):
659 elif keysymvalue(x[4]) > keysymvalue(y[4]):
661 elif keysymvalue(x[4]) < keysymvalue(y[4]):
666 def sequence_unicode_cmp(x, y):
667 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
669 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
671 elif len(x) > len(y):
673 elif len(x) < len(y):
675 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
677 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
681 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
683 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
687 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
689 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
693 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
695 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
700 def sequence_algorithmic_cmp(x, y):
703 elif len(x) > len(y):
706 for i in range(len(x)):
714 xorg_compose_sequences.sort(sequence_cmp)
716 xorg_compose_sequences_uniqued = []
719 for next_item in xorg_compose_sequences:
723 if sequence_unicode_cmp(item, next_item) != 0:
724 xorg_compose_sequences_uniqued.append(item)
727 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
730 for item in xorg_compose_sequences:
731 if findall('Multi_key', "".join(item[:-1])) != []:
732 counter_multikey += 1
734 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
735 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
738 num_first_keysyms = 0
741 num_algorithmic_greek = 0
742 for sequence in xorg_compose_sequences:
743 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
744 firstitem = sequence[0]
745 num_first_keysyms += 1
746 zeroes += 6 - len(sequence) + 1
749 for sequence in xorg_compose_sequences_algorithmic_uniqued:
750 ch = ord(sequence[-1:][0])
751 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
752 num_algorithmic_greek += 1
756 for sequence in xorg_compose_sequences_algorithmic_uniqued:
757 letter = "".join(sequence[-1:])
758 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
759 for elem in sequence[:-2]:
760 print "<0x%(keysym)04X>," % { 'keysym': elem },
761 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
762 print "], recomposed as", letter.encode('utf-8'), "verified"
764 def num_of_keysyms(seq):
767 def convert_UnotationToHex(arg):
768 if isinstance(arg, str):
769 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
770 return sub('^U', '0x', arg)
773 def addprefix_GDK(arg):
774 if match('^0x', arg):
775 return '%(arg)s, ' % { 'arg': arg }
777 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
784 ct_sequence_width = 2
785 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
789 sequence_iterator = iter(xorg_compose_sequences)
790 sequence = sequence_iterator.next()
792 first_keysym = sequence[0] # Set the first keysym
793 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
794 while sequence[0] == first_keysym:
795 compose_table[counter][num_of_keysyms(sequence)-1] += 1
797 sequence = sequence_iterator.next()
798 except StopIteration:
805 ct_index = start_offset
806 for line_num in range(len(compose_table)):
807 for i in range(WIDTHOFCOMPOSETABLE):
808 occurences = compose_table[line_num][i+1]
809 compose_table[line_num][i+1] = ct_index
810 ct_index += occurences * (i+2)
812 for sequence in xorg_compose_sequences:
813 ct_second_part.append(map(convert_UnotationToHex, sequence))
815 print headerfile_start
816 for i in compose_table:
818 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
819 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
820 elif not match('^0x', i[0]):
821 print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
823 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
824 for i in ct_second_part:
826 for ks in i[1:][:-1]:
827 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
828 print '0x%(cp)04X, ' % { 'cp':i[-1] }
831 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
832 print '0x%(cp)04X, ' % { 'cp':i[-1] }
834 elif opt_gtkexpanded:
835 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
837 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
840 def redecompose(codepoint):
841 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
842 if decomposition[0] == '' or decomposition[0] == '0':
844 if match('<\w+>', decomposition[0]):
845 numdecomposition = map(stringtohex, decomposition[1:])
846 return map(redecompose, numdecomposition)
847 numdecomposition = map(stringtohex, decomposition)
848 return map(redecompose, numdecomposition)
850 def process_unicodedata_file(verbose = False):
851 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
852 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
854 unicodedatatxt = open(filename_unicodedatatxt, 'r')
855 except IOError, (errno, strerror):
856 print "I/O error(%s): %s" % (errno, strerror)
859 print "Unexpected error: ", sys.exc_info()[0]
861 for line in unicodedatatxt.readlines():
862 if line[0] == "" or line[0] == '#':
865 uniproperties = split(';', line)
866 codepoint = stringtohex(uniproperties[0])
867 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
868 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
870 name = uniproperties[1]
871 category = uniproperties[2]
872 combiningclass = uniproperties[3]
873 decomposition = uniproperties[5]
874 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
876 counter_combinations = 0
877 counter_combinations_greek = 0
879 counter_entries_greek = 0
881 for item in unicodedatabase.keys():
882 (name, decomposition, combiningclass) = unicodedatabase[item]
883 if decomposition[0] == '':
885 print name, "is empty"
886 elif match('<\w+>', decomposition[0]):
888 print name, "has weird", decomposition[0]
890 sequence = map(stringtohex, decomposition)
891 chrsequence = map(unichr, sequence)
892 normalized = normalize('NFC', "".join(chrsequence))
894 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
895 decomposedsequence = []
896 for subseq in map(redecompose, sequence):
897 for seqitem in subseq:
898 if isinstance(seqitem, list):
900 if isinstance(i, list):
902 decomposedsequence.append(j)
904 decomposedsequence.append(i)
906 decomposedsequence.append(seqitem)
907 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
908 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
910 counter_combinations += factorial(len(decomposedsequence)-1)
912 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
913 counter_entries_greek += 1
914 counter_combinations_greek += factorial(len(decomposedsequence)-1)
916 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
918 for elem in decomposedsequence:
919 print '<0x%(hex)04X>,' % { 'hex': elem },
920 print "], recomposed as", recomposedchar,
921 if unichr(item) == recomposedchar:
925 print "Unicode statistics from UnicodeData.txt"
926 print "Number of entries that can be algorithmically produced :", counter_entries
927 print " of which are for Greek :", counter_entries_greek
928 print "Number of compose sequence combinations requiring :", counter_combinations
929 print " of which are for Greek :", counter_combinations_greek
930 print "Note: We do not include partial compositions, "
931 print "thus the slight discrepancy in the figures"
934 if opt_unicodedatatxt:
935 process_unicodedata_file(True)
939 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
940 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
941 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
942 print " of which have Multi_key :", counter_multikey
944 print "Algorithmic (stats for Xorg Compose file)"
945 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
946 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
947 print " of which are for Greek :", num_algorithmic_greek
949 process_unicodedata_file()
950 print "Not algorithmic (stats from Xorg Compose file)"
951 print "Number of sequences :", len(xorg_compose_sequences)
952 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
953 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
954 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
955 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
956 print "Number of different first items :", num_first_keysyms
957 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
958 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
960 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
961 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
963 print "Existing (old) implementation in GTK+"
964 print "Number of sequences in old gtkimcontextsimple.c :", 691
965 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"