2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
38 keysymunicodedatabase = {}
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
76 * file for a list of people on the GTK+ Team. See the ChangeLog
77 * files for a list of changes. These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
84 /* === These are the original comments of the file; we keep for historical purposes ===
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
90 * The following compose letter letter sequences confliced
91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 * spanish. atilde and otilde are used at least for Portuguese ]
101 * at and Aring; resolved to Aring [ AA ]
102 * guillemotleft and caron; resolved to guillemotleft [ << ]
103 * ogonek and cedilla; resolved to cedilla [ ,, ]
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
109 static const guint16 gtk_compose_seqs_compact[] = {"""
111 headerfile_end = """};
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
116 def stringtohex(str): return atoi(str, 16)
122 return n * factorial(n-1)
125 """ Performs a uniq operation on a list or lists """
128 theInputList += theList
130 for elem in theInputList:
131 if elem not in theFinalList:
132 theFinalList.append(elem)
137 def all_permutations(seq):
138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139 """ Produces all permutations of the items of a list """
143 for perm in all_permutations(seq[1:]):
144 for i in range(len(perm)+1):
145 #nb str[0:1] works in both string and list contexts
146 yield perm[:i] + seq[0:1] + perm[i:]
149 print """compose-parse available parameters:
150 -h, --help this craft
151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
152 -a, --algorithmic show sequences saved with algorithmic optimisation
153 -g, --gtk show entries that go to GTK+
154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
155 -v, --verbose show verbose output
156 -p, --plane1 show plane1 compose sequences
157 -n, --numeric when used with --gtk, create file with numeric values only
158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
160 Default is to show statistics.
164 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
170 opt_statistics = False
171 opt_algorithmic = False
173 opt_unicodedatatxt = False
177 opt_gtkexpanded = False
180 if o in ("-h", "--help"):
183 if o in ("-s", "--statistics"):
184 opt_statistics = True
185 if o in ("-a", "--algorithmic"):
186 opt_algorithmic = True
187 if o in ("-g", "--gtk"):
189 if o in ("-u", "--unicodedatatxt"):
190 opt_unicodedatatxt = True
191 if o in ("-v", "--verbose"):
193 if o in ("-p", "--plane1"):
195 if o in ("-n", "--numeric"):
197 if o in ("-e", "--gtk-expanded"):
198 opt_gtkexpanded = True
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201 opt_statistics = True
203 def download_hook(blocks_transferred, block_size, file_size):
204 """ A download hook to provide some feedback when downloading """
205 if blocks_transferred == 0:
208 print "Downloading", file_size, "bytes: ",
211 print "Downloading: ",
212 sys.stdout.write('#')
216 def download_file(url):
217 """ Downloads a file provided a URL. Returns the filename. """
218 """ Borks on failure """
219 localfilename = url.split('/')[-1]
220 if not isfile(localfilename) or getsize(localfilename) <= 0:
222 print "Downloading ", url, "..."
224 urlretrieve(url, localfilename, download_hook)
225 except IOError, (errno, strerror):
226 print "I/O error(%s): %s" % (errno, strerror)
229 print "Unexpected error: ", sys.exc_info()[0]
234 print "Using cached file for ", url
237 def process_gdkkeysymsh():
238 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239 """ Fills up keysymdb with contents """
240 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
242 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243 except IOError, (errno, strerror):
244 print "I/O error(%s): %s" % (errno, strerror)
247 print "Unexpected error: ", sys.exc_info()[0]
250 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
251 linenum_gdkkeysymsh = 0
253 for line in gdkkeysymsh.readlines():
254 linenum_gdkkeysymsh += 1
256 if line == "" or not match('^#define GDK_', line):
258 components = split('\s+', line)
259 if len(components) < 3:
260 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262 print "Was expecting 3 items in the line"
264 if not match('^GDK_', components[1]):
265 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267 print "Was expecting a keysym starting with GDK_"
269 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
270 unival = atoi(components[2][2:], 16)
273 keysymdb[components[1][4:]] = unival
275 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277 print "Was expecting a hexadecimal number at the end of the line"
281 """ Patch up the keysymdb with some of our own stuff """
283 """ This is for a missing keysym from the currently upstream file """
284 keysymdb['dead_stroke'] = 0x338
286 """ This is for a missing keysym from the currently upstream file """
287 ###keysymdb['dead_belowring'] = 0x323
288 ###keysymdb['dead_belowmacron'] = 0x331
289 ###keysymdb['dead_belowcircumflex'] = 0x32d
290 ###keysymdb['dead_belowtilde'] = 0x330
291 ###keysymdb['dead_belowbreve'] = 0x32e
292 ###keysymdb['dead_belowdiaeresis'] = 0x324
294 """ This is^Wwas preferential treatment for Greek """
295 # keysymdb['dead_tilde'] = 0x342
296 """ This is^was preferential treatment for Greek """
297 #keysymdb['combining_tilde'] = 0x342
299 """ Fixing VoidSymbol """
300 keysymdb['VoidSymbol'] = 0xFFFF
304 def process_keysymstxt():
305 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306 """ This file keeps a record between keysyms <-> unicode chars """
307 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
309 keysymstxt = open(filename_keysymstxt, 'r')
310 except IOError, (errno, strerror):
311 print "I/O error(%s): %s" % (errno, strerror)
314 print "Unexpected error: ", sys.exc_info()[0]
317 """ Parse the keysyms.txt file and place content in keysymdb """
318 linenum_keysymstxt = 0
320 for line in keysymstxt.readlines():
321 linenum_keysymstxt += 1
323 if line == "" or match('^#', line):
325 components = split('\s+', line)
326 if len(components) < 5:
327 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329 print "Was expecting 5 items in the line"
331 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
332 unival = atoi(components[1][1:], 16)
335 keysymdb[components[4]] = unival
338 """ Patch up the keysymdb with some of our own stuff """
339 """ This is for a missing keysym from the currently upstream file """
340 ###keysymdb['dead_belowring'] = 0x323
341 ###keysymdb['dead_belowmacron'] = 0x331
342 ###keysymdb['dead_belowcircumflex'] = 0x32d
343 ###keysymdb['dead_belowtilde'] = 0x330
344 ###keysymdb['dead_belowbreve'] = 0x32e
345 ###keysymdb['dead_belowdiaeresis'] = 0x324
347 """ This is preferential treatment for Greek """
348 """ => we get more savings if used for Greek """
349 # keysymdb['dead_tilde'] = 0x342
350 """ This is preferential treatment for Greek """
351 # keysymdb['combining_tilde'] = 0x342
353 """ This is for a missing keysym from Markus Kuhn's db """
354 keysymdb['dead_stroke'] = 0x338
355 """ This is for a missing keysym from Markus Kuhn's db """
356 keysymdb['Oslash'] = 0x0d8
358 """ This is for a missing (recently added) keysym """
359 keysymdb['dead_psili'] = 0x313
360 """ This is for a missing (recently added) keysym """
361 keysymdb['dead_dasia'] = 0x314
363 """ Allows to import Multi_key sequences """
364 keysymdb['Multi_key'] = 0xff20
366 keysymdb['zerosubscript'] = 0x2080
367 keysymdb['onesubscript'] = 0x2081
368 keysymdb['twosubscript'] = 0x2082
369 keysymdb['threesubscript'] = 0x2083
370 keysymdb['foursubscript'] = 0x2084
371 keysymdb['fivesubscript'] = 0x2085
372 keysymdb['sixsubscript'] = 0x2086
373 keysymdb['sevensubscript'] = 0x2087
374 keysymdb['eightsubscript'] = 0x2088
375 keysymdb['ninesubscript'] = 0x2089
379 def keysymvalue(keysym, file = "n/a", linenum = 0):
380 """ Extracts a value from the keysym """
381 """ Find the value of keysym, using the data from keysyms """
382 """ Use file and linenum to when reporting errors """
385 if keysymdatabase.has_key(keysym):
386 return keysymdatabase[keysym]
387 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
388 return atoi(keysym[1:], 16)
389 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
390 return atoi(keysym[2:], 16)
392 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
396 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
397 """ Extracts a value from the keysym """
398 """ Find the value of keysym, using the data from keysyms """
399 """ Use file and linenum to when reporting errors """
402 if keysymunicodedatabase.has_key(keysym):
403 return keysymunicodedatabase[keysym]
404 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
405 return atoi(keysym[1:], 16)
406 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
407 return atoi(keysym[2:], 16)
409 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
412 def rename_combining(seq):
413 filtered_sequence = []
415 if findall('^combining_', ks):
416 filtered_sequence.append(sub('^combining_', 'dead_', ks))
418 filtered_sequence.append(ks)
419 return filtered_sequence
422 keysymunicodedatabase = process_keysymstxt()
423 keysymdatabase = process_gdkkeysymsh()
425 """ Grab and open the compose file from upstream """
426 filename_compose = download_file(URL_COMPOSE)
428 composefile = open(filename_compose, 'r')
429 except IOError, (errno, strerror):
430 print "I/O error(%s): %s" % (errno, strerror)
433 print "Unexpected error: ", sys.exc_info()[0]
436 """ Look if there is a lookaside (supplementary) compose file in the current
437 directory, and if so, open, then merge with upstream Compose file.
439 xorg_compose_sequences_raw = []
440 for seq in composefile.readlines():
441 xorg_compose_sequences_raw.append(seq)
444 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
445 for seq in composefile_lookaside.readlines():
446 xorg_compose_sequences_raw.append(seq)
447 except IOError, (errno, strerror):
449 print "I/O error(%s): %s" % (errno, strerror)
450 print "Did not find lookaside compose file. Continuing..."
452 print "Unexpected error: ", sys.exc_info()[0]
455 """ Parse the compose file in xorg_compose_sequences"""
456 xorg_compose_sequences = []
457 xorg_compose_sequences_algorithmic = []
459 comment_nest_depth = 0
460 for line in xorg_compose_sequences_raw:
463 if match("^XCOMM", line) or match("^#", line):
466 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
468 comment_start = line.find("/*")
470 if comment_start >= 0:
471 if comment_nest_depth == 0:
472 line = line[:comment_start]
476 comment_nest_depth += 1
478 comment_end = line.find("*/")
481 comment_nest_depth -= 1
483 if comment_nest_depth < 0:
484 print "Invalid comment %(linenum_compose)d in %(filename)s: \
485 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
488 if comment_nest_depth > 0:
491 line = line[comment_end + 2:]
497 components = split(':', line)
498 if len(components) != 2:
499 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
500 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
502 (seq, val ) = split(':', line)
505 raw_sequence = findall('\w+', seq)
506 values = split('\s+', val)
507 unichar_temp = split('"', values[0])
508 unichar = unichar_temp[1]
511 codepointstr = values[1]
513 # No codepoints that are >1 characters yet.
515 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
516 raw_sequence[0] = '0x' + raw_sequence[0][1:]
517 if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
518 codepoint = atoi(codepointstr[1:], 16)
519 elif keysymunicodedatabase.has_key(codepointstr):
520 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
521 print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
522 print raw_sequence, codepointstr
523 codepoint = keysymunicodedatabase[codepointstr]
526 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
527 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
529 sequence = rename_combining(raw_sequence)
532 if keysymvalue(i) > 0xFFFF:
537 if keysymvalue(i) < 0:
542 if "U0342" in sequence or \
543 "U0313" in sequence or \
544 "U0314" in sequence or \
545 "0x0313" in sequence or \
546 "0x0342" in sequence or \
547 "0x0314" in sequence:
549 if "dead_belowring" in sequence or\
550 "dead_belowcomma" in sequence or\
551 "dead_belowmacron" in sequence or\
552 "dead_belowtilde" in sequence or\
553 "dead_belowbreve" in sequence or\
554 "dead_belowdiaeresis" in sequence or\
555 "dead_belowcircumflex" in sequence:
557 #for i in range(len(sequence)):
558 # if sequence[i] == "0x0342":
559 # sequence[i] = "dead_tilde"
560 if "Multi_key" not in sequence:
561 """ Ignore for now >0xFFFF keysyms """
562 if codepoint < 0xFFFF:
563 original_sequence = copy(sequence)
564 stats_sequence = copy(sequence)
565 base = sequence.pop()
566 basechar = keysymvalue(base, filename_compose, linenum_compose)
568 if basechar < 0xFFFF:
571 not_normalised = True
572 skipping_this = False
573 for i in range(0, len(sequence)):
574 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
575 because of lack of dead_perispomeni (i.e. conflict)
578 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
581 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
584 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
587 if sequence[-1] == "dead_psili":
588 sequence[i] = "dead_horn"
589 if sequence[-1] == "dead_dasia":
590 sequence[-1] = "dead_ogonek"
592 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
596 for perm in all_permutations(unisequence):
597 # print counter, original_sequence, unichr(basechar) + "".join(perm)
598 # print counter, map(unichr, perm)
599 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
600 if len(normalized) == 1:
601 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
602 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
603 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
604 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
605 stats_sequence_data.append(normalized)
606 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
607 not_normalised = False
611 original_sequence.append(codepoint)
612 xorg_compose_sequences.append(original_sequence)
613 """ print xorg_compose_sequences[-1] """
616 print "Error in base char !?!"
619 print "OVER", sequence
622 sequence.append(codepoint)
623 xorg_compose_sequences.append(sequence)
624 """ print xorg_compose_sequences[-1] """
626 def sequence_cmp(x, y):
627 if keysymvalue(x[0]) > keysymvalue(y[0]):
629 elif keysymvalue(x[0]) < keysymvalue(y[0]):
631 elif len(x) > len(y):
633 elif len(x) < len(y):
635 elif keysymvalue(x[1]) > keysymvalue(y[1]):
637 elif keysymvalue(x[1]) < keysymvalue(y[1]):
641 elif keysymvalue(x[2]) > keysymvalue(y[2]):
643 elif keysymvalue(x[2]) < keysymvalue(y[2]):
647 elif keysymvalue(x[3]) > keysymvalue(y[3]):
649 elif keysymvalue(x[3]) < keysymvalue(y[3]):
653 elif keysymvalue(x[4]) > keysymvalue(y[4]):
655 elif keysymvalue(x[4]) < keysymvalue(y[4]):
660 def sequence_unicode_cmp(x, y):
661 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
663 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
665 elif len(x) > len(y):
667 elif len(x) < len(y):
669 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
671 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
675 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
677 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
681 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
683 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
687 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
689 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
694 def sequence_algorithmic_cmp(x, y):
697 elif len(x) > len(y):
700 for i in range(len(x)):
708 xorg_compose_sequences.sort(sequence_cmp)
710 xorg_compose_sequences_uniqued = []
713 for next_item in xorg_compose_sequences:
717 if sequence_unicode_cmp(item, next_item) != 0:
718 xorg_compose_sequences_uniqued.append(item)
721 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
724 for item in xorg_compose_sequences:
725 if findall('Multi_key', "".join(item[:-1])) != []:
726 counter_multikey += 1
728 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
729 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
732 num_first_keysyms = 0
735 num_algorithmic_greek = 0
736 for sequence in xorg_compose_sequences:
737 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
738 firstitem = sequence[0]
739 num_first_keysyms += 1
740 zeroes += 6 - len(sequence) + 1
743 for sequence in xorg_compose_sequences_algorithmic_uniqued:
744 ch = ord(sequence[-1:][0])
745 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
746 num_algorithmic_greek += 1
750 for sequence in xorg_compose_sequences_algorithmic_uniqued:
751 letter = "".join(sequence[-1:])
752 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
753 for elem in sequence[:-2]:
754 print "<0x%(keysym)04X>," % { 'keysym': elem },
755 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
756 print "], recomposed as", letter, "verified"
758 def num_of_keysyms(seq):
761 def convert_UnotationToHex(arg):
762 if isinstance(arg, str):
763 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
764 return sub('^U', '0x', arg)
767 def addprefix_GDK(arg):
768 if match('^0x', arg):
769 return '%(arg)s, ' % { 'arg': arg }
771 return 'GDK_%(arg)s, ' % { 'arg': arg }
778 ct_sequence_width = 2
779 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
783 sequence_iterator = iter(xorg_compose_sequences)
784 sequence = sequence_iterator.next()
786 first_keysym = sequence[0] # Set the first keysym
787 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
788 while sequence[0] == first_keysym:
789 compose_table[counter][num_of_keysyms(sequence)-1] += 1
791 sequence = sequence_iterator.next()
792 except StopIteration:
799 ct_index = start_offset
800 for line_num in range(len(compose_table)):
801 for i in range(WIDTHOFCOMPOSETABLE):
802 occurences = compose_table[line_num][i+1]
803 compose_table[line_num][i+1] = ct_index
804 ct_index += occurences * (i+2)
806 for sequence in xorg_compose_sequences:
807 ct_second_part.append(map(convert_UnotationToHex, sequence))
809 print headerfile_start
810 for i in compose_table:
812 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
813 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
814 elif not match('^0x', i[0]):
815 print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
817 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
818 for i in ct_second_part:
820 for ks in i[1:][:-1]:
821 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
822 print '0x%(cp)04X, ' % { 'cp':i[-1] }
825 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
826 print '0x%(cp)04X, ' % { 'cp':i[-1] }
828 elif opt_gtkexpanded:
829 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
831 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
834 def redecompose(codepoint):
835 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
836 if decomposition[0] == '' or decomposition[0] == '0':
838 if match('<\w+>', decomposition[0]):
839 numdecomposition = map(stringtohex, decomposition[1:])
840 return map(redecompose, numdecomposition)
841 numdecomposition = map(stringtohex, decomposition)
842 return map(redecompose, numdecomposition)
844 def process_unicodedata_file(verbose = False):
845 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
846 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
848 unicodedatatxt = open(filename_unicodedatatxt, 'r')
849 except IOError, (errno, strerror):
850 print "I/O error(%s): %s" % (errno, strerror)
853 print "Unexpected error: ", sys.exc_info()[0]
855 for line in unicodedatatxt.readlines():
856 if line[0] == "" or line[0] == '#':
859 uniproperties = split(';', line)
860 codepoint = stringtohex(uniproperties[0])
861 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
862 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
864 name = uniproperties[1]
865 category = uniproperties[2]
866 combiningclass = uniproperties[3]
867 decomposition = uniproperties[5]
868 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
870 counter_combinations = 0
871 counter_combinations_greek = 0
873 counter_entries_greek = 0
875 for item in unicodedatabase.keys():
876 (name, decomposition, combiningclass) = unicodedatabase[item]
877 if decomposition[0] == '':
879 print name, "is empty"
880 elif match('<\w+>', decomposition[0]):
882 print name, "has weird", decomposition[0]
884 sequence = map(stringtohex, decomposition)
885 chrsequence = map(unichr, sequence)
886 normalized = normalize('NFC', "".join(chrsequence))
888 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
889 decomposedsequence = []
890 for subseq in map(redecompose, sequence):
891 for seqitem in subseq:
892 if isinstance(seqitem, list):
894 if isinstance(i, list):
896 decomposedsequence.append(j)
898 decomposedsequence.append(i)
900 decomposedsequence.append(seqitem)
901 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
902 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
904 counter_combinations += factorial(len(decomposedsequence)-1)
906 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
907 counter_entries_greek += 1
908 counter_combinations_greek += factorial(len(decomposedsequence)-1)
910 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
912 for elem in decomposedsequence:
913 print '<0x%(hex)04X>,' % { 'hex': elem },
914 print "], recomposed as", recomposedchar,
915 if unichr(item) == recomposedchar:
919 print "Unicode statistics from UnicodeData.txt"
920 print "Number of entries that can be algorithmically produced :", counter_entries
921 print " of which are for Greek :", counter_entries_greek
922 print "Number of compose sequence combinations requiring :", counter_combinations
923 print " of which are for Greek :", counter_combinations_greek
924 print "Note: We do not include partial compositions, "
925 print "thus the slight discrepancy in the figures"
928 if opt_unicodedatatxt:
929 process_unicodedata_file(True)
933 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
934 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
935 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
936 print " of which have Multi_key :", counter_multikey
938 print "Algorithmic (stats for Xorg Compose file)"
939 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
940 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
941 print " of which are for Greek :", num_algorithmic_greek
943 process_unicodedata_file()
944 print "Not algorithmic (stats from Xorg Compose file)"
945 print "Number of sequences :", len(xorg_compose_sequences)
946 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
947 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
948 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
949 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
950 print "Number of different first items :", num_first_keysyms
951 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
952 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
954 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
955 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
957 print "Existing (old) implementation in GTK+"
958 print "Number of sequences in old gtkimcontextsimple.c :", 691
959 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"