2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
29 # We currently support keysyms of size 2; once upstream xorg gets sorted,
30 # we might produce some tables with size 2 and some with size 4.
33 # Current max compose sequence length; in case it gets increased.
34 WIDTHOFCOMPOSETABLE = 5
37 keysymunicodedatabase = {}
40 headerfile_start = """/* GTK - The GIMP Tool Kit
41 * Copyright (C) 2007, 2008 GNOME Foundation
43 * This library is free software; you can redistribute it and/or
44 * modify it under the terms of the GNU Lesser General Public
45 * License as published by the Free Software Foundation; either
46 * version 2 of the License, or (at your option) any later version.
48 * This library is distributed in the hope that it will be useful,
49 * but WITHOUT ANY WARRANTY; without even the implied warranty of
50 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
51 * Lesser General Public License for more details.
53 * You should have received a copy of the GNU Lesser General Public
54 * License along with this library; if not, write to the
55 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
56 * Boston, MA 02111-1307, USA.
60 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
61 * using the input files
62 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
63 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
64 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66 * This table is optimised for space and requires special handling to access the content.
67 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
70 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
74 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
75 * file for a list of people on the GTK+ Team. See the ChangeLog
76 * files for a list of changes. These files are distributed with
77 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
80 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
81 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83 /* === These are the original comments of the file; we keep for historical purposes ===
85 * The following table was generated from the X compose tables include with
86 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
87 * to obtain the relevant perl scripts.
89 * The following compose letter letter sequences confliced
90 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
91 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
92 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
93 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
94 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
95 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
97 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
98 * spanish. atilde and otilde are used at least for Portuguese ]
100 * at and Aring; resolved to Aring [ AA ]
101 * guillemotleft and caron; resolved to guillemotleft [ << ]
102 * ogonek and cedilla; resolved to cedilla [ ,, ]
104 * This probably should be resolved by first checking an additional set of compose tables
105 * that depend on the locale or selected input method.
108 static const guint16 gtk_compose_seqs_compact[] = {"""
110 headerfile_end = """};
112 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
115 def stringtohex(str): return atoi(str, 16)
121 return n * factorial(n-1)
124 """ Performs a uniq operation on a list or lists """
127 theInputList += theList
129 for elem in theInputList:
130 if elem not in theFinalList:
131 theFinalList.append(elem)
136 def all_permutations(seq):
137 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
138 """ Produces all permutations of the items of a list """
142 for perm in all_permutations(seq[1:]):
143 for i in range(len(perm)+1):
144 #nb str[0:1] works in both string and list contexts
145 yield perm[:i] + seq[0:1] + perm[i:]
148 print """compose-parse available parameters:
149 -h, --help this craft
150 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
151 -a, --algorithmic show sequences saved with algorithmic optimisation
152 -g, --gtk show entries that go to GTK+
153 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
154 -v, --verbose show verbose output
155 -p, --plane1 show plane1 compose sequences
156 -n, --numeric when used with --gtk, create file with numeric values only
157 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
159 Default is to show statistics.
163 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
164 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
169 opt_statistics = False
170 opt_algorithmic = False
172 opt_unicodedatatxt = False
176 opt_gtkexpanded = False
179 if o in ("-h", "--help"):
182 if o in ("-s", "--statistics"):
183 opt_statistics = True
184 if o in ("-a", "--algorithmic"):
185 opt_algorithmic = True
186 if o in ("-g", "--gtk"):
188 if o in ("-u", "--unicodedatatxt"):
189 opt_unicodedatatxt = True
190 if o in ("-v", "--verbose"):
192 if o in ("-p", "--plane1"):
194 if o in ("-n", "--numeric"):
196 if o in ("-e", "--gtk-expanded"):
197 opt_gtkexpanded = True
199 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
200 opt_statistics = True
202 def download_hook(blocks_transferred, block_size, file_size):
203 """ A download hook to provide some feedback when downloading """
204 if blocks_transferred == 0:
207 print "Downloading", file_size, "bytes: ",
210 print "Downloading: ",
211 sys.stdout.write('#')
215 def download_file(url):
216 """ Downloads a file provided a URL. Returns the filename. """
217 """ Borks on failure """
218 localfilename = url.split('/')[-1]
219 if not isfile(localfilename) or getsize(localfilename) <= 0:
221 print "Downloading ", url, "..."
223 urlretrieve(url, localfilename, download_hook)
224 except IOError, (errno, strerror):
225 print "I/O error(%s): %s" % (errno, strerror)
228 print "Unexpected error: ", sys.exc_info()[0]
233 print "Using cached file for ", url
236 def process_gdkkeysymsh():
237 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
238 """ Fills up keysymdb with contents """
239 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
241 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
242 except IOError, (errno, strerror):
243 print "I/O error(%s): %s" % (errno, strerror)
246 print "Unexpected error: ", sys.exc_info()[0]
249 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
250 linenum_gdkkeysymsh = 0
252 for line in gdkkeysymsh.readlines():
253 linenum_gdkkeysymsh += 1
255 if line == "" or not match('^#define GDK_', line):
257 components = split('\s+', line)
258 if len(components) < 3:
259 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
260 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
261 print "Was expecting 3 items in the line"
263 if not match('^GDK_', components[1]):
264 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
265 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
266 print "Was expecting a keysym starting with GDK_"
268 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
269 unival = atoi(components[2][2:], 16)
272 keysymdb[components[1][4:]] = unival
274 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
275 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
276 print "Was expecting a hexadecimal number at the end of the line"
280 """ Patch up the keysymdb with some of our own stuff """
282 """ This is for a missing keysym from the currently upstream file """
283 keysymdb['dead_stroke'] = 0x338
285 """ This is for a missing keysym from the currently upstream file """
286 ###keysymdb['dead_belowring'] = 0x323
287 ###keysymdb['dead_belowmacron'] = 0x331
288 ###keysymdb['dead_belowcircumflex'] = 0x32d
289 ###keysymdb['dead_belowtilde'] = 0x330
290 ###keysymdb['dead_belowbreve'] = 0x32e
291 ###keysymdb['dead_belowdiaeresis'] = 0x324
293 """ This is^Wwas preferential treatment for Greek """
294 # keysymdb['dead_tilde'] = 0x342
295 """ This is^was preferential treatment for Greek """
296 #keysymdb['combining_tilde'] = 0x342
298 """ Fixing VoidSymbol """
299 keysymdb['VoidSymbol'] = 0xFFFF
303 def process_keysymstxt():
304 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
305 """ This file keeps a record between keysyms <-> unicode chars """
306 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
308 keysymstxt = open(filename_keysymstxt, 'r')
309 except IOError, (errno, strerror):
310 print "I/O error(%s): %s" % (errno, strerror)
313 print "Unexpected error: ", sys.exc_info()[0]
316 """ Parse the keysyms.txt file and place content in keysymdb """
317 linenum_keysymstxt = 0
319 for line in keysymstxt.readlines():
320 linenum_keysymstxt += 1
322 if line == "" or match('^#', line):
324 components = split('\s+', line)
325 if len(components) < 5:
326 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
327 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
328 print "Was expecting 5 items in the line"
330 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
331 unival = atoi(components[1][1:], 16)
334 keysymdb[components[4]] = unival
337 """ Patch up the keysymdb with some of our own stuff """
338 """ This is for a missing keysym from the currently upstream file """
339 ###keysymdb['dead_belowring'] = 0x323
340 ###keysymdb['dead_belowmacron'] = 0x331
341 ###keysymdb['dead_belowcircumflex'] = 0x32d
342 ###keysymdb['dead_belowtilde'] = 0x330
343 ###keysymdb['dead_belowbreve'] = 0x32e
344 ###keysymdb['dead_belowdiaeresis'] = 0x324
346 """ This is preferential treatment for Greek """
347 """ => we get more savings if used for Greek """
348 # keysymdb['dead_tilde'] = 0x342
349 """ This is preferential treatment for Greek """
350 # keysymdb['combining_tilde'] = 0x342
352 """ This is for a missing keysym from Markus Kuhn's db """
353 keysymdb['dead_stroke'] = 0x338
354 """ This is for a missing keysym from Markus Kuhn's db """
355 keysymdb['Oslash'] = 0x0d8
357 """ This is for a missing (recently added) keysym """
358 keysymdb['dead_psili'] = 0x313
359 """ This is for a missing (recently added) keysym """
360 keysymdb['dead_dasia'] = 0x314
362 """ Allows to import Multi_key sequences """
363 keysymdb['Multi_key'] = 0xff20
367 def keysymvalue(keysym, file = "n/a", linenum = 0):
368 """ Extracts a value from the keysym """
369 """ Find the value of keysym, using the data from keysyms """
370 """ Use file and linenum to when reporting errors """
373 if keysymdatabase.has_key(keysym):
374 return keysymdatabase[keysym]
375 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
376 return atoi(keysym[1:], 16)
377 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
378 return atoi(keysym[2:], 16)
380 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
384 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
385 """ Extracts a value from the keysym """
386 """ Find the value of keysym, using the data from keysyms """
387 """ Use file and linenum to when reporting errors """
390 if keysymunicodedatabase.has_key(keysym):
391 return keysymunicodedatabase[keysym]
392 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
393 return atoi(keysym[1:], 16)
394 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
395 return atoi(keysym[2:], 16)
397 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
400 def rename_combining(seq):
401 filtered_sequence = []
403 if findall('^combining_', ks):
404 filtered_sequence.append(sub('^combining_', 'dead_', ks))
406 filtered_sequence.append(ks)
407 return filtered_sequence
410 keysymunicodedatabase = process_keysymstxt()
411 keysymdatabase = process_gdkkeysymsh()
413 """ Grab and open the compose file from upstream """
414 filename_compose = download_file(URL_COMPOSE)
416 composefile = open(filename_compose, 'r')
417 except IOError, (errno, strerror):
418 print "I/O error(%s): %s" % (errno, strerror)
421 print "Unexpected error: ", sys.exc_info()[0]
424 """ Parse the compose file in xorg_compose_sequences"""
425 xorg_compose_sequences = []
426 xorg_compose_sequences_algorithmic = []
428 for line in composefile.readlines():
431 if line is "" or match("^XCOMM", line) or match("^#", line):
435 components = split(':', line)
436 if len(components) != 2:
437 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
438 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
440 (seq, val ) = split(':', line)
443 raw_sequence = findall('\w+', seq)
444 values = split('\s+', val)
445 unichar_temp = split('"', values[0])
446 unichar = unichar_temp[1]
449 codepointstr = values[1]
451 # No codepoints that are >1 characters yet.
453 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
454 raw_sequence[0] = '0x' + raw_sequence[0][1:]
455 if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
456 codepoint = atoi(codepointstr[1:], 16)
457 elif keysymunicodedatabase.has_key(codepointstr):
458 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
459 print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
460 print raw_sequence, codepointstr
461 codepoint = keysymunicodedatabase[codepointstr]
464 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
465 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
467 sequence = rename_combining(raw_sequence)
470 if keysymvalue(i) > 0xFFFF:
475 if keysymvalue(i) < 0:
480 if "U0342" in sequence or \
481 "U0313" in sequence or \
482 "U0314" in sequence or \
483 "0x0313" in sequence or \
484 "0x0342" in sequence or \
485 "0x0314" in sequence:
487 #for i in range(len(sequence)):
488 # if sequence[i] == "0x0342":
489 # sequence[i] = "dead_tilde"
490 if "Multi_key" not in sequence:
491 """ Ignore for now >0xFFFF keysyms """
492 if codepoint < 0xFFFF:
493 original_sequence = copy(sequence)
494 stats_sequence = copy(sequence)
495 base = sequence.pop()
496 basechar = keysymvalue(base, filename_compose, linenum_compose)
498 if basechar < 0xFFFF:
501 not_normalised = True
502 skipping_this = False
503 for i in range(0, len(sequence)):
504 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
505 because of lack of dead_perispomeni (i.e. conflict)
508 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
511 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
514 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
517 if sequence[-1] == "dead_psili":
518 sequence[i] = "dead_horn"
519 if sequence[-1] == "dead_dasia":
520 sequence[-1] = "dead_ogonek"
522 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
526 for perm in all_permutations(unisequence):
527 # print counter, original_sequence, unichr(basechar) + "".join(perm)
528 # print counter, map(unichr, perm)
529 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
530 if len(normalized) == 1:
531 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
532 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
533 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
534 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
535 stats_sequence_data.append(normalized)
536 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
537 not_normalised = False
541 original_sequence.append(codepoint)
542 xorg_compose_sequences.append(original_sequence)
543 """ print xorg_compose_sequences[-1] """
546 print "Error in base char !?!"
549 print "OVER", sequence
552 sequence.append(codepoint)
553 xorg_compose_sequences.append(sequence)
554 """ print xorg_compose_sequences[-1] """
556 def sequence_cmp(x, y):
557 if keysymvalue(x[0]) > keysymvalue(y[0]):
559 elif keysymvalue(x[0]) < keysymvalue(y[0]):
561 elif len(x) > len(y):
563 elif len(x) < len(y):
565 elif keysymvalue(x[1]) > keysymvalue(y[1]):
567 elif keysymvalue(x[1]) < keysymvalue(y[1]):
571 elif keysymvalue(x[2]) > keysymvalue(y[2]):
573 elif keysymvalue(x[2]) < keysymvalue(y[2]):
577 elif keysymvalue(x[3]) > keysymvalue(y[3]):
579 elif keysymvalue(x[3]) < keysymvalue(y[3]):
583 elif keysymvalue(x[4]) > keysymvalue(y[4]):
585 elif keysymvalue(x[4]) < keysymvalue(y[4]):
590 def sequence_unicode_cmp(x, y):
591 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
593 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
595 elif len(x) > len(y):
597 elif len(x) < len(y):
599 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
601 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
605 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
607 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
611 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
613 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
617 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
619 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
624 def sequence_algorithmic_cmp(x, y):
627 elif len(x) > len(y):
630 for i in range(len(x)):
638 xorg_compose_sequences.sort(sequence_cmp)
640 xorg_compose_sequences_uniqued = []
643 for next_item in xorg_compose_sequences:
647 if sequence_unicode_cmp(item, next_item) != 0:
648 xorg_compose_sequences_uniqued.append(item)
651 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
654 for item in xorg_compose_sequences:
655 if findall('Multi_key', "".join(item[:-1])) != []:
656 counter_multikey += 1
658 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
659 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
662 num_first_keysyms = 0
665 num_algorithmic_greek = 0
666 for sequence in xorg_compose_sequences:
667 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
668 firstitem = sequence[0]
669 num_first_keysyms += 1
670 zeroes += 6 - len(sequence) + 1
673 for sequence in xorg_compose_sequences_algorithmic_uniqued:
674 ch = ord(sequence[-1:][0])
675 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
676 num_algorithmic_greek += 1
680 for sequence in xorg_compose_sequences_algorithmic_uniqued:
681 letter = "".join(sequence[-1:])
682 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
683 for elem in sequence[:-2]:
684 print "<0x%(keysym)04X>," % { 'keysym': elem },
685 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
686 print "], recomposed as", letter, "verified"
688 def num_of_keysyms(seq):
691 def convert_UnotationToHex(arg):
692 if isinstance(arg, str):
693 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
694 return sub('^U', '0x', arg)
697 def addprefix_GDK(arg):
698 if match('^0x', arg):
699 return '%(arg)s, ' % { 'arg': arg }
701 return 'GDK_%(arg)s, ' % { 'arg': arg }
708 ct_sequence_width = 2
709 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
713 sequence_iterator = iter(xorg_compose_sequences)
714 sequence = sequence_iterator.next()
716 first_keysym = sequence[0] # Set the first keysym
717 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
718 while sequence[0] == first_keysym:
719 compose_table[counter][num_of_keysyms(sequence)-1] += 1
721 sequence = sequence_iterator.next()
722 except StopIteration:
729 ct_index = start_offset
730 for line_num in range(len(compose_table)):
731 for i in range(WIDTHOFCOMPOSETABLE):
732 occurences = compose_table[line_num][i+1]
733 compose_table[line_num][i+1] = ct_index
734 ct_index += occurences * (i+2)
736 for sequence in xorg_compose_sequences:
737 ct_second_part.append(map(convert_UnotationToHex, sequence))
739 print headerfile_start
740 for i in compose_table:
742 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
743 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
744 elif not match('^0x', i[0]):
745 print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
747 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
748 for i in ct_second_part:
750 for ks in i[1:][:-1]:
751 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
752 print '0x%(cp)04X, ' % { 'cp':i[-1] }
755 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
756 print '0x%(cp)04X, ' % { 'cp':i[-1] }
758 elif opt_gtkexpanded:
759 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
761 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
764 def redecompose(codepoint):
765 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
766 if decomposition[0] == '' or decomposition[0] == '0':
768 if match('<\w+>', decomposition[0]):
769 numdecomposition = map(stringtohex, decomposition[1:])
770 return map(redecompose, numdecomposition)
771 numdecomposition = map(stringtohex, decomposition)
772 return map(redecompose, numdecomposition)
774 def process_unicodedata_file(verbose = False):
775 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
776 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
778 unicodedatatxt = open(filename_unicodedatatxt, 'r')
779 except IOError, (errno, strerror):
780 print "I/O error(%s): %s" % (errno, strerror)
783 print "Unexpected error: ", sys.exc_info()[0]
785 for line in unicodedatatxt.readlines():
786 if line[0] == "" or line[0] == '#':
789 uniproperties = split(';', line)
790 codepoint = stringtohex(uniproperties[0])
791 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
792 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
794 name = uniproperties[1]
795 category = uniproperties[2]
796 combiningclass = uniproperties[3]
797 decomposition = uniproperties[5]
798 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
800 counter_combinations = 0
801 counter_combinations_greek = 0
803 counter_entries_greek = 0
805 for item in unicodedatabase.keys():
806 (name, decomposition, combiningclass) = unicodedatabase[item]
807 if decomposition[0] == '':
809 print name, "is empty"
810 elif match('<\w+>', decomposition[0]):
812 print name, "has weird", decomposition[0]
814 sequence = map(stringtohex, decomposition)
815 chrsequence = map(unichr, sequence)
816 normalized = normalize('NFC', "".join(chrsequence))
818 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
819 decomposedsequence = []
820 for subseq in map(redecompose, sequence):
821 for seqitem in subseq:
822 if isinstance(seqitem, list):
824 if isinstance(i, list):
826 decomposedsequence.append(j)
828 decomposedsequence.append(i)
830 decomposedsequence.append(seqitem)
831 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
832 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
834 counter_combinations += factorial(len(decomposedsequence)-1)
836 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
837 counter_entries_greek += 1
838 counter_combinations_greek += factorial(len(decomposedsequence)-1)
840 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
842 for elem in decomposedsequence:
843 print '<0x%(hex)04X>,' % { 'hex': elem },
844 print "], recomposed as", recomposedchar,
845 if unichr(item) == recomposedchar:
849 print "Unicode statistics from UnicodeData.txt"
850 print "Number of entries that can be algorithmically produced :", counter_entries
851 print " of which are for Greek :", counter_entries_greek
852 print "Number of compose sequence combinations requiring :", counter_combinations
853 print " of which are for Greek :", counter_combinations_greek
854 print "Note: We do not include partial compositions, "
855 print "thus the slight discrepancy in the figures"
858 if opt_unicodedatatxt:
859 process_unicodedata_file(True)
863 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
864 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
865 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
866 print " of which have Multi_key :", counter_multikey
868 print "Algorithmic (stats for Xorg Compose file)"
869 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
870 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
871 print " of which are for Greek :", num_algorithmic_greek
873 process_unicodedata_file()
874 print "Not algorithmic (stats from Xorg Compose file)"
875 print "Number of sequences :", len(xorg_compose_sequences)
876 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
877 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
878 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
879 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
880 print "Number of different first items :", num_first_keysyms
881 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
882 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
884 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
885 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
887 print "Existing (old) implementation in GTK+"
888 print "Number of sequences in old gtkimcontextsimple.c :", 691
889 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"