2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
38 keysymunicodedatabase = {}
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
76 * file for a list of people on the GTK+ Team. See the ChangeLog
77 * files for a list of changes. These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
84 /* === These are the original comments of the file; we keep for historical purposes ===
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
90 * The following compose letter letter sequences confliced
91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 * spanish. atilde and otilde are used at least for Portuguese ]
101 * at and Aring; resolved to Aring [ AA ]
102 * guillemotleft and caron; resolved to guillemotleft [ << ]
103 * ogonek and cedilla; resolved to cedilla [ ,, ]
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
109 static const guint16 gtk_compose_seqs_compact[] = {"""
111 headerfile_end = """};
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
116 def stringtohex(str): return atoi(str, 16)
122 return n * factorial(n-1)
125 """ Performs a uniq operation on a list or lists """
128 theInputList += theList
130 for elem in theInputList:
131 if elem not in theFinalList:
132 theFinalList.append(elem)
137 def all_permutations(seq):
138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139 """ Produces all permutations of the items of a list """
143 for perm in all_permutations(seq[1:]):
144 for i in range(len(perm)+1):
145 #nb str[0:1] works in both string and list contexts
146 yield perm[:i] + seq[0:1] + perm[i:]
149 print """compose-parse available parameters:
150 -h, --help this craft
151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
152 -a, --algorithmic show sequences saved with algorithmic optimisation
153 -g, --gtk show entries that go to GTK+
154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
155 -v, --verbose show verbose output
156 -p, --plane1 show plane1 compose sequences
157 -n, --numeric when used with --gtk, create file with numeric values only
158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
160 Default is to show statistics.
164 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
170 opt_statistics = False
171 opt_algorithmic = False
173 opt_unicodedatatxt = False
177 opt_gtkexpanded = False
180 if o in ("-h", "--help"):
183 if o in ("-s", "--statistics"):
184 opt_statistics = True
185 if o in ("-a", "--algorithmic"):
186 opt_algorithmic = True
187 if o in ("-g", "--gtk"):
189 if o in ("-u", "--unicodedatatxt"):
190 opt_unicodedatatxt = True
191 if o in ("-v", "--verbose"):
193 if o in ("-p", "--plane1"):
195 if o in ("-n", "--numeric"):
197 if o in ("-e", "--gtk-expanded"):
198 opt_gtkexpanded = True
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201 opt_statistics = True
203 def download_hook(blocks_transferred, block_size, file_size):
204 """ A download hook to provide some feedback when downloading """
205 if blocks_transferred == 0:
208 print "Downloading", file_size, "bytes: ",
211 print "Downloading: ",
212 sys.stdout.write('#')
216 def download_file(url):
217 """ Downloads a file provided a URL. Returns the filename. """
218 """ Borks on failure """
219 localfilename = url.split('/')[-1]
220 if not isfile(localfilename) or getsize(localfilename) <= 0:
222 print "Downloading ", url, "..."
224 urlretrieve(url, localfilename, download_hook)
225 except IOError, (errno, strerror):
226 print "I/O error(%s): %s" % (errno, strerror)
229 print "Unexpected error: ", sys.exc_info()[0]
234 print "Using cached file for ", url
237 def process_gdkkeysymsh():
238 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239 """ Fills up keysymdb with contents """
240 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
242 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243 except IOError, (errno, strerror):
244 print "I/O error(%s): %s" % (errno, strerror)
247 print "Unexpected error: ", sys.exc_info()[0]
250 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
251 linenum_gdkkeysymsh = 0
253 for line in gdkkeysymsh.readlines():
254 linenum_gdkkeysymsh += 1
256 if line == "" or not match('^#define GDK_', line):
258 components = split('\s+', line)
259 if len(components) < 3:
260 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262 print "Was expecting 3 items in the line"
264 if not match('^GDK_', components[1]):
265 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267 print "Was expecting a keysym starting with GDK_"
269 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
270 unival = atoi(components[2][2:], 16)
273 keysymdb[components[1][4:]] = unival
275 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277 print "Was expecting a hexadecimal number at the end of the line"
281 """ Patch up the keysymdb with some of our own stuff """
283 """ This is for a missing keysym from the currently upstream file """
284 keysymdb['dead_stroke'] = 0x338
286 """ This is for a missing keysym from the currently upstream file """
287 ###keysymdb['dead_belowring'] = 0x323
288 ###keysymdb['dead_belowmacron'] = 0x331
289 ###keysymdb['dead_belowcircumflex'] = 0x32d
290 ###keysymdb['dead_belowtilde'] = 0x330
291 ###keysymdb['dead_belowbreve'] = 0x32e
292 ###keysymdb['dead_belowdiaeresis'] = 0x324
294 """ This is^Wwas preferential treatment for Greek """
295 # keysymdb['dead_tilde'] = 0x342
296 """ This is^was preferential treatment for Greek """
297 #keysymdb['combining_tilde'] = 0x342
299 """ Fixing VoidSymbol """
300 keysymdb['VoidSymbol'] = 0xFFFF
304 def process_keysymstxt():
305 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306 """ This file keeps a record between keysyms <-> unicode chars """
307 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
309 keysymstxt = open(filename_keysymstxt, 'r')
310 except IOError, (errno, strerror):
311 print "I/O error(%s): %s" % (errno, strerror)
314 print "Unexpected error: ", sys.exc_info()[0]
317 """ Parse the keysyms.txt file and place content in keysymdb """
318 linenum_keysymstxt = 0
320 for line in keysymstxt.readlines():
321 linenum_keysymstxt += 1
323 if line == "" or match('^#', line):
325 components = split('\s+', line)
326 if len(components) < 5:
327 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329 print "Was expecting 5 items in the line"
331 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
332 unival = atoi(components[1][1:], 16)
335 keysymdb[components[4]] = unival
338 """ Patch up the keysymdb with some of our own stuff """
339 """ This is for a missing keysym from the currently upstream file """
340 ###keysymdb['dead_belowring'] = 0x323
341 ###keysymdb['dead_belowmacron'] = 0x331
342 ###keysymdb['dead_belowcircumflex'] = 0x32d
343 ###keysymdb['dead_belowtilde'] = 0x330
344 ###keysymdb['dead_belowbreve'] = 0x32e
345 ###keysymdb['dead_belowdiaeresis'] = 0x324
347 """ This is preferential treatment for Greek """
348 """ => we get more savings if used for Greek """
349 # keysymdb['dead_tilde'] = 0x342
350 """ This is preferential treatment for Greek """
351 # keysymdb['combining_tilde'] = 0x342
353 """ This is for a missing keysym from Markus Kuhn's db """
354 keysymdb['dead_stroke'] = 0x338
355 """ This is for a missing keysym from Markus Kuhn's db """
356 keysymdb['Oslash'] = 0x0d8
358 """ This is for a missing (recently added) keysym """
359 keysymdb['dead_psili'] = 0x313
360 """ This is for a missing (recently added) keysym """
361 keysymdb['dead_dasia'] = 0x314
363 """ Allows to import Multi_key sequences """
364 keysymdb['Multi_key'] = 0xff20
368 def keysymvalue(keysym, file = "n/a", linenum = 0):
369 """ Extracts a value from the keysym """
370 """ Find the value of keysym, using the data from keysyms """
371 """ Use file and linenum to when reporting errors """
374 if keysymdatabase.has_key(keysym):
375 return keysymdatabase[keysym]
376 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
377 return atoi(keysym[1:], 16)
378 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
379 return atoi(keysym[2:], 16)
381 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
385 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
386 """ Extracts a value from the keysym """
387 """ Find the value of keysym, using the data from keysyms """
388 """ Use file and linenum to when reporting errors """
391 if keysymunicodedatabase.has_key(keysym):
392 return keysymunicodedatabase[keysym]
393 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
394 return atoi(keysym[1:], 16)
395 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
396 return atoi(keysym[2:], 16)
398 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
401 def rename_combining(seq):
402 filtered_sequence = []
404 if findall('^combining_', ks):
405 filtered_sequence.append(sub('^combining_', 'dead_', ks))
407 filtered_sequence.append(ks)
408 return filtered_sequence
411 keysymunicodedatabase = process_keysymstxt()
412 keysymdatabase = process_gdkkeysymsh()
414 """ Grab and open the compose file from upstream """
415 filename_compose = download_file(URL_COMPOSE)
417 composefile = open(filename_compose, 'r')
418 except IOError, (errno, strerror):
419 print "I/O error(%s): %s" % (errno, strerror)
422 print "Unexpected error: ", sys.exc_info()[0]
425 """ Look if there is a lookaside (supplementary) compose file in the current
426 directory, and if so, open, then merge with upstream Compose file.
429 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
430 except IOError, (errno, strerror):
432 print "I/O error(%s): %s" % (errno, strerror)
433 print "Did not find lookaside compose file. Continuing..."
435 print "Unexpected error: ", sys.exc_info()[0]
438 xorg_compose_sequences_raw = []
439 for seq in composefile.readlines():
440 xorg_compose_sequences_raw.append(seq)
441 for seq in composefile_lookaside.readlines():
442 xorg_compose_sequences_raw.append(seq)
444 """ Parse the compose file in xorg_compose_sequences"""
445 xorg_compose_sequences = []
446 xorg_compose_sequences_algorithmic = []
448 for line in xorg_compose_sequences_raw:
451 if line is "" or match("^XCOMM", line) or match("^#", line):
455 components = split(':', line)
456 if len(components) != 2:
457 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
458 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
460 (seq, val ) = split(':', line)
463 raw_sequence = findall('\w+', seq)
464 values = split('\s+', val)
465 unichar_temp = split('"', values[0])
466 unichar = unichar_temp[1]
469 codepointstr = values[1]
471 # No codepoints that are >1 characters yet.
473 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
474 raw_sequence[0] = '0x' + raw_sequence[0][1:]
475 if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
476 codepoint = atoi(codepointstr[1:], 16)
477 elif keysymunicodedatabase.has_key(codepointstr):
478 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
479 print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
480 print raw_sequence, codepointstr
481 codepoint = keysymunicodedatabase[codepointstr]
484 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
485 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
487 sequence = rename_combining(raw_sequence)
490 if keysymvalue(i) > 0xFFFF:
495 if keysymvalue(i) < 0:
500 if "U0342" in sequence or \
501 "U0313" in sequence or \
502 "U0314" in sequence or \
503 "0x0313" in sequence or \
504 "0x0342" in sequence or \
505 "0x0314" in sequence:
507 if "dead_belowring" in sequence or\
508 "dead_belowcomma" in sequence or\
509 "dead_belowmacron" in sequence or\
510 "dead_belowtilde" in sequence or\
511 "dead_belowbreve" in sequence or\
512 "dead_belowdiaeresis" in sequence or\
513 "dead_belowcircumflex" in sequence:
515 #for i in range(len(sequence)):
516 # if sequence[i] == "0x0342":
517 # sequence[i] = "dead_tilde"
518 if "Multi_key" not in sequence:
519 """ Ignore for now >0xFFFF keysyms """
520 if codepoint < 0xFFFF:
521 original_sequence = copy(sequence)
522 stats_sequence = copy(sequence)
523 base = sequence.pop()
524 basechar = keysymvalue(base, filename_compose, linenum_compose)
526 if basechar < 0xFFFF:
529 not_normalised = True
530 skipping_this = False
531 for i in range(0, len(sequence)):
532 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
533 because of lack of dead_perispomeni (i.e. conflict)
536 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
539 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
542 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
545 if sequence[-1] == "dead_psili":
546 sequence[i] = "dead_horn"
547 if sequence[-1] == "dead_dasia":
548 sequence[-1] = "dead_ogonek"
550 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
554 for perm in all_permutations(unisequence):
555 # print counter, original_sequence, unichr(basechar) + "".join(perm)
556 # print counter, map(unichr, perm)
557 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
558 if len(normalized) == 1:
559 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
560 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
561 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
562 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
563 stats_sequence_data.append(normalized)
564 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
565 not_normalised = False
569 original_sequence.append(codepoint)
570 xorg_compose_sequences.append(original_sequence)
571 """ print xorg_compose_sequences[-1] """
574 print "Error in base char !?!"
577 print "OVER", sequence
580 sequence.append(codepoint)
581 xorg_compose_sequences.append(sequence)
582 """ print xorg_compose_sequences[-1] """
584 def sequence_cmp(x, y):
585 if keysymvalue(x[0]) > keysymvalue(y[0]):
587 elif keysymvalue(x[0]) < keysymvalue(y[0]):
589 elif len(x) > len(y):
591 elif len(x) < len(y):
593 elif keysymvalue(x[1]) > keysymvalue(y[1]):
595 elif keysymvalue(x[1]) < keysymvalue(y[1]):
599 elif keysymvalue(x[2]) > keysymvalue(y[2]):
601 elif keysymvalue(x[2]) < keysymvalue(y[2]):
605 elif keysymvalue(x[3]) > keysymvalue(y[3]):
607 elif keysymvalue(x[3]) < keysymvalue(y[3]):
611 elif keysymvalue(x[4]) > keysymvalue(y[4]):
613 elif keysymvalue(x[4]) < keysymvalue(y[4]):
618 def sequence_unicode_cmp(x, y):
619 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
621 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
623 elif len(x) > len(y):
625 elif len(x) < len(y):
627 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
629 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
633 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
635 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
639 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
641 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
645 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
647 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
652 def sequence_algorithmic_cmp(x, y):
655 elif len(x) > len(y):
658 for i in range(len(x)):
666 xorg_compose_sequences.sort(sequence_cmp)
668 xorg_compose_sequences_uniqued = []
671 for next_item in xorg_compose_sequences:
675 if sequence_unicode_cmp(item, next_item) != 0:
676 xorg_compose_sequences_uniqued.append(item)
679 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
682 for item in xorg_compose_sequences:
683 if findall('Multi_key', "".join(item[:-1])) != []:
684 counter_multikey += 1
686 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
687 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
690 num_first_keysyms = 0
693 num_algorithmic_greek = 0
694 for sequence in xorg_compose_sequences:
695 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
696 firstitem = sequence[0]
697 num_first_keysyms += 1
698 zeroes += 6 - len(sequence) + 1
701 for sequence in xorg_compose_sequences_algorithmic_uniqued:
702 ch = ord(sequence[-1:][0])
703 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
704 num_algorithmic_greek += 1
708 for sequence in xorg_compose_sequences_algorithmic_uniqued:
709 letter = "".join(sequence[-1:])
710 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
711 for elem in sequence[:-2]:
712 print "<0x%(keysym)04X>," % { 'keysym': elem },
713 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
714 print "], recomposed as", letter, "verified"
716 def num_of_keysyms(seq):
719 def convert_UnotationToHex(arg):
720 if isinstance(arg, str):
721 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
722 return sub('^U', '0x', arg)
725 def addprefix_GDK(arg):
726 if match('^0x', arg):
727 return '%(arg)s, ' % { 'arg': arg }
729 return 'GDK_%(arg)s, ' % { 'arg': arg }
736 ct_sequence_width = 2
737 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
741 sequence_iterator = iter(xorg_compose_sequences)
742 sequence = sequence_iterator.next()
744 first_keysym = sequence[0] # Set the first keysym
745 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
746 while sequence[0] == first_keysym:
747 compose_table[counter][num_of_keysyms(sequence)-1] += 1
749 sequence = sequence_iterator.next()
750 except StopIteration:
757 ct_index = start_offset
758 for line_num in range(len(compose_table)):
759 for i in range(WIDTHOFCOMPOSETABLE):
760 occurences = compose_table[line_num][i+1]
761 compose_table[line_num][i+1] = ct_index
762 ct_index += occurences * (i+2)
764 for sequence in xorg_compose_sequences:
765 ct_second_part.append(map(convert_UnotationToHex, sequence))
767 print headerfile_start
768 for i in compose_table:
770 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
771 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
772 elif not match('^0x', i[0]):
773 print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
775 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
776 for i in ct_second_part:
778 for ks in i[1:][:-1]:
779 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
780 print '0x%(cp)04X, ' % { 'cp':i[-1] }
783 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
784 print '0x%(cp)04X, ' % { 'cp':i[-1] }
786 elif opt_gtkexpanded:
787 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
789 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
792 def redecompose(codepoint):
793 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
794 if decomposition[0] == '' or decomposition[0] == '0':
796 if match('<\w+>', decomposition[0]):
797 numdecomposition = map(stringtohex, decomposition[1:])
798 return map(redecompose, numdecomposition)
799 numdecomposition = map(stringtohex, decomposition)
800 return map(redecompose, numdecomposition)
802 def process_unicodedata_file(verbose = False):
803 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
804 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
806 unicodedatatxt = open(filename_unicodedatatxt, 'r')
807 except IOError, (errno, strerror):
808 print "I/O error(%s): %s" % (errno, strerror)
811 print "Unexpected error: ", sys.exc_info()[0]
813 for line in unicodedatatxt.readlines():
814 if line[0] == "" or line[0] == '#':
817 uniproperties = split(';', line)
818 codepoint = stringtohex(uniproperties[0])
819 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
820 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
822 name = uniproperties[1]
823 category = uniproperties[2]
824 combiningclass = uniproperties[3]
825 decomposition = uniproperties[5]
826 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
828 counter_combinations = 0
829 counter_combinations_greek = 0
831 counter_entries_greek = 0
833 for item in unicodedatabase.keys():
834 (name, decomposition, combiningclass) = unicodedatabase[item]
835 if decomposition[0] == '':
837 print name, "is empty"
838 elif match('<\w+>', decomposition[0]):
840 print name, "has weird", decomposition[0]
842 sequence = map(stringtohex, decomposition)
843 chrsequence = map(unichr, sequence)
844 normalized = normalize('NFC', "".join(chrsequence))
846 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
847 decomposedsequence = []
848 for subseq in map(redecompose, sequence):
849 for seqitem in subseq:
850 if isinstance(seqitem, list):
852 if isinstance(i, list):
854 decomposedsequence.append(j)
856 decomposedsequence.append(i)
858 decomposedsequence.append(seqitem)
859 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
860 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
862 counter_combinations += factorial(len(decomposedsequence)-1)
864 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
865 counter_entries_greek += 1
866 counter_combinations_greek += factorial(len(decomposedsequence)-1)
868 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
870 for elem in decomposedsequence:
871 print '<0x%(hex)04X>,' % { 'hex': elem },
872 print "], recomposed as", recomposedchar,
873 if unichr(item) == recomposedchar:
877 print "Unicode statistics from UnicodeData.txt"
878 print "Number of entries that can be algorithmically produced :", counter_entries
879 print " of which are for Greek :", counter_entries_greek
880 print "Number of compose sequence combinations requiring :", counter_combinations
881 print " of which are for Greek :", counter_combinations_greek
882 print "Note: We do not include partial compositions, "
883 print "thus the slight discrepancy in the figures"
886 if opt_unicodedatatxt:
887 process_unicodedata_file(True)
891 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
892 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
893 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
894 print " of which have Multi_key :", counter_multikey
896 print "Algorithmic (stats for Xorg Compose file)"
897 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
898 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
899 print " of which are for Greek :", num_algorithmic_greek
901 process_unicodedata_file()
902 print "Not algorithmic (stats from Xorg Compose file)"
903 print "Number of sequences :", len(xorg_compose_sequences)
904 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
905 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
906 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
907 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
908 print "Number of different first items :", num_first_keysyms
909 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
910 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
912 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
913 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
915 print "Existing (old) implementation in GTK+"
916 print "Number of sequences in old gtkimcontextsimple.c :", 691
917 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"