]> Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py
stylecontext: Do invalidation on first resize container
[~andy/gtk] / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
33
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
36
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
40
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42  * Copyright (C) 2007, 2008 GNOME Foundation
43  *
44  * This library is free software; you can redistribute it and/or
45  * modify it under the terms of the GNU Lesser General Public
46  * License as published by the Free Software Foundation; either
47  * version 2 of the License, or (at your option) any later version.
48  *
49  * This library is distributed in the hope that it will be useful,
50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52  * Lesser General Public License for more details.
53  *
54  * You should have received a copy of the GNU Lesser General Public
55  * License along with this library. If not, see see <http://www.gnu.org/licenses/>.
56  */
57
58 /*
59  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
60  * using the input files
61  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
62  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
63  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
64  *
65  * This table is optimised for space and requires special handling to access the content.
66  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
67  * 
68  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
69  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
70  */
71
72 /*
73  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
74  * file for a list of people on the GTK+ Team.  See the ChangeLog
75  * files for a list of changes.  These files are distributed with
76  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
77  */
78
79 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
80 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
81
82 /* === These are the original comments of the file; we keep for historical purposes ===
83  *
84  * The following table was generated from the X compose tables include with
85  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
86  * to obtain the relevant perl scripts.
87  *
88  * The following compose letter letter sequences confliced
89  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
90  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
91  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
92  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
93  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
94  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
95  *
96  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
97  *   spanish. atilde and otilde are used at least for Portuguese ]
98  *
99  *   at and Aring; resolved to Aring                                          [ AA ]
100  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
101  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
102  *
103  * This probably should be resolved by first checking an additional set of compose tables
104  * that depend on the locale or selected input method.
105  */
106
107 static const guint16 gtk_compose_seqs_compact[] = {"""
108
109 headerfile_end = """};
110
111 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
112 """
113
114 def stringtohex(str): return atoi(str, 16)
115
116 def factorial(n): 
117         if n <= 1:
118                 return 1
119         else:
120                 return n * factorial(n-1)
121
122 def uniq(*args) :
123         """ Performs a uniq operation on a list or lists """
124         theInputList = []
125         for theList in args:
126            theInputList += theList
127         theFinalList = []
128         for elem in theInputList:
129                 if elem not in theFinalList:
130                         theFinalList.append(elem)
131         return theFinalList
132
133
134
135 def all_permutations(seq):
136         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
137         """ Produces all permutations of the items of a list """
138         if len(seq) <=1:
139             yield seq
140         else:
141             for perm in all_permutations(seq[1:]):
142                 for i in range(len(perm)+1):
143                     #nb str[0:1] works in both string and list contexts
144                         yield perm[:i] + seq[0:1] + perm[i:]
145
146 def usage():
147         print """compose-parse available parameters:
148         -h, --help              this craft
149         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
150         -a, --algorithmic       show sequences saved with algorithmic optimisation
151         -g, --gtk               show entries that go to GTK+
152         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
153         -v, --verbose           show verbose output
154         -p, --plane1            show plane1 compose sequences
155         -n, --numeric           when used with --gtk, create file with numeric values only
156         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
157
158         Default is to show statistics.
159         """
160
161 try: 
162         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
163                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
164 except: 
165         usage()
166         sys.exit(2)
167
168 opt_statistics = False
169 opt_algorithmic = False
170 opt_gtk = False
171 opt_unicodedatatxt = False
172 opt_verbose = False
173 opt_plane1 = False
174 opt_numeric = False
175 opt_gtkexpanded = False
176
177 for o, a in opts:
178         if o in ("-h", "--help"):
179                 usage()
180                 sys.exit()
181         if o in ("-s", "--statistics"):
182                 opt_statistics = True
183         if o in ("-a", "--algorithmic"):
184                 opt_algorithmic = True
185         if o in ("-g", "--gtk"):
186                 opt_gtk = True  
187         if o in ("-u", "--unicodedatatxt"):
188                 opt_unicodedatatxt = True
189         if o in ("-v", "--verbose"):
190                 opt_verbose = True
191         if o in ("-p", "--plane1"):
192                 opt_plane1 = True
193         if o in ("-n", "--numeric"):
194                 opt_numeric = True
195         if o in ("-e", "--gtk-expanded"):
196                 opt_gtkexpanded = True
197
198 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
199         opt_statistics = True
200
201 def download_hook(blocks_transferred, block_size, file_size):
202         """ A download hook to provide some feedback when downloading """
203         if blocks_transferred == 0:
204                 if file_size > 0:
205                         if opt_verbose:
206                                 print "Downloading", file_size, "bytes: ",
207                 else:   
208                         if opt_verbose:
209                                 print "Downloading: ",
210         sys.stdout.write('#')
211         sys.stdout.flush()
212
213
214 def download_file(url):
215         """ Downloads a file provided a URL. Returns the filename. """
216         """ Borks on failure """
217         localfilename = url.split('/')[-1]
218         if not isfile(localfilename) or getsize(localfilename) <= 0:
219                 if opt_verbose:
220                         print "Downloading ", url, "..."
221                 try: 
222                         urlretrieve(url, localfilename, download_hook)
223                 except IOError, (errno, strerror):
224                         print "I/O error(%s): %s" % (errno, strerror)
225                         sys.exit(-1)
226                 except:
227                         print "Unexpected error: ", sys.exc_info()[0]
228                         sys.exit(-1)
229                 print " done."
230         else:
231                 if opt_verbose:
232                         print "Using cached file for ", url
233         return localfilename
234
235 def process_gdkkeysymsh():
236         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
237         """ Fills up keysymdb with contents """
238         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
239         try: 
240                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
241         except IOError, (errno, strerror):
242                 print "I/O error(%s): %s" % (errno, strerror)
243                 sys.exit(-1)
244         except:
245                 print "Unexpected error: ", sys.exc_info()[0]
246                 sys.exit(-1)
247
248         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
249         linenum_gdkkeysymsh = 0
250         keysymdb = {}
251         for line in gdkkeysymsh.readlines():
252                 linenum_gdkkeysymsh += 1
253                 line = line.strip()
254                 if line == "" or not match('^#define GDK_KEY_', line):
255                         continue
256                 components = split('\s+', line)
257                 if len(components) < 3:
258                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
259                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
260                         print "Was expecting 3 items in the line"
261                         sys.exit(-1)
262                 if not match('^GDK_KEY_', components[1]):
263                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
265                         print "Was expecting a keysym starting with GDK_KEY_"
266                         sys.exit(-1)
267                 if match('^0x[0-9a-fA-F]+$', components[2]):
268                         unival = long(components[2][2:], 16)
269                         if unival == 0:
270                                 continue
271                         keysymdb[components[1][8:]] = unival
272                 else:
273                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
274                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
275                         print "Was expecting a hexadecimal number at the end of the line"
276                         sys.exit(-1)
277         gdkkeysymsh.close()
278
279         """ Patch up the keysymdb with some of our own stuff """
280
281         """ This is for a missing keysym from the currently upstream file """
282         keysymdb['dead_stroke'] = 0x338
283
284         """ This is for a missing keysym from the currently upstream file """
285         ###keysymdb['dead_belowring'] = 0x323
286         ###keysymdb['dead_belowmacron'] = 0x331
287         ###keysymdb['dead_belowcircumflex'] = 0x32d
288         ###keysymdb['dead_belowtilde'] = 0x330
289         ###keysymdb['dead_belowbreve'] = 0x32e
290         ###keysymdb['dead_belowdiaeresis'] = 0x324
291
292         """ This is^Wwas preferential treatment for Greek """
293         # keysymdb['dead_tilde'] = 0x342                
294         """ This is^was preferential treatment for Greek """
295         #keysymdb['combining_tilde'] = 0x342    
296
297         """ Fixing VoidSymbol """
298         keysymdb['VoidSymbol'] = 0xFFFF
299
300         return keysymdb
301
302 def process_keysymstxt():
303         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
304         """ This file keeps a record between keysyms <-> unicode chars """
305         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
306         try: 
307                 keysymstxt = open(filename_keysymstxt, 'r')
308         except IOError, (errno, strerror):
309                 print "I/O error(%s): %s" % (errno, strerror)
310                 sys.exit(-1)
311         except:
312                 print "Unexpected error: ", sys.exc_info()[0]
313                 sys.exit(-1)
314
315         """ Parse the keysyms.txt file and place content in  keysymdb """
316         linenum_keysymstxt = 0
317         keysymdb = {}
318         for line in keysymstxt.readlines():
319                 linenum_keysymstxt += 1
320                 line = line.strip()
321                 if line == "" or match('^#', line):
322                         continue
323                 components = split('\s+', line)
324                 if len(components) < 5:
325                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
326                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
327                         print "Was expecting 5 items in the line"
328                         sys.exit(-1)
329                 if match('^U[0-9a-fA-F]+$', components[1]):
330                         unival = long(components[1][1:], 16)
331                 if unival == 0:
332                         continue
333                 keysymdb[components[4]] = unival
334         keysymstxt.close()
335
336         """ Patch up the keysymdb with some of our own stuff """
337         """ This is for a missing keysym from the currently upstream file """
338         ###keysymdb['dead_belowring'] = 0x323
339         ###keysymdb['dead_belowmacron'] = 0x331
340         ###keysymdb['dead_belowcircumflex'] = 0x32d
341         ###keysymdb['dead_belowtilde'] = 0x330
342         ###keysymdb['dead_belowbreve'] = 0x32e
343         ###keysymdb['dead_belowdiaeresis'] = 0x324
344
345         """ This is preferential treatment for Greek """
346         """ => we get more savings if used for Greek """
347         # keysymdb['dead_tilde'] = 0x342                
348         """ This is preferential treatment for Greek """
349         # keysymdb['combining_tilde'] = 0x342   
350
351         """ This is for a missing keysym from Markus Kuhn's db """
352         keysymdb['dead_stroke'] = 0x338
353         """ This is for a missing keysym from Markus Kuhn's db """
354         keysymdb['Oslash'] = 0x0d8              
355         """ This is for a missing keysym from Markus Kuhn's db """
356         keysymdb['Ssharp'] = 0x1e9e
357
358         """ This is for a missing (recently added) keysym """
359         keysymdb['dead_psili'] = 0x313          
360         """ This is for a missing (recently added) keysym """
361         keysymdb['dead_dasia'] = 0x314          
362
363         """ Allows to import Multi_key sequences """
364         keysymdb['Multi_key'] = 0xff20
365
366         keysymdb['zerosubscript'] = 0x2080
367         keysymdb['onesubscript'] = 0x2081
368         keysymdb['twosubscript'] = 0x2082
369         keysymdb['threesubscript'] = 0x2083
370         keysymdb['foursubscript'] = 0x2084
371         keysymdb['fivesubscript'] = 0x2085
372         keysymdb['sixsubscript'] = 0x2086
373         keysymdb['sevensubscript'] = 0x2087
374         keysymdb['eightsubscript'] = 0x2088
375         keysymdb['ninesubscript'] = 0x2089
376         keysymdb['dead_doublegrave'] = 0x030F
377         keysymdb['dead_invertedbreve'] = 0x0311
378
379         return keysymdb
380
381 def keysymvalue(keysym, file = "n/a", linenum = 0):
382         """ Extracts a value from the keysym """
383         """ Find the value of keysym, using the data from keysyms """
384         """ Use file and linenum to when reporting errors """
385         if keysym == "":
386                 return 0
387         if keysymdatabase.has_key(keysym):
388                 return keysymdatabase[keysym]
389         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
390                 return atoi(keysym[1:], 16)
391         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
392                 return atoi(keysym[2:], 16)
393         else:
394                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
395                 #return -1
396                 sys.exit(-1)
397
398 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
399         """ Extracts a value from the keysym """
400         """ Find the value of keysym, using the data from keysyms """
401         """ Use file and linenum to when reporting errors """
402         if keysym == "":
403                 return 0
404         if keysymunicodedatabase.has_key(keysym):
405                 return keysymunicodedatabase[keysym]
406         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
407                 return atoi(keysym[1:], 16)
408         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
409                 return atoi(keysym[2:], 16)
410         else:
411                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
412                 sys.exit(-1)
413
414 def rename_combining(seq):
415         filtered_sequence = []
416         for ks in seq:
417                 if findall('^combining_', ks):
418                         ks = sub('^combining_', 'dead_', ks)
419                 if ks == 'dead_double_grave':
420                         ks = 'dead_doublegrave'
421                 if ks == 'dead_inverted_breve':
422                         ks = 'dead_invertedbreve'
423                 filtered_sequence.append(ks)
424         return filtered_sequence
425
426
427 keysymunicodedatabase = process_keysymstxt()
428 keysymdatabase = process_gdkkeysymsh()
429
430 """ Grab and open the compose file from upstream """
431 filename_compose = download_file(URL_COMPOSE)
432 try: 
433         composefile = open(filename_compose, 'r')
434 except IOError, (errno, strerror):
435         print "I/O error(%s): %s" % (errno, strerror)
436         sys.exit(-1)
437 except:
438         print "Unexpected error: ", sys.exc_info()[0]
439         sys.exit(-1)
440
441 """ Look if there is a lookaside (supplementary) compose file in the current
442     directory, and if so, open, then merge with upstream Compose file.
443 """
444 xorg_compose_sequences_raw = []
445 for seq in composefile.readlines():
446         xorg_compose_sequences_raw.append(seq)
447
448 try:
449         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
450         for seq in composefile_lookaside.readlines():
451                 xorg_compose_sequences_raw.append(seq)
452 except IOError, (errno, strerror):
453         if opt_verbose:
454                 print "I/O error(%s): %s" % (errno, strerror)
455                 print "Did not find lookaside compose file. Continuing..."
456 except:
457         print "Unexpected error: ", sys.exc_info()[0]
458         sys.exit(-1)
459
460 """ Parse the compose file in  xorg_compose_sequences"""
461 xorg_compose_sequences = []
462 xorg_compose_sequences_algorithmic = []
463 linenum_compose = 0
464 comment_nest_depth = 0
465 for line in xorg_compose_sequences_raw:
466         linenum_compose += 1
467         line = line.strip()
468         if match("^XCOMM", line) or match("^#", line):
469                 continue
470
471         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
472
473         comment_start = line.find("/*")
474
475         if comment_start >= 0:
476                 if comment_nest_depth == 0:
477                         line = line[:comment_start]
478                 else:
479                         line = ""
480
481                 comment_nest_depth += 1
482         else:
483                 comment_end = line.find("*/")
484
485                 if comment_end >= 0:
486                         comment_nest_depth -= 1
487
488                 if comment_nest_depth < 0:
489                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
490                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
491                         exit(-1)
492
493                 if comment_nest_depth > 0:
494                         line = ""
495                 else:
496                         line = line[comment_end + 2:]
497
498         if line is "":
499                 continue
500
501         #line = line[:-1]
502         components = split(':', line)
503         if len(components) != 2:
504                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
505                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
506                 exit(-1)
507         (seq, val ) = split(':', line)
508         seq = seq.strip()
509         val = val.strip()
510         raw_sequence = findall('\w+', seq)
511         values = split('\s+', val)
512         unichar_temp = split('"', values[0])
513         unichar = unichar_temp[1]
514         if len(values) == 1:
515                 continue
516         codepointstr = values[1]
517         if values[1] == '#':
518                 # No codepoints that are >1 characters yet.
519                 continue
520         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
521                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
522         if  match('^U[0-9a-fA-F]+$', codepointstr):
523                 codepoint = long(codepointstr[1:], 16)
524         elif keysymunicodedatabase.has_key(codepointstr):
525                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
526                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
527                         #print raw_sequence, codepointstr
528                 codepoint = keysymunicodedatabase[codepointstr]
529         else:
530                 print
531                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
532                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
533                 exit(-1)
534         sequence = rename_combining(raw_sequence)
535         reject_this = False
536         for i in sequence:
537                 if keysymvalue(i) > 0xFFFF:
538                         reject_this = True
539                         if opt_plane1:
540                                 print sequence
541                         break
542                 if keysymvalue(i) < 0:
543                         reject_this = True
544                         break
545         if reject_this:
546                 continue
547         if "U0342" in sequence or \
548                 "U0313" in sequence or \
549                 "U0314" in sequence or \
550                 "0x0313" in sequence or \
551                 "0x0342" in sequence or \
552                 "0x0314" in sequence:
553                 continue
554         if "dead_belowring" in sequence or\
555                 "dead_currency" in sequence or\
556                 "dead_belowcomma" in sequence or\
557                 "dead_belowmacron" in sequence or\
558                 "dead_belowtilde" in sequence or\
559                 "dead_belowbreve" in sequence or\
560                 "dead_belowdiaeresis" in sequence or\
561                 "dead_belowcircumflex" in sequence:
562                 continue
563         #for i in range(len(sequence)):
564         #       if sequence[i] == "0x0342":
565         #               sequence[i] = "dead_tilde"
566         if "Multi_key" not in sequence:
567                 """ Ignore for now >0xFFFF keysyms """
568                 if codepoint < 0xFFFF:
569                         original_sequence = copy(sequence)
570                         stats_sequence = copy(sequence)
571                         base = sequence.pop()
572                         basechar = keysymvalue(base, filename_compose, linenum_compose)
573                         
574                         if basechar < 0xFFFF:
575                                 counter = 1
576                                 unisequence = []
577                                 not_normalised = True
578                                 skipping_this = False
579                                 for i in range(0, len(sequence)):
580                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
581                                             because of lack of dead_perispomeni (i.e. conflict)
582                                         """
583                                         bc = basechar
584                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
585                                                 skipping_this = True
586                                                 break
587                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
588                                                 skipping_this = True
589                                                 break
590                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
591                                                 skipping_this = True
592                                                 break
593                                         if sequence[-1] == "dead_psili":
594                                                 sequence[i] = "dead_horn"
595                                         if sequence[-1] == "dead_dasia":
596                                                 sequence[-1] = "dead_ogonek"
597                                         """
598                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
599                                         
600                                 if skipping_this:
601                                         unisequence = []
602                                 for perm in all_permutations(unisequence):
603                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
604                                         # print counter, map(unichr, perm)
605                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
606                                         if len(normalized) == 1:
607                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
608                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
609                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
610                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
611                                                 stats_sequence_data.append(normalized)
612                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
613                                                 not_normalised = False
614                                                 break;
615                                         counter += 1
616                                 if not_normalised:
617                                         original_sequence.append(codepoint)
618                                         xorg_compose_sequences.append(original_sequence)
619                                         """ print xorg_compose_sequences[-1] """
620                                         
621                         else:
622                                 print "Error in base char !?!"
623                                 exit(-2)
624                 else:
625                         print "OVER", sequence
626                         exit(-1)
627         else:
628                 sequence.append(codepoint)
629                 xorg_compose_sequences.append(sequence)
630                 """ print xorg_compose_sequences[-1] """
631
632 def sequence_cmp(x, y):
633         if keysymvalue(x[0]) > keysymvalue(y[0]):
634                 return 1
635         elif keysymvalue(x[0]) < keysymvalue(y[0]):
636                 return -1
637         elif len(x) > len(y):
638                 return 1
639         elif len(x) < len(y):
640                 return -1
641         elif keysymvalue(x[1]) > keysymvalue(y[1]):
642                 return 1
643         elif keysymvalue(x[1]) < keysymvalue(y[1]):
644                 return -1
645         elif len(x) < 4:
646                 return 0
647         elif keysymvalue(x[2]) > keysymvalue(y[2]):
648                 return 1
649         elif keysymvalue(x[2]) < keysymvalue(y[2]):
650                 return -1
651         elif len(x) < 5:
652                 return 0
653         elif keysymvalue(x[3]) > keysymvalue(y[3]):
654                 return 1
655         elif keysymvalue(x[3]) < keysymvalue(y[3]):
656                 return -1
657         elif len(x) < 6:
658                 return 0
659         elif keysymvalue(x[4]) > keysymvalue(y[4]):
660                 return 1
661         elif keysymvalue(x[4]) < keysymvalue(y[4]):
662                 return -1
663         else:
664                 return 0
665
666 def sequence_unicode_cmp(x, y):
667         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
668                 return 1
669         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
670                 return -1
671         elif len(x) > len(y):
672                 return 1
673         elif len(x) < len(y):
674                 return -1
675         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
676                 return 1
677         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
678                 return -1
679         elif len(x) < 4:
680                 return 0
681         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
682                 return 1
683         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
684                 return -1
685         elif len(x) < 5:
686                 return 0
687         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
688                 return 1
689         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
690                 return -1
691         elif len(x) < 6:
692                 return 0
693         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
694                 return 1
695         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
696                 return -1
697         else:
698                 return 0
699
700 def sequence_algorithmic_cmp(x, y):
701         if len(x) < len(y):
702                 return -1
703         elif len(x) > len(y):
704                 return 1
705         else:
706                 for i in range(len(x)):
707                         if x[i] < y[i]:
708                                 return -1
709                         elif x[i] > y[i]:
710                                 return 1
711         return 0
712
713
714 xorg_compose_sequences.sort(sequence_cmp)
715
716 xorg_compose_sequences_uniqued = []
717 first_time = True
718 item = None
719 for next_item in xorg_compose_sequences:
720         if first_time:
721                 first_time = False
722                 item = next_item
723         if sequence_unicode_cmp(item, next_item) != 0:
724                 xorg_compose_sequences_uniqued.append(item)
725         item = next_item
726
727 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
728
729 counter_multikey = 0
730 for item in xorg_compose_sequences:
731         if findall('Multi_key', "".join(item[:-1])) != []:
732                 counter_multikey += 1
733
734 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
735 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
736
737 firstitem = ""
738 num_first_keysyms = 0
739 zeroes = 0
740 num_entries = 0
741 num_algorithmic_greek = 0
742 for sequence in xorg_compose_sequences:
743         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
744                 firstitem = sequence[0]
745                 num_first_keysyms += 1
746         zeroes += 6 - len(sequence) + 1
747         num_entries += 1
748
749 for sequence in xorg_compose_sequences_algorithmic_uniqued:
750         ch = ord(sequence[-1:][0])
751         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
752                 num_algorithmic_greek += 1
753                 
754
755 if opt_algorithmic:
756         for sequence in xorg_compose_sequences_algorithmic_uniqued:
757                 letter = "".join(sequence[-1:])
758                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
759                 for elem in sequence[:-2]:
760                         print "<0x%(keysym)04X>," % { 'keysym': elem },
761                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
762                 print "], recomposed as", letter.encode('utf-8'), "verified"
763
764 def num_of_keysyms(seq):
765         return len(seq) - 1
766
767 def convert_UnotationToHex(arg):
768         if isinstance(arg, str):
769                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
770                         return sub('^U', '0x', arg)
771         return arg
772
773 def addprefix_GDK(arg):
774         if match('^0x', arg):
775                 return '%(arg)s, ' % { 'arg': arg }
776         else:
777                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
778
779 if opt_gtk:
780         first_keysym = ""
781         sequence = []
782         compose_table = []
783         ct_second_part = []
784         ct_sequence_width = 2
785         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
786         we_finished = False
787         counter = 0
788
789         sequence_iterator = iter(xorg_compose_sequences)
790         sequence = sequence_iterator.next()
791         while True:
792                 first_keysym = sequence[0]                                      # Set the first keysym
793                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
794                 while sequence[0] == first_keysym:
795                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
796                         try:
797                                 sequence = sequence_iterator.next()
798                         except StopIteration:
799                                 we_finished = True
800                                 break
801                 if we_finished:
802                         break
803                 counter += 1
804
805         ct_index = start_offset
806         for line_num in range(len(compose_table)):
807                 for i in range(WIDTHOFCOMPOSETABLE):
808                         occurences = compose_table[line_num][i+1]
809                         compose_table[line_num][i+1] = ct_index
810                         ct_index += occurences * (i+2)
811
812         for sequence in xorg_compose_sequences:
813                 ct_second_part.append(map(convert_UnotationToHex, sequence))
814
815         print headerfile_start
816         for i in compose_table:
817                 if opt_gtkexpanded:
818                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
819                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
820                 elif not match('^0x', i[0]):
821                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
822                 else:
823                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
824         for i in ct_second_part:
825                 if opt_numeric:
826                         for ks in i[1:][:-1]:
827                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
828                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
829                         """
830                         for ks in i[:-1]:
831                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
832                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
833                         """
834                 elif opt_gtkexpanded:
835                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
836                 else:
837                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
838         print headerfile_end 
839
840 def redecompose(codepoint):
841         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
842         if decomposition[0] == '' or decomposition[0] == '0':
843                 return [codepoint]
844         if match('<\w+>', decomposition[0]):
845                 numdecomposition = map(stringtohex, decomposition[1:])
846                 return map(redecompose, numdecomposition)
847         numdecomposition = map(stringtohex, decomposition)
848         return map(redecompose, numdecomposition)
849
850 def process_unicodedata_file(verbose = False):
851         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
852         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
853         try: 
854                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
855         except IOError, (errno, strerror):
856                 print "I/O error(%s): %s" % (errno, strerror)
857                 sys.exit(-1)
858         except:
859                 print "Unexpected error: ", sys.exc_info()[0]
860                 sys.exit(-1)
861         for line in unicodedatatxt.readlines():
862                 if line[0] == "" or line[0] == '#':
863                         continue
864                 line = line[:-1]
865                 uniproperties = split(';', line)
866                 codepoint = stringtohex(uniproperties[0])
867                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
868                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
869                         continue
870                 name = uniproperties[1]
871                 category = uniproperties[2]
872                 combiningclass = uniproperties[3]
873                 decomposition = uniproperties[5]
874                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
875         
876         counter_combinations = 0
877         counter_combinations_greek = 0
878         counter_entries = 0
879         counter_entries_greek = 0
880
881         for item in unicodedatabase.keys():
882                 (name, decomposition, combiningclass) = unicodedatabase[item]
883                 if decomposition[0] == '':
884                         continue
885                         print name, "is empty"
886                 elif match('<\w+>', decomposition[0]):
887                         continue
888                         print name, "has weird", decomposition[0]
889                 else:
890                         sequence = map(stringtohex, decomposition)
891                         chrsequence = map(unichr, sequence)
892                         normalized = normalize('NFC', "".join(chrsequence))
893                         
894                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
895                         decomposedsequence = []
896                         for subseq in map(redecompose, sequence):
897                                 for seqitem in subseq:
898                                         if isinstance(seqitem, list):
899                                                 for i in seqitem:
900                                                         if isinstance(i, list):
901                                                                 for j in i:
902                                                                         decomposedsequence.append(j)
903                                                         else:
904                                                                 decomposedsequence.append(i)
905                                         else:
906                                                 decomposedsequence.append(seqitem)
907                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
908                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
909                                 counter_entries += 1
910                                 counter_combinations += factorial(len(decomposedsequence)-1)
911                                 ch = item
912                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
913                                         counter_entries_greek += 1
914                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
915                                 if verbose:
916                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
917                                         print "[",
918                                         for elem in decomposedsequence:
919                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
920                                         print "], recomposed as", recomposedchar,
921                                         if unichr(item) == recomposedchar:
922                                                 print "verified"
923         
924         if verbose == False:
925                 print "Unicode statistics from UnicodeData.txt"
926                 print "Number of entries that can be algorithmically produced     :", counter_entries
927                 print "  of which are for Greek                                   :", counter_entries_greek
928                 print "Number of compose sequence combinations requiring          :", counter_combinations
929                 print "  of which are for Greek                                   :", counter_combinations_greek
930                 print "Note: We do not include partial compositions, "
931                 print "thus the slight discrepancy in the figures"
932                 print
933
934 if opt_unicodedatatxt:
935         process_unicodedata_file(True)
936
937 if opt_statistics:
938         print
939         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
940         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
941         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
942         print "    of which have Multi_key                                :", counter_multikey
943         print 
944         print "Algorithmic (stats for Xorg Compose file)"
945         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
946         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
947         print "  of which are for Greek                                   :", num_algorithmic_greek
948         print 
949         process_unicodedata_file()
950         print "Not algorithmic (stats from Xorg Compose file)"
951         print "Number of sequences                                        :", len(xorg_compose_sequences) 
952         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
953         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
954         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
955         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
956         print "Number of different first items                            :", num_first_keysyms
957         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
958         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
959         print 
960         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
961         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
962         print
963         print "Existing (old) implementation in GTK+"
964         print "Number of sequences in old gtkimcontextsimple.c            :", 691
965         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"