]> Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py
Closed Bug 550676 – Memory leak, update keyboard layout data table
[~andy/gtk] / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
28
29 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
30 # we might produce some tables with size 2 and some with size 4.
31 SIZEOFINT = 2
32
33 # Current max compose sequence length; in case it gets increased.
34 WIDTHOFCOMPOSETABLE = 5
35
36 keysymdatabase = {}
37 keysymunicodedatabase = {}
38 unicodedatabase = {}
39
40 headerfile_start = """/* GTK - The GIMP Tool Kit
41  * Copyright (C) 2007, 2008 GNOME Foundation
42  *
43  * This library is free software; you can redistribute it and/or
44  * modify it under the terms of the GNU Lesser General Public
45  * License as published by the Free Software Foundation; either
46  * version 2 of the License, or (at your option) any later version.
47  *
48  * This library is distributed in the hope that it will be useful,
49  * but WITHOUT ANY WARRANTY; without even the implied warranty of
50  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
51  * Lesser General Public License for more details.
52  *
53  * You should have received a copy of the GNU Lesser General Public
54  * License along with this library; if not, write to the
55  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
56  * Boston, MA 02111-1307, USA.
57  */
58
59 /*
60  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
61  * using the input files
62  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
63  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
64  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
65  *
66  * This table is optimised for space and requires special handling to access the content.
67  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
68  * 
69  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
70  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
71  */
72
73 /*
74  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
75  * file for a list of people on the GTK+ Team.  See the ChangeLog
76  * files for a list of changes.  These files are distributed with
77  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
78  */
79
80 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
81 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82
83 /* === These are the original comments of the file; we keep for historical purposes ===
84  *
85  * The following table was generated from the X compose tables include with
86  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
87  * to obtain the relevant perl scripts.
88  *
89  * The following compose letter letter sequences confliced
90  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
91  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
92  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
93  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
94  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
95  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
96  *
97  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
98  *   spanish. atilde and otilde are used at least for Portuguese ]
99  *
100  *   at and Aring; resolved to Aring                                          [ AA ]
101  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
102  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
103  *
104  * This probably should be resolved by first checking an additional set of compose tables
105  * that depend on the locale or selected input method.
106  */
107
108 static const guint16 gtk_compose_seqs_compact[] = {"""
109
110 headerfile_end = """};
111
112 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
113 """
114
115 def stringtohex(str): return atoi(str, 16)
116
117 def factorial(n): 
118         if n <= 1:
119                 return 1
120         else:
121                 return n * factorial(n-1)
122
123 def uniq(*args) :
124         """ Performs a uniq operation on a list or lists """
125         theInputList = []
126         for theList in args:
127            theInputList += theList
128         theFinalList = []
129         for elem in theInputList:
130                 if elem not in theFinalList:
131                         theFinalList.append(elem)
132         return theFinalList
133
134
135
136 def all_permutations(seq):
137         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
138         """ Produces all permutations of the items of a list """
139         if len(seq) <=1:
140             yield seq
141         else:
142             for perm in all_permutations(seq[1:]):
143                 for i in range(len(perm)+1):
144                     #nb str[0:1] works in both string and list contexts
145                         yield perm[:i] + seq[0:1] + perm[i:]
146
147 def usage():
148         print """compose-parse available parameters:
149         -h, --help              this craft
150         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
151         -a, --algorithmic       show sequences saved with algorithmic optimisation
152         -g, --gtk               show entries that go to GTK+
153         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
154         -v, --verbose           show verbose output
155         -p, --plane1            show plane1 compose sequences
156         -n, --numeric           when used with --gtk, create file with numeric values only
157         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
158
159         Default is to show statistics.
160         """
161
162 try: 
163         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
164                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
165 except: 
166         usage()
167         sys.exit(2)
168
169 opt_statistics = False
170 opt_algorithmic = False
171 opt_gtk = False
172 opt_unicodedatatxt = False
173 opt_verbose = False
174 opt_plane1 = False
175 opt_numeric = False
176 opt_gtkexpanded = False
177
178 for o, a in opts:
179         if o in ("-h", "--help"):
180                 usage()
181                 sys.exit()
182         if o in ("-s", "--statistics"):
183                 opt_statistics = True
184         if o in ("-a", "--algorithmic"):
185                 opt_algorithmic = True
186         if o in ("-g", "--gtk"):
187                 opt_gtk = True  
188         if o in ("-u", "--unicodedatatxt"):
189                 opt_unicodedatatxt = True
190         if o in ("-v", "--verbose"):
191                 opt_verbose = True
192         if o in ("-p", "--plane1"):
193                 opt_plane1 = True
194         if o in ("-n", "--numeric"):
195                 opt_numeric = True
196         if o in ("-e", "--gtk-expanded"):
197                 opt_gtkexpanded = True
198
199 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
200         opt_statistics = True
201
202 def download_hook(blocks_transferred, block_size, file_size):
203         """ A download hook to provide some feedback when downloading """
204         if blocks_transferred == 0:
205                 if file_size > 0:
206                         if opt_verbose:
207                                 print "Downloading", file_size, "bytes: ",
208                 else:   
209                         if opt_verbose:
210                                 print "Downloading: ",
211         sys.stdout.write('#')
212         sys.stdout.flush()
213
214
215 def download_file(url):
216         """ Downloads a file provided a URL. Returns the filename. """
217         """ Borks on failure """
218         localfilename = url.split('/')[-1]
219         if not isfile(localfilename) or getsize(localfilename) <= 0:
220                 if opt_verbose:
221                         print "Downloading ", url, "..."
222                 try: 
223                         urlretrieve(url, localfilename, download_hook)
224                 except IOError, (errno, strerror):
225                         print "I/O error(%s): %s" % (errno, strerror)
226                         sys.exit(-1)
227                 except:
228                         print "Unexpected error: ", sys.exc_info()[0]
229                         sys.exit(-1)
230                 print " done."
231         else:
232                 if opt_verbose:
233                         print "Using cached file for ", url
234         return localfilename
235
236 def process_gdkkeysymsh():
237         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
238         """ Fills up keysymdb with contents """
239         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
240         try: 
241                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
242         except IOError, (errno, strerror):
243                 print "I/O error(%s): %s" % (errno, strerror)
244                 sys.exit(-1)
245         except:
246                 print "Unexpected error: ", sys.exc_info()[0]
247                 sys.exit(-1)
248
249         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
250         linenum_gdkkeysymsh = 0
251         keysymdb = {}
252         for line in gdkkeysymsh.readlines():
253                 linenum_gdkkeysymsh += 1
254                 line = line.strip()
255                 if line == "" or not match('^#define GDK_', line):
256                         continue
257                 components = split('\s+', line)
258                 if len(components) < 3:
259                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
260                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
261                         print "Was expecting 3 items in the line"
262                         sys.exit(-1)
263                 if not match('^GDK_', components[1]):
264                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
265                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
266                         print "Was expecting a keysym starting with GDK_"
267                         sys.exit(-1)
268                 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
269                         unival = atoi(components[2][2:], 16)
270                         if unival == 0:
271                                 continue
272                         keysymdb[components[1][4:]] = unival
273                 else:
274                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
275                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
276                         print "Was expecting a hexadecimal number at the end of the line"
277                         sys.exit(-1)
278         gdkkeysymsh.close()
279
280         """ Patch up the keysymdb with some of our own stuff """
281
282         """ This is for a missing keysym from the currently upstream file """
283         keysymdb['dead_stroke'] = 0x338
284
285         """ This is for a missing keysym from the currently upstream file """
286         ###keysymdb['dead_belowring'] = 0x323
287         ###keysymdb['dead_belowmacron'] = 0x331
288         ###keysymdb['dead_belowcircumflex'] = 0x32d
289         ###keysymdb['dead_belowtilde'] = 0x330
290         ###keysymdb['dead_belowbreve'] = 0x32e
291         ###keysymdb['dead_belowdiaeresis'] = 0x324
292
293         """ This is^Wwas preferential treatment for Greek """
294         # keysymdb['dead_tilde'] = 0x342                
295         """ This is^was preferential treatment for Greek """
296         #keysymdb['combining_tilde'] = 0x342    
297
298         """ Fixing VoidSymbol """
299         keysymdb['VoidSymbol'] = 0xFFFF
300
301         return keysymdb
302
303 def process_keysymstxt():
304         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
305         """ This file keeps a record between keysyms <-> unicode chars """
306         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
307         try: 
308                 keysymstxt = open(filename_keysymstxt, 'r')
309         except IOError, (errno, strerror):
310                 print "I/O error(%s): %s" % (errno, strerror)
311                 sys.exit(-1)
312         except:
313                 print "Unexpected error: ", sys.exc_info()[0]
314                 sys.exit(-1)
315
316         """ Parse the keysyms.txt file and place content in  keysymdb """
317         linenum_keysymstxt = 0
318         keysymdb = {}
319         for line in keysymstxt.readlines():
320                 linenum_keysymstxt += 1
321                 line = line.strip()
322                 if line == "" or match('^#', line):
323                         continue
324                 components = split('\s+', line)
325                 if len(components) < 5:
326                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
327                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
328                         print "Was expecting 5 items in the line"
329                         sys.exit(-1)
330                 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
331                         unival = atoi(components[1][1:], 16)
332                 if unival == 0:
333                         continue
334                 keysymdb[components[4]] = unival
335         keysymstxt.close()
336
337         """ Patch up the keysymdb with some of our own stuff """
338         """ This is for a missing keysym from the currently upstream file """
339         ###keysymdb['dead_belowring'] = 0x323
340         ###keysymdb['dead_belowmacron'] = 0x331
341         ###keysymdb['dead_belowcircumflex'] = 0x32d
342         ###keysymdb['dead_belowtilde'] = 0x330
343         ###keysymdb['dead_belowbreve'] = 0x32e
344         ###keysymdb['dead_belowdiaeresis'] = 0x324
345
346         """ This is preferential treatment for Greek """
347         """ => we get more savings if used for Greek """
348         # keysymdb['dead_tilde'] = 0x342                
349         """ This is preferential treatment for Greek """
350         # keysymdb['combining_tilde'] = 0x342   
351
352         """ This is for a missing keysym from Markus Kuhn's db """
353         keysymdb['dead_stroke'] = 0x338
354         """ This is for a missing keysym from Markus Kuhn's db """
355         keysymdb['Oslash'] = 0x0d8              
356
357         """ This is for a missing (recently added) keysym """
358         keysymdb['dead_psili'] = 0x313          
359         """ This is for a missing (recently added) keysym """
360         keysymdb['dead_dasia'] = 0x314          
361
362         """ Allows to import Multi_key sequences """
363         keysymdb['Multi_key'] = 0xff20
364
365         return keysymdb
366
367 def keysymvalue(keysym, file = "n/a", linenum = 0):
368         """ Extracts a value from the keysym """
369         """ Find the value of keysym, using the data from keysyms """
370         """ Use file and linenum to when reporting errors """
371         if keysym == "":
372                 return 0
373         if keysymdatabase.has_key(keysym):
374                 return keysymdatabase[keysym]
375         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
376                 return atoi(keysym[1:], 16)
377         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
378                 return atoi(keysym[2:], 16)
379         else:
380                 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
381                 return -1
382                 #sys.exit(-1)
383
384 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
385         """ Extracts a value from the keysym """
386         """ Find the value of keysym, using the data from keysyms """
387         """ Use file and linenum to when reporting errors """
388         if keysym == "":
389                 return 0
390         if keysymunicodedatabase.has_key(keysym):
391                 return keysymunicodedatabase[keysym]
392         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
393                 return atoi(keysym[1:], 16)
394         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
395                 return atoi(keysym[2:], 16)
396         else:
397                 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
398                 sys.exit(-1)
399
400 def rename_combining(seq):
401         filtered_sequence = []
402         for ks in seq:
403                 if findall('^combining_', ks):
404                         filtered_sequence.append(sub('^combining_', 'dead_', ks))
405                 else:
406                         filtered_sequence.append(ks)
407         return filtered_sequence
408
409
410 keysymunicodedatabase = process_keysymstxt()
411 keysymdatabase = process_gdkkeysymsh()
412
413 """ Grab and open the compose file from upstream """
414 filename_compose = download_file(URL_COMPOSE)
415 try: 
416         composefile = open(filename_compose, 'r')
417 except IOError, (errno, strerror):
418         print "I/O error(%s): %s" % (errno, strerror)
419         sys.exit(-1)
420 except:
421         print "Unexpected error: ", sys.exc_info()[0]
422         sys.exit(-1)
423
424 """ Parse the compose file in  xorg_compose_sequences"""
425 xorg_compose_sequences = []
426 xorg_compose_sequences_algorithmic = []
427 linenum_compose = 0
428 for line in composefile.readlines():
429         linenum_compose += 1
430         line = line.strip()
431         if line is "" or match("^XCOMM", line) or match("^#", line):
432                 continue
433
434         line = line[:-1]
435         components = split(':', line)
436         if len(components) != 2:
437                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
438                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
439                 exit(-1)
440         (seq, val ) = split(':', line)
441         seq = seq.strip()
442         val = val.strip()
443         raw_sequence = findall('\w+', seq)
444         values = split('\s+', val)
445         unichar_temp = split('"', values[0])
446         unichar = unichar_temp[1]
447         if len(values) == 1:
448                 continue
449         codepointstr = values[1]
450         if values[1] == '#':
451                 # No codepoints that are >1 characters yet.
452                 continue
453         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
454                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
455         if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
456                 codepoint = atoi(codepointstr[1:], 16)
457         elif keysymunicodedatabase.has_key(codepointstr):
458                 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
459                         print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
460                         print raw_sequence, codepointstr
461                 codepoint = keysymunicodedatabase[codepointstr]
462         else:
463                 print
464                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
465                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
466                 exit(-1)
467         sequence = rename_combining(raw_sequence)
468         reject_this = False
469         for i in sequence:
470                 if keysymvalue(i) > 0xFFFF:
471                         reject_this = True
472                         if opt_plane1:
473                                 print sequence
474                         break
475                 if keysymvalue(i) < 0:
476                         reject_this = True
477                         break
478         if reject_this:
479                 continue
480         if "U0342" in sequence or \
481                 "U0313" in sequence or \
482                 "U0314" in sequence or \
483                 "0x0313" in sequence or \
484                 "0x0342" in sequence or \
485                 "0x0314" in sequence:
486                 continue
487         #for i in range(len(sequence)):
488         #       if sequence[i] == "0x0342":
489         #               sequence[i] = "dead_tilde"
490         if "Multi_key" not in sequence:
491                 """ Ignore for now >0xFFFF keysyms """
492                 if codepoint < 0xFFFF:
493                         original_sequence = copy(sequence)
494                         stats_sequence = copy(sequence)
495                         base = sequence.pop()
496                         basechar = keysymvalue(base, filename_compose, linenum_compose)
497                         
498                         if basechar < 0xFFFF:
499                                 counter = 1
500                                 unisequence = []
501                                 not_normalised = True
502                                 skipping_this = False
503                                 for i in range(0, len(sequence)):
504                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
505                                             because of lack of dead_perispomeni (i.e. conflict)
506                                         """
507                                         bc = basechar
508                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
509                                                 skipping_this = True
510                                                 break
511                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
512                                                 skipping_this = True
513                                                 break
514                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
515                                                 skipping_this = True
516                                                 break
517                                         if sequence[-1] == "dead_psili":
518                                                 sequence[i] = "dead_horn"
519                                         if sequence[-1] == "dead_dasia":
520                                                 sequence[-1] = "dead_ogonek"
521                                         """
522                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
523                                         
524                                 if skipping_this:
525                                         unisequence = []
526                                 for perm in all_permutations(unisequence):
527                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
528                                         # print counter, map(unichr, perm)
529                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
530                                         if len(normalized) == 1:
531                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
532                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
533                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
534                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
535                                                 stats_sequence_data.append(normalized)
536                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
537                                                 not_normalised = False
538                                                 break;
539                                         counter += 1
540                                 if not_normalised:
541                                         original_sequence.append(codepoint)
542                                         xorg_compose_sequences.append(original_sequence)
543                                         """ print xorg_compose_sequences[-1] """
544                                         
545                         else:
546                                 print "Error in base char !?!"
547                                 exit(-2)
548                 else:
549                         print "OVER", sequence
550                         exit(-1)
551         else:
552                 sequence.append(codepoint)
553                 xorg_compose_sequences.append(sequence)
554                 """ print xorg_compose_sequences[-1] """
555
556 def sequence_cmp(x, y):
557         if keysymvalue(x[0]) > keysymvalue(y[0]):
558                 return 1
559         elif keysymvalue(x[0]) < keysymvalue(y[0]):
560                 return -1
561         elif len(x) > len(y):
562                 return 1
563         elif len(x) < len(y):
564                 return -1
565         elif keysymvalue(x[1]) > keysymvalue(y[1]):
566                 return 1
567         elif keysymvalue(x[1]) < keysymvalue(y[1]):
568                 return -1
569         elif len(x) < 4:
570                 return 0
571         elif keysymvalue(x[2]) > keysymvalue(y[2]):
572                 return 1
573         elif keysymvalue(x[2]) < keysymvalue(y[2]):
574                 return -1
575         elif len(x) < 5:
576                 return 0
577         elif keysymvalue(x[3]) > keysymvalue(y[3]):
578                 return 1
579         elif keysymvalue(x[3]) < keysymvalue(y[3]):
580                 return -1
581         elif len(x) < 6:
582                 return 0
583         elif keysymvalue(x[4]) > keysymvalue(y[4]):
584                 return 1
585         elif keysymvalue(x[4]) < keysymvalue(y[4]):
586                 return -1
587         else:
588                 return 0
589
590 def sequence_unicode_cmp(x, y):
591         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
592                 return 1
593         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
594                 return -1
595         elif len(x) > len(y):
596                 return 1
597         elif len(x) < len(y):
598                 return -1
599         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
600                 return 1
601         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
602                 return -1
603         elif len(x) < 4:
604                 return 0
605         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
606                 return 1
607         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
608                 return -1
609         elif len(x) < 5:
610                 return 0
611         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
612                 return 1
613         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
614                 return -1
615         elif len(x) < 6:
616                 return 0
617         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
618                 return 1
619         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
620                 return -1
621         else:
622                 return 0
623
624 def sequence_algorithmic_cmp(x, y):
625         if len(x) < len(y):
626                 return -1
627         elif len(x) > len(y):
628                 return 1
629         else:
630                 for i in range(len(x)):
631                         if x[i] < y[i]:
632                                 return -1
633                         elif x[i] > y[i]:
634                                 return 1
635         return 0
636
637
638 xorg_compose_sequences.sort(sequence_cmp)
639
640 xorg_compose_sequences_uniqued = []
641 first_time = True
642 item = None
643 for next_item in xorg_compose_sequences:
644         if first_time:
645                 first_time = False
646                 item = next_item
647         if sequence_unicode_cmp(item, next_item) != 0:
648                 xorg_compose_sequences_uniqued.append(item)
649         item = next_item
650
651 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
652
653 counter_multikey = 0
654 for item in xorg_compose_sequences:
655         if findall('Multi_key', "".join(item[:-1])) != []:
656                 counter_multikey += 1
657
658 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
659 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
660
661 firstitem = ""
662 num_first_keysyms = 0
663 zeroes = 0
664 num_entries = 0
665 num_algorithmic_greek = 0
666 for sequence in xorg_compose_sequences:
667         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
668                 firstitem = sequence[0]
669                 num_first_keysyms += 1
670         zeroes += 6 - len(sequence) + 1
671         num_entries += 1
672
673 for sequence in xorg_compose_sequences_algorithmic_uniqued:
674         ch = ord(sequence[-1:][0])
675         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
676                 num_algorithmic_greek += 1
677                 
678
679 if opt_algorithmic:
680         for sequence in xorg_compose_sequences_algorithmic_uniqued:
681                 letter = "".join(sequence[-1:])
682                 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
683                 for elem in sequence[:-2]:
684                         print "<0x%(keysym)04X>," % { 'keysym': elem },
685                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
686                 print "], recomposed as", letter, "verified"
687
688 def num_of_keysyms(seq):
689         return len(seq) - 1
690
691 def convert_UnotationToHex(arg):
692         if isinstance(arg, str):
693                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
694                         return sub('^U', '0x', arg)
695         return arg
696
697 def addprefix_GDK(arg):
698         if match('^0x', arg):
699                 return '%(arg)s, ' % { 'arg': arg } 
700         else:
701                 return 'GDK_%(arg)s, ' % { 'arg': arg } 
702
703 if opt_gtk:
704         first_keysym = ""
705         sequence = []
706         compose_table = []
707         ct_second_part = []
708         ct_sequence_width = 2
709         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
710         we_finished = False
711         counter = 0
712
713         sequence_iterator = iter(xorg_compose_sequences)
714         sequence = sequence_iterator.next()
715         while True:
716                 first_keysym = sequence[0]                                      # Set the first keysym
717                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
718                 while sequence[0] == first_keysym:
719                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
720                         try:
721                                 sequence = sequence_iterator.next()
722                         except StopIteration:
723                                 we_finished = True
724                                 break
725                 if we_finished:
726                         break
727                 counter += 1
728
729         ct_index = start_offset
730         for line_num in range(len(compose_table)):
731                 for i in range(WIDTHOFCOMPOSETABLE):
732                         occurences = compose_table[line_num][i+1]
733                         compose_table[line_num][i+1] = ct_index
734                         ct_index += occurences * (i+2)
735
736         for sequence in xorg_compose_sequences:
737                 ct_second_part.append(map(convert_UnotationToHex, sequence))
738
739         print headerfile_start
740         for i in compose_table:
741                 if opt_gtkexpanded:
742                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
743                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
744                 elif not match('^0x', i[0]):
745                         print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
746                 else:
747                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
748         for i in ct_second_part:
749                 if opt_numeric:
750                         for ks in i[1:][:-1]:
751                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
752                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
753                         """
754                         for ks in i[:-1]:
755                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
756                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
757                         """
758                 elif opt_gtkexpanded:
759                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
760                 else:
761                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
762         print headerfile_end 
763
764 def redecompose(codepoint):
765         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
766         if decomposition[0] == '' or decomposition[0] == '0':
767                 return [codepoint]
768         if match('<\w+>', decomposition[0]):
769                 numdecomposition = map(stringtohex, decomposition[1:])
770                 return map(redecompose, numdecomposition)
771         numdecomposition = map(stringtohex, decomposition)
772         return map(redecompose, numdecomposition)
773
774 def process_unicodedata_file(verbose = False):
775         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
776         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
777         try: 
778                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
779         except IOError, (errno, strerror):
780                 print "I/O error(%s): %s" % (errno, strerror)
781                 sys.exit(-1)
782         except:
783                 print "Unexpected error: ", sys.exc_info()[0]
784                 sys.exit(-1)
785         for line in unicodedatatxt.readlines():
786                 if line[0] == "" or line[0] == '#':
787                         continue
788                 line = line[:-1]
789                 uniproperties = split(';', line)
790                 codepoint = stringtohex(uniproperties[0])
791                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
792                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
793                         continue
794                 name = uniproperties[1]
795                 category = uniproperties[2]
796                 combiningclass = uniproperties[3]
797                 decomposition = uniproperties[5]
798                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
799         
800         counter_combinations = 0
801         counter_combinations_greek = 0
802         counter_entries = 0
803         counter_entries_greek = 0
804
805         for item in unicodedatabase.keys():
806                 (name, decomposition, combiningclass) = unicodedatabase[item]
807                 if decomposition[0] == '':
808                         continue
809                         print name, "is empty"
810                 elif match('<\w+>', decomposition[0]):
811                         continue
812                         print name, "has weird", decomposition[0]
813                 else:
814                         sequence = map(stringtohex, decomposition)
815                         chrsequence = map(unichr, sequence)
816                         normalized = normalize('NFC', "".join(chrsequence))
817                         
818                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
819                         decomposedsequence = []
820                         for subseq in map(redecompose, sequence):
821                                 for seqitem in subseq:
822                                         if isinstance(seqitem, list):
823                                                 for i in seqitem:
824                                                         if isinstance(i, list):
825                                                                 for j in i:
826                                                                         decomposedsequence.append(j)
827                                                         else:
828                                                                 decomposedsequence.append(i)
829                                         else:
830                                                 decomposedsequence.append(seqitem)
831                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
832                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
833                                 counter_entries += 1
834                                 counter_combinations += factorial(len(decomposedsequence)-1)
835                                 ch = item
836                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
837                                         counter_entries_greek += 1
838                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
839                                 if verbose:
840                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
841                                         print "[",
842                                         for elem in decomposedsequence:
843                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
844                                         print "], recomposed as", recomposedchar,
845                                         if unichr(item) == recomposedchar:
846                                                 print "verified"
847         
848         if verbose == False:
849                 print "Unicode statistics from UnicodeData.txt"
850                 print "Number of entries that can be algorithmically produced     :", counter_entries
851                 print "  of which are for Greek                                   :", counter_entries_greek
852                 print "Number of compose sequence combinations requiring          :", counter_combinations
853                 print "  of which are for Greek                                   :", counter_combinations_greek
854                 print "Note: We do not include partial compositions, "
855                 print "thus the slight discrepancy in the figures"
856                 print
857
858 if opt_unicodedatatxt:
859         process_unicodedata_file(True)
860
861 if opt_statistics:
862         print
863         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
864         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
865         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
866         print "    of which have Multi_key                                :", counter_multikey
867         print 
868         print "Algorithmic (stats for Xorg Compose file)"
869         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
870         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
871         print "  of which are for Greek                                   :", num_algorithmic_greek
872         print 
873         process_unicodedata_file()
874         print "Not algorithmic (stats from Xorg Compose file)"
875         print "Number of sequences                                        :", len(xorg_compose_sequences) 
876         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
877         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
878         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
879         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
880         print "Number of different first items                            :", num_first_keysyms
881         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
882         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
883         print 
884         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
885         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
886         print
887         print "Existing (old) implementation in GTK+"
888         print "Number of sequences in old gtkimcontextsimple.c            :", 691
889         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"