]> Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py
Closes #557420 – Some compose sequences don't work anymore
[~andy/gtk] / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
33
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
36
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
40
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42  * Copyright (C) 2007, 2008 GNOME Foundation
43  *
44  * This library is free software; you can redistribute it and/or
45  * modify it under the terms of the GNU Lesser General Public
46  * License as published by the Free Software Foundation; either
47  * version 2 of the License, or (at your option) any later version.
48  *
49  * This library is distributed in the hope that it will be useful,
50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52  * Lesser General Public License for more details.
53  *
54  * You should have received a copy of the GNU Lesser General Public
55  * License along with this library; if not, write to the
56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57  * Boston, MA 02111-1307, USA.
58  */
59
60 /*
61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62  * using the input files
63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66  *
67  * This table is optimised for space and requires special handling to access the content.
68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69  * 
70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
72  */
73
74 /*
75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
76  * file for a list of people on the GTK+ Team.  See the ChangeLog
77  * files for a list of changes.  These files are distributed with
78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79  */
80
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83
84 /* === These are the original comments of the file; we keep for historical purposes ===
85  *
86  * The following table was generated from the X compose tables include with
87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88  * to obtain the relevant perl scripts.
89  *
90  * The following compose letter letter sequences confliced
91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
97  *
98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99  *   spanish. atilde and otilde are used at least for Portuguese ]
100  *
101  *   at and Aring; resolved to Aring                                          [ AA ]
102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
104  *
105  * This probably should be resolved by first checking an additional set of compose tables
106  * that depend on the locale or selected input method.
107  */
108
109 static const guint16 gtk_compose_seqs_compact[] = {"""
110
111 headerfile_end = """};
112
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114 """
115
116 def stringtohex(str): return atoi(str, 16)
117
118 def factorial(n): 
119         if n <= 1:
120                 return 1
121         else:
122                 return n * factorial(n-1)
123
124 def uniq(*args) :
125         """ Performs a uniq operation on a list or lists """
126         theInputList = []
127         for theList in args:
128            theInputList += theList
129         theFinalList = []
130         for elem in theInputList:
131                 if elem not in theFinalList:
132                         theFinalList.append(elem)
133         return theFinalList
134
135
136
137 def all_permutations(seq):
138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139         """ Produces all permutations of the items of a list """
140         if len(seq) <=1:
141             yield seq
142         else:
143             for perm in all_permutations(seq[1:]):
144                 for i in range(len(perm)+1):
145                     #nb str[0:1] works in both string and list contexts
146                         yield perm[:i] + seq[0:1] + perm[i:]
147
148 def usage():
149         print """compose-parse available parameters:
150         -h, --help              this craft
151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
152         -a, --algorithmic       show sequences saved with algorithmic optimisation
153         -g, --gtk               show entries that go to GTK+
154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
155         -v, --verbose           show verbose output
156         -p, --plane1            show plane1 compose sequences
157         -n, --numeric           when used with --gtk, create file with numeric values only
158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
159
160         Default is to show statistics.
161         """
162
163 try: 
164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
166 except: 
167         usage()
168         sys.exit(2)
169
170 opt_statistics = False
171 opt_algorithmic = False
172 opt_gtk = False
173 opt_unicodedatatxt = False
174 opt_verbose = False
175 opt_plane1 = False
176 opt_numeric = False
177 opt_gtkexpanded = False
178
179 for o, a in opts:
180         if o in ("-h", "--help"):
181                 usage()
182                 sys.exit()
183         if o in ("-s", "--statistics"):
184                 opt_statistics = True
185         if o in ("-a", "--algorithmic"):
186                 opt_algorithmic = True
187         if o in ("-g", "--gtk"):
188                 opt_gtk = True  
189         if o in ("-u", "--unicodedatatxt"):
190                 opt_unicodedatatxt = True
191         if o in ("-v", "--verbose"):
192                 opt_verbose = True
193         if o in ("-p", "--plane1"):
194                 opt_plane1 = True
195         if o in ("-n", "--numeric"):
196                 opt_numeric = True
197         if o in ("-e", "--gtk-expanded"):
198                 opt_gtkexpanded = True
199
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201         opt_statistics = True
202
203 def download_hook(blocks_transferred, block_size, file_size):
204         """ A download hook to provide some feedback when downloading """
205         if blocks_transferred == 0:
206                 if file_size > 0:
207                         if opt_verbose:
208                                 print "Downloading", file_size, "bytes: ",
209                 else:   
210                         if opt_verbose:
211                                 print "Downloading: ",
212         sys.stdout.write('#')
213         sys.stdout.flush()
214
215
216 def download_file(url):
217         """ Downloads a file provided a URL. Returns the filename. """
218         """ Borks on failure """
219         localfilename = url.split('/')[-1]
220         if not isfile(localfilename) or getsize(localfilename) <= 0:
221                 if opt_verbose:
222                         print "Downloading ", url, "..."
223                 try: 
224                         urlretrieve(url, localfilename, download_hook)
225                 except IOError, (errno, strerror):
226                         print "I/O error(%s): %s" % (errno, strerror)
227                         sys.exit(-1)
228                 except:
229                         print "Unexpected error: ", sys.exc_info()[0]
230                         sys.exit(-1)
231                 print " done."
232         else:
233                 if opt_verbose:
234                         print "Using cached file for ", url
235         return localfilename
236
237 def process_gdkkeysymsh():
238         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239         """ Fills up keysymdb with contents """
240         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
241         try: 
242                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243         except IOError, (errno, strerror):
244                 print "I/O error(%s): %s" % (errno, strerror)
245                 sys.exit(-1)
246         except:
247                 print "Unexpected error: ", sys.exc_info()[0]
248                 sys.exit(-1)
249
250         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
251         linenum_gdkkeysymsh = 0
252         keysymdb = {}
253         for line in gdkkeysymsh.readlines():
254                 linenum_gdkkeysymsh += 1
255                 line = line.strip()
256                 if line == "" or not match('^#define GDK_', line):
257                         continue
258                 components = split('\s+', line)
259                 if len(components) < 3:
260                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262                         print "Was expecting 3 items in the line"
263                         sys.exit(-1)
264                 if not match('^GDK_', components[1]):
265                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267                         print "Was expecting a keysym starting with GDK_"
268                         sys.exit(-1)
269                 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
270                         unival = atoi(components[2][2:], 16)
271                         if unival == 0:
272                                 continue
273                         keysymdb[components[1][4:]] = unival
274                 else:
275                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277                         print "Was expecting a hexadecimal number at the end of the line"
278                         sys.exit(-1)
279         gdkkeysymsh.close()
280
281         """ Patch up the keysymdb with some of our own stuff """
282
283         """ This is for a missing keysym from the currently upstream file """
284         keysymdb['dead_stroke'] = 0x338
285
286         """ This is for a missing keysym from the currently upstream file """
287         ###keysymdb['dead_belowring'] = 0x323
288         ###keysymdb['dead_belowmacron'] = 0x331
289         ###keysymdb['dead_belowcircumflex'] = 0x32d
290         ###keysymdb['dead_belowtilde'] = 0x330
291         ###keysymdb['dead_belowbreve'] = 0x32e
292         ###keysymdb['dead_belowdiaeresis'] = 0x324
293
294         """ This is^Wwas preferential treatment for Greek """
295         # keysymdb['dead_tilde'] = 0x342                
296         """ This is^was preferential treatment for Greek """
297         #keysymdb['combining_tilde'] = 0x342    
298
299         """ Fixing VoidSymbol """
300         keysymdb['VoidSymbol'] = 0xFFFF
301
302         return keysymdb
303
304 def process_keysymstxt():
305         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306         """ This file keeps a record between keysyms <-> unicode chars """
307         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
308         try: 
309                 keysymstxt = open(filename_keysymstxt, 'r')
310         except IOError, (errno, strerror):
311                 print "I/O error(%s): %s" % (errno, strerror)
312                 sys.exit(-1)
313         except:
314                 print "Unexpected error: ", sys.exc_info()[0]
315                 sys.exit(-1)
316
317         """ Parse the keysyms.txt file and place content in  keysymdb """
318         linenum_keysymstxt = 0
319         keysymdb = {}
320         for line in keysymstxt.readlines():
321                 linenum_keysymstxt += 1
322                 line = line.strip()
323                 if line == "" or match('^#', line):
324                         continue
325                 components = split('\s+', line)
326                 if len(components) < 5:
327                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329                         print "Was expecting 5 items in the line"
330                         sys.exit(-1)
331                 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
332                         unival = atoi(components[1][1:], 16)
333                 if unival == 0:
334                         continue
335                 keysymdb[components[4]] = unival
336         keysymstxt.close()
337
338         """ Patch up the keysymdb with some of our own stuff """
339         """ This is for a missing keysym from the currently upstream file """
340         ###keysymdb['dead_belowring'] = 0x323
341         ###keysymdb['dead_belowmacron'] = 0x331
342         ###keysymdb['dead_belowcircumflex'] = 0x32d
343         ###keysymdb['dead_belowtilde'] = 0x330
344         ###keysymdb['dead_belowbreve'] = 0x32e
345         ###keysymdb['dead_belowdiaeresis'] = 0x324
346
347         """ This is preferential treatment for Greek """
348         """ => we get more savings if used for Greek """
349         # keysymdb['dead_tilde'] = 0x342                
350         """ This is preferential treatment for Greek """
351         # keysymdb['combining_tilde'] = 0x342   
352
353         """ This is for a missing keysym from Markus Kuhn's db """
354         keysymdb['dead_stroke'] = 0x338
355         """ This is for a missing keysym from Markus Kuhn's db """
356         keysymdb['Oslash'] = 0x0d8              
357
358         """ This is for a missing (recently added) keysym """
359         keysymdb['dead_psili'] = 0x313          
360         """ This is for a missing (recently added) keysym """
361         keysymdb['dead_dasia'] = 0x314          
362
363         """ Allows to import Multi_key sequences """
364         keysymdb['Multi_key'] = 0xff20
365
366         return keysymdb
367
368 def keysymvalue(keysym, file = "n/a", linenum = 0):
369         """ Extracts a value from the keysym """
370         """ Find the value of keysym, using the data from keysyms """
371         """ Use file and linenum to when reporting errors """
372         if keysym == "":
373                 return 0
374         if keysymdatabase.has_key(keysym):
375                 return keysymdatabase[keysym]
376         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
377                 return atoi(keysym[1:], 16)
378         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
379                 return atoi(keysym[2:], 16)
380         else:
381                 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
382                 return -1
383                 #sys.exit(-1)
384
385 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
386         """ Extracts a value from the keysym """
387         """ Find the value of keysym, using the data from keysyms """
388         """ Use file and linenum to when reporting errors """
389         if keysym == "":
390                 return 0
391         if keysymunicodedatabase.has_key(keysym):
392                 return keysymunicodedatabase[keysym]
393         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
394                 return atoi(keysym[1:], 16)
395         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
396                 return atoi(keysym[2:], 16)
397         else:
398                 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
399                 sys.exit(-1)
400
401 def rename_combining(seq):
402         filtered_sequence = []
403         for ks in seq:
404                 if findall('^combining_', ks):
405                         filtered_sequence.append(sub('^combining_', 'dead_', ks))
406                 else:
407                         filtered_sequence.append(ks)
408         return filtered_sequence
409
410
411 keysymunicodedatabase = process_keysymstxt()
412 keysymdatabase = process_gdkkeysymsh()
413
414 """ Grab and open the compose file from upstream """
415 filename_compose = download_file(URL_COMPOSE)
416 try: 
417         composefile = open(filename_compose, 'r')
418 except IOError, (errno, strerror):
419         print "I/O error(%s): %s" % (errno, strerror)
420         sys.exit(-1)
421 except:
422         print "Unexpected error: ", sys.exc_info()[0]
423         sys.exit(-1)
424
425 """ Look if there is a lookaside (supplementary) compose file in the current
426     directory, and if so, open, then merge with upstream Compose file.
427 """
428 try:
429         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
430 except IOError, (errno, strerror):
431         if not opt_quiet:
432                 print "I/O error(%s): %s" % (errno, strerror)
433                 print "Did not find lookaside compose file. Continuing..."
434 except:
435         print "Unexpected error: ", sys.exc_info()[0]
436         sys.exit(-1)
437
438 xorg_compose_sequences_raw = []
439 for seq in composefile.readlines():
440         xorg_compose_sequences_raw.append(seq)
441 for seq in composefile_lookaside.readlines():
442         xorg_compose_sequences_raw.append(seq)
443
444 """ Parse the compose file in  xorg_compose_sequences"""
445 xorg_compose_sequences = []
446 xorg_compose_sequences_algorithmic = []
447 linenum_compose = 0
448 for line in xorg_compose_sequences_raw:
449         linenum_compose += 1
450         line = line.strip()
451         if line is "" or match("^XCOMM", line) or match("^#", line):
452                 continue
453
454         #line = line[:-1]
455         components = split(':', line)
456         if len(components) != 2:
457                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
458                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
459                 exit(-1)
460         (seq, val ) = split(':', line)
461         seq = seq.strip()
462         val = val.strip()
463         raw_sequence = findall('\w+', seq)
464         values = split('\s+', val)
465         unichar_temp = split('"', values[0])
466         unichar = unichar_temp[1]
467         if len(values) == 1:
468                 continue
469         codepointstr = values[1]
470         if values[1] == '#':
471                 # No codepoints that are >1 characters yet.
472                 continue
473         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
474                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
475         if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
476                 codepoint = atoi(codepointstr[1:], 16)
477         elif keysymunicodedatabase.has_key(codepointstr):
478                 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
479                         print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
480                         print raw_sequence, codepointstr
481                 codepoint = keysymunicodedatabase[codepointstr]
482         else:
483                 print
484                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
485                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
486                 exit(-1)
487         sequence = rename_combining(raw_sequence)
488         reject_this = False
489         for i in sequence:
490                 if keysymvalue(i) > 0xFFFF:
491                         reject_this = True
492                         if opt_plane1:
493                                 print sequence
494                         break
495                 if keysymvalue(i) < 0:
496                         reject_this = True
497                         break
498         if reject_this:
499                 continue
500         if "U0342" in sequence or \
501                 "U0313" in sequence or \
502                 "U0314" in sequence or \
503                 "0x0313" in sequence or \
504                 "0x0342" in sequence or \
505                 "0x0314" in sequence:
506                 continue
507         if "dead_belowring" in sequence or\
508                 "dead_belowcomma" in sequence or\
509                 "dead_belowmacron" in sequence or\
510                 "dead_belowtilde" in sequence or\
511                 "dead_belowbreve" in sequence or\
512                 "dead_belowdiaeresis" in sequence or\
513                 "dead_belowcircumflex" in sequence:
514                 continue
515         #for i in range(len(sequence)):
516         #       if sequence[i] == "0x0342":
517         #               sequence[i] = "dead_tilde"
518         if "Multi_key" not in sequence:
519                 """ Ignore for now >0xFFFF keysyms """
520                 if codepoint < 0xFFFF:
521                         original_sequence = copy(sequence)
522                         stats_sequence = copy(sequence)
523                         base = sequence.pop()
524                         basechar = keysymvalue(base, filename_compose, linenum_compose)
525                         
526                         if basechar < 0xFFFF:
527                                 counter = 1
528                                 unisequence = []
529                                 not_normalised = True
530                                 skipping_this = False
531                                 for i in range(0, len(sequence)):
532                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
533                                             because of lack of dead_perispomeni (i.e. conflict)
534                                         """
535                                         bc = basechar
536                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
537                                                 skipping_this = True
538                                                 break
539                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
540                                                 skipping_this = True
541                                                 break
542                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
543                                                 skipping_this = True
544                                                 break
545                                         if sequence[-1] == "dead_psili":
546                                                 sequence[i] = "dead_horn"
547                                         if sequence[-1] == "dead_dasia":
548                                                 sequence[-1] = "dead_ogonek"
549                                         """
550                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
551                                         
552                                 if skipping_this:
553                                         unisequence = []
554                                 for perm in all_permutations(unisequence):
555                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
556                                         # print counter, map(unichr, perm)
557                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
558                                         if len(normalized) == 1:
559                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
560                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
561                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
562                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
563                                                 stats_sequence_data.append(normalized)
564                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
565                                                 not_normalised = False
566                                                 break;
567                                         counter += 1
568                                 if not_normalised:
569                                         original_sequence.append(codepoint)
570                                         xorg_compose_sequences.append(original_sequence)
571                                         """ print xorg_compose_sequences[-1] """
572                                         
573                         else:
574                                 print "Error in base char !?!"
575                                 exit(-2)
576                 else:
577                         print "OVER", sequence
578                         exit(-1)
579         else:
580                 sequence.append(codepoint)
581                 xorg_compose_sequences.append(sequence)
582                 """ print xorg_compose_sequences[-1] """
583
584 def sequence_cmp(x, y):
585         if keysymvalue(x[0]) > keysymvalue(y[0]):
586                 return 1
587         elif keysymvalue(x[0]) < keysymvalue(y[0]):
588                 return -1
589         elif len(x) > len(y):
590                 return 1
591         elif len(x) < len(y):
592                 return -1
593         elif keysymvalue(x[1]) > keysymvalue(y[1]):
594                 return 1
595         elif keysymvalue(x[1]) < keysymvalue(y[1]):
596                 return -1
597         elif len(x) < 4:
598                 return 0
599         elif keysymvalue(x[2]) > keysymvalue(y[2]):
600                 return 1
601         elif keysymvalue(x[2]) < keysymvalue(y[2]):
602                 return -1
603         elif len(x) < 5:
604                 return 0
605         elif keysymvalue(x[3]) > keysymvalue(y[3]):
606                 return 1
607         elif keysymvalue(x[3]) < keysymvalue(y[3]):
608                 return -1
609         elif len(x) < 6:
610                 return 0
611         elif keysymvalue(x[4]) > keysymvalue(y[4]):
612                 return 1
613         elif keysymvalue(x[4]) < keysymvalue(y[4]):
614                 return -1
615         else:
616                 return 0
617
618 def sequence_unicode_cmp(x, y):
619         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
620                 return 1
621         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
622                 return -1
623         elif len(x) > len(y):
624                 return 1
625         elif len(x) < len(y):
626                 return -1
627         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
628                 return 1
629         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
630                 return -1
631         elif len(x) < 4:
632                 return 0
633         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
634                 return 1
635         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
636                 return -1
637         elif len(x) < 5:
638                 return 0
639         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
640                 return 1
641         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
642                 return -1
643         elif len(x) < 6:
644                 return 0
645         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
646                 return 1
647         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
648                 return -1
649         else:
650                 return 0
651
652 def sequence_algorithmic_cmp(x, y):
653         if len(x) < len(y):
654                 return -1
655         elif len(x) > len(y):
656                 return 1
657         else:
658                 for i in range(len(x)):
659                         if x[i] < y[i]:
660                                 return -1
661                         elif x[i] > y[i]:
662                                 return 1
663         return 0
664
665
666 xorg_compose_sequences.sort(sequence_cmp)
667
668 xorg_compose_sequences_uniqued = []
669 first_time = True
670 item = None
671 for next_item in xorg_compose_sequences:
672         if first_time:
673                 first_time = False
674                 item = next_item
675         if sequence_unicode_cmp(item, next_item) != 0:
676                 xorg_compose_sequences_uniqued.append(item)
677         item = next_item
678
679 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
680
681 counter_multikey = 0
682 for item in xorg_compose_sequences:
683         if findall('Multi_key', "".join(item[:-1])) != []:
684                 counter_multikey += 1
685
686 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
687 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
688
689 firstitem = ""
690 num_first_keysyms = 0
691 zeroes = 0
692 num_entries = 0
693 num_algorithmic_greek = 0
694 for sequence in xorg_compose_sequences:
695         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
696                 firstitem = sequence[0]
697                 num_first_keysyms += 1
698         zeroes += 6 - len(sequence) + 1
699         num_entries += 1
700
701 for sequence in xorg_compose_sequences_algorithmic_uniqued:
702         ch = ord(sequence[-1:][0])
703         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
704                 num_algorithmic_greek += 1
705                 
706
707 if opt_algorithmic:
708         for sequence in xorg_compose_sequences_algorithmic_uniqued:
709                 letter = "".join(sequence[-1:])
710                 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
711                 for elem in sequence[:-2]:
712                         print "<0x%(keysym)04X>," % { 'keysym': elem },
713                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
714                 print "], recomposed as", letter, "verified"
715
716 def num_of_keysyms(seq):
717         return len(seq) - 1
718
719 def convert_UnotationToHex(arg):
720         if isinstance(arg, str):
721                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
722                         return sub('^U', '0x', arg)
723         return arg
724
725 def addprefix_GDK(arg):
726         if match('^0x', arg):
727                 return '%(arg)s, ' % { 'arg': arg } 
728         else:
729                 return 'GDK_%(arg)s, ' % { 'arg': arg } 
730
731 if opt_gtk:
732         first_keysym = ""
733         sequence = []
734         compose_table = []
735         ct_second_part = []
736         ct_sequence_width = 2
737         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
738         we_finished = False
739         counter = 0
740
741         sequence_iterator = iter(xorg_compose_sequences)
742         sequence = sequence_iterator.next()
743         while True:
744                 first_keysym = sequence[0]                                      # Set the first keysym
745                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
746                 while sequence[0] == first_keysym:
747                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
748                         try:
749                                 sequence = sequence_iterator.next()
750                         except StopIteration:
751                                 we_finished = True
752                                 break
753                 if we_finished:
754                         break
755                 counter += 1
756
757         ct_index = start_offset
758         for line_num in range(len(compose_table)):
759                 for i in range(WIDTHOFCOMPOSETABLE):
760                         occurences = compose_table[line_num][i+1]
761                         compose_table[line_num][i+1] = ct_index
762                         ct_index += occurences * (i+2)
763
764         for sequence in xorg_compose_sequences:
765                 ct_second_part.append(map(convert_UnotationToHex, sequence))
766
767         print headerfile_start
768         for i in compose_table:
769                 if opt_gtkexpanded:
770                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
771                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
772                 elif not match('^0x', i[0]):
773                         print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
774                 else:
775                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
776         for i in ct_second_part:
777                 if opt_numeric:
778                         for ks in i[1:][:-1]:
779                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
780                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
781                         """
782                         for ks in i[:-1]:
783                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
784                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
785                         """
786                 elif opt_gtkexpanded:
787                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
788                 else:
789                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
790         print headerfile_end 
791
792 def redecompose(codepoint):
793         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
794         if decomposition[0] == '' or decomposition[0] == '0':
795                 return [codepoint]
796         if match('<\w+>', decomposition[0]):
797                 numdecomposition = map(stringtohex, decomposition[1:])
798                 return map(redecompose, numdecomposition)
799         numdecomposition = map(stringtohex, decomposition)
800         return map(redecompose, numdecomposition)
801
802 def process_unicodedata_file(verbose = False):
803         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
804         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
805         try: 
806                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
807         except IOError, (errno, strerror):
808                 print "I/O error(%s): %s" % (errno, strerror)
809                 sys.exit(-1)
810         except:
811                 print "Unexpected error: ", sys.exc_info()[0]
812                 sys.exit(-1)
813         for line in unicodedatatxt.readlines():
814                 if line[0] == "" or line[0] == '#':
815                         continue
816                 line = line[:-1]
817                 uniproperties = split(';', line)
818                 codepoint = stringtohex(uniproperties[0])
819                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
820                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
821                         continue
822                 name = uniproperties[1]
823                 category = uniproperties[2]
824                 combiningclass = uniproperties[3]
825                 decomposition = uniproperties[5]
826                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
827         
828         counter_combinations = 0
829         counter_combinations_greek = 0
830         counter_entries = 0
831         counter_entries_greek = 0
832
833         for item in unicodedatabase.keys():
834                 (name, decomposition, combiningclass) = unicodedatabase[item]
835                 if decomposition[0] == '':
836                         continue
837                         print name, "is empty"
838                 elif match('<\w+>', decomposition[0]):
839                         continue
840                         print name, "has weird", decomposition[0]
841                 else:
842                         sequence = map(stringtohex, decomposition)
843                         chrsequence = map(unichr, sequence)
844                         normalized = normalize('NFC', "".join(chrsequence))
845                         
846                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
847                         decomposedsequence = []
848                         for subseq in map(redecompose, sequence):
849                                 for seqitem in subseq:
850                                         if isinstance(seqitem, list):
851                                                 for i in seqitem:
852                                                         if isinstance(i, list):
853                                                                 for j in i:
854                                                                         decomposedsequence.append(j)
855                                                         else:
856                                                                 decomposedsequence.append(i)
857                                         else:
858                                                 decomposedsequence.append(seqitem)
859                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
860                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
861                                 counter_entries += 1
862                                 counter_combinations += factorial(len(decomposedsequence)-1)
863                                 ch = item
864                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
865                                         counter_entries_greek += 1
866                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
867                                 if verbose:
868                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
869                                         print "[",
870                                         for elem in decomposedsequence:
871                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
872                                         print "], recomposed as", recomposedchar,
873                                         if unichr(item) == recomposedchar:
874                                                 print "verified"
875         
876         if verbose == False:
877                 print "Unicode statistics from UnicodeData.txt"
878                 print "Number of entries that can be algorithmically produced     :", counter_entries
879                 print "  of which are for Greek                                   :", counter_entries_greek
880                 print "Number of compose sequence combinations requiring          :", counter_combinations
881                 print "  of which are for Greek                                   :", counter_combinations_greek
882                 print "Note: We do not include partial compositions, "
883                 print "thus the slight discrepancy in the figures"
884                 print
885
886 if opt_unicodedatatxt:
887         process_unicodedata_file(True)
888
889 if opt_statistics:
890         print
891         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
892         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
893         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
894         print "    of which have Multi_key                                :", counter_multikey
895         print 
896         print "Algorithmic (stats for Xorg Compose file)"
897         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
898         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
899         print "  of which are for Greek                                   :", num_algorithmic_greek
900         print 
901         process_unicodedata_file()
902         print "Not algorithmic (stats from Xorg Compose file)"
903         print "Number of sequences                                        :", len(xorg_compose_sequences) 
904         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
905         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
906         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
907         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
908         print "Number of different first items                            :", num_first_keysyms
909         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
910         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
911         print 
912         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
913         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
914         print
915         print "Existing (old) implementation in GTK+"
916         print "Number of sequences in old gtkimcontextsimple.c            :", 691
917         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"