]> Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py
Add some keysyms missing from keysyms.txt
[~andy/gtk] / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
33
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
36
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
40
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42  * Copyright (C) 2007, 2008 GNOME Foundation
43  *
44  * This library is free software; you can redistribute it and/or
45  * modify it under the terms of the GNU Lesser General Public
46  * License as published by the Free Software Foundation; either
47  * version 2 of the License, or (at your option) any later version.
48  *
49  * This library is distributed in the hope that it will be useful,
50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52  * Lesser General Public License for more details.
53  *
54  * You should have received a copy of the GNU Lesser General Public
55  * License along with this library; if not, write to the
56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57  * Boston, MA 02111-1307, USA.
58  */
59
60 /*
61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62  * using the input files
63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66  *
67  * This table is optimised for space and requires special handling to access the content.
68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69  * 
70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
72  */
73
74 /*
75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
76  * file for a list of people on the GTK+ Team.  See the ChangeLog
77  * files for a list of changes.  These files are distributed with
78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79  */
80
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83
84 /* === These are the original comments of the file; we keep for historical purposes ===
85  *
86  * The following table was generated from the X compose tables include with
87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88  * to obtain the relevant perl scripts.
89  *
90  * The following compose letter letter sequences confliced
91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
97  *
98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99  *   spanish. atilde and otilde are used at least for Portuguese ]
100  *
101  *   at and Aring; resolved to Aring                                          [ AA ]
102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
104  *
105  * This probably should be resolved by first checking an additional set of compose tables
106  * that depend on the locale or selected input method.
107  */
108
109 static const guint16 gtk_compose_seqs_compact[] = {"""
110
111 headerfile_end = """};
112
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114 """
115
116 def stringtohex(str): return atoi(str, 16)
117
118 def factorial(n): 
119         if n <= 1:
120                 return 1
121         else:
122                 return n * factorial(n-1)
123
124 def uniq(*args) :
125         """ Performs a uniq operation on a list or lists """
126         theInputList = []
127         for theList in args:
128            theInputList += theList
129         theFinalList = []
130         for elem in theInputList:
131                 if elem not in theFinalList:
132                         theFinalList.append(elem)
133         return theFinalList
134
135
136
137 def all_permutations(seq):
138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139         """ Produces all permutations of the items of a list """
140         if len(seq) <=1:
141             yield seq
142         else:
143             for perm in all_permutations(seq[1:]):
144                 for i in range(len(perm)+1):
145                     #nb str[0:1] works in both string and list contexts
146                         yield perm[:i] + seq[0:1] + perm[i:]
147
148 def usage():
149         print """compose-parse available parameters:
150         -h, --help              this craft
151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
152         -a, --algorithmic       show sequences saved with algorithmic optimisation
153         -g, --gtk               show entries that go to GTK+
154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
155         -v, --verbose           show verbose output
156         -p, --plane1            show plane1 compose sequences
157         -n, --numeric           when used with --gtk, create file with numeric values only
158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
159
160         Default is to show statistics.
161         """
162
163 try: 
164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
166 except: 
167         usage()
168         sys.exit(2)
169
170 opt_statistics = False
171 opt_algorithmic = False
172 opt_gtk = False
173 opt_unicodedatatxt = False
174 opt_verbose = False
175 opt_plane1 = False
176 opt_numeric = False
177 opt_gtkexpanded = False
178
179 for o, a in opts:
180         if o in ("-h", "--help"):
181                 usage()
182                 sys.exit()
183         if o in ("-s", "--statistics"):
184                 opt_statistics = True
185         if o in ("-a", "--algorithmic"):
186                 opt_algorithmic = True
187         if o in ("-g", "--gtk"):
188                 opt_gtk = True  
189         if o in ("-u", "--unicodedatatxt"):
190                 opt_unicodedatatxt = True
191         if o in ("-v", "--verbose"):
192                 opt_verbose = True
193         if o in ("-p", "--plane1"):
194                 opt_plane1 = True
195         if o in ("-n", "--numeric"):
196                 opt_numeric = True
197         if o in ("-e", "--gtk-expanded"):
198                 opt_gtkexpanded = True
199
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201         opt_statistics = True
202
203 def download_hook(blocks_transferred, block_size, file_size):
204         """ A download hook to provide some feedback when downloading """
205         if blocks_transferred == 0:
206                 if file_size > 0:
207                         if opt_verbose:
208                                 print "Downloading", file_size, "bytes: ",
209                 else:   
210                         if opt_verbose:
211                                 print "Downloading: ",
212         sys.stdout.write('#')
213         sys.stdout.flush()
214
215
216 def download_file(url):
217         """ Downloads a file provided a URL. Returns the filename. """
218         """ Borks on failure """
219         localfilename = url.split('/')[-1]
220         if not isfile(localfilename) or getsize(localfilename) <= 0:
221                 if opt_verbose:
222                         print "Downloading ", url, "..."
223                 try: 
224                         urlretrieve(url, localfilename, download_hook)
225                 except IOError, (errno, strerror):
226                         print "I/O error(%s): %s" % (errno, strerror)
227                         sys.exit(-1)
228                 except:
229                         print "Unexpected error: ", sys.exc_info()[0]
230                         sys.exit(-1)
231                 print " done."
232         else:
233                 if opt_verbose:
234                         print "Using cached file for ", url
235         return localfilename
236
237 def process_gdkkeysymsh():
238         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239         """ Fills up keysymdb with contents """
240         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
241         try: 
242                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243         except IOError, (errno, strerror):
244                 print "I/O error(%s): %s" % (errno, strerror)
245                 sys.exit(-1)
246         except:
247                 print "Unexpected error: ", sys.exc_info()[0]
248                 sys.exit(-1)
249
250         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
251         linenum_gdkkeysymsh = 0
252         keysymdb = {}
253         for line in gdkkeysymsh.readlines():
254                 linenum_gdkkeysymsh += 1
255                 line = line.strip()
256                 if line == "" or not match('^#define GDK_', line):
257                         continue
258                 components = split('\s+', line)
259                 if len(components) < 3:
260                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262                         print "Was expecting 3 items in the line"
263                         sys.exit(-1)
264                 if not match('^GDK_', components[1]):
265                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267                         print "Was expecting a keysym starting with GDK_"
268                         sys.exit(-1)
269                 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
270                         unival = atoi(components[2][2:], 16)
271                         if unival == 0:
272                                 continue
273                         keysymdb[components[1][4:]] = unival
274                 else:
275                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277                         print "Was expecting a hexadecimal number at the end of the line"
278                         sys.exit(-1)
279         gdkkeysymsh.close()
280
281         """ Patch up the keysymdb with some of our own stuff """
282
283         """ This is for a missing keysym from the currently upstream file """
284         keysymdb['dead_stroke'] = 0x338
285
286         """ This is for a missing keysym from the currently upstream file """
287         ###keysymdb['dead_belowring'] = 0x323
288         ###keysymdb['dead_belowmacron'] = 0x331
289         ###keysymdb['dead_belowcircumflex'] = 0x32d
290         ###keysymdb['dead_belowtilde'] = 0x330
291         ###keysymdb['dead_belowbreve'] = 0x32e
292         ###keysymdb['dead_belowdiaeresis'] = 0x324
293
294         """ This is^Wwas preferential treatment for Greek """
295         # keysymdb['dead_tilde'] = 0x342                
296         """ This is^was preferential treatment for Greek """
297         #keysymdb['combining_tilde'] = 0x342    
298
299         """ Fixing VoidSymbol """
300         keysymdb['VoidSymbol'] = 0xFFFF
301
302         return keysymdb
303
304 def process_keysymstxt():
305         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306         """ This file keeps a record between keysyms <-> unicode chars """
307         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
308         try: 
309                 keysymstxt = open(filename_keysymstxt, 'r')
310         except IOError, (errno, strerror):
311                 print "I/O error(%s): %s" % (errno, strerror)
312                 sys.exit(-1)
313         except:
314                 print "Unexpected error: ", sys.exc_info()[0]
315                 sys.exit(-1)
316
317         """ Parse the keysyms.txt file and place content in  keysymdb """
318         linenum_keysymstxt = 0
319         keysymdb = {}
320         for line in keysymstxt.readlines():
321                 linenum_keysymstxt += 1
322                 line = line.strip()
323                 if line == "" or match('^#', line):
324                         continue
325                 components = split('\s+', line)
326                 if len(components) < 5:
327                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329                         print "Was expecting 5 items in the line"
330                         sys.exit(-1)
331                 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
332                         unival = atoi(components[1][1:], 16)
333                 if unival == 0:
334                         continue
335                 keysymdb[components[4]] = unival
336         keysymstxt.close()
337
338         """ Patch up the keysymdb with some of our own stuff """
339         """ This is for a missing keysym from the currently upstream file """
340         ###keysymdb['dead_belowring'] = 0x323
341         ###keysymdb['dead_belowmacron'] = 0x331
342         ###keysymdb['dead_belowcircumflex'] = 0x32d
343         ###keysymdb['dead_belowtilde'] = 0x330
344         ###keysymdb['dead_belowbreve'] = 0x32e
345         ###keysymdb['dead_belowdiaeresis'] = 0x324
346
347         """ This is preferential treatment for Greek """
348         """ => we get more savings if used for Greek """
349         # keysymdb['dead_tilde'] = 0x342                
350         """ This is preferential treatment for Greek """
351         # keysymdb['combining_tilde'] = 0x342   
352
353         """ This is for a missing keysym from Markus Kuhn's db """
354         keysymdb['dead_stroke'] = 0x338
355         """ This is for a missing keysym from Markus Kuhn's db """
356         keysymdb['Oslash'] = 0x0d8              
357
358         """ This is for a missing (recently added) keysym """
359         keysymdb['dead_psili'] = 0x313          
360         """ This is for a missing (recently added) keysym """
361         keysymdb['dead_dasia'] = 0x314          
362
363         """ Allows to import Multi_key sequences """
364         keysymdb['Multi_key'] = 0xff20
365
366         keysymdb['zerosubscript'] = 0x2080
367         keysymdb['onesubscript'] = 0x2081
368         keysymdb['twosubscript'] = 0x2082
369         keysymdb['threesubscript'] = 0x2083
370         keysymdb['foursubscript'] = 0x2084
371         keysymdb['fivesubscript'] = 0x2085
372         keysymdb['sixsubscript'] = 0x2086
373         keysymdb['sevensubscript'] = 0x2087
374         keysymdb['eightsubscript'] = 0x2088
375         keysymdb['ninesubscript'] = 0x2089
376
377         return keysymdb
378
379 def keysymvalue(keysym, file = "n/a", linenum = 0):
380         """ Extracts a value from the keysym """
381         """ Find the value of keysym, using the data from keysyms """
382         """ Use file and linenum to when reporting errors """
383         if keysym == "":
384                 return 0
385         if keysymdatabase.has_key(keysym):
386                 return keysymdatabase[keysym]
387         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
388                 return atoi(keysym[1:], 16)
389         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
390                 return atoi(keysym[2:], 16)
391         else:
392                 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
393                 return -1
394                 #sys.exit(-1)
395
396 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
397         """ Extracts a value from the keysym """
398         """ Find the value of keysym, using the data from keysyms """
399         """ Use file and linenum to when reporting errors """
400         if keysym == "":
401                 return 0
402         if keysymunicodedatabase.has_key(keysym):
403                 return keysymunicodedatabase[keysym]
404         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
405                 return atoi(keysym[1:], 16)
406         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
407                 return atoi(keysym[2:], 16)
408         else:
409                 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
410                 sys.exit(-1)
411
412 def rename_combining(seq):
413         filtered_sequence = []
414         for ks in seq:
415                 if findall('^combining_', ks):
416                         filtered_sequence.append(sub('^combining_', 'dead_', ks))
417                 else:
418                         filtered_sequence.append(ks)
419         return filtered_sequence
420
421
422 keysymunicodedatabase = process_keysymstxt()
423 keysymdatabase = process_gdkkeysymsh()
424
425 """ Grab and open the compose file from upstream """
426 filename_compose = download_file(URL_COMPOSE)
427 try: 
428         composefile = open(filename_compose, 'r')
429 except IOError, (errno, strerror):
430         print "I/O error(%s): %s" % (errno, strerror)
431         sys.exit(-1)
432 except:
433         print "Unexpected error: ", sys.exc_info()[0]
434         sys.exit(-1)
435
436 """ Look if there is a lookaside (supplementary) compose file in the current
437     directory, and if so, open, then merge with upstream Compose file.
438 """
439 xorg_compose_sequences_raw = []
440 for seq in composefile.readlines():
441         xorg_compose_sequences_raw.append(seq)
442
443 try:
444         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
445         for seq in composefile_lookaside.readlines():
446                 xorg_compose_sequences_raw.append(seq)
447 except IOError, (errno, strerror):
448         if opt_verbose:
449                 print "I/O error(%s): %s" % (errno, strerror)
450                 print "Did not find lookaside compose file. Continuing..."
451 except:
452         print "Unexpected error: ", sys.exc_info()[0]
453         sys.exit(-1)
454
455 """ Parse the compose file in  xorg_compose_sequences"""
456 xorg_compose_sequences = []
457 xorg_compose_sequences_algorithmic = []
458 linenum_compose = 0
459 comment_nest_depth = 0
460 for line in xorg_compose_sequences_raw:
461         linenum_compose += 1
462         line = line.strip()
463         if match("^XCOMM", line) or match("^#", line):
464                 continue
465
466         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
467
468         comment_start = line.find("/*")
469
470         if comment_start >= 0:
471                 if comment_nest_depth == 0:
472                         line = line[:comment_start]
473                 else:
474                         line = ""
475
476                 comment_nest_depth += 1
477         else:
478                 comment_end = line.find("*/")
479
480                 if comment_end >= 0:
481                         comment_nest_depth -= 1
482
483                 if comment_nest_depth < 0:
484                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
485                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
486                         exit(-1)
487
488                 if comment_nest_depth > 0:
489                         line = ""
490                 else:
491                         line = line[comment_end + 2:]
492
493         if line is "":
494                 continue
495
496         #line = line[:-1]
497         components = split(':', line)
498         if len(components) != 2:
499                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
500                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
501                 exit(-1)
502         (seq, val ) = split(':', line)
503         seq = seq.strip()
504         val = val.strip()
505         raw_sequence = findall('\w+', seq)
506         values = split('\s+', val)
507         unichar_temp = split('"', values[0])
508         unichar = unichar_temp[1]
509         if len(values) == 1:
510                 continue
511         codepointstr = values[1]
512         if values[1] == '#':
513                 # No codepoints that are >1 characters yet.
514                 continue
515         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
516                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
517         if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
518                 codepoint = atoi(codepointstr[1:], 16)
519         elif keysymunicodedatabase.has_key(codepointstr):
520                 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
521                         print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
522                         print raw_sequence, codepointstr
523                 codepoint = keysymunicodedatabase[codepointstr]
524         else:
525                 print
526                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
527                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
528                 exit(-1)
529         sequence = rename_combining(raw_sequence)
530         reject_this = False
531         for i in sequence:
532                 if keysymvalue(i) > 0xFFFF:
533                         reject_this = True
534                         if opt_plane1:
535                                 print sequence
536                         break
537                 if keysymvalue(i) < 0:
538                         reject_this = True
539                         break
540         if reject_this:
541                 continue
542         if "U0342" in sequence or \
543                 "U0313" in sequence or \
544                 "U0314" in sequence or \
545                 "0x0313" in sequence or \
546                 "0x0342" in sequence or \
547                 "0x0314" in sequence:
548                 continue
549         if "dead_belowring" in sequence or\
550                 "dead_belowcomma" in sequence or\
551                 "dead_belowmacron" in sequence or\
552                 "dead_belowtilde" in sequence or\
553                 "dead_belowbreve" in sequence or\
554                 "dead_belowdiaeresis" in sequence or\
555                 "dead_belowcircumflex" in sequence:
556                 continue
557         #for i in range(len(sequence)):
558         #       if sequence[i] == "0x0342":
559         #               sequence[i] = "dead_tilde"
560         if "Multi_key" not in sequence:
561                 """ Ignore for now >0xFFFF keysyms """
562                 if codepoint < 0xFFFF:
563                         original_sequence = copy(sequence)
564                         stats_sequence = copy(sequence)
565                         base = sequence.pop()
566                         basechar = keysymvalue(base, filename_compose, linenum_compose)
567                         
568                         if basechar < 0xFFFF:
569                                 counter = 1
570                                 unisequence = []
571                                 not_normalised = True
572                                 skipping_this = False
573                                 for i in range(0, len(sequence)):
574                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
575                                             because of lack of dead_perispomeni (i.e. conflict)
576                                         """
577                                         bc = basechar
578                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
579                                                 skipping_this = True
580                                                 break
581                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
582                                                 skipping_this = True
583                                                 break
584                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
585                                                 skipping_this = True
586                                                 break
587                                         if sequence[-1] == "dead_psili":
588                                                 sequence[i] = "dead_horn"
589                                         if sequence[-1] == "dead_dasia":
590                                                 sequence[-1] = "dead_ogonek"
591                                         """
592                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
593                                         
594                                 if skipping_this:
595                                         unisequence = []
596                                 for perm in all_permutations(unisequence):
597                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
598                                         # print counter, map(unichr, perm)
599                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
600                                         if len(normalized) == 1:
601                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
602                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
603                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
604                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
605                                                 stats_sequence_data.append(normalized)
606                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
607                                                 not_normalised = False
608                                                 break;
609                                         counter += 1
610                                 if not_normalised:
611                                         original_sequence.append(codepoint)
612                                         xorg_compose_sequences.append(original_sequence)
613                                         """ print xorg_compose_sequences[-1] """
614                                         
615                         else:
616                                 print "Error in base char !?!"
617                                 exit(-2)
618                 else:
619                         print "OVER", sequence
620                         exit(-1)
621         else:
622                 sequence.append(codepoint)
623                 xorg_compose_sequences.append(sequence)
624                 """ print xorg_compose_sequences[-1] """
625
626 def sequence_cmp(x, y):
627         if keysymvalue(x[0]) > keysymvalue(y[0]):
628                 return 1
629         elif keysymvalue(x[0]) < keysymvalue(y[0]):
630                 return -1
631         elif len(x) > len(y):
632                 return 1
633         elif len(x) < len(y):
634                 return -1
635         elif keysymvalue(x[1]) > keysymvalue(y[1]):
636                 return 1
637         elif keysymvalue(x[1]) < keysymvalue(y[1]):
638                 return -1
639         elif len(x) < 4:
640                 return 0
641         elif keysymvalue(x[2]) > keysymvalue(y[2]):
642                 return 1
643         elif keysymvalue(x[2]) < keysymvalue(y[2]):
644                 return -1
645         elif len(x) < 5:
646                 return 0
647         elif keysymvalue(x[3]) > keysymvalue(y[3]):
648                 return 1
649         elif keysymvalue(x[3]) < keysymvalue(y[3]):
650                 return -1
651         elif len(x) < 6:
652                 return 0
653         elif keysymvalue(x[4]) > keysymvalue(y[4]):
654                 return 1
655         elif keysymvalue(x[4]) < keysymvalue(y[4]):
656                 return -1
657         else:
658                 return 0
659
660 def sequence_unicode_cmp(x, y):
661         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
662                 return 1
663         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
664                 return -1
665         elif len(x) > len(y):
666                 return 1
667         elif len(x) < len(y):
668                 return -1
669         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
670                 return 1
671         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
672                 return -1
673         elif len(x) < 4:
674                 return 0
675         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
676                 return 1
677         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
678                 return -1
679         elif len(x) < 5:
680                 return 0
681         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
682                 return 1
683         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
684                 return -1
685         elif len(x) < 6:
686                 return 0
687         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
688                 return 1
689         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
690                 return -1
691         else:
692                 return 0
693
694 def sequence_algorithmic_cmp(x, y):
695         if len(x) < len(y):
696                 return -1
697         elif len(x) > len(y):
698                 return 1
699         else:
700                 for i in range(len(x)):
701                         if x[i] < y[i]:
702                                 return -1
703                         elif x[i] > y[i]:
704                                 return 1
705         return 0
706
707
708 xorg_compose_sequences.sort(sequence_cmp)
709
710 xorg_compose_sequences_uniqued = []
711 first_time = True
712 item = None
713 for next_item in xorg_compose_sequences:
714         if first_time:
715                 first_time = False
716                 item = next_item
717         if sequence_unicode_cmp(item, next_item) != 0:
718                 xorg_compose_sequences_uniqued.append(item)
719         item = next_item
720
721 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
722
723 counter_multikey = 0
724 for item in xorg_compose_sequences:
725         if findall('Multi_key', "".join(item[:-1])) != []:
726                 counter_multikey += 1
727
728 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
729 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
730
731 firstitem = ""
732 num_first_keysyms = 0
733 zeroes = 0
734 num_entries = 0
735 num_algorithmic_greek = 0
736 for sequence in xorg_compose_sequences:
737         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
738                 firstitem = sequence[0]
739                 num_first_keysyms += 1
740         zeroes += 6 - len(sequence) + 1
741         num_entries += 1
742
743 for sequence in xorg_compose_sequences_algorithmic_uniqued:
744         ch = ord(sequence[-1:][0])
745         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
746                 num_algorithmic_greek += 1
747                 
748
749 if opt_algorithmic:
750         for sequence in xorg_compose_sequences_algorithmic_uniqued:
751                 letter = "".join(sequence[-1:])
752                 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
753                 for elem in sequence[:-2]:
754                         print "<0x%(keysym)04X>," % { 'keysym': elem },
755                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
756                 print "], recomposed as", letter, "verified"
757
758 def num_of_keysyms(seq):
759         return len(seq) - 1
760
761 def convert_UnotationToHex(arg):
762         if isinstance(arg, str):
763                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
764                         return sub('^U', '0x', arg)
765         return arg
766
767 def addprefix_GDK(arg):
768         if match('^0x', arg):
769                 return '%(arg)s, ' % { 'arg': arg } 
770         else:
771                 return 'GDK_%(arg)s, ' % { 'arg': arg } 
772
773 if opt_gtk:
774         first_keysym = ""
775         sequence = []
776         compose_table = []
777         ct_second_part = []
778         ct_sequence_width = 2
779         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
780         we_finished = False
781         counter = 0
782
783         sequence_iterator = iter(xorg_compose_sequences)
784         sequence = sequence_iterator.next()
785         while True:
786                 first_keysym = sequence[0]                                      # Set the first keysym
787                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
788                 while sequence[0] == first_keysym:
789                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
790                         try:
791                                 sequence = sequence_iterator.next()
792                         except StopIteration:
793                                 we_finished = True
794                                 break
795                 if we_finished:
796                         break
797                 counter += 1
798
799         ct_index = start_offset
800         for line_num in range(len(compose_table)):
801                 for i in range(WIDTHOFCOMPOSETABLE):
802                         occurences = compose_table[line_num][i+1]
803                         compose_table[line_num][i+1] = ct_index
804                         ct_index += occurences * (i+2)
805
806         for sequence in xorg_compose_sequences:
807                 ct_second_part.append(map(convert_UnotationToHex, sequence))
808
809         print headerfile_start
810         for i in compose_table:
811                 if opt_gtkexpanded:
812                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
813                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
814                 elif not match('^0x', i[0]):
815                         print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
816                 else:
817                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
818         for i in ct_second_part:
819                 if opt_numeric:
820                         for ks in i[1:][:-1]:
821                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
822                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
823                         """
824                         for ks in i[:-1]:
825                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
826                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
827                         """
828                 elif opt_gtkexpanded:
829                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
830                 else:
831                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
832         print headerfile_end 
833
834 def redecompose(codepoint):
835         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
836         if decomposition[0] == '' or decomposition[0] == '0':
837                 return [codepoint]
838         if match('<\w+>', decomposition[0]):
839                 numdecomposition = map(stringtohex, decomposition[1:])
840                 return map(redecompose, numdecomposition)
841         numdecomposition = map(stringtohex, decomposition)
842         return map(redecompose, numdecomposition)
843
844 def process_unicodedata_file(verbose = False):
845         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
846         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
847         try: 
848                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
849         except IOError, (errno, strerror):
850                 print "I/O error(%s): %s" % (errno, strerror)
851                 sys.exit(-1)
852         except:
853                 print "Unexpected error: ", sys.exc_info()[0]
854                 sys.exit(-1)
855         for line in unicodedatatxt.readlines():
856                 if line[0] == "" or line[0] == '#':
857                         continue
858                 line = line[:-1]
859                 uniproperties = split(';', line)
860                 codepoint = stringtohex(uniproperties[0])
861                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
862                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
863                         continue
864                 name = uniproperties[1]
865                 category = uniproperties[2]
866                 combiningclass = uniproperties[3]
867                 decomposition = uniproperties[5]
868                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
869         
870         counter_combinations = 0
871         counter_combinations_greek = 0
872         counter_entries = 0
873         counter_entries_greek = 0
874
875         for item in unicodedatabase.keys():
876                 (name, decomposition, combiningclass) = unicodedatabase[item]
877                 if decomposition[0] == '':
878                         continue
879                         print name, "is empty"
880                 elif match('<\w+>', decomposition[0]):
881                         continue
882                         print name, "has weird", decomposition[0]
883                 else:
884                         sequence = map(stringtohex, decomposition)
885                         chrsequence = map(unichr, sequence)
886                         normalized = normalize('NFC', "".join(chrsequence))
887                         
888                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
889                         decomposedsequence = []
890                         for subseq in map(redecompose, sequence):
891                                 for seqitem in subseq:
892                                         if isinstance(seqitem, list):
893                                                 for i in seqitem:
894                                                         if isinstance(i, list):
895                                                                 for j in i:
896                                                                         decomposedsequence.append(j)
897                                                         else:
898                                                                 decomposedsequence.append(i)
899                                         else:
900                                                 decomposedsequence.append(seqitem)
901                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
902                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
903                                 counter_entries += 1
904                                 counter_combinations += factorial(len(decomposedsequence)-1)
905                                 ch = item
906                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
907                                         counter_entries_greek += 1
908                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
909                                 if verbose:
910                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
911                                         print "[",
912                                         for elem in decomposedsequence:
913                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
914                                         print "], recomposed as", recomposedchar,
915                                         if unichr(item) == recomposedchar:
916                                                 print "verified"
917         
918         if verbose == False:
919                 print "Unicode statistics from UnicodeData.txt"
920                 print "Number of entries that can be algorithmically produced     :", counter_entries
921                 print "  of which are for Greek                                   :", counter_entries_greek
922                 print "Number of compose sequence combinations requiring          :", counter_combinations
923                 print "  of which are for Greek                                   :", counter_combinations_greek
924                 print "Note: We do not include partial compositions, "
925                 print "thus the slight discrepancy in the figures"
926                 print
927
928 if opt_unicodedatatxt:
929         process_unicodedata_file(True)
930
931 if opt_statistics:
932         print
933         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
934         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
935         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
936         print "    of which have Multi_key                                :", counter_multikey
937         print 
938         print "Algorithmic (stats for Xorg Compose file)"
939         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
940         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
941         print "  of which are for Greek                                   :", num_algorithmic_greek
942         print 
943         process_unicodedata_file()
944         print "Not algorithmic (stats from Xorg Compose file)"
945         print "Number of sequences                                        :", len(xorg_compose_sequences) 
946         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
947         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
948         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
949         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
950         print "Number of different first items                            :", num_first_keysyms
951         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
952         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
953         print 
954         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
955         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
956         print
957         print "Existing (old) implementation in GTK+"
958         print "Number of sequences in old gtkimcontextsimple.c            :", 691
959         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"