]> Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py
19db9a22bf13fbf961721645c17948b97e906de6
[~andy/gtk] / gtk / compose-parse.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # compose-parse.py, version 1.3
5 #
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
8 #
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
10 #
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
12
13 from re                 import findall, match, split, sub
14 from string             import atoi
15 from unicodedata        import normalize
16 from urllib             import urlretrieve
17 from os.path            import isfile, getsize
18 from copy               import copy
19
20 import sys
21 import getopt
22
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
29
30 # We currently support keysyms of size 2; once upstream xorg gets sorted, 
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
33
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
36
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
40
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42  * Copyright (C) 2007, 2008 GNOME Foundation
43  *
44  * This library is free software; you can redistribute it and/or
45  * modify it under the terms of the GNU Lesser General Public
46  * License as published by the Free Software Foundation; either
47  * version 2 of the License, or (at your option) any later version.
48  *
49  * This library is distributed in the hope that it will be useful,
50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
52  * Lesser General Public License for more details.
53  *
54  * You should have received a copy of the GNU Lesser General Public
55  * License along with this library; if not, write to the
56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57  * Boston, MA 02111-1307, USA.
58  */
59
60 /*
61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62  * using the input files
63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
66  *
67  * This table is optimised for space and requires special handling to access the content.
68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
69  * 
70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
72  */
73
74 /*
75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
76  * file for a list of people on the GTK+ Team.  See the ChangeLog
77  * files for a list of changes.  These files are distributed with
78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
79  */
80
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
83
84 /* === These are the original comments of the file; we keep for historical purposes ===
85  *
86  * The following table was generated from the X compose tables include with
87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88  * to obtain the relevant perl scripts.
89  *
90  * The following compose letter letter sequences confliced
91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
97  *
98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99  *   spanish. atilde and otilde are used at least for Portuguese ]
100  *
101  *   at and Aring; resolved to Aring                                          [ AA ]
102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
104  *
105  * This probably should be resolved by first checking an additional set of compose tables
106  * that depend on the locale or selected input method.
107  */
108
109 static const guint16 gtk_compose_seqs_compact[] = {"""
110
111 headerfile_end = """};
112
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
114 """
115
116 def stringtohex(str): return atoi(str, 16)
117
118 def factorial(n): 
119         if n <= 1:
120                 return 1
121         else:
122                 return n * factorial(n-1)
123
124 def uniq(*args) :
125         """ Performs a uniq operation on a list or lists """
126         theInputList = []
127         for theList in args:
128            theInputList += theList
129         theFinalList = []
130         for elem in theInputList:
131                 if elem not in theFinalList:
132                         theFinalList.append(elem)
133         return theFinalList
134
135
136
137 def all_permutations(seq):
138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139         """ Produces all permutations of the items of a list """
140         if len(seq) <=1:
141             yield seq
142         else:
143             for perm in all_permutations(seq[1:]):
144                 for i in range(len(perm)+1):
145                     #nb str[0:1] works in both string and list contexts
146                         yield perm[:i] + seq[0:1] + perm[i:]
147
148 def usage():
149         print """compose-parse available parameters:
150         -h, --help              this craft
151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
152         -a, --algorithmic       show sequences saved with algorithmic optimisation
153         -g, --gtk               show entries that go to GTK+
154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
155         -v, --verbose           show verbose output
156         -p, --plane1            show plane1 compose sequences
157         -n, --numeric           when used with --gtk, create file with numeric values only
158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
159
160         Default is to show statistics.
161         """
162
163 try: 
164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
166 except: 
167         usage()
168         sys.exit(2)
169
170 opt_statistics = False
171 opt_algorithmic = False
172 opt_gtk = False
173 opt_unicodedatatxt = False
174 opt_verbose = False
175 opt_plane1 = False
176 opt_numeric = False
177 opt_gtkexpanded = False
178
179 for o, a in opts:
180         if o in ("-h", "--help"):
181                 usage()
182                 sys.exit()
183         if o in ("-s", "--statistics"):
184                 opt_statistics = True
185         if o in ("-a", "--algorithmic"):
186                 opt_algorithmic = True
187         if o in ("-g", "--gtk"):
188                 opt_gtk = True  
189         if o in ("-u", "--unicodedatatxt"):
190                 opt_unicodedatatxt = True
191         if o in ("-v", "--verbose"):
192                 opt_verbose = True
193         if o in ("-p", "--plane1"):
194                 opt_plane1 = True
195         if o in ("-n", "--numeric"):
196                 opt_numeric = True
197         if o in ("-e", "--gtk-expanded"):
198                 opt_gtkexpanded = True
199
200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
201         opt_statistics = True
202
203 def download_hook(blocks_transferred, block_size, file_size):
204         """ A download hook to provide some feedback when downloading """
205         if blocks_transferred == 0:
206                 if file_size > 0:
207                         if opt_verbose:
208                                 print "Downloading", file_size, "bytes: ",
209                 else:   
210                         if opt_verbose:
211                                 print "Downloading: ",
212         sys.stdout.write('#')
213         sys.stdout.flush()
214
215
216 def download_file(url):
217         """ Downloads a file provided a URL. Returns the filename. """
218         """ Borks on failure """
219         localfilename = url.split('/')[-1]
220         if not isfile(localfilename) or getsize(localfilename) <= 0:
221                 if opt_verbose:
222                         print "Downloading ", url, "..."
223                 try: 
224                         urlretrieve(url, localfilename, download_hook)
225                 except IOError, (errno, strerror):
226                         print "I/O error(%s): %s" % (errno, strerror)
227                         sys.exit(-1)
228                 except:
229                         print "Unexpected error: ", sys.exc_info()[0]
230                         sys.exit(-1)
231                 print " done."
232         else:
233                 if opt_verbose:
234                         print "Using cached file for ", url
235         return localfilename
236
237 def process_gdkkeysymsh():
238         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
239         """ Fills up keysymdb with contents """
240         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
241         try: 
242                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
243         except IOError, (errno, strerror):
244                 print "I/O error(%s): %s" % (errno, strerror)
245                 sys.exit(-1)
246         except:
247                 print "Unexpected error: ", sys.exc_info()[0]
248                 sys.exit(-1)
249
250         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
251         linenum_gdkkeysymsh = 0
252         keysymdb = {}
253         for line in gdkkeysymsh.readlines():
254                 linenum_gdkkeysymsh += 1
255                 line = line.strip()
256                 if line == "" or not match('^#define GDK_KEY_', line):
257                         continue
258                 components = split('\s+', line)
259                 if len(components) < 3:
260                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
261                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
262                         print "Was expecting 3 items in the line"
263                         sys.exit(-1)
264                 if not match('^GDK_KEY_', components[1]):
265                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
266                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
267                         print "Was expecting a keysym starting with GDK_KEY_"
268                         sys.exit(-1)
269                 if match('^0x[0-9a-fA-F]+$', components[2]):
270                         unival = long(components[2][2:], 16)
271                         if unival == 0:
272                                 continue
273                         keysymdb[components[1][8:]] = unival
274                 else:
275                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
276                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
277                         print "Was expecting a hexadecimal number at the end of the line"
278                         sys.exit(-1)
279         gdkkeysymsh.close()
280
281         """ Patch up the keysymdb with some of our own stuff """
282
283         """ This is for a missing keysym from the currently upstream file """
284         keysymdb['dead_stroke'] = 0x338
285
286         """ This is for a missing keysym from the currently upstream file """
287         ###keysymdb['dead_belowring'] = 0x323
288         ###keysymdb['dead_belowmacron'] = 0x331
289         ###keysymdb['dead_belowcircumflex'] = 0x32d
290         ###keysymdb['dead_belowtilde'] = 0x330
291         ###keysymdb['dead_belowbreve'] = 0x32e
292         ###keysymdb['dead_belowdiaeresis'] = 0x324
293
294         """ This is^Wwas preferential treatment for Greek """
295         # keysymdb['dead_tilde'] = 0x342                
296         """ This is^was preferential treatment for Greek """
297         #keysymdb['combining_tilde'] = 0x342    
298
299         """ Fixing VoidSymbol """
300         keysymdb['VoidSymbol'] = 0xFFFF
301
302         return keysymdb
303
304 def process_keysymstxt():
305         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
306         """ This file keeps a record between keysyms <-> unicode chars """
307         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
308         try: 
309                 keysymstxt = open(filename_keysymstxt, 'r')
310         except IOError, (errno, strerror):
311                 print "I/O error(%s): %s" % (errno, strerror)
312                 sys.exit(-1)
313         except:
314                 print "Unexpected error: ", sys.exc_info()[0]
315                 sys.exit(-1)
316
317         """ Parse the keysyms.txt file and place content in  keysymdb """
318         linenum_keysymstxt = 0
319         keysymdb = {}
320         for line in keysymstxt.readlines():
321                 linenum_keysymstxt += 1
322                 line = line.strip()
323                 if line == "" or match('^#', line):
324                         continue
325                 components = split('\s+', line)
326                 if len(components) < 5:
327                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
328                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
329                         print "Was expecting 5 items in the line"
330                         sys.exit(-1)
331                 if match('^U[0-9a-fA-F]+$', components[1]):
332                         unival = long(components[1][1:], 16)
333                 if unival == 0:
334                         continue
335                 keysymdb[components[4]] = unival
336         keysymstxt.close()
337
338         """ Patch up the keysymdb with some of our own stuff """
339         """ This is for a missing keysym from the currently upstream file """
340         ###keysymdb['dead_belowring'] = 0x323
341         ###keysymdb['dead_belowmacron'] = 0x331
342         ###keysymdb['dead_belowcircumflex'] = 0x32d
343         ###keysymdb['dead_belowtilde'] = 0x330
344         ###keysymdb['dead_belowbreve'] = 0x32e
345         ###keysymdb['dead_belowdiaeresis'] = 0x324
346
347         """ This is preferential treatment for Greek """
348         """ => we get more savings if used for Greek """
349         # keysymdb['dead_tilde'] = 0x342                
350         """ This is preferential treatment for Greek """
351         # keysymdb['combining_tilde'] = 0x342   
352
353         """ This is for a missing keysym from Markus Kuhn's db """
354         keysymdb['dead_stroke'] = 0x338
355         """ This is for a missing keysym from Markus Kuhn's db """
356         keysymdb['Oslash'] = 0x0d8              
357         """ This is for a missing keysym from Markus Kuhn's db """
358         keysymdb['Ssharp'] = 0x1e9e
359
360         """ This is for a missing (recently added) keysym """
361         keysymdb['dead_psili'] = 0x313          
362         """ This is for a missing (recently added) keysym """
363         keysymdb['dead_dasia'] = 0x314          
364
365         """ Allows to import Multi_key sequences """
366         keysymdb['Multi_key'] = 0xff20
367
368         keysymdb['zerosubscript'] = 0x2080
369         keysymdb['onesubscript'] = 0x2081
370         keysymdb['twosubscript'] = 0x2082
371         keysymdb['threesubscript'] = 0x2083
372         keysymdb['foursubscript'] = 0x2084
373         keysymdb['fivesubscript'] = 0x2085
374         keysymdb['sixsubscript'] = 0x2086
375         keysymdb['sevensubscript'] = 0x2087
376         keysymdb['eightsubscript'] = 0x2088
377         keysymdb['ninesubscript'] = 0x2089
378         keysymdb['dead_doublegrave'] = 0x030F
379         keysymdb['dead_invertedbreve'] = 0x0311
380
381         return keysymdb
382
383 def keysymvalue(keysym, file = "n/a", linenum = 0):
384         """ Extracts a value from the keysym """
385         """ Find the value of keysym, using the data from keysyms """
386         """ Use file and linenum to when reporting errors """
387         if keysym == "":
388                 return 0
389         if keysymdatabase.has_key(keysym):
390                 return keysymdatabase[keysym]
391         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
392                 return atoi(keysym[1:], 16)
393         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
394                 return atoi(keysym[2:], 16)
395         else:
396                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
397                 #return -1
398                 sys.exit(-1)
399
400 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
401         """ Extracts a value from the keysym """
402         """ Find the value of keysym, using the data from keysyms """
403         """ Use file and linenum to when reporting errors """
404         if keysym == "":
405                 return 0
406         if keysymunicodedatabase.has_key(keysym):
407                 return keysymunicodedatabase[keysym]
408         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
409                 return atoi(keysym[1:], 16)
410         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
411                 return atoi(keysym[2:], 16)
412         else:
413                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
414                 sys.exit(-1)
415
416 def rename_combining(seq):
417         filtered_sequence = []
418         for ks in seq:
419                 if findall('^combining_', ks):
420                         ks = sub('^combining_', 'dead_', ks)
421                 if ks == 'dead_double_grave':
422                         ks = 'dead_doublegrave'
423                 if ks == 'dead_inverted_breve':
424                         ks = 'dead_invertedbreve'
425                 filtered_sequence.append(ks)
426         return filtered_sequence
427
428
429 keysymunicodedatabase = process_keysymstxt()
430 keysymdatabase = process_gdkkeysymsh()
431
432 """ Grab and open the compose file from upstream """
433 filename_compose = download_file(URL_COMPOSE)
434 try: 
435         composefile = open(filename_compose, 'r')
436 except IOError, (errno, strerror):
437         print "I/O error(%s): %s" % (errno, strerror)
438         sys.exit(-1)
439 except:
440         print "Unexpected error: ", sys.exc_info()[0]
441         sys.exit(-1)
442
443 """ Look if there is a lookaside (supplementary) compose file in the current
444     directory, and if so, open, then merge with upstream Compose file.
445 """
446 xorg_compose_sequences_raw = []
447 for seq in composefile.readlines():
448         xorg_compose_sequences_raw.append(seq)
449
450 try:
451         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
452         for seq in composefile_lookaside.readlines():
453                 xorg_compose_sequences_raw.append(seq)
454 except IOError, (errno, strerror):
455         if opt_verbose:
456                 print "I/O error(%s): %s" % (errno, strerror)
457                 print "Did not find lookaside compose file. Continuing..."
458 except:
459         print "Unexpected error: ", sys.exc_info()[0]
460         sys.exit(-1)
461
462 """ Parse the compose file in  xorg_compose_sequences"""
463 xorg_compose_sequences = []
464 xorg_compose_sequences_algorithmic = []
465 linenum_compose = 0
466 comment_nest_depth = 0
467 for line in xorg_compose_sequences_raw:
468         linenum_compose += 1
469         line = line.strip()
470         if match("^XCOMM", line) or match("^#", line):
471                 continue
472
473         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
474
475         comment_start = line.find("/*")
476
477         if comment_start >= 0:
478                 if comment_nest_depth == 0:
479                         line = line[:comment_start]
480                 else:
481                         line = ""
482
483                 comment_nest_depth += 1
484         else:
485                 comment_end = line.find("*/")
486
487                 if comment_end >= 0:
488                         comment_nest_depth -= 1
489
490                 if comment_nest_depth < 0:
491                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
492                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
493                         exit(-1)
494
495                 if comment_nest_depth > 0:
496                         line = ""
497                 else:
498                         line = line[comment_end + 2:]
499
500         if line is "":
501                 continue
502
503         #line = line[:-1]
504         components = split(':', line)
505         if len(components) != 2:
506                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
507                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
508                 exit(-1)
509         (seq, val ) = split(':', line)
510         seq = seq.strip()
511         val = val.strip()
512         raw_sequence = findall('\w+', seq)
513         values = split('\s+', val)
514         unichar_temp = split('"', values[0])
515         unichar = unichar_temp[1]
516         if len(values) == 1:
517                 continue
518         codepointstr = values[1]
519         if values[1] == '#':
520                 # No codepoints that are >1 characters yet.
521                 continue
522         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
523                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
524         if  match('^U[0-9a-fA-F]+$', codepointstr):
525                 codepoint = long(codepointstr[1:], 16)
526         elif keysymunicodedatabase.has_key(codepointstr):
527                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
528                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
529                         #print raw_sequence, codepointstr
530                 codepoint = keysymunicodedatabase[codepointstr]
531         else:
532                 print
533                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
534                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
535                 exit(-1)
536         sequence = rename_combining(raw_sequence)
537         reject_this = False
538         for i in sequence:
539                 if keysymvalue(i) > 0xFFFF:
540                         reject_this = True
541                         if opt_plane1:
542                                 print sequence
543                         break
544                 if keysymvalue(i) < 0:
545                         reject_this = True
546                         break
547         if reject_this:
548                 continue
549         if "U0342" in sequence or \
550                 "U0313" in sequence or \
551                 "U0314" in sequence or \
552                 "0x0313" in sequence or \
553                 "0x0342" in sequence or \
554                 "0x0314" in sequence:
555                 continue
556         if "dead_belowring" in sequence or\
557                 "dead_currency" in sequence or\
558                 "dead_belowcomma" in sequence or\
559                 "dead_belowmacron" in sequence or\
560                 "dead_belowtilde" in sequence or\
561                 "dead_belowbreve" in sequence or\
562                 "dead_belowdiaeresis" in sequence or\
563                 "dead_belowcircumflex" in sequence:
564                 continue
565         #for i in range(len(sequence)):
566         #       if sequence[i] == "0x0342":
567         #               sequence[i] = "dead_tilde"
568         if "Multi_key" not in sequence:
569                 """ Ignore for now >0xFFFF keysyms """
570                 if codepoint < 0xFFFF:
571                         original_sequence = copy(sequence)
572                         stats_sequence = copy(sequence)
573                         base = sequence.pop()
574                         basechar = keysymvalue(base, filename_compose, linenum_compose)
575                         
576                         if basechar < 0xFFFF:
577                                 counter = 1
578                                 unisequence = []
579                                 not_normalised = True
580                                 skipping_this = False
581                                 for i in range(0, len(sequence)):
582                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
583                                             because of lack of dead_perispomeni (i.e. conflict)
584                                         """
585                                         bc = basechar
586                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
587                                                 skipping_this = True
588                                                 break
589                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590                                                 skipping_this = True
591                                                 break
592                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593                                                 skipping_this = True
594                                                 break
595                                         if sequence[-1] == "dead_psili":
596                                                 sequence[i] = "dead_horn"
597                                         if sequence[-1] == "dead_dasia":
598                                                 sequence[-1] = "dead_ogonek"
599                                         """
600                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
601                                         
602                                 if skipping_this:
603                                         unisequence = []
604                                 for perm in all_permutations(unisequence):
605                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
606                                         # print counter, map(unichr, perm)
607                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
608                                         if len(normalized) == 1:
609                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
610                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
611                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
612                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
613                                                 stats_sequence_data.append(normalized)
614                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
615                                                 not_normalised = False
616                                                 break;
617                                         counter += 1
618                                 if not_normalised:
619                                         original_sequence.append(codepoint)
620                                         xorg_compose_sequences.append(original_sequence)
621                                         """ print xorg_compose_sequences[-1] """
622                                         
623                         else:
624                                 print "Error in base char !?!"
625                                 exit(-2)
626                 else:
627                         print "OVER", sequence
628                         exit(-1)
629         else:
630                 sequence.append(codepoint)
631                 xorg_compose_sequences.append(sequence)
632                 """ print xorg_compose_sequences[-1] """
633
634 def sequence_cmp(x, y):
635         if keysymvalue(x[0]) > keysymvalue(y[0]):
636                 return 1
637         elif keysymvalue(x[0]) < keysymvalue(y[0]):
638                 return -1
639         elif len(x) > len(y):
640                 return 1
641         elif len(x) < len(y):
642                 return -1
643         elif keysymvalue(x[1]) > keysymvalue(y[1]):
644                 return 1
645         elif keysymvalue(x[1]) < keysymvalue(y[1]):
646                 return -1
647         elif len(x) < 4:
648                 return 0
649         elif keysymvalue(x[2]) > keysymvalue(y[2]):
650                 return 1
651         elif keysymvalue(x[2]) < keysymvalue(y[2]):
652                 return -1
653         elif len(x) < 5:
654                 return 0
655         elif keysymvalue(x[3]) > keysymvalue(y[3]):
656                 return 1
657         elif keysymvalue(x[3]) < keysymvalue(y[3]):
658                 return -1
659         elif len(x) < 6:
660                 return 0
661         elif keysymvalue(x[4]) > keysymvalue(y[4]):
662                 return 1
663         elif keysymvalue(x[4]) < keysymvalue(y[4]):
664                 return -1
665         else:
666                 return 0
667
668 def sequence_unicode_cmp(x, y):
669         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
670                 return 1
671         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
672                 return -1
673         elif len(x) > len(y):
674                 return 1
675         elif len(x) < len(y):
676                 return -1
677         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
678                 return 1
679         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
680                 return -1
681         elif len(x) < 4:
682                 return 0
683         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
684                 return 1
685         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
686                 return -1
687         elif len(x) < 5:
688                 return 0
689         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
690                 return 1
691         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
692                 return -1
693         elif len(x) < 6:
694                 return 0
695         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
696                 return 1
697         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
698                 return -1
699         else:
700                 return 0
701
702 def sequence_algorithmic_cmp(x, y):
703         if len(x) < len(y):
704                 return -1
705         elif len(x) > len(y):
706                 return 1
707         else:
708                 for i in range(len(x)):
709                         if x[i] < y[i]:
710                                 return -1
711                         elif x[i] > y[i]:
712                                 return 1
713         return 0
714
715
716 xorg_compose_sequences.sort(sequence_cmp)
717
718 xorg_compose_sequences_uniqued = []
719 first_time = True
720 item = None
721 for next_item in xorg_compose_sequences:
722         if first_time:
723                 first_time = False
724                 item = next_item
725         if sequence_unicode_cmp(item, next_item) != 0:
726                 xorg_compose_sequences_uniqued.append(item)
727         item = next_item
728
729 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
730
731 counter_multikey = 0
732 for item in xorg_compose_sequences:
733         if findall('Multi_key', "".join(item[:-1])) != []:
734                 counter_multikey += 1
735
736 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
737 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
738
739 firstitem = ""
740 num_first_keysyms = 0
741 zeroes = 0
742 num_entries = 0
743 num_algorithmic_greek = 0
744 for sequence in xorg_compose_sequences:
745         if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
746                 firstitem = sequence[0]
747                 num_first_keysyms += 1
748         zeroes += 6 - len(sequence) + 1
749         num_entries += 1
750
751 for sequence in xorg_compose_sequences_algorithmic_uniqued:
752         ch = ord(sequence[-1:][0])
753         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
754                 num_algorithmic_greek += 1
755                 
756
757 if opt_algorithmic:
758         for sequence in xorg_compose_sequences_algorithmic_uniqued:
759                 letter = "".join(sequence[-1:])
760                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
761                 for elem in sequence[:-2]:
762                         print "<0x%(keysym)04X>," % { 'keysym': elem },
763                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
764                 print "], recomposed as", letter.encode('utf-8'), "verified"
765
766 def num_of_keysyms(seq):
767         return len(seq) - 1
768
769 def convert_UnotationToHex(arg):
770         if isinstance(arg, str):
771                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
772                         return sub('^U', '0x', arg)
773         return arg
774
775 def addprefix_GDK(arg):
776         if match('^0x', arg):
777                 return '%(arg)s, ' % { 'arg': arg }
778         else:
779                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
780
781 if opt_gtk:
782         first_keysym = ""
783         sequence = []
784         compose_table = []
785         ct_second_part = []
786         ct_sequence_width = 2
787         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
788         we_finished = False
789         counter = 0
790
791         sequence_iterator = iter(xorg_compose_sequences)
792         sequence = sequence_iterator.next()
793         while True:
794                 first_keysym = sequence[0]                                      # Set the first keysym
795                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
796                 while sequence[0] == first_keysym:
797                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
798                         try:
799                                 sequence = sequence_iterator.next()
800                         except StopIteration:
801                                 we_finished = True
802                                 break
803                 if we_finished:
804                         break
805                 counter += 1
806
807         ct_index = start_offset
808         for line_num in range(len(compose_table)):
809                 for i in range(WIDTHOFCOMPOSETABLE):
810                         occurences = compose_table[line_num][i+1]
811                         compose_table[line_num][i+1] = ct_index
812                         ct_index += occurences * (i+2)
813
814         for sequence in xorg_compose_sequences:
815                 ct_second_part.append(map(convert_UnotationToHex, sequence))
816
817         print headerfile_start
818         for i in compose_table:
819                 if opt_gtkexpanded:
820                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
821                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
822                 elif not match('^0x', i[0]):
823                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
824                 else:
825                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
826         for i in ct_second_part:
827                 if opt_numeric:
828                         for ks in i[1:][:-1]:
829                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
830                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
831                         """
832                         for ks in i[:-1]:
833                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
834                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
835                         """
836                 elif opt_gtkexpanded:
837                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
838                 else:
839                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
840         print headerfile_end 
841
842 def redecompose(codepoint):
843         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
844         if decomposition[0] == '' or decomposition[0] == '0':
845                 return [codepoint]
846         if match('<\w+>', decomposition[0]):
847                 numdecomposition = map(stringtohex, decomposition[1:])
848                 return map(redecompose, numdecomposition)
849         numdecomposition = map(stringtohex, decomposition)
850         return map(redecompose, numdecomposition)
851
852 def process_unicodedata_file(verbose = False):
853         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
854         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
855         try: 
856                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
857         except IOError, (errno, strerror):
858                 print "I/O error(%s): %s" % (errno, strerror)
859                 sys.exit(-1)
860         except:
861                 print "Unexpected error: ", sys.exc_info()[0]
862                 sys.exit(-1)
863         for line in unicodedatatxt.readlines():
864                 if line[0] == "" or line[0] == '#':
865                         continue
866                 line = line[:-1]
867                 uniproperties = split(';', line)
868                 codepoint = stringtohex(uniproperties[0])
869                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
870                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
871                         continue
872                 name = uniproperties[1]
873                 category = uniproperties[2]
874                 combiningclass = uniproperties[3]
875                 decomposition = uniproperties[5]
876                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
877         
878         counter_combinations = 0
879         counter_combinations_greek = 0
880         counter_entries = 0
881         counter_entries_greek = 0
882
883         for item in unicodedatabase.keys():
884                 (name, decomposition, combiningclass) = unicodedatabase[item]
885                 if decomposition[0] == '':
886                         continue
887                         print name, "is empty"
888                 elif match('<\w+>', decomposition[0]):
889                         continue
890                         print name, "has weird", decomposition[0]
891                 else:
892                         sequence = map(stringtohex, decomposition)
893                         chrsequence = map(unichr, sequence)
894                         normalized = normalize('NFC', "".join(chrsequence))
895                         
896                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
897                         decomposedsequence = []
898                         for subseq in map(redecompose, sequence):
899                                 for seqitem in subseq:
900                                         if isinstance(seqitem, list):
901                                                 for i in seqitem:
902                                                         if isinstance(i, list):
903                                                                 for j in i:
904                                                                         decomposedsequence.append(j)
905                                                         else:
906                                                                 decomposedsequence.append(i)
907                                         else:
908                                                 decomposedsequence.append(seqitem)
909                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
910                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
911                                 counter_entries += 1
912                                 counter_combinations += factorial(len(decomposedsequence)-1)
913                                 ch = item
914                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
915                                         counter_entries_greek += 1
916                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
917                                 if verbose:
918                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
919                                         print "[",
920                                         for elem in decomposedsequence:
921                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
922                                         print "], recomposed as", recomposedchar,
923                                         if unichr(item) == recomposedchar:
924                                                 print "verified"
925         
926         if verbose == False:
927                 print "Unicode statistics from UnicodeData.txt"
928                 print "Number of entries that can be algorithmically produced     :", counter_entries
929                 print "  of which are for Greek                                   :", counter_entries_greek
930                 print "Number of compose sequence combinations requiring          :", counter_combinations
931                 print "  of which are for Greek                                   :", counter_combinations_greek
932                 print "Note: We do not include partial compositions, "
933                 print "thus the slight discrepancy in the figures"
934                 print
935
936 if opt_unicodedatatxt:
937         process_unicodedata_file(True)
938
939 if opt_statistics:
940         print
941         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
942         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
943         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
944         print "    of which have Multi_key                                :", counter_multikey
945         print 
946         print "Algorithmic (stats for Xorg Compose file)"
947         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
948         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
949         print "  of which are for Greek                                   :", num_algorithmic_greek
950         print 
951         process_unicodedata_file()
952         print "Not algorithmic (stats from Xorg Compose file)"
953         print "Number of sequences                                        :", len(xorg_compose_sequences) 
954         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
955         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
956         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
957         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
958         print "Number of different first items                            :", num_first_keysyms
959         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
960         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
961         print 
962         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
963         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
964         print
965         print "Existing (old) implementation in GTK+"
966         print "Number of sequences in old gtkimcontextsimple.c            :", 691
967         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"