Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # compose-parse.py, version 1.3
   5 #
   6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
   7 # the script produces statistics and information about the whole process, run with --help for more.
   8 #
   9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
  10 #
  11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
  12
  13 from re                 import findall, match, split, sub
  14 from string             import atoi
  15 from unicodedata        import normalize
  16 from urllib             import urlretrieve
  17 from os.path            import isfile, getsize
  18 from copy               import copy
  19
  20 import sys
  21 import getopt
  22
  23 # We grab files off the web, left and right.
  24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
  25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
  26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
  27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
  28
  29 # We currently support keysyms of size 2; once upstream xorg gets sorted,
  30 # we might produce some tables with size 2 and some with size 4.
  31 SIZEOFINT = 2
  32
  33 # Current max compose sequence length; in case it gets increased.
  34 WIDTHOFCOMPOSETABLE = 5
  35
  36 keysymdatabase = {}
  37 keysymunicodedatabase = {}
  38 unicodedatabase = {}
  39
  40 headerfile_start = """/* GTK - The GIMP Tool Kit
  41  * Copyright (C) 2007, 2008 GNOME Foundation
  42  *
  43  * This library is free software; you can redistribute it and/or
  44  * modify it under the terms of the GNU Lesser General Public
  45  * License as published by the Free Software Foundation; either
  46  * version 2 of the License, or (at your option) any later version.
  47  *
  48  * This library is distributed in the hope that it will be useful,
  49  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  50  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  51  * Lesser General Public License for more details.
  52  *
  53  * You should have received a copy of the GNU Lesser General Public
  54  * License along with this library; if not, write to the
  55  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  56  * Boston, MA 02111-1307, USA.
  57  */
  58
  59 /*
  60  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
  61  * using the input files
  62  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
  63  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
  64  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  65  *
  66  * This table is optimised for space and requires special handling to access the content.
  67  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
  68  *
  69  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
  70  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
  71  */
  72
  73 /*
  74  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
  75  * file for a list of people on the GTK+ Team.  See the ChangeLog
  76  * files for a list of changes.  These files are distributed with
  77  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
  78  */
  79
  80 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  81 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  82
  83 /* === These are the original comments of the file; we keep for historical purposes ===
  84  *
  85  * The following table was generated from the X compose tables include with
  86  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
  87  * to obtain the relevant perl scripts.
  88  *
  89  * The following compose letter letter sequences confliced
  90  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
  91  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
  92  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
  93  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
  94  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
  95  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
  96  *
  97  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
  98  *   spanish. atilde and otilde are used at least for Portuguese ]
  99  *
 100  *   at and Aring; resolved to Aring                                          [ AA ]
 101  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
 102  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
 103  *
 104  * This probably should be resolved by first checking an additional set of compose tables
 105  * that depend on the locale or selected input method.
 106  */
 107
 108 static const guint16 gtk_compose_seqs_compact[] = {"""
 109
 110 headerfile_end = """};
 111
 112 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
 113 """
 114
 115 def stringtohex(str): return atoi(str, 16)
 116
 117 def factorial(n):
 118         if n <= 1:
 119                 return 1
 120         else:
 121                 return n * factorial(n-1)
 122
 123 def uniq(*args) :
 124         """ Performs a uniq operation on a list or lists """
 125         theInputList = []
 126         for theList in args:
 127            theInputList += theList
 128         theFinalList = []
 129         for elem in theInputList:
 130                 if elem not in theFinalList:
 131                         theFinalList.append(elem)
 132         return theFinalList
 133
 134
 135
 136 def all_permutations(seq):
 137         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
 138         """ Produces all permutations of the items of a list """
 139         if len(seq) <=1:
 140             yield seq
 141         else:
 142             for perm in all_permutations(seq[1:]):
 143                 for i in range(len(perm)+1):
 144                     #nb str[0:1] works in both string and list contexts
 145                         yield perm[:i] + seq[0:1] + perm[i:]
 146
 147 def usage():
 148         print """compose-parse available parameters:
 149         -h, --help              this craft
 150         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
 151         -a, --algorithmic       show sequences saved with algorithmic optimisation
 152         -g, --gtk               show entries that go to GTK+
 153         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
 154         -v, --verbose           show verbose output
 155         -p, --plane1            show plane1 compose sequences
 156         -n, --numeric           when used with --gtk, create file with numeric values only
 157         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
 158
 159         Default is to show statistics.
 160         """
 161
 162 try:
 163         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
 164                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
 165 except:
 166         usage()
 167         sys.exit(2)
 168
 169 opt_statistics = False
 170 opt_algorithmic = False
 171 opt_gtk = False
 172 opt_unicodedatatxt = False
 173 opt_verbose = False
 174 opt_plane1 = False
 175 opt_numeric = False
 176 opt_gtkexpanded = False
 177
 178 for o, a in opts:
 179         if o in ("-h", "--help"):
 180                 usage()
 181                 sys.exit()
 182         if o in ("-s", "--statistics"):
 183                 opt_statistics = True
 184         if o in ("-a", "--algorithmic"):
 185                 opt_algorithmic = True
 186         if o in ("-g", "--gtk"):
 187                 opt_gtk = True
 188         if o in ("-u", "--unicodedatatxt"):
 189                 opt_unicodedatatxt = True
 190         if o in ("-v", "--verbose"):
 191                 opt_verbose = True
 192         if o in ("-p", "--plane1"):
 193                 opt_plane1 = True
 194         if o in ("-n", "--numeric"):
 195                 opt_numeric = True
 196         if o in ("-e", "--gtk-expanded"):
 197                 opt_gtkexpanded = True
 198
 199 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
 200         opt_statistics = True
 201
 202 def download_hook(blocks_transferred, block_size, file_size):
 203         """ A download hook to provide some feedback when downloading """
 204         if blocks_transferred == 0:
 205                 if file_size > 0:
 206                         if opt_verbose:
 207                                 print "Downloading", file_size, "bytes: ",
 208                 else:
 209                         if opt_verbose:
 210                                 print "Downloading: ",
 211         sys.stdout.write('#')
 212         sys.stdout.flush()
 213
 214
 215 def download_file(url):
 216         """ Downloads a file provided a URL. Returns the filename. """
 217         """ Borks on failure """
 218         localfilename = url.split('/')[-1]
 219         if not isfile(localfilename) or getsize(localfilename) <= 0:
 220                 if opt_verbose:
 221                         print "Downloading ", url, "..."
 222                 try:
 223                         urlretrieve(url, localfilename, download_hook)
 224                 except IOError, (errno, strerror):
 225                         print "I/O error(%s): %s" % (errno, strerror)
 226                         sys.exit(-1)
 227                 except:
 228                         print "Unexpected error: ", sys.exc_info()[0]
 229                         sys.exit(-1)
 230                 print " done."
 231         else:
 232                 if opt_verbose:
 233                         print "Using cached file for ", url
 234         return localfilename
 235
 236 def process_gdkkeysymsh():
 237         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
 238         """ Fills up keysymdb with contents """
 239         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
 240         try:
 241                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
 242         except IOError, (errno, strerror):
 243                 print "I/O error(%s): %s" % (errno, strerror)
 244                 sys.exit(-1)
 245         except:
 246                 print "Unexpected error: ", sys.exc_info()[0]
 247                 sys.exit(-1)
 248
 249         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
 250         linenum_gdkkeysymsh = 0
 251         keysymdb = {}
 252         for line in gdkkeysymsh.readlines():
 253                 linenum_gdkkeysymsh += 1
 254                 line = line.strip()
 255                 if line == "" or not match('^#define GDK_', line):
 256                         continue
 257                 components = split('\s+', line)
 258                 if len(components) < 3:
 259                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 260                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 261                         print "Was expecting 3 items in the line"
 262                         sys.exit(-1)
 263                 if not match('^GDK_', components[1]):
 264                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 265                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 266                         print "Was expecting a keysym starting with GDK_"
 267                         sys.exit(-1)
 268                 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
 269                         unival = atoi(components[2][2:], 16)
 270                         if unival == 0:
 271                                 continue
 272                         keysymdb[components[1][4:]] = unival
 273                 else:
 274                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 275                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 276                         print "Was expecting a hexadecimal number at the end of the line"
 277                         sys.exit(-1)
 278         gdkkeysymsh.close()
 279
 280         """ Patch up the keysymdb with some of our own stuff """
 281
 282         """ This is for a missing keysym from the currently upstream file """
 283         keysymdb['dead_stroke'] = 0x338
 284
 285         """ This is for a missing keysym from the currently upstream file """
 286         ###keysymdb['dead_belowring'] = 0x323
 287         ###keysymdb['dead_belowmacron'] = 0x331
 288         ###keysymdb['dead_belowcircumflex'] = 0x32d
 289         ###keysymdb['dead_belowtilde'] = 0x330
 290         ###keysymdb['dead_belowbreve'] = 0x32e
 291         ###keysymdb['dead_belowdiaeresis'] = 0x324
 292
 293         """ This is^Wwas preferential treatment for Greek """
 294         # keysymdb['dead_tilde'] = 0x342
 295         """ This is^was preferential treatment for Greek """
 296         #keysymdb['combining_tilde'] = 0x342
 297
 298         """ Fixing VoidSymbol """
 299         keysymdb['VoidSymbol'] = 0xFFFF
 300
 301         return keysymdb
 302
 303 def process_keysymstxt():
 304         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
 305         """ This file keeps a record between keysyms <-> unicode chars """
 306         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
 307         try:
 308                 keysymstxt = open(filename_keysymstxt, 'r')
 309         except IOError, (errno, strerror):
 310                 print "I/O error(%s): %s" % (errno, strerror)
 311                 sys.exit(-1)
 312         except:
 313                 print "Unexpected error: ", sys.exc_info()[0]
 314                 sys.exit(-1)
 315
 316         """ Parse the keysyms.txt file and place content in  keysymdb """
 317         linenum_keysymstxt = 0
 318         keysymdb = {}
 319         for line in keysymstxt.readlines():
 320                 linenum_keysymstxt += 1
 321                 line = line.strip()
 322                 if line == "" or match('^#', line):
 323                         continue
 324                 components = split('\s+', line)
 325                 if len(components) < 5:
 326                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
 327                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
 328                         print "Was expecting 5 items in the line"
 329                         sys.exit(-1)
 330                 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
 331                         unival = atoi(components[1][1:], 16)
 332                 if unival == 0:
 333                         continue
 334                 keysymdb[components[4]] = unival
 335         keysymstxt.close()
 336
 337         """ Patch up the keysymdb with some of our own stuff """
 338         """ This is for a missing keysym from the currently upstream file """
 339         ###keysymdb['dead_belowring'] = 0x323
 340         ###keysymdb['dead_belowmacron'] = 0x331
 341         ###keysymdb['dead_belowcircumflex'] = 0x32d
 342         ###keysymdb['dead_belowtilde'] = 0x330
 343         ###keysymdb['dead_belowbreve'] = 0x32e
 344         ###keysymdb['dead_belowdiaeresis'] = 0x324
 345
 346         """ This is preferential treatment for Greek """
 347         """ => we get more savings if used for Greek """
 348         # keysymdb['dead_tilde'] = 0x342
 349         """ This is preferential treatment for Greek """
 350         # keysymdb['combining_tilde'] = 0x342
 351
 352         """ This is for a missing keysym from Markus Kuhn's db """
 353         keysymdb['dead_stroke'] = 0x338
 354         """ This is for a missing keysym from Markus Kuhn's db """
 355         keysymdb['Oslash'] = 0x0d8
 356
 357         """ This is for a missing (recently added) keysym """
 358         keysymdb['dead_psili'] = 0x313
 359         """ This is for a missing (recently added) keysym """
 360         keysymdb['dead_dasia'] = 0x314
 361
 362         """ Allows to import Multi_key sequences """
 363         keysymdb['Multi_key'] = 0xff20
 364
 365         return keysymdb
 366
 367 def keysymvalue(keysym, file = "n/a", linenum = 0):
 368         """ Extracts a value from the keysym """
 369         """ Find the value of keysym, using the data from keysyms """
 370         """ Use file and linenum to when reporting errors """
 371         if keysym == "":
 372                 return 0
 373         if keysymdatabase.has_key(keysym):
 374                 return keysymdatabase[keysym]
 375         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 376                 return atoi(keysym[1:], 16)
 377         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 378                 return atoi(keysym[2:], 16)
 379         else:
 380                 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 381                 return -1
 382                 #sys.exit(-1)
 383
 384 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
 385         """ Extracts a value from the keysym """
 386         """ Find the value of keysym, using the data from keysyms """
 387         """ Use file and linenum to when reporting errors """
 388         if keysym == "":
 389                 return 0
 390         if keysymunicodedatabase.has_key(keysym):
 391                 return keysymunicodedatabase[keysym]
 392         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 393                 return atoi(keysym[1:], 16)
 394         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 395                 return atoi(keysym[2:], 16)
 396         else:
 397                 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 398                 sys.exit(-1)
 399
 400 def rename_combining(seq):
 401         filtered_sequence = []
 402         for ks in seq:
 403                 if findall('^combining_', ks):
 404                         filtered_sequence.append(sub('^combining_', 'dead_', ks))
 405                 else:
 406                         filtered_sequence.append(ks)
 407         return filtered_sequence
 408
 409
 410 keysymunicodedatabase = process_keysymstxt()
 411 keysymdatabase = process_gdkkeysymsh()
 412
 413 """ Grab and open the compose file from upstream """
 414 filename_compose = download_file(URL_COMPOSE)
 415 try:
 416         composefile = open(filename_compose, 'r')
 417 except IOError, (errno, strerror):
 418         print "I/O error(%s): %s" % (errno, strerror)
 419         sys.exit(-1)
 420 except:
 421         print "Unexpected error: ", sys.exc_info()[0]
 422         sys.exit(-1)
 423
 424 """ Parse the compose file in  xorg_compose_sequences"""
 425 xorg_compose_sequences = []
 426 xorg_compose_sequences_algorithmic = []
 427 linenum_compose = 0
 428 for line in composefile.readlines():
 429         linenum_compose += 1
 430         line = line.strip()
 431         if line is "" or match("^XCOMM", line) or match("^#", line):
 432                 continue
 433
 434         line = line[:-1]
 435         components = split(':', line)
 436         if len(components) != 2:
 437                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
 438                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 439                 exit(-1)
 440         (seq, val ) = split(':', line)
 441         seq = seq.strip()
 442         val = val.strip()
 443         raw_sequence = findall('\w+', seq)
 444         values = split('\s+', val)
 445         unichar_temp = split('"', values[0])
 446         unichar = unichar_temp[1]
 447         if len(values) == 1:
 448                 continue
 449         codepointstr = values[1]
 450         if values[1] == '#':
 451                 # No codepoints that are >1 characters yet.
 452                 continue
 453         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
 454                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
 455         if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
 456                 codepoint = atoi(codepointstr[1:], 16)
 457         elif keysymunicodedatabase.has_key(codepointstr):
 458                 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
 459                         print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
 460                         print raw_sequence, codepointstr
 461                 codepoint = keysymunicodedatabase[codepointstr]
 462         else:
 463                 print
 464                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
 465                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
 466                 exit(-1)
 467         sequence = rename_combining(raw_sequence)
 468         reject_this = False
 469         for i in sequence:
 470                 if keysymvalue(i) > 0xFFFF:
 471                         reject_this = True
 472                         if opt_plane1:
 473                                 print sequence
 474                         break
 475                 if keysymvalue(i) < 0:
 476                         reject_this = True
 477                         break
 478         if reject_this:
 479                 continue
 480         if "U0342" in sequence or \
 481                 "U0313" in sequence or \
 482                 "U0314" in sequence or \
 483                 "0x0313" in sequence or \
 484                 "0x0342" in sequence or \
 485                 "0x0314" in sequence:
 486                 continue
 487         #for i in range(len(sequence)):
 488         #       if sequence[i] == "0x0342":
 489         #               sequence[i] = "dead_tilde"
 490         if "Multi_key" not in sequence:
 491                 """ Ignore for now >0xFFFF keysyms """
 492                 if codepoint < 0xFFFF:
 493                         original_sequence = copy(sequence)
 494                         stats_sequence = copy(sequence)
 495                         base = sequence.pop()
 496                         basechar = keysymvalue(base, filename_compose, linenum_compose)
 497
 498                         if basechar < 0xFFFF:
 499                                 counter = 1
 500                                 unisequence = []
 501                                 not_normalised = True
 502                                 skipping_this = False
 503                                 for i in range(0, len(sequence)):
 504                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
 505                                             because of lack of dead_perispomeni (i.e. conflict)
 506                                         """
 507                                         bc = basechar
 508                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 509                                                 skipping_this = True
 510                                                 break
 511                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 512                                                 skipping_this = True
 513                                                 break
 514                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 515                                                 skipping_this = True
 516                                                 break
 517                                         if sequence[-1] == "dead_psili":
 518                                                 sequence[i] = "dead_horn"
 519                                         if sequence[-1] == "dead_dasia":
 520                                                 sequence[-1] = "dead_ogonek"
 521                                         """
 522                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
 523
 524                                 if skipping_this:
 525                                         unisequence = []
 526                                 for perm in all_permutations(unisequence):
 527                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
 528                                         # print counter, map(unichr, perm)
 529                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
 530                                         if len(normalized) == 1:
 531                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
 532                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
 533                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
 534                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
 535                                                 stats_sequence_data.append(normalized)
 536                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
 537                                                 not_normalised = False
 538                                                 break;
 539                                         counter += 1
 540                                 if not_normalised:
 541                                         original_sequence.append(codepoint)
 542                                         xorg_compose_sequences.append(original_sequence)
 543                                         """ print xorg_compose_sequences[-1] """
 544
 545                         else:
 546                                 print "Error in base char !?!"
 547                                 exit(-2)
 548                 else:
 549                         print "OVER", sequence
 550                         exit(-1)
 551         else:
 552                 sequence.append(codepoint)
 553                 xorg_compose_sequences.append(sequence)
 554                 """ print xorg_compose_sequences[-1] """
 555
 556 def sequence_cmp(x, y):
 557         if keysymvalue(x[0]) > keysymvalue(y[0]):
 558                 return 1
 559         elif keysymvalue(x[0]) < keysymvalue(y[0]):
 560                 return -1
 561         elif len(x) > len(y):
 562                 return 1
 563         elif len(x) < len(y):
 564                 return -1
 565         elif keysymvalue(x[1]) > keysymvalue(y[1]):
 566                 return 1
 567         elif keysymvalue(x[1]) < keysymvalue(y[1]):
 568                 return -1
 569         elif len(x) < 4:
 570                 return 0
 571         elif keysymvalue(x[2]) > keysymvalue(y[2]):
 572                 return 1
 573         elif keysymvalue(x[2]) < keysymvalue(y[2]):
 574                 return -1
 575         elif len(x) < 5:
 576                 return 0
 577         elif keysymvalue(x[3]) > keysymvalue(y[3]):
 578                 return 1
 579         elif keysymvalue(x[3]) < keysymvalue(y[3]):
 580                 return -1
 581         elif len(x) < 6:
 582                 return 0
 583         elif keysymvalue(x[4]) > keysymvalue(y[4]):
 584                 return 1
 585         elif keysymvalue(x[4]) < keysymvalue(y[4]):
 586                 return -1
 587         else:
 588                 return 0
 589
 590 def sequence_unicode_cmp(x, y):
 591         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
 592                 return 1
 593         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
 594                 return -1
 595         elif len(x) > len(y):
 596                 return 1
 597         elif len(x) < len(y):
 598                 return -1
 599         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
 600                 return 1
 601         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
 602                 return -1
 603         elif len(x) < 4:
 604                 return 0
 605         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
 606                 return 1
 607         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
 608                 return -1
 609         elif len(x) < 5:
 610                 return 0
 611         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
 612                 return 1
 613         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
 614                 return -1
 615         elif len(x) < 6:
 616                 return 0
 617         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
 618                 return 1
 619         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
 620                 return -1
 621         else:
 622                 return 0
 623
 624 def sequence_algorithmic_cmp(x, y):
 625         if len(x) < len(y):
 626                 return -1
 627         elif len(x) > len(y):
 628                 return 1
 629         else:
 630                 for i in range(len(x)):
 631                         if x[i] < y[i]:
 632                                 return -1
 633                         elif x[i] > y[i]:
 634                                 return 1
 635         return 0
 636
 637
 638 xorg_compose_sequences.sort(sequence_cmp)
 639
 640 xorg_compose_sequences_uniqued = []
 641 first_time = True
 642 item = None
 643 for next_item in xorg_compose_sequences:
 644         if first_time:
 645                 first_time = False
 646                 item = next_item
 647         if sequence_unicode_cmp(item, next_item) != 0:
 648                 xorg_compose_sequences_uniqued.append(item)
 649         item = next_item
 650
 651 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
 652
 653 counter_multikey = 0
 654 for item in xorg_compose_sequences:
 655         if findall('Multi_key', "".join(item[:-1])) != []:
 656                 counter_multikey += 1
 657
 658 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
 659 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
 660
 661 firstitem = ""
 662 num_first_keysyms = 0
 663 zeroes = 0
 664 num_entries = 0
 665 num_algorithmic_greek = 0
 666 for sequence in xorg_compose_sequences:
 667         if keysymvalue(firstitem) != keysymvalue(sequence[0]):
 668                 firstitem = sequence[0]
 669                 num_first_keysyms += 1
 670         zeroes += 6 - len(sequence) + 1
 671         num_entries += 1
 672
 673 for sequence in xorg_compose_sequences_algorithmic_uniqued:
 674         ch = ord(sequence[-1:][0])
 675         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 676                 num_algorithmic_greek += 1
 677
 678
 679 if opt_algorithmic:
 680         for sequence in xorg_compose_sequences_algorithmic_uniqued:
 681                 letter = "".join(sequence[-1:])
 682                 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
 683                 for elem in sequence[:-2]:
 684                         print "<0x%(keysym)04X>," % { 'keysym': elem },
 685                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
 686                 print "], recomposed as", letter, "verified"
 687
 688 def num_of_keysyms(seq):
 689         return len(seq) - 1
 690
 691 def convert_UnotationToHex(arg):
 692         if isinstance(arg, str):
 693                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
 694                         return sub('^U', '0x', arg)
 695         return arg
 696
 697 def addprefix_GDK(arg):
 698         if match('^0x', arg):
 699                 return '%(arg)s, ' % { 'arg': arg }
 700         else:
 701                 return 'GDK_%(arg)s, ' % { 'arg': arg }
 702
 703 if opt_gtk:
 704         first_keysym = ""
 705         sequence = []
 706         compose_table = []
 707         ct_second_part = []
 708         ct_sequence_width = 2
 709         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
 710         we_finished = False
 711         counter = 0
 712
 713         sequence_iterator = iter(xorg_compose_sequences)
 714         sequence = sequence_iterator.next()
 715         while True:
 716                 first_keysym = sequence[0]                                      # Set the first keysym
 717                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
 718                 while sequence[0] == first_keysym:
 719                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
 720                         try:
 721                                 sequence = sequence_iterator.next()
 722                         except StopIteration:
 723                                 we_finished = True
 724                                 break
 725                 if we_finished:
 726                         break
 727                 counter += 1
 728
 729         ct_index = start_offset
 730         for line_num in range(len(compose_table)):
 731                 for i in range(WIDTHOFCOMPOSETABLE):
 732                         occurences = compose_table[line_num][i+1]
 733                         compose_table[line_num][i+1] = ct_index
 734                         ct_index += occurences * (i+2)
 735
 736         for sequence in xorg_compose_sequences:
 737                 ct_second_part.append(map(convert_UnotationToHex, sequence))
 738
 739         print headerfile_start
 740         for i in compose_table:
 741                 if opt_gtkexpanded:
 742                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
 743                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
 744                 elif not match('^0x', i[0]):
 745                         print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 746                 else:
 747                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 748         for i in ct_second_part:
 749                 if opt_numeric:
 750                         for ks in i[1:][:-1]:
 751                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 752                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 753                         """
 754                         for ks in i[:-1]:
 755                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 756                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 757                         """
 758                 elif opt_gtkexpanded:
 759                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
 760                 else:
 761                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
 762         print headerfile_end
 763
 764 def redecompose(codepoint):
 765         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
 766         if decomposition[0] == '' or decomposition[0] == '0':
 767                 return [codepoint]
 768         if match('<\w+>', decomposition[0]):
 769                 numdecomposition = map(stringtohex, decomposition[1:])
 770                 return map(redecompose, numdecomposition)
 771         numdecomposition = map(stringtohex, decomposition)
 772         return map(redecompose, numdecomposition)
 773
 774 def process_unicodedata_file(verbose = False):
 775         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
 776         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
 777         try:
 778                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
 779         except IOError, (errno, strerror):
 780                 print "I/O error(%s): %s" % (errno, strerror)
 781                 sys.exit(-1)
 782         except:
 783                 print "Unexpected error: ", sys.exc_info()[0]
 784                 sys.exit(-1)
 785         for line in unicodedatatxt.readlines():
 786                 if line[0] == "" or line[0] == '#':
 787                         continue
 788                 line = line[:-1]
 789                 uniproperties = split(';', line)
 790                 codepoint = stringtohex(uniproperties[0])
 791                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
 792                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
 793                         continue
 794                 name = uniproperties[1]
 795                 category = uniproperties[2]
 796                 combiningclass = uniproperties[3]
 797                 decomposition = uniproperties[5]
 798                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
 799
 800         counter_combinations = 0
 801         counter_combinations_greek = 0
 802         counter_entries = 0
 803         counter_entries_greek = 0
 804
 805         for item in unicodedatabase.keys():
 806                 (name, decomposition, combiningclass) = unicodedatabase[item]
 807                 if decomposition[0] == '':
 808                         continue
 809                         print name, "is empty"
 810                 elif match('<\w+>', decomposition[0]):
 811                         continue
 812                         print name, "has weird", decomposition[0]
 813                 else:
 814                         sequence = map(stringtohex, decomposition)
 815                         chrsequence = map(unichr, sequence)
 816                         normalized = normalize('NFC', "".join(chrsequence))
 817
 818                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
 819                         decomposedsequence = []
 820                         for subseq in map(redecompose, sequence):
 821                                 for seqitem in subseq:
 822                                         if isinstance(seqitem, list):
 823                                                 for i in seqitem:
 824                                                         if isinstance(i, list):
 825                                                                 for j in i:
 826                                                                         decomposedsequence.append(j)
 827                                                         else:
 828                                                                 decomposedsequence.append(i)
 829                                         else:
 830                                                 decomposedsequence.append(seqitem)
 831                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
 832                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
 833                                 counter_entries += 1
 834                                 counter_combinations += factorial(len(decomposedsequence)-1)
 835                                 ch = item
 836                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 837                                         counter_entries_greek += 1
 838                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
 839                                 if verbose:
 840                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
 841                                         print "[",
 842                                         for elem in decomposedsequence:
 843                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
 844                                         print "], recomposed as", recomposedchar,
 845                                         if unichr(item) == recomposedchar:
 846                                                 print "verified"
 847
 848         if verbose == False:
 849                 print "Unicode statistics from UnicodeData.txt"
 850                 print "Number of entries that can be algorithmically produced     :", counter_entries
 851                 print "  of which are for Greek                                   :", counter_entries_greek
 852                 print "Number of compose sequence combinations requiring          :", counter_combinations
 853                 print "  of which are for Greek                                   :", counter_combinations_greek
 854                 print "Note: We do not include partial compositions, "
 855                 print "thus the slight discrepancy in the figures"
 856                 print
 857
 858 if opt_unicodedatatxt:
 859         process_unicodedata_file(True)
 860
 861 if opt_statistics:
 862         print
 863         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
 864         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
 865         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
 866         print "    of which have Multi_key                                :", counter_multikey
 867         print
 868         print "Algorithmic (stats for Xorg Compose file)"
 869         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
 870         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
 871         print "  of which are for Greek                                   :", num_algorithmic_greek
 872         print
 873         process_unicodedata_file()
 874         print "Not algorithmic (stats from Xorg Compose file)"
 875         print "Number of sequences                                        :", len(xorg_compose_sequences)
 876         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
 877         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
 878         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
 879         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
 880         print "Number of different first items                            :", num_first_keysyms
 881         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
 882         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
 883         print
 884         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
 885         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
 886         print
 887         print "Existing (old) implementation in GTK+"
 888         print "Number of sequences in old gtkimcontextsimple.c            :", 691
 889         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"