Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # compose-parse.py, version 1.3
   5 #
   6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
   7 # the script produces statistics and information about the whole process, run with --help for more.
   8 #
   9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
  10 #
  11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
  12
  13 from re                 import findall, match, split, sub
  14 from string             import atoi
  15 from unicodedata        import normalize
  16 from urllib             import urlretrieve
  17 from os.path            import isfile, getsize
  18 from copy               import copy
  19
  20 import sys
  21 import getopt
  22
  23 # We grab files off the web, left and right.
  24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
  25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
  26 URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
  27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
  28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
  29
  30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
  31 # we might produce some tables with size 2 and some with size 4.
  32 SIZEOFINT = 2
  33
  34 # Current max compose sequence length; in case it gets increased.
  35 WIDTHOFCOMPOSETABLE = 5
  36
  37 keysymdatabase = {}
  38 keysymunicodedatabase = {}
  39 unicodedatabase = {}
  40
  41 headerfile_start = """/* GTK - The GIMP Tool Kit
  42  * Copyright (C) 2007, 2008 GNOME Foundation
  43  *
  44  * This library is free software; you can redistribute it and/or
  45  * modify it under the terms of the GNU Lesser General Public
  46  * License as published by the Free Software Foundation; either
  47  * version 2 of the License, or (at your option) any later version.
  48  *
  49  * This library is distributed in the hope that it will be useful,
  50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  52  * Lesser General Public License for more details.
  53  *
  54  * You should have received a copy of the GNU Lesser General Public
  55  * License along with this library; if not, write to the
  56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  57  * Boston, MA 02111-1307, USA.
  58  */
  59
  60 /*
  61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
  62  * using the input files
  63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
  64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
  65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  66  *
  67  * This table is optimised for space and requires special handling to access the content.
  68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
  69  *
  70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
  71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
  72  */
  73
  74 /*
  75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
  76  * file for a list of people on the GTK+ Team.  See the ChangeLog
  77  * files for a list of changes.  These files are distributed with
  78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
  79  */
  80
  81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  83
  84 /* === These are the original comments of the file; we keep for historical purposes ===
  85  *
  86  * The following table was generated from the X compose tables include with
  87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
  88  * to obtain the relevant perl scripts.
  89  *
  90  * The following compose letter letter sequences confliced
  91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
  92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
  93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
  94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
  95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
  96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
  97  *
  98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
  99  *   spanish. atilde and otilde are used at least for Portuguese ]
 100  *
 101  *   at and Aring; resolved to Aring                                          [ AA ]
 102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
 103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
 104  *
 105  * This probably should be resolved by first checking an additional set of compose tables
 106  * that depend on the locale or selected input method.
 107  */
 108
 109 static const guint16 gtk_compose_seqs_compact[] = {"""
 110
 111 headerfile_end = """};
 112
 113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
 114 """
 115
 116 def stringtohex(str): return atoi(str, 16)
 117
 118 def factorial(n):
 119         if n <= 1:
 120                 return 1
 121         else:
 122                 return n * factorial(n-1)
 123
 124 def uniq(*args) :
 125         """ Performs a uniq operation on a list or lists """
 126         theInputList = []
 127         for theList in args:
 128            theInputList += theList
 129         theFinalList = []
 130         for elem in theInputList:
 131                 if elem not in theFinalList:
 132                         theFinalList.append(elem)
 133         return theFinalList
 134
 135
 136
 137 def all_permutations(seq):
 138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
 139         """ Produces all permutations of the items of a list """
 140         if len(seq) <=1:
 141             yield seq
 142         else:
 143             for perm in all_permutations(seq[1:]):
 144                 for i in range(len(perm)+1):
 145                     #nb str[0:1] works in both string and list contexts
 146                         yield perm[:i] + seq[0:1] + perm[i:]
 147
 148 def usage():
 149         print """compose-parse available parameters:
 150         -h, --help              this craft
 151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
 152         -a, --algorithmic       show sequences saved with algorithmic optimisation
 153         -g, --gtk               show entries that go to GTK+
 154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
 155         -v, --verbose           show verbose output
 156         -p, --plane1            show plane1 compose sequences
 157         -n, --numeric           when used with --gtk, create file with numeric values only
 158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
 159
 160         Default is to show statistics.
 161         """
 162
 163 try:
 164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
 165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
 166 except:
 167         usage()
 168         sys.exit(2)
 169
 170 opt_statistics = False
 171 opt_algorithmic = False
 172 opt_gtk = False
 173 opt_unicodedatatxt = False
 174 opt_verbose = False
 175 opt_plane1 = False
 176 opt_numeric = False
 177 opt_gtkexpanded = False
 178
 179 for o, a in opts:
 180         if o in ("-h", "--help"):
 181                 usage()
 182                 sys.exit()
 183         if o in ("-s", "--statistics"):
 184                 opt_statistics = True
 185         if o in ("-a", "--algorithmic"):
 186                 opt_algorithmic = True
 187         if o in ("-g", "--gtk"):
 188                 opt_gtk = True
 189         if o in ("-u", "--unicodedatatxt"):
 190                 opt_unicodedatatxt = True
 191         if o in ("-v", "--verbose"):
 192                 opt_verbose = True
 193         if o in ("-p", "--plane1"):
 194                 opt_plane1 = True
 195         if o in ("-n", "--numeric"):
 196                 opt_numeric = True
 197         if o in ("-e", "--gtk-expanded"):
 198                 opt_gtkexpanded = True
 199
 200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
 201         opt_statistics = True
 202
 203 def download_hook(blocks_transferred, block_size, file_size):
 204         """ A download hook to provide some feedback when downloading """
 205         if blocks_transferred == 0:
 206                 if file_size > 0:
 207                         if opt_verbose:
 208                                 print "Downloading", file_size, "bytes: ",
 209                 else:
 210                         if opt_verbose:
 211                                 print "Downloading: ",
 212         sys.stdout.write('#')
 213         sys.stdout.flush()
 214
 215
 216 def download_file(url):
 217         """ Downloads a file provided a URL. Returns the filename. """
 218         """ Borks on failure """
 219         localfilename = url.split('/')[-1]
 220         if not isfile(localfilename) or getsize(localfilename) <= 0:
 221                 if opt_verbose:
 222                         print "Downloading ", url, "..."
 223                 try:
 224                         urlretrieve(url, localfilename, download_hook)
 225                 except IOError, (errno, strerror):
 226                         print "I/O error(%s): %s" % (errno, strerror)
 227                         sys.exit(-1)
 228                 except:
 229                         print "Unexpected error: ", sys.exc_info()[0]
 230                         sys.exit(-1)
 231                 print " done."
 232         else:
 233                 if opt_verbose:
 234                         print "Using cached file for ", url
 235         return localfilename
 236
 237 def process_gdkkeysymsh():
 238         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
 239         """ Fills up keysymdb with contents """
 240         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
 241         try:
 242                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
 243         except IOError, (errno, strerror):
 244                 print "I/O error(%s): %s" % (errno, strerror)
 245                 sys.exit(-1)
 246         except:
 247                 print "Unexpected error: ", sys.exc_info()[0]
 248                 sys.exit(-1)
 249
 250         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
 251         linenum_gdkkeysymsh = 0
 252         keysymdb = {}
 253         for line in gdkkeysymsh.readlines():
 254                 linenum_gdkkeysymsh += 1
 255                 line = line.strip()
 256                 if line == "" or not match('^#define GDK_', line):
 257                         continue
 258                 components = split('\s+', line)
 259                 if len(components) < 3:
 260                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 261                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 262                         print "Was expecting 3 items in the line"
 263                         sys.exit(-1)
 264                 if not match('^GDK_', components[1]):
 265                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 266                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 267                         print "Was expecting a keysym starting with GDK_"
 268                         sys.exit(-1)
 269                 if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
 270                         unival = atoi(components[2][2:], 16)
 271                         if unival == 0:
 272                                 continue
 273                         keysymdb[components[1][4:]] = unival
 274                 else:
 275                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 276                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 277                         print "Was expecting a hexadecimal number at the end of the line"
 278                         sys.exit(-1)
 279         gdkkeysymsh.close()
 280
 281         """ Patch up the keysymdb with some of our own stuff """
 282
 283         """ This is for a missing keysym from the currently upstream file """
 284         keysymdb['dead_stroke'] = 0x338
 285
 286         """ This is for a missing keysym from the currently upstream file """
 287         ###keysymdb['dead_belowring'] = 0x323
 288         ###keysymdb['dead_belowmacron'] = 0x331
 289         ###keysymdb['dead_belowcircumflex'] = 0x32d
 290         ###keysymdb['dead_belowtilde'] = 0x330
 291         ###keysymdb['dead_belowbreve'] = 0x32e
 292         ###keysymdb['dead_belowdiaeresis'] = 0x324
 293
 294         """ This is^Wwas preferential treatment for Greek """
 295         # keysymdb['dead_tilde'] = 0x342
 296         """ This is^was preferential treatment for Greek """
 297         #keysymdb['combining_tilde'] = 0x342
 298
 299         """ Fixing VoidSymbol """
 300         keysymdb['VoidSymbol'] = 0xFFFF
 301
 302         return keysymdb
 303
 304 def process_keysymstxt():
 305         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
 306         """ This file keeps a record between keysyms <-> unicode chars """
 307         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
 308         try:
 309                 keysymstxt = open(filename_keysymstxt, 'r')
 310         except IOError, (errno, strerror):
 311                 print "I/O error(%s): %s" % (errno, strerror)
 312                 sys.exit(-1)
 313         except:
 314                 print "Unexpected error: ", sys.exc_info()[0]
 315                 sys.exit(-1)
 316
 317         """ Parse the keysyms.txt file and place content in  keysymdb """
 318         linenum_keysymstxt = 0
 319         keysymdb = {}
 320         for line in keysymstxt.readlines():
 321                 linenum_keysymstxt += 1
 322                 line = line.strip()
 323                 if line == "" or match('^#', line):
 324                         continue
 325                 components = split('\s+', line)
 326                 if len(components) < 5:
 327                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
 328                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
 329                         print "Was expecting 5 items in the line"
 330                         sys.exit(-1)
 331                 if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
 332                         unival = atoi(components[1][1:], 16)
 333                 if unival == 0:
 334                         continue
 335                 keysymdb[components[4]] = unival
 336         keysymstxt.close()
 337
 338         """ Patch up the keysymdb with some of our own stuff """
 339         """ This is for a missing keysym from the currently upstream file """
 340         ###keysymdb['dead_belowring'] = 0x323
 341         ###keysymdb['dead_belowmacron'] = 0x331
 342         ###keysymdb['dead_belowcircumflex'] = 0x32d
 343         ###keysymdb['dead_belowtilde'] = 0x330
 344         ###keysymdb['dead_belowbreve'] = 0x32e
 345         ###keysymdb['dead_belowdiaeresis'] = 0x324
 346
 347         """ This is preferential treatment for Greek """
 348         """ => we get more savings if used for Greek """
 349         # keysymdb['dead_tilde'] = 0x342
 350         """ This is preferential treatment for Greek """
 351         # keysymdb['combining_tilde'] = 0x342
 352
 353         """ This is for a missing keysym from Markus Kuhn's db """
 354         keysymdb['dead_stroke'] = 0x338
 355         """ This is for a missing keysym from Markus Kuhn's db """
 356         keysymdb['Oslash'] = 0x0d8
 357
 358         """ This is for a missing (recently added) keysym """
 359         keysymdb['dead_psili'] = 0x313
 360         """ This is for a missing (recently added) keysym """
 361         keysymdb['dead_dasia'] = 0x314
 362
 363         """ Allows to import Multi_key sequences """
 364         keysymdb['Multi_key'] = 0xff20
 365
 366         keysymdb['zerosubscript'] = 0x2080
 367         keysymdb['onesubscript'] = 0x2081
 368         keysymdb['twosubscript'] = 0x2082
 369         keysymdb['threesubscript'] = 0x2083
 370         keysymdb['foursubscript'] = 0x2084
 371         keysymdb['fivesubscript'] = 0x2085
 372         keysymdb['sixsubscript'] = 0x2086
 373         keysymdb['sevensubscript'] = 0x2087
 374         keysymdb['eightsubscript'] = 0x2088
 375         keysymdb['ninesubscript'] = 0x2089
 376
 377         return keysymdb
 378
 379 def keysymvalue(keysym, file = "n/a", linenum = 0):
 380         """ Extracts a value from the keysym """
 381         """ Find the value of keysym, using the data from keysyms """
 382         """ Use file and linenum to when reporting errors """
 383         if keysym == "":
 384                 return 0
 385         if keysymdatabase.has_key(keysym):
 386                 return keysymdatabase[keysym]
 387         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 388                 return atoi(keysym[1:], 16)
 389         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 390                 return atoi(keysym[2:], 16)
 391         else:
 392                 #print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 393                 return -1
 394                 #sys.exit(-1)
 395
 396 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
 397         """ Extracts a value from the keysym """
 398         """ Find the value of keysym, using the data from keysyms """
 399         """ Use file and linenum to when reporting errors """
 400         if keysym == "":
 401                 return 0
 402         if keysymunicodedatabase.has_key(keysym):
 403                 return keysymunicodedatabase[keysym]
 404         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 405                 return atoi(keysym[1:], 16)
 406         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 407                 return atoi(keysym[2:], 16)
 408         else:
 409                 print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 410                 sys.exit(-1)
 411
 412 def rename_combining(seq):
 413         filtered_sequence = []
 414         for ks in seq:
 415                 if findall('^combining_', ks):
 416                         filtered_sequence.append(sub('^combining_', 'dead_', ks))
 417                 else:
 418                         filtered_sequence.append(ks)
 419         return filtered_sequence
 420
 421
 422 keysymunicodedatabase = process_keysymstxt()
 423 keysymdatabase = process_gdkkeysymsh()
 424
 425 """ Grab and open the compose file from upstream """
 426 filename_compose = download_file(URL_COMPOSE)
 427 try:
 428         composefile = open(filename_compose, 'r')
 429 except IOError, (errno, strerror):
 430         print "I/O error(%s): %s" % (errno, strerror)
 431         sys.exit(-1)
 432 except:
 433         print "Unexpected error: ", sys.exc_info()[0]
 434         sys.exit(-1)
 435
 436 """ Look if there is a lookaside (supplementary) compose file in the current
 437     directory, and if so, open, then merge with upstream Compose file.
 438 """
 439 xorg_compose_sequences_raw = []
 440 for seq in composefile.readlines():
 441         xorg_compose_sequences_raw.append(seq)
 442
 443 try:
 444         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
 445         for seq in composefile_lookaside.readlines():
 446                 xorg_compose_sequences_raw.append(seq)
 447 except IOError, (errno, strerror):
 448         if opt_verbose:
 449                 print "I/O error(%s): %s" % (errno, strerror)
 450                 print "Did not find lookaside compose file. Continuing..."
 451 except:
 452         print "Unexpected error: ", sys.exc_info()[0]
 453         sys.exit(-1)
 454
 455 """ Parse the compose file in  xorg_compose_sequences"""
 456 xorg_compose_sequences = []
 457 xorg_compose_sequences_algorithmic = []
 458 linenum_compose = 0
 459 comment_nest_depth = 0
 460 for line in xorg_compose_sequences_raw:
 461         linenum_compose += 1
 462         line = line.strip()
 463         if match("^XCOMM", line) or match("^#", line):
 464                 continue
 465
 466         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
 467
 468         comment_start = line.find("/*")
 469
 470         if comment_start >= 0:
 471                 if comment_nest_depth == 0:
 472                         line = line[:comment_start]
 473                 else:
 474                         line = ""
 475
 476                 comment_nest_depth += 1
 477         else:
 478                 comment_end = line.find("*/")
 479
 480                 if comment_end >= 0:
 481                         comment_nest_depth -= 1
 482
 483                 if comment_nest_depth < 0:
 484                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
 485                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 486                         exit(-1)
 487
 488                 if comment_nest_depth > 0:
 489                         line = ""
 490                 else:
 491                         line = line[comment_end + 2:]
 492
 493         if line is "":
 494                 continue
 495
 496         #line = line[:-1]
 497         components = split(':', line)
 498         if len(components) != 2:
 499                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
 500                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 501                 exit(-1)
 502         (seq, val ) = split(':', line)
 503         seq = seq.strip()
 504         val = val.strip()
 505         raw_sequence = findall('\w+', seq)
 506         values = split('\s+', val)
 507         unichar_temp = split('"', values[0])
 508         unichar = unichar_temp[1]
 509         if len(values) == 1:
 510                 continue
 511         codepointstr = values[1]
 512         if values[1] == '#':
 513                 # No codepoints that are >1 characters yet.
 514                 continue
 515         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
 516                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
 517         if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
 518                 codepoint = atoi(codepointstr[1:], 16)
 519         elif keysymunicodedatabase.has_key(codepointstr):
 520                 if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
 521                         print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
 522                         print raw_sequence, codepointstr
 523                 codepoint = keysymunicodedatabase[codepointstr]
 524         else:
 525                 print
 526                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
 527                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
 528                 exit(-1)
 529         sequence = rename_combining(raw_sequence)
 530         reject_this = False
 531         for i in sequence:
 532                 if keysymvalue(i) > 0xFFFF:
 533                         reject_this = True
 534                         if opt_plane1:
 535                                 print sequence
 536                         break
 537                 if keysymvalue(i) < 0:
 538                         reject_this = True
 539                         break
 540         if reject_this:
 541                 continue
 542         if "U0342" in sequence or \
 543                 "U0313" in sequence or \
 544                 "U0314" in sequence or \
 545                 "0x0313" in sequence or \
 546                 "0x0342" in sequence or \
 547                 "0x0314" in sequence:
 548                 continue
 549         if "dead_belowring" in sequence or\
 550                 "dead_belowcomma" in sequence or\
 551                 "dead_belowmacron" in sequence or\
 552                 "dead_belowtilde" in sequence or\
 553                 "dead_belowbreve" in sequence or\
 554                 "dead_belowdiaeresis" in sequence or\
 555                 "dead_belowcircumflex" in sequence:
 556                 continue
 557         #for i in range(len(sequence)):
 558         #       if sequence[i] == "0x0342":
 559         #               sequence[i] = "dead_tilde"
 560         if "Multi_key" not in sequence:
 561                 """ Ignore for now >0xFFFF keysyms """
 562                 if codepoint < 0xFFFF:
 563                         original_sequence = copy(sequence)
 564                         stats_sequence = copy(sequence)
 565                         base = sequence.pop()
 566                         basechar = keysymvalue(base, filename_compose, linenum_compose)
 567
 568                         if basechar < 0xFFFF:
 569                                 counter = 1
 570                                 unisequence = []
 571                                 not_normalised = True
 572                                 skipping_this = False
 573                                 for i in range(0, len(sequence)):
 574                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
 575                                             because of lack of dead_perispomeni (i.e. conflict)
 576                                         """
 577                                         bc = basechar
 578                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 579                                                 skipping_this = True
 580                                                 break
 581                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 582                                                 skipping_this = True
 583                                                 break
 584                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 585                                                 skipping_this = True
 586                                                 break
 587                                         if sequence[-1] == "dead_psili":
 588                                                 sequence[i] = "dead_horn"
 589                                         if sequence[-1] == "dead_dasia":
 590                                                 sequence[-1] = "dead_ogonek"
 591                                         """
 592                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
 593
 594                                 if skipping_this:
 595                                         unisequence = []
 596                                 for perm in all_permutations(unisequence):
 597                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
 598                                         # print counter, map(unichr, perm)
 599                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
 600                                         if len(normalized) == 1:
 601                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
 602                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
 603                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
 604                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
 605                                                 stats_sequence_data.append(normalized)
 606                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
 607                                                 not_normalised = False
 608                                                 break;
 609                                         counter += 1
 610                                 if not_normalised:
 611                                         original_sequence.append(codepoint)
 612                                         xorg_compose_sequences.append(original_sequence)
 613                                         """ print xorg_compose_sequences[-1] """
 614
 615                         else:
 616                                 print "Error in base char !?!"
 617                                 exit(-2)
 618                 else:
 619                         print "OVER", sequence
 620                         exit(-1)
 621         else:
 622                 sequence.append(codepoint)
 623                 xorg_compose_sequences.append(sequence)
 624                 """ print xorg_compose_sequences[-1] """
 625
 626 def sequence_cmp(x, y):
 627         if keysymvalue(x[0]) > keysymvalue(y[0]):
 628                 return 1
 629         elif keysymvalue(x[0]) < keysymvalue(y[0]):
 630                 return -1
 631         elif len(x) > len(y):
 632                 return 1
 633         elif len(x) < len(y):
 634                 return -1
 635         elif keysymvalue(x[1]) > keysymvalue(y[1]):
 636                 return 1
 637         elif keysymvalue(x[1]) < keysymvalue(y[1]):
 638                 return -1
 639         elif len(x) < 4:
 640                 return 0
 641         elif keysymvalue(x[2]) > keysymvalue(y[2]):
 642                 return 1
 643         elif keysymvalue(x[2]) < keysymvalue(y[2]):
 644                 return -1
 645         elif len(x) < 5:
 646                 return 0
 647         elif keysymvalue(x[3]) > keysymvalue(y[3]):
 648                 return 1
 649         elif keysymvalue(x[3]) < keysymvalue(y[3]):
 650                 return -1
 651         elif len(x) < 6:
 652                 return 0
 653         elif keysymvalue(x[4]) > keysymvalue(y[4]):
 654                 return 1
 655         elif keysymvalue(x[4]) < keysymvalue(y[4]):
 656                 return -1
 657         else:
 658                 return 0
 659
 660 def sequence_unicode_cmp(x, y):
 661         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
 662                 return 1
 663         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
 664                 return -1
 665         elif len(x) > len(y):
 666                 return 1
 667         elif len(x) < len(y):
 668                 return -1
 669         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
 670                 return 1
 671         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
 672                 return -1
 673         elif len(x) < 4:
 674                 return 0
 675         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
 676                 return 1
 677         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
 678                 return -1
 679         elif len(x) < 5:
 680                 return 0
 681         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
 682                 return 1
 683         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
 684                 return -1
 685         elif len(x) < 6:
 686                 return 0
 687         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
 688                 return 1
 689         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
 690                 return -1
 691         else:
 692                 return 0
 693
 694 def sequence_algorithmic_cmp(x, y):
 695         if len(x) < len(y):
 696                 return -1
 697         elif len(x) > len(y):
 698                 return 1
 699         else:
 700                 for i in range(len(x)):
 701                         if x[i] < y[i]:
 702                                 return -1
 703                         elif x[i] > y[i]:
 704                                 return 1
 705         return 0
 706
 707
 708 xorg_compose_sequences.sort(sequence_cmp)
 709
 710 xorg_compose_sequences_uniqued = []
 711 first_time = True
 712 item = None
 713 for next_item in xorg_compose_sequences:
 714         if first_time:
 715                 first_time = False
 716                 item = next_item
 717         if sequence_unicode_cmp(item, next_item) != 0:
 718                 xorg_compose_sequences_uniqued.append(item)
 719         item = next_item
 720
 721 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
 722
 723 counter_multikey = 0
 724 for item in xorg_compose_sequences:
 725         if findall('Multi_key', "".join(item[:-1])) != []:
 726                 counter_multikey += 1
 727
 728 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
 729 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
 730
 731 firstitem = ""
 732 num_first_keysyms = 0
 733 zeroes = 0
 734 num_entries = 0
 735 num_algorithmic_greek = 0
 736 for sequence in xorg_compose_sequences:
 737         if keysymvalue(firstitem) != keysymvalue(sequence[0]):
 738                 firstitem = sequence[0]
 739                 num_first_keysyms += 1
 740         zeroes += 6 - len(sequence) + 1
 741         num_entries += 1
 742
 743 for sequence in xorg_compose_sequences_algorithmic_uniqued:
 744         ch = ord(sequence[-1:][0])
 745         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 746                 num_algorithmic_greek += 1
 747
 748
 749 if opt_algorithmic:
 750         for sequence in xorg_compose_sequences_algorithmic_uniqued:
 751                 letter = "".join(sequence[-1:])
 752                 print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
 753                 for elem in sequence[:-2]:
 754                         print "<0x%(keysym)04X>," % { 'keysym': elem },
 755                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
 756                 print "], recomposed as", letter, "verified"
 757
 758 def num_of_keysyms(seq):
 759         return len(seq) - 1
 760
 761 def convert_UnotationToHex(arg):
 762         if isinstance(arg, str):
 763                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
 764                         return sub('^U', '0x', arg)
 765         return arg
 766
 767 def addprefix_GDK(arg):
 768         if match('^0x', arg):
 769                 return '%(arg)s, ' % { 'arg': arg }
 770         else:
 771                 return 'GDK_%(arg)s, ' % { 'arg': arg }
 772
 773 if opt_gtk:
 774         first_keysym = ""
 775         sequence = []
 776         compose_table = []
 777         ct_second_part = []
 778         ct_sequence_width = 2
 779         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
 780         we_finished = False
 781         counter = 0
 782
 783         sequence_iterator = iter(xorg_compose_sequences)
 784         sequence = sequence_iterator.next()
 785         while True:
 786                 first_keysym = sequence[0]                                      # Set the first keysym
 787                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
 788                 while sequence[0] == first_keysym:
 789                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
 790                         try:
 791                                 sequence = sequence_iterator.next()
 792                         except StopIteration:
 793                                 we_finished = True
 794                                 break
 795                 if we_finished:
 796                         break
 797                 counter += 1
 798
 799         ct_index = start_offset
 800         for line_num in range(len(compose_table)):
 801                 for i in range(WIDTHOFCOMPOSETABLE):
 802                         occurences = compose_table[line_num][i+1]
 803                         compose_table[line_num][i+1] = ct_index
 804                         ct_index += occurences * (i+2)
 805
 806         for sequence in xorg_compose_sequences:
 807                 ct_second_part.append(map(convert_UnotationToHex, sequence))
 808
 809         print headerfile_start
 810         for i in compose_table:
 811                 if opt_gtkexpanded:
 812                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
 813                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
 814                 elif not match('^0x', i[0]):
 815                         print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 816                 else:
 817                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 818         for i in ct_second_part:
 819                 if opt_numeric:
 820                         for ks in i[1:][:-1]:
 821                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 822                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 823                         """
 824                         for ks in i[:-1]:
 825                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 826                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 827                         """
 828                 elif opt_gtkexpanded:
 829                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
 830                 else:
 831                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
 832         print headerfile_end
 833
 834 def redecompose(codepoint):
 835         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
 836         if decomposition[0] == '' or decomposition[0] == '0':
 837                 return [codepoint]
 838         if match('<\w+>', decomposition[0]):
 839                 numdecomposition = map(stringtohex, decomposition[1:])
 840                 return map(redecompose, numdecomposition)
 841         numdecomposition = map(stringtohex, decomposition)
 842         return map(redecompose, numdecomposition)
 843
 844 def process_unicodedata_file(verbose = False):
 845         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
 846         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
 847         try:
 848                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
 849         except IOError, (errno, strerror):
 850                 print "I/O error(%s): %s" % (errno, strerror)
 851                 sys.exit(-1)
 852         except:
 853                 print "Unexpected error: ", sys.exc_info()[0]
 854                 sys.exit(-1)
 855         for line in unicodedatatxt.readlines():
 856                 if line[0] == "" or line[0] == '#':
 857                         continue
 858                 line = line[:-1]
 859                 uniproperties = split(';', line)
 860                 codepoint = stringtohex(uniproperties[0])
 861                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
 862                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
 863                         continue
 864                 name = uniproperties[1]
 865                 category = uniproperties[2]
 866                 combiningclass = uniproperties[3]
 867                 decomposition = uniproperties[5]
 868                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
 869
 870         counter_combinations = 0
 871         counter_combinations_greek = 0
 872         counter_entries = 0
 873         counter_entries_greek = 0
 874
 875         for item in unicodedatabase.keys():
 876                 (name, decomposition, combiningclass) = unicodedatabase[item]
 877                 if decomposition[0] == '':
 878                         continue
 879                         print name, "is empty"
 880                 elif match('<\w+>', decomposition[0]):
 881                         continue
 882                         print name, "has weird", decomposition[0]
 883                 else:
 884                         sequence = map(stringtohex, decomposition)
 885                         chrsequence = map(unichr, sequence)
 886                         normalized = normalize('NFC', "".join(chrsequence))
 887
 888                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
 889                         decomposedsequence = []
 890                         for subseq in map(redecompose, sequence):
 891                                 for seqitem in subseq:
 892                                         if isinstance(seqitem, list):
 893                                                 for i in seqitem:
 894                                                         if isinstance(i, list):
 895                                                                 for j in i:
 896                                                                         decomposedsequence.append(j)
 897                                                         else:
 898                                                                 decomposedsequence.append(i)
 899                                         else:
 900                                                 decomposedsequence.append(seqitem)
 901                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
 902                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
 903                                 counter_entries += 1
 904                                 counter_combinations += factorial(len(decomposedsequence)-1)
 905                                 ch = item
 906                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 907                                         counter_entries_greek += 1
 908                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
 909                                 if verbose:
 910                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
 911                                         print "[",
 912                                         for elem in decomposedsequence:
 913                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
 914                                         print "], recomposed as", recomposedchar,
 915                                         if unichr(item) == recomposedchar:
 916                                                 print "verified"
 917
 918         if verbose == False:
 919                 print "Unicode statistics from UnicodeData.txt"
 920                 print "Number of entries that can be algorithmically produced     :", counter_entries
 921                 print "  of which are for Greek                                   :", counter_entries_greek
 922                 print "Number of compose sequence combinations requiring          :", counter_combinations
 923                 print "  of which are for Greek                                   :", counter_combinations_greek
 924                 print "Note: We do not include partial compositions, "
 925                 print "thus the slight discrepancy in the figures"
 926                 print
 927
 928 if opt_unicodedatatxt:
 929         process_unicodedata_file(True)
 930
 931 if opt_statistics:
 932         print
 933         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
 934         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
 935         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
 936         print "    of which have Multi_key                                :", counter_multikey
 937         print
 938         print "Algorithmic (stats for Xorg Compose file)"
 939         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
 940         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
 941         print "  of which are for Greek                                   :", num_algorithmic_greek
 942         print
 943         process_unicodedata_file()
 944         print "Not algorithmic (stats from Xorg Compose file)"
 945         print "Number of sequences                                        :", len(xorg_compose_sequences)
 946         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
 947         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
 948         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
 949         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
 950         print "Number of different first items                            :", num_first_keysyms
 951         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
 952         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
 953         print
 954         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
 955         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
 956         print
 957         print "Existing (old) implementation in GTK+"
 958         print "Number of sequences in old gtkimcontextsimple.c            :", 691
 959         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"