Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # compose-parse.py, version 1.3
   5 #
   6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
   7 # the script produces statistics and information about the whole process, run with --help for more.
   8 #
   9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
  10 #
  11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
  12
  13 from re                 import findall, match, split, sub
  14 from string             import atoi
  15 from unicodedata        import normalize
  16 from urllib             import urlretrieve
  17 from os.path            import isfile, getsize
  18 from copy               import copy
  19
  20 import sys
  21 import getopt
  22
  23 # We grab files off the web, left and right.
  24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
  25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
  26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
  27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
  28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
  29
  30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
  31 # we might produce some tables with size 2 and some with size 4.
  32 SIZEOFINT = 2
  33
  34 # Current max compose sequence length; in case it gets increased.
  35 WIDTHOFCOMPOSETABLE = 5
  36
  37 keysymdatabase = {}
  38 keysymunicodedatabase = {}
  39 unicodedatabase = {}
  40
  41 headerfile_start = """/* GTK - The GIMP Tool Kit
  42  * Copyright (C) 2007, 2008 GNOME Foundation
  43  *
  44  * This library is free software; you can redistribute it and/or
  45  * modify it under the terms of the GNU Lesser General Public
  46  * License as published by the Free Software Foundation; either
  47  * version 2 of the License, or (at your option) any later version.
  48  *
  49  * This library is distributed in the hope that it will be useful,
  50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  52  * Lesser General Public License for more details.
  53  *
  54  * You should have received a copy of the GNU Lesser General Public
  55  * License along with this library. If not, see see <http://www.gnu.org/licenses/>.
  56  */
  57
  58 /*
  59  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
  60  * using the input files
  61  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
  62  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
  63  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  64  *
  65  * This table is optimised for space and requires special handling to access the content.
  66  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
  67  *
  68  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
  69  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
  70  */
  71
  72 /*
  73  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
  74  * file for a list of people on the GTK+ Team.  See the ChangeLog
  75  * files for a list of changes.  These files are distributed with
  76  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
  77  */
  78
  79 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  80 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  81
  82 /* === These are the original comments of the file; we keep for historical purposes ===
  83  *
  84  * The following table was generated from the X compose tables include with
  85  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
  86  * to obtain the relevant perl scripts.
  87  *
  88  * The following compose letter letter sequences confliced
  89  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
  90  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
  91  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
  92  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
  93  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
  94  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
  95  *
  96  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
  97  *   spanish. atilde and otilde are used at least for Portuguese ]
  98  *
  99  *   at and Aring; resolved to Aring                                          [ AA ]
 100  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
 101  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
 102  *
 103  * This probably should be resolved by first checking an additional set of compose tables
 104  * that depend on the locale or selected input method.
 105  */
 106
 107 static const guint16 gtk_compose_seqs_compact[] = {"""
 108
 109 headerfile_end = """};
 110
 111 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
 112 """
 113
 114 def stringtohex(str): return atoi(str, 16)
 115
 116 def factorial(n):
 117         if n <= 1:
 118                 return 1
 119         else:
 120                 return n * factorial(n-1)
 121
 122 def uniq(*args) :
 123         """ Performs a uniq operation on a list or lists """
 124         theInputList = []
 125         for theList in args:
 126            theInputList += theList
 127         theFinalList = []
 128         for elem in theInputList:
 129                 if elem not in theFinalList:
 130                         theFinalList.append(elem)
 131         return theFinalList
 132
 133
 134
 135 def all_permutations(seq):
 136         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
 137         """ Produces all permutations of the items of a list """
 138         if len(seq) <=1:
 139             yield seq
 140         else:
 141             for perm in all_permutations(seq[1:]):
 142                 for i in range(len(perm)+1):
 143                     #nb str[0:1] works in both string and list contexts
 144                         yield perm[:i] + seq[0:1] + perm[i:]
 145
 146 def usage():
 147         print """compose-parse available parameters:
 148         -h, --help              this craft
 149         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
 150         -a, --algorithmic       show sequences saved with algorithmic optimisation
 151         -g, --gtk               show entries that go to GTK+
 152         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
 153         -v, --verbose           show verbose output
 154         -p, --plane1            show plane1 compose sequences
 155         -n, --numeric           when used with --gtk, create file with numeric values only
 156         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
 157
 158         Default is to show statistics.
 159         """
 160
 161 try:
 162         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
 163                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
 164 except:
 165         usage()
 166         sys.exit(2)
 167
 168 opt_statistics = False
 169 opt_algorithmic = False
 170 opt_gtk = False
 171 opt_unicodedatatxt = False
 172 opt_verbose = False
 173 opt_plane1 = False
 174 opt_numeric = False
 175 opt_gtkexpanded = False
 176
 177 for o, a in opts:
 178         if o in ("-h", "--help"):
 179                 usage()
 180                 sys.exit()
 181         if o in ("-s", "--statistics"):
 182                 opt_statistics = True
 183         if o in ("-a", "--algorithmic"):
 184                 opt_algorithmic = True
 185         if o in ("-g", "--gtk"):
 186                 opt_gtk = True
 187         if o in ("-u", "--unicodedatatxt"):
 188                 opt_unicodedatatxt = True
 189         if o in ("-v", "--verbose"):
 190                 opt_verbose = True
 191         if o in ("-p", "--plane1"):
 192                 opt_plane1 = True
 193         if o in ("-n", "--numeric"):
 194                 opt_numeric = True
 195         if o in ("-e", "--gtk-expanded"):
 196                 opt_gtkexpanded = True
 197
 198 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
 199         opt_statistics = True
 200
 201 def download_hook(blocks_transferred, block_size, file_size):
 202         """ A download hook to provide some feedback when downloading """
 203         if blocks_transferred == 0:
 204                 if file_size > 0:
 205                         if opt_verbose:
 206                                 print "Downloading", file_size, "bytes: ",
 207                 else:
 208                         if opt_verbose:
 209                                 print "Downloading: ",
 210         sys.stdout.write('#')
 211         sys.stdout.flush()
 212
 213
 214 def download_file(url):
 215         """ Downloads a file provided a URL. Returns the filename. """
 216         """ Borks on failure """
 217         localfilename = url.split('/')[-1]
 218         if not isfile(localfilename) or getsize(localfilename) <= 0:
 219                 if opt_verbose:
 220                         print "Downloading ", url, "..."
 221                 try:
 222                         urlretrieve(url, localfilename, download_hook)
 223                 except IOError, (errno, strerror):
 224                         print "I/O error(%s): %s" % (errno, strerror)
 225                         sys.exit(-1)
 226                 except:
 227                         print "Unexpected error: ", sys.exc_info()[0]
 228                         sys.exit(-1)
 229                 print " done."
 230         else:
 231                 if opt_verbose:
 232                         print "Using cached file for ", url
 233         return localfilename
 234
 235 def process_gdkkeysymsh():
 236         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
 237         """ Fills up keysymdb with contents """
 238         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
 239         try:
 240                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
 241         except IOError, (errno, strerror):
 242                 print "I/O error(%s): %s" % (errno, strerror)
 243                 sys.exit(-1)
 244         except:
 245                 print "Unexpected error: ", sys.exc_info()[0]
 246                 sys.exit(-1)
 247
 248         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
 249         linenum_gdkkeysymsh = 0
 250         keysymdb = {}
 251         for line in gdkkeysymsh.readlines():
 252                 linenum_gdkkeysymsh += 1
 253                 line = line.strip()
 254                 if line == "" or not match('^#define GDK_KEY_', line):
 255                         continue
 256                 components = split('\s+', line)
 257                 if len(components) < 3:
 258                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 259                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 260                         print "Was expecting 3 items in the line"
 261                         sys.exit(-1)
 262                 if not match('^GDK_KEY_', components[1]):
 263                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 264                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 265                         print "Was expecting a keysym starting with GDK_KEY_"
 266                         sys.exit(-1)
 267                 if match('^0x[0-9a-fA-F]+$', components[2]):
 268                         unival = long(components[2][2:], 16)
 269                         if unival == 0:
 270                                 continue
 271                         keysymdb[components[1][8:]] = unival
 272                 else:
 273                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 274                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 275                         print "Was expecting a hexadecimal number at the end of the line"
 276                         sys.exit(-1)
 277         gdkkeysymsh.close()
 278
 279         """ Patch up the keysymdb with some of our own stuff """
 280
 281         """ This is for a missing keysym from the currently upstream file """
 282         keysymdb['dead_stroke'] = 0x338
 283
 284         """ This is for a missing keysym from the currently upstream file """
 285         ###keysymdb['dead_belowring'] = 0x323
 286         ###keysymdb['dead_belowmacron'] = 0x331
 287         ###keysymdb['dead_belowcircumflex'] = 0x32d
 288         ###keysymdb['dead_belowtilde'] = 0x330
 289         ###keysymdb['dead_belowbreve'] = 0x32e
 290         ###keysymdb['dead_belowdiaeresis'] = 0x324
 291
 292         """ This is^Wwas preferential treatment for Greek """
 293         # keysymdb['dead_tilde'] = 0x342
 294         """ This is^was preferential treatment for Greek """
 295         #keysymdb['combining_tilde'] = 0x342
 296
 297         """ Fixing VoidSymbol """
 298         keysymdb['VoidSymbol'] = 0xFFFF
 299
 300         return keysymdb
 301
 302 def process_keysymstxt():
 303         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
 304         """ This file keeps a record between keysyms <-> unicode chars """
 305         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
 306         try:
 307                 keysymstxt = open(filename_keysymstxt, 'r')
 308         except IOError, (errno, strerror):
 309                 print "I/O error(%s): %s" % (errno, strerror)
 310                 sys.exit(-1)
 311         except:
 312                 print "Unexpected error: ", sys.exc_info()[0]
 313                 sys.exit(-1)
 314
 315         """ Parse the keysyms.txt file and place content in  keysymdb """
 316         linenum_keysymstxt = 0
 317         keysymdb = {}
 318         for line in keysymstxt.readlines():
 319                 linenum_keysymstxt += 1
 320                 line = line.strip()
 321                 if line == "" or match('^#', line):
 322                         continue
 323                 components = split('\s+', line)
 324                 if len(components) < 5:
 325                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
 326                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
 327                         print "Was expecting 5 items in the line"
 328                         sys.exit(-1)
 329                 if match('^U[0-9a-fA-F]+$', components[1]):
 330                         unival = long(components[1][1:], 16)
 331                 if unival == 0:
 332                         continue
 333                 keysymdb[components[4]] = unival
 334         keysymstxt.close()
 335
 336         """ Patch up the keysymdb with some of our own stuff """
 337         """ This is for a missing keysym from the currently upstream file """
 338         ###keysymdb['dead_belowring'] = 0x323
 339         ###keysymdb['dead_belowmacron'] = 0x331
 340         ###keysymdb['dead_belowcircumflex'] = 0x32d
 341         ###keysymdb['dead_belowtilde'] = 0x330
 342         ###keysymdb['dead_belowbreve'] = 0x32e
 343         ###keysymdb['dead_belowdiaeresis'] = 0x324
 344
 345         """ This is preferential treatment for Greek """
 346         """ => we get more savings if used for Greek """
 347         # keysymdb['dead_tilde'] = 0x342
 348         """ This is preferential treatment for Greek """
 349         # keysymdb['combining_tilde'] = 0x342
 350
 351         """ This is for a missing keysym from Markus Kuhn's db """
 352         keysymdb['dead_stroke'] = 0x338
 353         """ This is for a missing keysym from Markus Kuhn's db """
 354         keysymdb['Oslash'] = 0x0d8
 355         """ This is for a missing keysym from Markus Kuhn's db """
 356         keysymdb['Ssharp'] = 0x1e9e
 357
 358         """ This is for a missing (recently added) keysym """
 359         keysymdb['dead_psili'] = 0x313
 360         """ This is for a missing (recently added) keysym """
 361         keysymdb['dead_dasia'] = 0x314
 362
 363         """ Allows to import Multi_key sequences """
 364         keysymdb['Multi_key'] = 0xff20
 365
 366         keysymdb['zerosubscript'] = 0x2080
 367         keysymdb['onesubscript'] = 0x2081
 368         keysymdb['twosubscript'] = 0x2082
 369         keysymdb['threesubscript'] = 0x2083
 370         keysymdb['foursubscript'] = 0x2084
 371         keysymdb['fivesubscript'] = 0x2085
 372         keysymdb['sixsubscript'] = 0x2086
 373         keysymdb['sevensubscript'] = 0x2087
 374         keysymdb['eightsubscript'] = 0x2088
 375         keysymdb['ninesubscript'] = 0x2089
 376         keysymdb['dead_doublegrave'] = 0x030F
 377         keysymdb['dead_invertedbreve'] = 0x0311
 378
 379         return keysymdb
 380
 381 def keysymvalue(keysym, file = "n/a", linenum = 0):
 382         """ Extracts a value from the keysym """
 383         """ Find the value of keysym, using the data from keysyms """
 384         """ Use file and linenum to when reporting errors """
 385         if keysym == "":
 386                 return 0
 387         if keysymdatabase.has_key(keysym):
 388                 return keysymdatabase[keysym]
 389         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 390                 return atoi(keysym[1:], 16)
 391         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 392                 return atoi(keysym[2:], 16)
 393         else:
 394                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 395                 #return -1
 396                 sys.exit(-1)
 397
 398 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
 399         """ Extracts a value from the keysym """
 400         """ Find the value of keysym, using the data from keysyms """
 401         """ Use file and linenum to when reporting errors """
 402         if keysym == "":
 403                 return 0
 404         if keysymunicodedatabase.has_key(keysym):
 405                 return keysymunicodedatabase[keysym]
 406         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 407                 return atoi(keysym[1:], 16)
 408         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 409                 return atoi(keysym[2:], 16)
 410         else:
 411                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 412                 sys.exit(-1)
 413
 414 def rename_combining(seq):
 415         filtered_sequence = []
 416         for ks in seq:
 417                 if findall('^combining_', ks):
 418                         ks = sub('^combining_', 'dead_', ks)
 419                 if ks == 'dead_double_grave':
 420                         ks = 'dead_doublegrave'
 421                 if ks == 'dead_inverted_breve':
 422                         ks = 'dead_invertedbreve'
 423                 filtered_sequence.append(ks)
 424         return filtered_sequence
 425
 426
 427 keysymunicodedatabase = process_keysymstxt()
 428 keysymdatabase = process_gdkkeysymsh()
 429
 430 """ Grab and open the compose file from upstream """
 431 filename_compose = download_file(URL_COMPOSE)
 432 try:
 433         composefile = open(filename_compose, 'r')
 434 except IOError, (errno, strerror):
 435         print "I/O error(%s): %s" % (errno, strerror)
 436         sys.exit(-1)
 437 except:
 438         print "Unexpected error: ", sys.exc_info()[0]
 439         sys.exit(-1)
 440
 441 """ Look if there is a lookaside (supplementary) compose file in the current
 442     directory, and if so, open, then merge with upstream Compose file.
 443 """
 444 xorg_compose_sequences_raw = []
 445 for seq in composefile.readlines():
 446         xorg_compose_sequences_raw.append(seq)
 447
 448 try:
 449         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
 450         for seq in composefile_lookaside.readlines():
 451                 xorg_compose_sequences_raw.append(seq)
 452 except IOError, (errno, strerror):
 453         if opt_verbose:
 454                 print "I/O error(%s): %s" % (errno, strerror)
 455                 print "Did not find lookaside compose file. Continuing..."
 456 except:
 457         print "Unexpected error: ", sys.exc_info()[0]
 458         sys.exit(-1)
 459
 460 """ Parse the compose file in  xorg_compose_sequences"""
 461 xorg_compose_sequences = []
 462 xorg_compose_sequences_algorithmic = []
 463 linenum_compose = 0
 464 comment_nest_depth = 0
 465 for line in xorg_compose_sequences_raw:
 466         linenum_compose += 1
 467         line = line.strip()
 468         if match("^XCOMM", line) or match("^#", line):
 469                 continue
 470
 471         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
 472
 473         comment_start = line.find("/*")
 474
 475         if comment_start >= 0:
 476                 if comment_nest_depth == 0:
 477                         line = line[:comment_start]
 478                 else:
 479                         line = ""
 480
 481                 comment_nest_depth += 1
 482         else:
 483                 comment_end = line.find("*/")
 484
 485                 if comment_end >= 0:
 486                         comment_nest_depth -= 1
 487
 488                 if comment_nest_depth < 0:
 489                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
 490                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 491                         exit(-1)
 492
 493                 if comment_nest_depth > 0:
 494                         line = ""
 495                 else:
 496                         line = line[comment_end + 2:]
 497
 498         if line is "":
 499                 continue
 500
 501         #line = line[:-1]
 502         components = split(':', line)
 503         if len(components) != 2:
 504                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
 505                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 506                 exit(-1)
 507         (seq, val ) = split(':', line)
 508         seq = seq.strip()
 509         val = val.strip()
 510         raw_sequence = findall('\w+', seq)
 511         values = split('\s+', val)
 512         unichar_temp = split('"', values[0])
 513         unichar = unichar_temp[1]
 514         if len(values) == 1:
 515                 continue
 516         codepointstr = values[1]
 517         if values[1] == '#':
 518                 # No codepoints that are >1 characters yet.
 519                 continue
 520         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
 521                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
 522         if  match('^U[0-9a-fA-F]+$', codepointstr):
 523                 codepoint = long(codepointstr[1:], 16)
 524         elif keysymunicodedatabase.has_key(codepointstr):
 525                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
 526                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
 527                         #print raw_sequence, codepointstr
 528                 codepoint = keysymunicodedatabase[codepointstr]
 529         else:
 530                 print
 531                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
 532                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
 533                 exit(-1)
 534         sequence = rename_combining(raw_sequence)
 535         reject_this = False
 536         for i in sequence:
 537                 if keysymvalue(i) > 0xFFFF:
 538                         reject_this = True
 539                         if opt_plane1:
 540                                 print sequence
 541                         break
 542                 if keysymvalue(i) < 0:
 543                         reject_this = True
 544                         break
 545         if reject_this:
 546                 continue
 547         if "U0342" in sequence or \
 548                 "U0313" in sequence or \
 549                 "U0314" in sequence or \
 550                 "0x0313" in sequence or \
 551                 "0x0342" in sequence or \
 552                 "0x0314" in sequence:
 553                 continue
 554         if "dead_belowring" in sequence or\
 555                 "dead_currency" in sequence or\
 556                 "dead_belowcomma" in sequence or\
 557                 "dead_belowmacron" in sequence or\
 558                 "dead_belowtilde" in sequence or\
 559                 "dead_belowbreve" in sequence or\
 560                 "dead_belowdiaeresis" in sequence or\
 561                 "dead_belowcircumflex" in sequence:
 562                 continue
 563         #for i in range(len(sequence)):
 564         #       if sequence[i] == "0x0342":
 565         #               sequence[i] = "dead_tilde"
 566         if "Multi_key" not in sequence:
 567                 """ Ignore for now >0xFFFF keysyms """
 568                 if codepoint < 0xFFFF:
 569                         original_sequence = copy(sequence)
 570                         stats_sequence = copy(sequence)
 571                         base = sequence.pop()
 572                         basechar = keysymvalue(base, filename_compose, linenum_compose)
 573
 574                         if basechar < 0xFFFF:
 575                                 counter = 1
 576                                 unisequence = []
 577                                 not_normalised = True
 578                                 skipping_this = False
 579                                 for i in range(0, len(sequence)):
 580                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
 581                                             because of lack of dead_perispomeni (i.e. conflict)
 582                                         """
 583                                         bc = basechar
 584                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 585                                                 skipping_this = True
 586                                                 break
 587                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 588                                                 skipping_this = True
 589                                                 break
 590                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 591                                                 skipping_this = True
 592                                                 break
 593                                         if sequence[-1] == "dead_psili":
 594                                                 sequence[i] = "dead_horn"
 595                                         if sequence[-1] == "dead_dasia":
 596                                                 sequence[-1] = "dead_ogonek"
 597                                         """
 598                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
 599
 600                                 if skipping_this:
 601                                         unisequence = []
 602                                 for perm in all_permutations(unisequence):
 603                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
 604                                         # print counter, map(unichr, perm)
 605                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
 606                                         if len(normalized) == 1:
 607                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
 608                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
 609                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
 610                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
 611                                                 stats_sequence_data.append(normalized)
 612                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
 613                                                 not_normalised = False
 614                                                 break;
 615                                         counter += 1
 616                                 if not_normalised:
 617                                         original_sequence.append(codepoint)
 618                                         xorg_compose_sequences.append(original_sequence)
 619                                         """ print xorg_compose_sequences[-1] """
 620
 621                         else:
 622                                 print "Error in base char !?!"
 623                                 exit(-2)
 624                 else:
 625                         print "OVER", sequence
 626                         exit(-1)
 627         else:
 628                 sequence.append(codepoint)
 629                 xorg_compose_sequences.append(sequence)
 630                 """ print xorg_compose_sequences[-1] """
 631
 632 def sequence_cmp(x, y):
 633         if keysymvalue(x[0]) > keysymvalue(y[0]):
 634                 return 1
 635         elif keysymvalue(x[0]) < keysymvalue(y[0]):
 636                 return -1
 637         elif len(x) > len(y):
 638                 return 1
 639         elif len(x) < len(y):
 640                 return -1
 641         elif keysymvalue(x[1]) > keysymvalue(y[1]):
 642                 return 1
 643         elif keysymvalue(x[1]) < keysymvalue(y[1]):
 644                 return -1
 645         elif len(x) < 4:
 646                 return 0
 647         elif keysymvalue(x[2]) > keysymvalue(y[2]):
 648                 return 1
 649         elif keysymvalue(x[2]) < keysymvalue(y[2]):
 650                 return -1
 651         elif len(x) < 5:
 652                 return 0
 653         elif keysymvalue(x[3]) > keysymvalue(y[3]):
 654                 return 1
 655         elif keysymvalue(x[3]) < keysymvalue(y[3]):
 656                 return -1
 657         elif len(x) < 6:
 658                 return 0
 659         elif keysymvalue(x[4]) > keysymvalue(y[4]):
 660                 return 1
 661         elif keysymvalue(x[4]) < keysymvalue(y[4]):
 662                 return -1
 663         else:
 664                 return 0
 665
 666 def sequence_unicode_cmp(x, y):
 667         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
 668                 return 1
 669         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
 670                 return -1
 671         elif len(x) > len(y):
 672                 return 1
 673         elif len(x) < len(y):
 674                 return -1
 675         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
 676                 return 1
 677         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
 678                 return -1
 679         elif len(x) < 4:
 680                 return 0
 681         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
 682                 return 1
 683         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
 684                 return -1
 685         elif len(x) < 5:
 686                 return 0
 687         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
 688                 return 1
 689         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
 690                 return -1
 691         elif len(x) < 6:
 692                 return 0
 693         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
 694                 return 1
 695         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
 696                 return -1
 697         else:
 698                 return 0
 699
 700 def sequence_algorithmic_cmp(x, y):
 701         if len(x) < len(y):
 702                 return -1
 703         elif len(x) > len(y):
 704                 return 1
 705         else:
 706                 for i in range(len(x)):
 707                         if x[i] < y[i]:
 708                                 return -1
 709                         elif x[i] > y[i]:
 710                                 return 1
 711         return 0
 712
 713
 714 xorg_compose_sequences.sort(sequence_cmp)
 715
 716 xorg_compose_sequences_uniqued = []
 717 first_time = True
 718 item = None
 719 for next_item in xorg_compose_sequences:
 720         if first_time:
 721                 first_time = False
 722                 item = next_item
 723         if sequence_unicode_cmp(item, next_item) != 0:
 724                 xorg_compose_sequences_uniqued.append(item)
 725         item = next_item
 726
 727 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
 728
 729 counter_multikey = 0
 730 for item in xorg_compose_sequences:
 731         if findall('Multi_key', "".join(item[:-1])) != []:
 732                 counter_multikey += 1
 733
 734 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
 735 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
 736
 737 firstitem = ""
 738 num_first_keysyms = 0
 739 zeroes = 0
 740 num_entries = 0
 741 num_algorithmic_greek = 0
 742 for sequence in xorg_compose_sequences:
 743         if keysymvalue(firstitem) != keysymvalue(sequence[0]):
 744                 firstitem = sequence[0]
 745                 num_first_keysyms += 1
 746         zeroes += 6 - len(sequence) + 1
 747         num_entries += 1
 748
 749 for sequence in xorg_compose_sequences_algorithmic_uniqued:
 750         ch = ord(sequence[-1:][0])
 751         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 752                 num_algorithmic_greek += 1
 753
 754
 755 if opt_algorithmic:
 756         for sequence in xorg_compose_sequences_algorithmic_uniqued:
 757                 letter = "".join(sequence[-1:])
 758                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
 759                 for elem in sequence[:-2]:
 760                         print "<0x%(keysym)04X>," % { 'keysym': elem },
 761                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
 762                 print "], recomposed as", letter.encode('utf-8'), "verified"
 763
 764 def num_of_keysyms(seq):
 765         return len(seq) - 1
 766
 767 def convert_UnotationToHex(arg):
 768         if isinstance(arg, str):
 769                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
 770                         return sub('^U', '0x', arg)
 771         return arg
 772
 773 def addprefix_GDK(arg):
 774         if match('^0x', arg):
 775                 return '%(arg)s, ' % { 'arg': arg }
 776         else:
 777                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
 778
 779 if opt_gtk:
 780         first_keysym = ""
 781         sequence = []
 782         compose_table = []
 783         ct_second_part = []
 784         ct_sequence_width = 2
 785         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
 786         we_finished = False
 787         counter = 0
 788
 789         sequence_iterator = iter(xorg_compose_sequences)
 790         sequence = sequence_iterator.next()
 791         while True:
 792                 first_keysym = sequence[0]                                      # Set the first keysym
 793                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
 794                 while sequence[0] == first_keysym:
 795                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
 796                         try:
 797                                 sequence = sequence_iterator.next()
 798                         except StopIteration:
 799                                 we_finished = True
 800                                 break
 801                 if we_finished:
 802                         break
 803                 counter += 1
 804
 805         ct_index = start_offset
 806         for line_num in range(len(compose_table)):
 807                 for i in range(WIDTHOFCOMPOSETABLE):
 808                         occurences = compose_table[line_num][i+1]
 809                         compose_table[line_num][i+1] = ct_index
 810                         ct_index += occurences * (i+2)
 811
 812         for sequence in xorg_compose_sequences:
 813                 ct_second_part.append(map(convert_UnotationToHex, sequence))
 814
 815         print headerfile_start
 816         for i in compose_table:
 817                 if opt_gtkexpanded:
 818                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
 819                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
 820                 elif not match('^0x', i[0]):
 821                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 822                 else:
 823                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 824         for i in ct_second_part:
 825                 if opt_numeric:
 826                         for ks in i[1:][:-1]:
 827                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 828                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 829                         """
 830                         for ks in i[:-1]:
 831                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 832                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 833                         """
 834                 elif opt_gtkexpanded:
 835                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
 836                 else:
 837                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
 838         print headerfile_end
 839
 840 def redecompose(codepoint):
 841         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
 842         if decomposition[0] == '' or decomposition[0] == '0':
 843                 return [codepoint]
 844         if match('<\w+>', decomposition[0]):
 845                 numdecomposition = map(stringtohex, decomposition[1:])
 846                 return map(redecompose, numdecomposition)
 847         numdecomposition = map(stringtohex, decomposition)
 848         return map(redecompose, numdecomposition)
 849
 850 def process_unicodedata_file(verbose = False):
 851         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
 852         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
 853         try:
 854                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
 855         except IOError, (errno, strerror):
 856                 print "I/O error(%s): %s" % (errno, strerror)
 857                 sys.exit(-1)
 858         except:
 859                 print "Unexpected error: ", sys.exc_info()[0]
 860                 sys.exit(-1)
 861         for line in unicodedatatxt.readlines():
 862                 if line[0] == "" or line[0] == '#':
 863                         continue
 864                 line = line[:-1]
 865                 uniproperties = split(';', line)
 866                 codepoint = stringtohex(uniproperties[0])
 867                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
 868                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
 869                         continue
 870                 name = uniproperties[1]
 871                 category = uniproperties[2]
 872                 combiningclass = uniproperties[3]
 873                 decomposition = uniproperties[5]
 874                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
 875
 876         counter_combinations = 0
 877         counter_combinations_greek = 0
 878         counter_entries = 0
 879         counter_entries_greek = 0
 880
 881         for item in unicodedatabase.keys():
 882                 (name, decomposition, combiningclass) = unicodedatabase[item]
 883                 if decomposition[0] == '':
 884                         continue
 885                         print name, "is empty"
 886                 elif match('<\w+>', decomposition[0]):
 887                         continue
 888                         print name, "has weird", decomposition[0]
 889                 else:
 890                         sequence = map(stringtohex, decomposition)
 891                         chrsequence = map(unichr, sequence)
 892                         normalized = normalize('NFC', "".join(chrsequence))
 893
 894                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
 895                         decomposedsequence = []
 896                         for subseq in map(redecompose, sequence):
 897                                 for seqitem in subseq:
 898                                         if isinstance(seqitem, list):
 899                                                 for i in seqitem:
 900                                                         if isinstance(i, list):
 901                                                                 for j in i:
 902                                                                         decomposedsequence.append(j)
 903                                                         else:
 904                                                                 decomposedsequence.append(i)
 905                                         else:
 906                                                 decomposedsequence.append(seqitem)
 907                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
 908                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
 909                                 counter_entries += 1
 910                                 counter_combinations += factorial(len(decomposedsequence)-1)
 911                                 ch = item
 912                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 913                                         counter_entries_greek += 1
 914                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
 915                                 if verbose:
 916                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
 917                                         print "[",
 918                                         for elem in decomposedsequence:
 919                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
 920                                         print "], recomposed as", recomposedchar,
 921                                         if unichr(item) == recomposedchar:
 922                                                 print "verified"
 923
 924         if verbose == False:
 925                 print "Unicode statistics from UnicodeData.txt"
 926                 print "Number of entries that can be algorithmically produced     :", counter_entries
 927                 print "  of which are for Greek                                   :", counter_entries_greek
 928                 print "Number of compose sequence combinations requiring          :", counter_combinations
 929                 print "  of which are for Greek                                   :", counter_combinations_greek
 930                 print "Note: We do not include partial compositions, "
 931                 print "thus the slight discrepancy in the figures"
 932                 print
 933
 934 if opt_unicodedatatxt:
 935         process_unicodedata_file(True)
 936
 937 if opt_statistics:
 938         print
 939         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
 940         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
 941         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
 942         print "    of which have Multi_key                                :", counter_multikey
 943         print
 944         print "Algorithmic (stats for Xorg Compose file)"
 945         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
 946         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
 947         print "  of which are for Greek                                   :", num_algorithmic_greek
 948         print
 949         process_unicodedata_file()
 950         print "Not algorithmic (stats from Xorg Compose file)"
 951         print "Number of sequences                                        :", len(xorg_compose_sequences)
 952         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
 953         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
 954         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
 955         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
 956         print "Number of different first items                            :", num_first_keysyms
 957         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
 958         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
 959         print
 960         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
 961         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
 962         print
 963         print "Existing (old) implementation in GTK+"
 964         print "Number of sequences in old gtkimcontextsimple.c            :", 691
 965         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"