Pileus Git - ~andy/gtk/blob - gtk/compose-parse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # compose-parse.py, version 1.3
   5 #
   6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
   7 # the script produces statistics and information about the whole process, run with --help for more.
   8 #
   9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
  10 #
  11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
  12
  13 from re                 import findall, match, split, sub
  14 from string             import atoi
  15 from unicodedata        import normalize
  16 from urllib             import urlretrieve
  17 from os.path            import isfile, getsize
  18 from copy               import copy
  19
  20 import sys
  21 import getopt
  22
  23 # We grab files off the web, left and right.
  24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
  25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
  26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
  27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
  28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
  29
  30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
  31 # we might produce some tables with size 2 and some with size 4.
  32 SIZEOFINT = 2
  33
  34 # Current max compose sequence length; in case it gets increased.
  35 WIDTHOFCOMPOSETABLE = 5
  36
  37 keysymdatabase = {}
  38 keysymunicodedatabase = {}
  39 unicodedatabase = {}
  40
  41 headerfile_start = """/* GTK - The GIMP Tool Kit
  42  * Copyright (C) 2007, 2008 GNOME Foundation
  43  *
  44  * This library is free software; you can redistribute it and/or
  45  * modify it under the terms of the GNU Lesser General Public
  46  * License as published by the Free Software Foundation; either
  47  * version 2 of the License, or (at your option) any later version.
  48  *
  49  * This library is distributed in the hope that it will be useful,
  50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  52  * Lesser General Public License for more details.
  53  *
  54  * You should have received a copy of the GNU Lesser General Public
  55  * License along with this library; if not, write to the
  56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  57  * Boston, MA 02111-1307, USA.
  58  */
  59
  60 /*
  61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
  62  * using the input files
  63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
  64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
  65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  66  *
  67  * This table is optimised for space and requires special handling to access the content.
  68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
  69  *
  70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
  71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
  72  */
  73
  74 /*
  75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
  76  * file for a list of people on the GTK+ Team.  See the ChangeLog
  77  * files for a list of changes.  These files are distributed with
  78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
  79  */
  80
  81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  83
  84 /* === These are the original comments of the file; we keep for historical purposes ===
  85  *
  86  * The following table was generated from the X compose tables include with
  87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
  88  * to obtain the relevant perl scripts.
  89  *
  90  * The following compose letter letter sequences confliced
  91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
  92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
  93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
  94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
  95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
  96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
  97  *
  98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
  99  *   spanish. atilde and otilde are used at least for Portuguese ]
 100  *
 101  *   at and Aring; resolved to Aring                                          [ AA ]
 102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
 103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
 104  *
 105  * This probably should be resolved by first checking an additional set of compose tables
 106  * that depend on the locale or selected input method.
 107  */
 108
 109 static const guint16 gtk_compose_seqs_compact[] = {"""
 110
 111 headerfile_end = """};
 112
 113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
 114 """
 115
 116 def stringtohex(str): return atoi(str, 16)
 117
 118 def factorial(n):
 119         if n <= 1:
 120                 return 1
 121         else:
 122                 return n * factorial(n-1)
 123
 124 def uniq(*args) :
 125         """ Performs a uniq operation on a list or lists """
 126         theInputList = []
 127         for theList in args:
 128            theInputList += theList
 129         theFinalList = []
 130         for elem in theInputList:
 131                 if elem not in theFinalList:
 132                         theFinalList.append(elem)
 133         return theFinalList
 134
 135
 136
 137 def all_permutations(seq):
 138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
 139         """ Produces all permutations of the items of a list """
 140         if len(seq) <=1:
 141             yield seq
 142         else:
 143             for perm in all_permutations(seq[1:]):
 144                 for i in range(len(perm)+1):
 145                     #nb str[0:1] works in both string and list contexts
 146                         yield perm[:i] + seq[0:1] + perm[i:]
 147
 148 def usage():
 149         print """compose-parse available parameters:
 150         -h, --help              this craft
 151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
 152         -a, --algorithmic       show sequences saved with algorithmic optimisation
 153         -g, --gtk               show entries that go to GTK+
 154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
 155         -v, --verbose           show verbose output
 156         -p, --plane1            show plane1 compose sequences
 157         -n, --numeric           when used with --gtk, create file with numeric values only
 158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
 159
 160         Default is to show statistics.
 161         """
 162
 163 try:
 164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
 165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
 166 except:
 167         usage()
 168         sys.exit(2)
 169
 170 opt_statistics = False
 171 opt_algorithmic = False
 172 opt_gtk = False
 173 opt_unicodedatatxt = False
 174 opt_verbose = False
 175 opt_plane1 = False
 176 opt_numeric = False
 177 opt_gtkexpanded = False
 178
 179 for o, a in opts:
 180         if o in ("-h", "--help"):
 181                 usage()
 182                 sys.exit()
 183         if o in ("-s", "--statistics"):
 184                 opt_statistics = True
 185         if o in ("-a", "--algorithmic"):
 186                 opt_algorithmic = True
 187         if o in ("-g", "--gtk"):
 188                 opt_gtk = True
 189         if o in ("-u", "--unicodedatatxt"):
 190                 opt_unicodedatatxt = True
 191         if o in ("-v", "--verbose"):
 192                 opt_verbose = True
 193         if o in ("-p", "--plane1"):
 194                 opt_plane1 = True
 195         if o in ("-n", "--numeric"):
 196                 opt_numeric = True
 197         if o in ("-e", "--gtk-expanded"):
 198                 opt_gtkexpanded = True
 199
 200 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
 201         opt_statistics = True
 202
 203 def download_hook(blocks_transferred, block_size, file_size):
 204         """ A download hook to provide some feedback when downloading """
 205         if blocks_transferred == 0:
 206                 if file_size > 0:
 207                         if opt_verbose:
 208                                 print "Downloading", file_size, "bytes: ",
 209                 else:
 210                         if opt_verbose:
 211                                 print "Downloading: ",
 212         sys.stdout.write('#')
 213         sys.stdout.flush()
 214
 215
 216 def download_file(url):
 217         """ Downloads a file provided a URL. Returns the filename. """
 218         """ Borks on failure """
 219         localfilename = url.split('/')[-1]
 220         if not isfile(localfilename) or getsize(localfilename) <= 0:
 221                 if opt_verbose:
 222                         print "Downloading ", url, "..."
 223                 try:
 224                         urlretrieve(url, localfilename, download_hook)
 225                 except IOError, (errno, strerror):
 226                         print "I/O error(%s): %s" % (errno, strerror)
 227                         sys.exit(-1)
 228                 except:
 229                         print "Unexpected error: ", sys.exc_info()[0]
 230                         sys.exit(-1)
 231                 print " done."
 232         else:
 233                 if opt_verbose:
 234                         print "Using cached file for ", url
 235         return localfilename
 236
 237 def process_gdkkeysymsh():
 238         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
 239         """ Fills up keysymdb with contents """
 240         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
 241         try:
 242                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
 243         except IOError, (errno, strerror):
 244                 print "I/O error(%s): %s" % (errno, strerror)
 245                 sys.exit(-1)
 246         except:
 247                 print "Unexpected error: ", sys.exc_info()[0]
 248                 sys.exit(-1)
 249
 250         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
 251         linenum_gdkkeysymsh = 0
 252         keysymdb = {}
 253         for line in gdkkeysymsh.readlines():
 254                 linenum_gdkkeysymsh += 1
 255                 line = line.strip()
 256                 if line == "" or not match('^#define GDK_KEY_', line):
 257                         continue
 258                 components = split('\s+', line)
 259                 if len(components) < 3:
 260                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 261                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 262                         print "Was expecting 3 items in the line"
 263                         sys.exit(-1)
 264                 if not match('^GDK_KEY_', components[1]):
 265                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 266                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 267                         print "Was expecting a keysym starting with GDK_KEY_"
 268                         sys.exit(-1)
 269                 if match('^0x[0-9a-fA-F]+$', components[2]):
 270                         unival = long(components[2][2:], 16)
 271                         if unival == 0:
 272                                 continue
 273                         keysymdb[components[1][8:]] = unival
 274                 else:
 275                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 276                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 277                         print "Was expecting a hexadecimal number at the end of the line"
 278                         sys.exit(-1)
 279         gdkkeysymsh.close()
 280
 281         """ Patch up the keysymdb with some of our own stuff """
 282
 283         """ This is for a missing keysym from the currently upstream file """
 284         keysymdb['dead_stroke'] = 0x338
 285
 286         """ This is for a missing keysym from the currently upstream file """
 287         ###keysymdb['dead_belowring'] = 0x323
 288         ###keysymdb['dead_belowmacron'] = 0x331
 289         ###keysymdb['dead_belowcircumflex'] = 0x32d
 290         ###keysymdb['dead_belowtilde'] = 0x330
 291         ###keysymdb['dead_belowbreve'] = 0x32e
 292         ###keysymdb['dead_belowdiaeresis'] = 0x324
 293
 294         """ This is^Wwas preferential treatment for Greek """
 295         # keysymdb['dead_tilde'] = 0x342
 296         """ This is^was preferential treatment for Greek """
 297         #keysymdb['combining_tilde'] = 0x342
 298
 299         """ Fixing VoidSymbol """
 300         keysymdb['VoidSymbol'] = 0xFFFF
 301
 302         return keysymdb
 303
 304 def process_keysymstxt():
 305         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
 306         """ This file keeps a record between keysyms <-> unicode chars """
 307         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
 308         try:
 309                 keysymstxt = open(filename_keysymstxt, 'r')
 310         except IOError, (errno, strerror):
 311                 print "I/O error(%s): %s" % (errno, strerror)
 312                 sys.exit(-1)
 313         except:
 314                 print "Unexpected error: ", sys.exc_info()[0]
 315                 sys.exit(-1)
 316
 317         """ Parse the keysyms.txt file and place content in  keysymdb """
 318         linenum_keysymstxt = 0
 319         keysymdb = {}
 320         for line in keysymstxt.readlines():
 321                 linenum_keysymstxt += 1
 322                 line = line.strip()
 323                 if line == "" or match('^#', line):
 324                         continue
 325                 components = split('\s+', line)
 326                 if len(components) < 5:
 327                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
 328                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
 329                         print "Was expecting 5 items in the line"
 330                         sys.exit(-1)
 331                 if match('^U[0-9a-fA-F]+$', components[1]):
 332                         unival = long(components[1][1:], 16)
 333                 if unival == 0:
 334                         continue
 335                 keysymdb[components[4]] = unival
 336         keysymstxt.close()
 337
 338         """ Patch up the keysymdb with some of our own stuff """
 339         """ This is for a missing keysym from the currently upstream file """
 340         ###keysymdb['dead_belowring'] = 0x323
 341         ###keysymdb['dead_belowmacron'] = 0x331
 342         ###keysymdb['dead_belowcircumflex'] = 0x32d
 343         ###keysymdb['dead_belowtilde'] = 0x330
 344         ###keysymdb['dead_belowbreve'] = 0x32e
 345         ###keysymdb['dead_belowdiaeresis'] = 0x324
 346
 347         """ This is preferential treatment for Greek """
 348         """ => we get more savings if used for Greek """
 349         # keysymdb['dead_tilde'] = 0x342
 350         """ This is preferential treatment for Greek """
 351         # keysymdb['combining_tilde'] = 0x342
 352
 353         """ This is for a missing keysym from Markus Kuhn's db """
 354         keysymdb['dead_stroke'] = 0x338
 355         """ This is for a missing keysym from Markus Kuhn's db """
 356         keysymdb['Oslash'] = 0x0d8
 357         """ This is for a missing keysym from Markus Kuhn's db """
 358         keysymdb['Ssharp'] = 0x1e9e
 359
 360         """ This is for a missing (recently added) keysym """
 361         keysymdb['dead_psili'] = 0x313
 362         """ This is for a missing (recently added) keysym """
 363         keysymdb['dead_dasia'] = 0x314
 364
 365         """ Allows to import Multi_key sequences """
 366         keysymdb['Multi_key'] = 0xff20
 367
 368         keysymdb['zerosubscript'] = 0x2080
 369         keysymdb['onesubscript'] = 0x2081
 370         keysymdb['twosubscript'] = 0x2082
 371         keysymdb['threesubscript'] = 0x2083
 372         keysymdb['foursubscript'] = 0x2084
 373         keysymdb['fivesubscript'] = 0x2085
 374         keysymdb['sixsubscript'] = 0x2086
 375         keysymdb['sevensubscript'] = 0x2087
 376         keysymdb['eightsubscript'] = 0x2088
 377         keysymdb['ninesubscript'] = 0x2089
 378         keysymdb['dead_doublegrave'] = 0x030F
 379         keysymdb['dead_invertedbreve'] = 0x0311
 380
 381         return keysymdb
 382
 383 def keysymvalue(keysym, file = "n/a", linenum = 0):
 384         """ Extracts a value from the keysym """
 385         """ Find the value of keysym, using the data from keysyms """
 386         """ Use file and linenum to when reporting errors """
 387         if keysym == "":
 388                 return 0
 389         if keysymdatabase.has_key(keysym):
 390                 return keysymdatabase[keysym]
 391         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 392                 return atoi(keysym[1:], 16)
 393         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 394                 return atoi(keysym[2:], 16)
 395         else:
 396                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 397                 #return -1
 398                 sys.exit(-1)
 399
 400 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
 401         """ Extracts a value from the keysym """
 402         """ Find the value of keysym, using the data from keysyms """
 403         """ Use file and linenum to when reporting errors """
 404         if keysym == "":
 405                 return 0
 406         if keysymunicodedatabase.has_key(keysym):
 407                 return keysymunicodedatabase[keysym]
 408         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 409                 return atoi(keysym[1:], 16)
 410         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 411                 return atoi(keysym[2:], 16)
 412         else:
 413                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 414                 sys.exit(-1)
 415
 416 def rename_combining(seq):
 417         filtered_sequence = []
 418         for ks in seq:
 419                 if findall('^combining_', ks):
 420                         ks = sub('^combining_', 'dead_', ks)
 421                 if ks == 'dead_double_grave':
 422                         ks = 'dead_doublegrave'
 423                 if ks == 'dead_inverted_breve':
 424                         ks = 'dead_invertedbreve'
 425                 filtered_sequence.append(ks)
 426         return filtered_sequence
 427
 428
 429 keysymunicodedatabase = process_keysymstxt()
 430 keysymdatabase = process_gdkkeysymsh()
 431
 432 """ Grab and open the compose file from upstream """
 433 filename_compose = download_file(URL_COMPOSE)
 434 try:
 435         composefile = open(filename_compose, 'r')
 436 except IOError, (errno, strerror):
 437         print "I/O error(%s): %s" % (errno, strerror)
 438         sys.exit(-1)
 439 except:
 440         print "Unexpected error: ", sys.exc_info()[0]
 441         sys.exit(-1)
 442
 443 """ Look if there is a lookaside (supplementary) compose file in the current
 444     directory, and if so, open, then merge with upstream Compose file.
 445 """
 446 xorg_compose_sequences_raw = []
 447 for seq in composefile.readlines():
 448         xorg_compose_sequences_raw.append(seq)
 449
 450 try:
 451         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
 452         for seq in composefile_lookaside.readlines():
 453                 xorg_compose_sequences_raw.append(seq)
 454 except IOError, (errno, strerror):
 455         if opt_verbose:
 456                 print "I/O error(%s): %s" % (errno, strerror)
 457                 print "Did not find lookaside compose file. Continuing..."
 458 except:
 459         print "Unexpected error: ", sys.exc_info()[0]
 460         sys.exit(-1)
 461
 462 """ Parse the compose file in  xorg_compose_sequences"""
 463 xorg_compose_sequences = []
 464 xorg_compose_sequences_algorithmic = []
 465 linenum_compose = 0
 466 comment_nest_depth = 0
 467 for line in xorg_compose_sequences_raw:
 468         linenum_compose += 1
 469         line = line.strip()
 470         if match("^XCOMM", line) or match("^#", line):
 471                 continue
 472
 473         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
 474
 475         comment_start = line.find("/*")
 476
 477         if comment_start >= 0:
 478                 if comment_nest_depth == 0:
 479                         line = line[:comment_start]
 480                 else:
 481                         line = ""
 482
 483                 comment_nest_depth += 1
 484         else:
 485                 comment_end = line.find("*/")
 486
 487                 if comment_end >= 0:
 488                         comment_nest_depth -= 1
 489
 490                 if comment_nest_depth < 0:
 491                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
 492                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 493                         exit(-1)
 494
 495                 if comment_nest_depth > 0:
 496                         line = ""
 497                 else:
 498                         line = line[comment_end + 2:]
 499
 500         if line is "":
 501                 continue
 502
 503         #line = line[:-1]
 504         components = split(':', line)
 505         if len(components) != 2:
 506                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
 507                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 508                 exit(-1)
 509         (seq, val ) = split(':', line)
 510         seq = seq.strip()
 511         val = val.strip()
 512         raw_sequence = findall('\w+', seq)
 513         values = split('\s+', val)
 514         unichar_temp = split('"', values[0])
 515         unichar = unichar_temp[1]
 516         if len(values) == 1:
 517                 continue
 518         codepointstr = values[1]
 519         if values[1] == '#':
 520                 # No codepoints that are >1 characters yet.
 521                 continue
 522         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
 523                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
 524         if  match('^U[0-9a-fA-F]+$', codepointstr):
 525                 codepoint = long(codepointstr[1:], 16)
 526         elif keysymunicodedatabase.has_key(codepointstr):
 527                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
 528                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
 529                         #print raw_sequence, codepointstr
 530                 codepoint = keysymunicodedatabase[codepointstr]
 531         else:
 532                 print
 533                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
 534                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
 535                 exit(-1)
 536         sequence = rename_combining(raw_sequence)
 537         reject_this = False
 538         for i in sequence:
 539                 if keysymvalue(i) > 0xFFFF:
 540                         reject_this = True
 541                         if opt_plane1:
 542                                 print sequence
 543                         break
 544                 if keysymvalue(i) < 0:
 545                         reject_this = True
 546                         break
 547         if reject_this:
 548                 continue
 549         if "U0342" in sequence or \
 550                 "U0313" in sequence or \
 551                 "U0314" in sequence or \
 552                 "0x0313" in sequence or \
 553                 "0x0342" in sequence or \
 554                 "0x0314" in sequence:
 555                 continue
 556         if "dead_belowring" in sequence or\
 557                 "dead_currency" in sequence or\
 558                 "dead_belowcomma" in sequence or\
 559                 "dead_belowmacron" in sequence or\
 560                 "dead_belowtilde" in sequence or\
 561                 "dead_belowbreve" in sequence or\
 562                 "dead_belowdiaeresis" in sequence or\
 563                 "dead_belowcircumflex" in sequence:
 564                 continue
 565         #for i in range(len(sequence)):
 566         #       if sequence[i] == "0x0342":
 567         #               sequence[i] = "dead_tilde"
 568         if "Multi_key" not in sequence:
 569                 """ Ignore for now >0xFFFF keysyms """
 570                 if codepoint < 0xFFFF:
 571                         original_sequence = copy(sequence)
 572                         stats_sequence = copy(sequence)
 573                         base = sequence.pop()
 574                         basechar = keysymvalue(base, filename_compose, linenum_compose)
 575
 576                         if basechar < 0xFFFF:
 577                                 counter = 1
 578                                 unisequence = []
 579                                 not_normalised = True
 580                                 skipping_this = False
 581                                 for i in range(0, len(sequence)):
 582                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
 583                                             because of lack of dead_perispomeni (i.e. conflict)
 584                                         """
 585                                         bc = basechar
 586                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 587                                                 skipping_this = True
 588                                                 break
 589                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 590                                                 skipping_this = True
 591                                                 break
 592                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 593                                                 skipping_this = True
 594                                                 break
 595                                         if sequence[-1] == "dead_psili":
 596                                                 sequence[i] = "dead_horn"
 597                                         if sequence[-1] == "dead_dasia":
 598                                                 sequence[-1] = "dead_ogonek"
 599                                         """
 600                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
 601
 602                                 if skipping_this:
 603                                         unisequence = []
 604                                 for perm in all_permutations(unisequence):
 605                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
 606                                         # print counter, map(unichr, perm)
 607                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
 608                                         if len(normalized) == 1:
 609                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
 610                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
 611                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
 612                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
 613                                                 stats_sequence_data.append(normalized)
 614                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
 615                                                 not_normalised = False
 616                                                 break;
 617                                         counter += 1
 618                                 if not_normalised:
 619                                         original_sequence.append(codepoint)
 620                                         xorg_compose_sequences.append(original_sequence)
 621                                         """ print xorg_compose_sequences[-1] """
 622
 623                         else:
 624                                 print "Error in base char !?!"
 625                                 exit(-2)
 626                 else:
 627                         print "OVER", sequence
 628                         exit(-1)
 629         else:
 630                 sequence.append(codepoint)
 631                 xorg_compose_sequences.append(sequence)
 632                 """ print xorg_compose_sequences[-1] """
 633
 634 def sequence_cmp(x, y):
 635         if keysymvalue(x[0]) > keysymvalue(y[0]):
 636                 return 1
 637         elif keysymvalue(x[0]) < keysymvalue(y[0]):
 638                 return -1
 639         elif len(x) > len(y):
 640                 return 1
 641         elif len(x) < len(y):
 642                 return -1
 643         elif keysymvalue(x[1]) > keysymvalue(y[1]):
 644                 return 1
 645         elif keysymvalue(x[1]) < keysymvalue(y[1]):
 646                 return -1
 647         elif len(x) < 4:
 648                 return 0
 649         elif keysymvalue(x[2]) > keysymvalue(y[2]):
 650                 return 1
 651         elif keysymvalue(x[2]) < keysymvalue(y[2]):
 652                 return -1
 653         elif len(x) < 5:
 654                 return 0
 655         elif keysymvalue(x[3]) > keysymvalue(y[3]):
 656                 return 1
 657         elif keysymvalue(x[3]) < keysymvalue(y[3]):
 658                 return -1
 659         elif len(x) < 6:
 660                 return 0
 661         elif keysymvalue(x[4]) > keysymvalue(y[4]):
 662                 return 1
 663         elif keysymvalue(x[4]) < keysymvalue(y[4]):
 664                 return -1
 665         else:
 666                 return 0
 667
 668 def sequence_unicode_cmp(x, y):
 669         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
 670                 return 1
 671         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
 672                 return -1
 673         elif len(x) > len(y):
 674                 return 1
 675         elif len(x) < len(y):
 676                 return -1
 677         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
 678                 return 1
 679         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
 680                 return -1
 681         elif len(x) < 4:
 682                 return 0
 683         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
 684                 return 1
 685         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
 686                 return -1
 687         elif len(x) < 5:
 688                 return 0
 689         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
 690                 return 1
 691         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
 692                 return -1
 693         elif len(x) < 6:
 694                 return 0
 695         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
 696                 return 1
 697         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
 698                 return -1
 699         else:
 700                 return 0
 701
 702 def sequence_algorithmic_cmp(x, y):
 703         if len(x) < len(y):
 704                 return -1
 705         elif len(x) > len(y):
 706                 return 1
 707         else:
 708                 for i in range(len(x)):
 709                         if x[i] < y[i]:
 710                                 return -1
 711                         elif x[i] > y[i]:
 712                                 return 1
 713         return 0
 714
 715
 716 xorg_compose_sequences.sort(sequence_cmp)
 717
 718 xorg_compose_sequences_uniqued = []
 719 first_time = True
 720 item = None
 721 for next_item in xorg_compose_sequences:
 722         if first_time:
 723                 first_time = False
 724                 item = next_item
 725         if sequence_unicode_cmp(item, next_item) != 0:
 726                 xorg_compose_sequences_uniqued.append(item)
 727         item = next_item
 728
 729 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
 730
 731 counter_multikey = 0
 732 for item in xorg_compose_sequences:
 733         if findall('Multi_key', "".join(item[:-1])) != []:
 734                 counter_multikey += 1
 735
 736 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
 737 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
 738
 739 firstitem = ""
 740 num_first_keysyms = 0
 741 zeroes = 0
 742 num_entries = 0
 743 num_algorithmic_greek = 0
 744 for sequence in xorg_compose_sequences:
 745         if keysymvalue(firstitem) != keysymvalue(sequence[0]):
 746                 firstitem = sequence[0]
 747                 num_first_keysyms += 1
 748         zeroes += 6 - len(sequence) + 1
 749         num_entries += 1
 750
 751 for sequence in xorg_compose_sequences_algorithmic_uniqued:
 752         ch = ord(sequence[-1:][0])
 753         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 754                 num_algorithmic_greek += 1
 755
 756
 757 if opt_algorithmic:
 758         for sequence in xorg_compose_sequences_algorithmic_uniqued:
 759                 letter = "".join(sequence[-1:])
 760                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
 761                 for elem in sequence[:-2]:
 762                         print "<0x%(keysym)04X>," % { 'keysym': elem },
 763                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
 764                 print "], recomposed as", letter.encode('utf-8'), "verified"
 765
 766 def num_of_keysyms(seq):
 767         return len(seq) - 1
 768
 769 def convert_UnotationToHex(arg):
 770         if isinstance(arg, str):
 771                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
 772                         return sub('^U', '0x', arg)
 773         return arg
 774
 775 def addprefix_GDK(arg):
 776         if match('^0x', arg):
 777                 return '%(arg)s, ' % { 'arg': arg }
 778         else:
 779                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
 780
 781 if opt_gtk:
 782         first_keysym = ""
 783         sequence = []
 784         compose_table = []
 785         ct_second_part = []
 786         ct_sequence_width = 2
 787         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
 788         we_finished = False
 789         counter = 0
 790
 791         sequence_iterator = iter(xorg_compose_sequences)
 792         sequence = sequence_iterator.next()
 793         while True:
 794                 first_keysym = sequence[0]                                      # Set the first keysym
 795                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
 796                 while sequence[0] == first_keysym:
 797                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
 798                         try:
 799                                 sequence = sequence_iterator.next()
 800                         except StopIteration:
 801                                 we_finished = True
 802                                 break
 803                 if we_finished:
 804                         break
 805                 counter += 1
 806
 807         ct_index = start_offset
 808         for line_num in range(len(compose_table)):
 809                 for i in range(WIDTHOFCOMPOSETABLE):
 810                         occurences = compose_table[line_num][i+1]
 811                         compose_table[line_num][i+1] = ct_index
 812                         ct_index += occurences * (i+2)
 813
 814         for sequence in xorg_compose_sequences:
 815                 ct_second_part.append(map(convert_UnotationToHex, sequence))
 816
 817         print headerfile_start
 818         for i in compose_table:
 819                 if opt_gtkexpanded:
 820                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
 821                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
 822                 elif not match('^0x', i[0]):
 823                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 824                 else:
 825                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 826         for i in ct_second_part:
 827                 if opt_numeric:
 828                         for ks in i[1:][:-1]:
 829                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 830                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 831                         """
 832                         for ks in i[:-1]:
 833                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 834                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 835                         """
 836                 elif opt_gtkexpanded:
 837                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
 838                 else:
 839                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
 840         print headerfile_end
 841
 842 def redecompose(codepoint):
 843         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
 844         if decomposition[0] == '' or decomposition[0] == '0':
 845                 return [codepoint]
 846         if match('<\w+>', decomposition[0]):
 847                 numdecomposition = map(stringtohex, decomposition[1:])
 848                 return map(redecompose, numdecomposition)
 849         numdecomposition = map(stringtohex, decomposition)
 850         return map(redecompose, numdecomposition)
 851
 852 def process_unicodedata_file(verbose = False):
 853         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
 854         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
 855         try:
 856                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
 857         except IOError, (errno, strerror):
 858                 print "I/O error(%s): %s" % (errno, strerror)
 859                 sys.exit(-1)
 860         except:
 861                 print "Unexpected error: ", sys.exc_info()[0]
 862                 sys.exit(-1)
 863         for line in unicodedatatxt.readlines():
 864                 if line[0] == "" or line[0] == '#':
 865                         continue
 866                 line = line[:-1]
 867                 uniproperties = split(';', line)
 868                 codepoint = stringtohex(uniproperties[0])
 869                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
 870                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
 871                         continue
 872                 name = uniproperties[1]
 873                 category = uniproperties[2]
 874                 combiningclass = uniproperties[3]
 875                 decomposition = uniproperties[5]
 876                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
 877
 878         counter_combinations = 0
 879         counter_combinations_greek = 0
 880         counter_entries = 0
 881         counter_entries_greek = 0
 882
 883         for item in unicodedatabase.keys():
 884                 (name, decomposition, combiningclass) = unicodedatabase[item]
 885                 if decomposition[0] == '':
 886                         continue
 887                         print name, "is empty"
 888                 elif match('<\w+>', decomposition[0]):
 889                         continue
 890                         print name, "has weird", decomposition[0]
 891                 else:
 892                         sequence = map(stringtohex, decomposition)
 893                         chrsequence = map(unichr, sequence)
 894                         normalized = normalize('NFC', "".join(chrsequence))
 895
 896                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
 897                         decomposedsequence = []
 898                         for subseq in map(redecompose, sequence):
 899                                 for seqitem in subseq:
 900                                         if isinstance(seqitem, list):
 901                                                 for i in seqitem:
 902                                                         if isinstance(i, list):
 903                                                                 for j in i:
 904                                                                         decomposedsequence.append(j)
 905                                                         else:
 906                                                                 decomposedsequence.append(i)
 907                                         else:
 908                                                 decomposedsequence.append(seqitem)
 909                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
 910                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
 911                                 counter_entries += 1
 912                                 counter_combinations += factorial(len(decomposedsequence)-1)
 913                                 ch = item
 914                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 915                                         counter_entries_greek += 1
 916                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
 917                                 if verbose:
 918                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
 919                                         print "[",
 920                                         for elem in decomposedsequence:
 921                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
 922                                         print "], recomposed as", recomposedchar,
 923                                         if unichr(item) == recomposedchar:
 924                                                 print "verified"
 925
 926         if verbose == False:
 927                 print "Unicode statistics from UnicodeData.txt"
 928                 print "Number of entries that can be algorithmically produced     :", counter_entries
 929                 print "  of which are for Greek                                   :", counter_entries_greek
 930                 print "Number of compose sequence combinations requiring          :", counter_combinations
 931                 print "  of which are for Greek                                   :", counter_combinations_greek
 932                 print "Note: We do not include partial compositions, "
 933                 print "thus the slight discrepancy in the figures"
 934                 print
 935
 936 if opt_unicodedatatxt:
 937         process_unicodedata_file(True)
 938
 939 if opt_statistics:
 940         print
 941         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
 942         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
 943         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
 944         print "    of which have Multi_key                                :", counter_multikey
 945         print
 946         print "Algorithmic (stats for Xorg Compose file)"
 947         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
 948         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
 949         print "  of which are for Greek                                   :", num_algorithmic_greek
 950         print
 951         process_unicodedata_file()
 952         print "Not algorithmic (stats from Xorg Compose file)"
 953         print "Number of sequences                                        :", len(xorg_compose_sequences)
 954         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
 955         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
 956         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
 957         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
 958         print "Number of different first items                            :", num_first_keysyms
 959         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
 960         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
 961         print
 962         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
 963         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
 964         print
 965         print "Existing (old) implementation in GTK+"
 966         print "Number of sequences in old gtkimcontextsimple.c            :", 691
 967         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"