Code:Findbaddiacriticals.py

See Letter Distribution for the use of this script.

""" findbaddiacriticals.py Find misformatted diacriticals, "manually" formatted diacriticals, and wrongly encoded diacriticals. Examples of manually formatted diacriticals: [`e] → * + combining grave (\u0300)        [`e] → è ['e] → * + combining umlaut (\u0301)         ['e] → é [^i] → i + combining circumflex accent (\u0302)         [^i] → î ; [^o] → ô [~n] → n + combining tilde (\u0303)         [~n] → ñ ; [^i] → ĩ [=a] → a + combining macro (\u0304)         [=e] → ē ; [=i] → ī ; [=o] → ō [)i] → i + combining breve (\u0306)         [)a] → ă ; [)u] → ŭ ; [)y] → y (no precomposed codepoint exist for y) [.u] → u + combining dot above (\u0307)         [.u] → u ; [.g] → g (diacriticals are removed; no precomposed codepoints exist) [:a] → a + combining umlaut (\u0308)         [:a] → ä ; [:a] → ë ; [:i] → ï [vr] → r + combining carbon (\u030C)         [vs] → š ; [vr] → ř [a.] → a + combining dot below (\u0323) [i.] → ị ; [a.] → a (no precomposed codepoint exist for y) [a:] → a + combining diaeresis below (\u0324) [a:] → a (diacriticals are removed; no precomposed codepoints exist) [i~] → i + combining tilde below (\u0330) [i~] → ḭ [e=] → e + combining macron below (\u0331) [e=] → e ; [s=] → s ; [x=] → x (diacriticals are removed; no precomposed codepoints exist) Other formatting such as [rho], [phi], [pi], [inf], _{max} for subscript and ^{2}  for superscript is left intact, as it typically applies to whole words, and we can treat is as regular punctuation. This script can not detect all misformatted diacriticals. In particular, it can not detect missing diacriticals (e.g. Dvorak instead of Dvořák). [)oo] → oo             (unclear what character is referred to) [=oo] → oo              (unclear what character is referred to) Dr. Us[s]her            (undetected) particular[ly]          (undetected) ɑ-Centauri → α-Centauri (undetected) Dvorak → Dvořák         (undetected) f_te → fête             (undetected) f^ete → fête            (undetected) """ import sys import os import codecs import unicodedata import tempfile import shutil import pprint import re controlchars = [unichr(codepoint) for codepoint in range(0,8) + [11,12] + range(14,32) + range(127,160)] manualdiac1re = re.compile(r'(\[[`\'=^~\)\.:v][a-zA-Z]\])') manualdiac2re = re.compile(r'(\[[a-zA-Z][\.:=]\])') def combinedchar(markupstr):     """Change manual markup like [:e] to ë"""     assert len(markupstr) == 4     if markupstr[2] in u"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":         letter = markupstr[2]         if markupstr[1] == '`':             combine = u"\u0300"         elif markupstr[1] == "'":             combine = u"\u0301"         elif markupstr[1] == '^':             combine = u"\u0302"         elif markupstr[1] == '~':             combine = u"\u0303"         elif markupstr[1] == '=':             combine = u"\u0304"         elif markupstr[1] == ')':             combine = u"\u0306"         elif markupstr[1] == '.':             combine = u"\u0307"         elif markupstr[1] == ':':             combine = u"\u0308"         elif markupstr[1] == 'v':             combine = u"\u030C"         else:             raise ValueError("%s: Not a valid diacritical formatting string" % markupstr)     else:         letter = markupstr[1]         if markupstr[2] == '.':             combine = u"\u0323"         elif markupstr[2] == ':':             combine = u"\u0324"         elif markupstr[2] == '~':             combine = u"\u0330"         elif markupstr[2] == '=':             combine = u"\u0331"         else:             raise ValueError("%s: Not a valid diacritical formatting string" % markupstr)     replacestr = unicodedata.normalize('NFC', letter + combine)     return replacestr     # return replacestr[0]  # if combining characters are never allowed. def process(filename):     global controlchars     global manualdiac1re, manualdiac2re     tmpfile = tempfile.TemporaryFile     src = codecs.open(filename, "rb", "utf-8")     dst = codecs.getwriter("utf-8")(tmpfile)     changes = False     for line in src:         ms = manualdiac1re.findall(line) + manualdiac2re.findall(line)         for m in ms:             print m, line.rstrip             pos = line.find(m)             line = line[:pos] + combinedchar(m) + line[pos+len(m):]             print line             changes = True         for c in controlchars:             if c in line:                 try:                     name = unicodedata.name(c)                 except ValueError:                     name = "unknown codepoint"                 print ("Illegal control character %4X (%s)" % (ord(c), name)), line.rstrip         dst.write(line)     src.close     if changes:     # now overwrite source with modified destination file         filename += ".2"         print "Writing changes to",filename         dst.seek(0)         src = open(filename+".2", "wb")         shutil.copyfileobj(dst, src)         src.close     dst.close def dirfiles(dir):     # we do not visit subdirectories, so this is not a loop.     # print os.walk(dir)     root, dirs, files = os.walk(dir).next     filenames = []     for f in files:         if f.startswith('.'):             continue         filenames.append(os.path.join(dir,f))     return filenames if __name__ == '__main__':     for filename in dirfiles('en-plaintext'):         size = os.stat(filename).st_size         print filename, "(",size/1000,"kB )"         process(filename)     #printletterfreq(letters)
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8
 * 1) illegal characters (one file contained "f�te", where the second character there is the backspace character). Spiffy.
 * 1) regular expression to detect 'manual' diacritical characters
 * 1) modname = globals['__name__']     # __main__ if it is called directly
 * 2) module = sys.modules[modname]       # this module