Code:Findbaddiacriticals.py

From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
"""
findbaddiacriticals.py
Find misformatted diacriticals, "manually" formatted diacriticals, and wrongly encoded diacriticals.
Examples of manually formatted diacriticals:
[`e] → * + combining grave (\u0300)
        [`e] → è
['e] → * + combining umlaut (\u0301)
        ['e] → é
[^i] → i + combining circumflex accent (\u0302)
        [^i] → î ; [^o] → ô
[~n] → n + combining tilde (\u0303)
        [~n] → ñ ; [^i] → ĩ
[=a] → a + combining macro (\u0304)
        [=e] → ē ; [=i] → ī ; [=o] → ō
[)i] → i + combining breve (\u0306)
        [)a] → ă ; [)u] → ŭ ; [)y] → y (no precomposed codepoint exist for y)
[.u] → u + combining dot above (\u0307)
        [.u] → u ; [.g] → g (diacriticals are removed; no precomposed codepoints exist)
[:a] → a + combining umlaut (\u0308)
        [:a] → ä ; [:a] → ë ; [:i] → ï
[vr] → r + combining carbon (\u030C)
        [vs] → š ; [vr] → ř
[a.] → a + combining dot below (\u0323)
        [i.] → ị ; [a.] → a (no precomposed codepoint exist for y)
[a:] → a + combining diaeresis below (\u0324)
        [a:] → a (diacriticals are removed; no precomposed codepoints exist)
[i~] → i + combining tilde below (\u0330)
        [i~] → ḭ
[e=] → e + combining macron below (\u0331)
        [e=] → e ; [s=] → s ; [x=] → x (diacriticals are removed; no precomposed codepoints exist)

Other formatting such as [rho], [phi], [pi], [inf], _{max}  for subscript and ^{2}  for superscript is left intact, as it typically applies to whole words, and we can treat is as regular punctuation.

This script can not detect all misformatted diacriticals. In particular, it can not detect missing diacriticals (e.g. Dvorak instead of Dvořák).
[)oo] → oo              (unclear what character is referred to)
[=oo] → oo              (unclear what character is referred to)
Dr. Us[s]her            (undetected)
particular[ly]          (undetected)
ɑ-Centauri → α-Centauri (undetected)
Dvorak → Dvořák         (undetected)
f_te → fête             (undetected)
f^ete → fête            (undetected)
"""

import sys
import os
import codecs
import unicodedata
import tempfile
import shutil
import pprint
import re

# illegal characters (one file contained "f�te", where the second character there is the backspace character). Spiffy.
controlchars = [unichr(codepoint) for codepoint in range(0,8) + [11,12] + range(14,32) + range(127,160)]

# regular expression to detect 'manual' diacritical characters
manualdiac1re = re.compile(r'(\[[`\'=^~\)\.:v][a-zA-Z]\])')
manualdiac2re = re.compile(r'(\[[a-zA-Z][\.:=]\])')

# modname = globals()['__name__']     # __main__ if it is called directly
# module = sys.modules[modname]       # this module

def combinedchar(markupstr):
    """Change manual markup like [:e] to ë"""
    assert len(markupstr) == 4
    if markupstr[2] in u"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
        letter = markupstr[2]
        if markupstr[1] == '`':
            combine = u"\u0300"
        elif markupstr[1] == "'":
            combine = u"\u0301"
        elif markupstr[1] == '^':
            combine = u"\u0302"
        elif markupstr[1] == '~':
            combine = u"\u0303"
        elif markupstr[1] == '=':
            combine = u"\u0304"
        elif markupstr[1] == ')':
            combine = u"\u0306"
        elif markupstr[1] == '.':
            combine = u"\u0307"
        elif markupstr[1] == ':':
            combine = u"\u0308"
        elif markupstr[1] == 'v':
            combine = u"\u030C"
        else:
            raise ValueError("%s: Not a valid diacritical formatting string" % markupstr)
    else:
        letter = markupstr[1]
        if markupstr[2] == '.':
            combine = u"\u0323"
        elif markupstr[2] == ':':
            combine = u"\u0324"
        elif markupstr[2] == '~':
            combine = u"\u0330"
        elif markupstr[2] == '=':
            combine = u"\u0331"
        else:
            raise ValueError("%s: Not a valid diacritical formatting string" % markupstr)
    replacestr = unicodedata.normalize('NFC', letter + combine)
    return replacestr
    # return replacestr[0]  # if combining characters are never allowed.


def process(filename):
    global controlchars
    global manualdiac1re, manualdiac2re
    tmpfile = tempfile.TemporaryFile()
    src = codecs.open(filename, "rb", "utf-8")
    dst = codecs.getwriter("utf-8")(tmpfile)
    changes = False
    for line in src:
        ms = manualdiac1re.findall(line) + manualdiac2re.findall(line)
        for m in ms:
            print m, line.rstrip()
            pos = line.find(m)
            line = line[:pos] + combinedchar(m) + line[pos+len(m):]
            print line
            changes = True
        for c in controlchars:
            if c in line:
                try:
                    name = unicodedata.name(c)
                except ValueError:
                    name = "unknown codepoint"
                print ("Illegal control character %4X (%s)" % (ord(c), name)), line.rstrip()
        dst.write(line)
    src.close()
    if changes:
    # now overwrite source with modified destination file
        filename += ".2"
        print "Writing changes to",filename
        dst.seek(0)
        src = open(filename+".2", "wb")
        shutil.copyfileobj(dst, src)
        src.close()
    dst.close()


def dirfiles(dir):
    # we do not visit subdirectories, so this is not a loop.
    # print os.walk(dir)
    root, dirs, files = os.walk(dir).next()
    filenames = []
    for f in files:
        if f.startswith('.'):
            continue
        filenames.append(os.path.join(dir,f))
    return filenames


if __name__ == '__main__':
    for filename in dirfiles('en-plaintext'):
        size = os.stat(filename).st_size
        print filename, "(",size/1000,"kB )"
        process(filename)
    #printletterfreq(letters)