From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
Convert a UTF-8 text file to a list of words.

All accents characters are removed (don't becomes dont). Diacritical marks are kept by default (ä remains ¨ä). Ligatures are expanded (thus ij becomes ij). Punctuation is removed (a.m. becomes am). Double spaces are removed (so the final text will next contain two or more spaces in a row). All spaces are converted into carriage return, so that the final text contains a list of words. All text is changed to upper case.

import sys
import os
import codecs
import unicodedata
import tempfile
import shutil
import pprint

# Specify which characters should be kept, by Unicode category (see below for an overview of the categories)
keepcats    = ['Ll', 'Lu', 'Lt']
removecats  = ['Cf', 'Lm', 'Lo', 'Mn', 'Nd', 'No', 'Pc', 'Pe', 'Pf', 'Pi', 'Ps', 'Sk']
spacecats   = ['Cc', 'Pd', 'Po', 'Sc', 'Sm', 'So', 'Zl', 'Zp', 'Zs']
# note: we remove non-latin or greek letters (such as arabic, hebrew or chinese characters)

# Override specific characters
keepchars   = u""
removechars = u"\"'′″."  # in Po, but remove instead of turn into space
spacechars  = u"-"

keepdiacriticals  = False   # should diacritical marks (such as umlauts) be kept or removed?
keepligatures     = False   # False removes ligatures (ij becomes ij) and formatting (½ becomes 1⁄2, ² becomes 2)
makeuppercase     = True    # convert characters to upper case?
removedoublespace = True    # series of spaces are converted into a single space.
# It is tempting to retain double spaces at the end of a sentence, and remove others. That would allow statistics about the first letter of a sentence (as opposed to just the first letter of a word). However, in practice it is undoable to get correctly find a sentence border, as a dot can be used for an abbreviation (s.v.p., a.m., Mr. Jones, L.T. Smith, etc.) and some sentences do not end in a dot (such as titles). Ideally, there should be a separate abbreviation marker. But since that does not exists (and is not used), this is fruitless attempt.

# Unicode categories:           (space/remove/keep)
# Cc - Other, control                   s    tab, return
# Cf - Other, format                    s    line separator, pop directional formatting, right-to-left override
# Cn - Other, not assigned              ?    
# Co - Other, private use               ?    
# Cs - Other, surrogate                 ?    
# Ll - Letter, lowercase                k    abcdefřéîæäçȳøαβåάέγἡὡῶºɑ
# Lm - Letter, modifier                 r    
# Lo - Letter, other                    r    Arabic, Hebrew letter
# Lt - Letter, titlecase                k    
# Lu - Letter, uppercase                k    ABCDEFÀÃÆÇÈÏÑÖÛÝĀĪŒŪƷΑΓΔΕΗΙΚΛΜΝΟΠΡΣΤΥΦΩἸ
# Mc - Mark, spacing combining          ?    
# Me - Mark, enclosing                  ?    
# Mn - Mark, non-spacing                r    Latin, Arabic, Hebrew diacritical marks (umlaut, etc.)
# Nd - Number, decimal digit            r    0123456789
# Nl - Number, letter                   ?    
# No - Number other                     r   ₂¹²³¼½¾⅓⅔⅙⅛
# Pc - Punctuation, connector           r    _
# Pd - Punctuation, dash                s/r  –-
# Pe - Punctuation, close               r    )]}
# Pf - Punctuation, final quote         r    ’”»
# Pi - Puntuation, initial quote        r    ‘“«
# Po - Punctuation, other               s/r  !"#%&'*,./:;?@\¡·¿′″, Hebrew punctuation
# Ps - Punctuation, open                r    ([{
# Sc - Symbol, currency                 s    $¢£¤
# Sk - Symbol, modifier                 r    ^`¯´
# Sm - Symbol, math                     s    +<=>|~±×÷
# So - Symbol, other                    s    §©°℥♀♂�
# Zl - Seaprator, line                  s    
# Zp - Seaparator, paragraph            s    
# Zs - Separator, space                 s    space, no-break space (A0)

# modname = globals()['__name__']     # __main__ if it is called directly
# module = sys.modules[modname]       # this module

def getNormalization():
    global keepdiacriticals, keepligatures
    if keepdiacriticals:
        if keepligatures:
            return 'NFC'
        else:   # not keepligatures
            return 'NFKC'
    else:   # not keepdiacriticals
        if keepligatures:
            return 'NFD'
        else:   # not keepligatures
            return 'NFKD'

def reduceChar(char, space=u"\n"):
    """Decides if a char is kept, turned into a space, or removed (return empty string)"""
    global keepcats, removecats, spacecats, keepchars, removechars, spacechars
    if char in keepchars:
        return char
    if char in removechars:
        return u""
    if char in spacechars:
        return space
    cat = unicodedata.category(char)
    if cat in keepcats:
        keepchars += char  # cache
        return char
    if cat in removecats:
        removechars += char # cache
        return u""
    if cat in spacecats:
        spacechars += char  # cache
        return space
    raise ValueError("Do not know how to handle char %s (u%04X, category %s)" % (repr(char), ord(char), cat))

def process(filename, destdir=None):
    global makeuppercase, removedoublespace
    tmpfile = tempfile.TemporaryFile()
    src =, "rb", "utf-8")
    dst = codecs.getwriter("utf-8")(tmpfile)
    normfactor = getNormalization()
    spaceChar = u"\n"
    lastspace = True  # last character is a space, used to supress double spaces
    for line in src:
        newline = u""
        line = unicodedata.normalize(normfactor, line)
        if makeuppercase:
            line = line.upper()
        for c in line:
            cn = reduceChar(c, space=spaceChar)
            if cn == spaceChar:
                if (not lastspace) or (not removedoublespace):
                    newline += cn
                lastspace = True
            elif cn != u"":  # regular char
                newline += cn
                lastspace = False
    # now overwrite source with modified destination file
    if destdir:
        filename = os.path.join(destdir, os.path.basename(filename))
    src = open(filename, "wb")
    shutil.copyfileobj(dst, src)

def printletterfreq(letters):
    for c,v in letters.items():
        print repr(c),ord(c),v

def dirfiles(dir):
    # we do not visit subdirectories, so this is not a loop.
    # print os.walk(dir)
    root, dirs, files = os.walk(dir).next()
    filenames = []
    for f in files:
        if f.startswith('.'):
    return filenames

if __name__ == '__main__':
    for filename in dirfiles('en-plaintext'):
        size = os.stat(filename).st_size
        print filename, "(",size/1000,"kB )"
        process(filename, 'en-words')