Code:LetterFreq.py

From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
"""
letterfreq.py
Read all files in a directory and print a frequence distribution of all characters
"""

import sys
import os
import pprint
import unicodedata

def letterfreq(fileobj, letters = None):
    if letters == None:
        letters = {}
    for line in fileobj:
        line = line.decode("utf-8")
        for c in line:
            if c not in letters:
                letters[c] = 0
            letters[c] += 1
    return letters

def charname(char):
    try:
        return unicodedata.name(char).split()[-1].capitalize()
    except (ValueError, IndexError):
        return "unnamed character "+repr(char)

def printletterfreq(letters):
    total = sum(letters[c] for c in letters)
    print "Character   Codepoint       # Occurances   Name"
    for c,v in sorted(letters.items()):
        print "%-10s  u%04x  %12d (%5.2f%%)   %s" % (repr(c),ord(c),v,100.0*v/total,charname(c))


def dirfiles(dir):
    # we do not visit subdirectories, so this is not a loop.
    # print os.walk(dir)
    root, dirs, files = os.walk(dir).next()
    filenames = []
    files = ['103.txt']
    for f in files:
        if f.startswith('.'):
            continue
        filenames.append(os.path.join(dir,f))
    return filenames


if __name__ == '__main__':
    letters = {}
    for filename in dirfiles('en-words-nodiac'):
        size = os.stat(filename).st_size
        print filename, "(",size/1000,"kB )"
        f = open(filename)
        letters = letterfreq(f, letters)
        f.close()
    if u"\n" in letters:
        letters[u" "] = letters[u"\n"]
        del letters[u"\n"]
    printletterfreq(letters)
    # pprint.pprint(letters)