Code:LetterFreq.py

See Letter Distribution for the use of this script.

""" letterfreq.py Read all files in a directory and print a frequence distribution of all characters """ import sys import os import pprint import unicodedata def letterfreq(fileobj, letters = None): if letters == None: letters = {} for line in fileobj: line = line.decode("utf-8") for c in line: if c not in letters: letters[c] = 0 letters[c] += 1 return letters def charname(char): try: return unicodedata.name(char).split[-1].capitalize except (ValueError, IndexError): return "unnamed character "+repr(char) def printletterfreq(letters): total = sum(letters[c] for c in letters) print "Character  Codepoint       # Occurances   Name" for c,v in sorted(letters.items): print "%-10s u%04x  %12d (%5.2f%%)   %s" % (repr(c),ord(c),v,100.0*v/total,charname(c)) def dirfiles(dir): # we do not visit subdirectories, so this is not a loop. # print os.walk(dir) root, dirs, files = os.walk(dir).next filenames = [] files = ['103.txt'] for f in files: if f.startswith('.'): continue filenames.append(os.path.join(dir,f)) return filenames if __name__ == '__main__': letters = {} for filename in dirfiles('en-words-nodiac'): size = os.stat(filename).st_size print filename, "(",size/1000,"kB )" f = open(filename) letters = letterfreq(f, letters) f.close if u"\n" in letters: letters[u" "] = letters[u"\n"] del letters[u"\n"] printletterfreq(letters) # pprint.pprint(letters)
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8