Code:Firslastletter.py

See Letter Distribution for the use of this script.

""" printletterstats.py Print normalized letter stats without space, or characters that only occur in less then half of the books """ import sys import os import pprint import unicodedata import codecs import pickle import math class Stats(object): """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation""" def __init__(self, mean, average, stddev, minimum, maximum, occurrence, length, entry, entryname=None): self.values = None self.entry = entry self.entryname = entryname self.length = length self.occurrence = occurrence self.minimum = minimum self.maximum = maximum self.mean = mean self.average = average self.stddev = stddev self.sum = average*length def stats(self): return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,self.entryname) def __str__(self): return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (self.entryname, self.mean, self.average, self.stddev, self.occurrence, self.length) def statline(self): if self.minimum != None: return (u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats).encode('utf-8') else: stats = (self.mean,self.average,self.stddev,self.occurrence,self.length,self.entry,self.entryname) return (u"%.17f\t%.17f\t%.17f\t\t\t%3d\t%3d\t%s\t%s\n" % stats).encode('utf-8') @staticmethod def titleline(title=None): if title == None: title = "" return (u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)).encode('utf-8') def readstats(filename): f = codecs.open(filename, "rb", "utf-8") f.readline stat = [] for line in f:        statline = line.split("\t") (mean, average, stddev, minimum, maximum, occurance, length, entry, entryname) = statline if entry == u' ': continue # Do not include space mean     = float(mean) average  = float(average) stddev   = float(stddev) mimimum  = float(minimum) maximum  = float(maximum) occurance = int(occurance) length   = int(length) entryname = entryname.rstrip if mean == 0.0: continue # Only include if mean > 0 stat.append(Stats(mean,average,stddev,mimimum,maximum,occurance,length,entry,entryname)) return stat def charname(char): try: return unicodedata.name(char).split[-1].capitalize except (ValueError, IndexError, TypeError): return u"unnamed character "+char def extractfirslastletters(stats): startstats = [] endstats  = [] for s in stats: assert len(s.entry) == 2 if s.entry[0] == u' ': s.entry    = s.entry[1] s.entryname = charname(s.entry) startstats.append(s) elif s.entry[1] == u' ': s.entry    = s.entry[0] s.entryname = charname(s.entry) endstats.append(s) return startstats, endstats def completeAlphabet(stat): alphabet = [ch for ch in u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"] length = stat[0].length for s in stat: assert len(s.entry) == 1 alphabet.remove(s.entry) for ch in alphabet: stat.append(Stats(0.0, 0.0, 0.0, 0.0, 0.0, 0, length, ch, ch)) return stat def normalize(stats): meansum   = 0.0 averagesum = 0.0 for s in stats: meansum    += s.mean averagesum += s.average for s in stats: s.mean   = s.mean    / meansum s.average = s.average / averagesum s.stddev = s.stddev / averagesum s.minimum = None s.maximum = None def writefreqdist(freqstats, filename, title=None): """Givens an array of Stats instances, sort write them to disk""" f = open(filename, "w") if title: f.write(Stats.titleline(title)) for s in freqstats: f.write(s.statline) f.close def writefreqdist(freqstats, filename, title=None): """Givens an array of Stats instances, sort write them to disk""" f = open(filename, "w") if title: f.write(Stats.titleline(title)) for s in freqstats: f.write(s.statline) f.close if __name__ == '__main__': stats = readstats('en-stats/letterpairs.txt') startstats, endstats = extractfirslastletters(stats) startstats = completeAlphabet(startstats) endstats  = completeAlphabet(endstats) normalize(startstats) normalize(endstats) # write results filename = "en-stats/startletters.txt" writefreqdist(startstats, filename, "Starting Letter") filename = "en-stats/endletters.txt" writefreqdist(endstats, filename, "Ending Letter")
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8