Code:NormalizeStats.py

See Letter Distribution for the use of this script.

""" printletterstats.py Print normalized letter stats without space, or characters that only occur in less then half of the books """ import sys import os import pprint import unicodedata import codecs import pickle import math class Stats(object): """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation""" def __init__(self, mean, average, stddev, minimum, maximum, occurrence, length, entry, entryname=None): self.values = None self.entry = entry self.entryname = entryname self.length = length self.occurrence = occurrence self.minimum = minimum self.maximum = maximum self.mean = mean self.average = average self.stddev = stddev self.sum = average*length def stats(self): return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,self.entryname) def __str__(self): return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (self.entryname, self.mean, self.average, self.stddev, self.occurrence, self.length) def statline(self): if self.minimum != None: return (u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats).encode('utf-8') else: stats = (self.mean,self.average,self.stddev,self.occurrence,self.length,self.entry,self.entryname) return (u"%.17f\t%.17f\t%.17f\t\t\t%3d\t%3d\t%s\t%s\n" % stats).encode('utf-8') @staticmethod def titleline(title=None): if title == None: title = "" return (u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)).encode('utf-8') def readstats(filename): f = codecs.open(filename, "rb", "utf-8") f.readline stat = [] for line in f:        statline = line.split("\t") (mean, average, stddev, minimum, maximum, occurance, length, entry, entryname) = statline if u' ' in entry: continue # Do not include space mean     = float(mean) average  = float(average) stddev   = float(stddev) mimimum  = float(minimum) maximum  = float(maximum) occurance = int(occurance) length   = int(length) entryname = entryname.rstrip if mean == 0.0: continue # Only include if mean > 0 stat.append(Stats(mean,average,stddev,mimimum,maximum,occurance,length,entry,entryname)) return stat def writefreqdist(freqstats, filename, title=None): """Givens an array of Stats instances, sort write them to disk""" f = open(filename, "w") if title: f.write(Stats.titleline(title)) for s in freqstats: f.write(s.statline) f.close def normalize(stats): meansum   = 0.0 averagesum = 0.0 for s in stats: meansum    += s.mean averagesum += s.average for s in stats: s.mean   = s.mean    / meansum s.average = s.average / averagesum s.stddev = s.stddev / averagesum s.minimum = None s.maximum = None if __name__ == '__main__': stats = readstats('en-stats/letterpairs.txt') normalize(stats) # write results filename = "en-stats/normalizedletterpairs.txt" writefreqdist(stats, filename, "Letters")
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8