Code:NormalizeStats.py

From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
"""
printletterstats.py
Print normalized letter stats without space, or characters that only occur in less then half of the books
"""

import sys
import os
import pprint
import unicodedata
import codecs
import pickle
import math


class Stats(object):
    """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation"""
    def __init__(self, mean, average, stddev, minimum, maximum, occurrence, length, entry, entryname=None):
        self.values = None
        self.entry = entry
        self.entryname = entryname
        self.length = length
        self.occurrence = occurrence
        self.minimum = minimum
        self.maximum = maximum
        self.mean = mean
        self.average = average
        self.stddev  = stddev
        self.sum = average*length
    def stats(self):
        return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,self.entryname)
    def __str__(self):
        return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (self.entryname, self.mean, self.average, self.stddev, self.occurrence, self.length)
    def statline(self):
        if self.minimum != None:
            return (u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats()).encode('utf-8')
        else:
            stats = (self.mean,self.average,self.stddev,self.occurrence,self.length,self.entry,self.entryname)
            return (u"%.17f\t%.17f\t%.17f\t\t\t%3d\t%3d\t%s\t%s\n" % stats).encode('utf-8')
    @staticmethod
    def titleline(title=None):
        if title == None:
            title = ""
        return (u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)).encode('utf-8')


def readstats(filename):
    f = codecs.open(filename, "rb", "utf-8")
    f.readline()
    stat = []
    for line in f:
        statline = line.split("\t")
        (mean, average, stddev, minimum, maximum, occurance, length, entry, entryname) = statline
        if u' ' in entry:
            continue  # Do not include space 
        mean      = float(mean)
        average   = float(average)
        stddev    = float(stddev)
        mimimum   = float(minimum)
        maximum   = float(maximum)
        occurance = int(occurance)
        length    = int(length)
        entryname = entryname.rstrip()
        if mean == 0.0:
            continue  # Only include if mean > 0
        stat.append(Stats(mean,average,stddev,mimimum,maximum,occurance,length,entry,entryname))
    return stat

def writefreqdist(freqstats, filename, title=None):
    """Givens an array of Stats instances, sort write them to disk"""
    f = open(filename, "w")
    if title:
        f.write(Stats.titleline(title))
    for s in freqstats:
        f.write(s.statline())
    f.close()

def normalize(stats):
    meansum    = 0.0
    averagesum = 0.0
    for s in stats:
        meansum     += s.mean
        averagesum  += s.average
    for s in stats:
        s.mean    = s.mean    / meansum
        s.average = s.average / averagesum
        s.stddev  = s.stddev / averagesum
        s.minimum = None
        s.maximum = None


if __name__ == '__main__':
    stats = readstats('en-stats/letterpairs.txt')
    normalize(stats)
    # write results
    filename = "en-stats/normalizedletterpairs.txt"
    writefreqdist(stats, filename, "Letters")