From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
Read a statistics file, and print as table (with HTML or other formatting)

import sys
import os
import pprint
import unicodedata
import codecs
import pickle
import math

class Stats(object):
    """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation"""
    def __init__(self, mean, average, stddev, minimum, maximum, occurrence, length, entry, entryname=None):
        self.values = None
        self.entry = entry
        self.entryname = entryname
        self.length = length
        self.occurrence = occurrence
        self.minimum = minimum
        self.maximum = maximum
        self.mean = mean
        self.average = average
        self.stddev  = stddev
        self.sum = average*length
    def stats(self):
        return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,self.entryname)
    def __str__(self):
        return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (self.entryname, self.mean, self.average, self.stddev, self.occurrence, self.length)
    def statline(self):
        if self.minimum != None:
            return u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats()
            stats = (self.mean,self.average,self.stddev,self.occurrence,self.length,self.entry,self.entryname)
            return u"%.17f\t%.17f\t%.17f\t\t\t%3d\t%3d\t%s\t%s\n" % stats
    def titleline(title=None):
        if title == None:
            title = ""
        return u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)

def charname(char):
    except (ValueError, IndexError, TypeError):
        return u"unnamed character "+char

def charseqname(charseq):
    if len(charseq) > 3:
        return charseq
    names = [charname(ch) for ch in charseq]
    if sum(len(ch) for ch in names) == len(charseq):
        return charseq
        return u" ".join(names)
        return charseq+u" ("+u" ".join(names)+u")"

def readstats(filename, maxlen=None):
    f =, "rb", "utf-8")
    stat = []
    i = 0
    for line in f:
        if maxlen and (i > maxlen):
        i +=1
        statline = line.split("\t")
        if len(statline) == 8:
            (mean, average, stddev, minimum, maximum, occurance, length, entry) = statline
            entryname = entry
            (mean, average, stddev, minimum, maximum, occurance, length, entry, entryname) = statline
        mean      = float(mean)
        average   = float(average)
        stddev    = float(stddev)
        if maximum:
            minimum   = float(minimum)
            maximum   = float(maximum)
            minimum   = None
            maximum   = None
        occurance = int(occurance)
        length    = int(length)
        entry     = entry.rstrip("\n")
        entryname = entryname.rstrip()
        if (maxlen == 0) and (mean == 0.0):
    return stat

def formattext(stats, title=u"", maxlen=None, useperc=True, prepend=u"", sep=u"\t", append=u"\n" ,tprepend=u"", tsep=u"\t", tappend=u"\n", head=u"", foot=u""):
    """Givens an array of Stats instances, sort write them to disk"""
    if maxlen:
        stats = stats[:maxlen]
    minimum = 0.0
    i = 0
    while (minimum == 0.0):
        i -= 1
        minimum = stats[i].average
    precision = math.log(minimum,10)
    precision = max(2,int(2 - precision))
    lenprecision = 2+int(math.log(stats[0].length,10))
    if useperc:
        precision -= 2
    collen = precision+3
    fformat = u"%"+str(collen)+u"."+str(precision)+u"f"
    if useperc:
        fformat += u"%%"
    dformat = u"%"+str(collen-lenprecision)+u"d"
    tformat = u"%"+str(collen+1)+u"s"
    sformat = u"%-"+str(6)+u"s"
    lineformat =  prepend+sformat+ sep+fformat+ sep+fformat+ sep+fformat+ sep+dformat+u"/%d"+append
    headformat = tprepend+sformat+tsep+tformat+tsep+tformat+tsep+tformat+tsep+tformat+tappend
    text = u""
    text += head
    text += headformat % (title, u"Mean", u"Average", u"Std. deviation", u"Book occurrence")
    if useperc:
        for s in stats:
            name = s.entryname
            text += lineformat % (name, 100*s.mean, 100*s.average, 100*s.stddev, s.occurrence,s.length)
        for s in stats:
            text += lineformat % (name, s.mean, s.average, s.stddev, s.occurrence,s.length)
    text += foot
    return text

def formattab(stats, title=u"", maxlen=None, useperc=True):
    return formattext(stats, title, maxlen, useperc, prepend=u"", sep=u"\t", append=u"\n", tprepend=u"", tsep=u"\t", tappend=u"\n", head=u"", foot=u"")

def formathtml(stats, title=u"", maxlen=None, useperc=True):
    return formattext(stats, title, maxlen, useperc, prepend=u"<tr><td>", sep=u"</td><td>", append=u"</td></tr>\n", tprepend=u"<tr><th>", tsep=u"</th><th>", tappend=u"</th></tr>\n", head=u"<table>\n", foot=u"</table>")

def formatmediawiki(stats, title=u"", maxlen=None, useperc=True):
    return formattext(stats, title, maxlen, useperc, prepend=u"|-\n| ", sep=u" || ", append=u"\n", tprepend=u"! ", tsep=u" !! ", tappend=u"\n", head=u"{|\n", foot=u"|}\n")

if __name__ == '__main__':
    stats = readstats('en-stats/normalizedletters.txt')
    print formatmediawiki(stats, title=u"Letter")
    # stats = readstats('en-stats/wordstats.txt', maxlen=25)
    # print formatmediawiki(stats, title=u"Word")