Code:StatsToTable.py

See Letter Distribution for the use of this script.

""" statsToTable.py Read a statistics file, and print as table (with HTML or other formatting) """ import sys import os import pprint import unicodedata import codecs import pickle import math class Stats(object): """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation""" def __init__(self, mean, average, stddev, minimum, maximum, occurrence, length, entry, entryname=None): self.values = None self.entry = entry self.entryname = entryname self.length = length self.occurrence = occurrence self.minimum = minimum self.maximum = maximum self.mean = mean self.average = average self.stddev = stddev self.sum = average*length def stats(self): return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,self.entryname) def __str__(self): return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (self.entryname, self.mean, self.average, self.stddev, self.occurrence, self.length) def statline(self): if self.minimum != None: return u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats else: stats = (self.mean,self.average,self.stddev,self.occurrence,self.length,self.entry,self.entryname) return u"%.17f\t%.17f\t%.17f\t\t\t%3d\t%3d\t%s\t%s\n" % stats @staticmethod def titleline(title=None): if title == None: title = "" return u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title) def charname(char): try: return unicodedata.name(char).split[-1].capitalize except (ValueError, IndexError, TypeError): return u"unnamed character "+char def charseqname(charseq): if len(charseq) &gt; 3: return charseq names = [charname(ch) for ch in charseq] if sum(len(ch) for ch in names) == len(charseq): return charseq else: return u" ".join(names) return charseq+u" ("+u" ".join(names)+u")" def readstats(filename, maxlen=None): f = codecs.open(filename, "rb", "utf-8") f.readline stat = [] i = 0 for line in f:        if maxlen and (i &gt; maxlen): break i +=1 statline = line.split("\t") if len(statline) == 8: (mean, average, stddev, minimum, maximum, occurance, length, entry) = statline entryname = entry else: (mean, average, stddev, minimum, maximum, occurance, length, entry, entryname) = statline mean     = float(mean) average  = float(average) stddev   = float(stddev) if maximum: minimum  = float(minimum) maximum  = float(maximum) else: minimum  = None maximum  = None occurance = int(occurance) length   = int(length) entry    = entry.rstrip("\n") entryname = entryname.rstrip if (maxlen == 0) and (mean == 0.0): break stat.append(Stats(mean,average,stddev,minimum,maximum,occurance,length,entry,entryname)) return stat def formattext(stats, title=u"", maxlen=None, useperc=True, prepend=u"", sep=u"\t", append=u"\n" ,tprepend=u"", tsep=u"\t", tappend=u"\n", head=u"", foot=u""): """Givens an array of Stats instances, sort write them to disk""" if maxlen: stats = stats[:maxlen] minimum = 0.0 i = 0 while (minimum == 0.0): i -= 1 minimum = stats[i].average precision = math.log(minimum,10) precision = max(2,int(2 - precision)) lenprecision = 2+int(math.log(stats[0].length,10)) if useperc: precision -= 2 collen = precision+3 fformat = u"%"+str(collen)+u"."+str(precision)+u"f" if useperc: fformat += u"%%" dformat = u"%"+str(collen-lenprecision)+u"d" tformat = u"%"+str(collen+1)+u"s" sformat = u"%-"+str(6)+u"s" lineformat = prepend+sformat+ sep+fformat+ sep+fformat+ sep+fformat+ sep+dformat+u"/%d"+append headformat = tprepend+sformat+tsep+tformat+tsep+tformat+tsep+tformat+tsep+tformat+tappend text = u"" text += head text += headformat % (title, u"Mean", u"Average", u"Std. deviation", u"Book occurrence") if useperc: for s in stats: name = s.entryname text += lineformat % (name, 100*s.mean, 100*s.average, 100*s.stddev, s.occurrence,s.length) else: for s in stats: text += lineformat % (name, s.mean, s.average, s.stddev, s.occurrence,s.length) text += foot return text def formattab(stats, title=u"", maxlen=None, useperc=True): return formattext(stats, title, maxlen, useperc, prepend=u"", sep=u"\t", append=u"\n", tprepend=u"", tsep=u"\t", tappend=u"\n", head=u"", foot=u"") def formathtml(stats, title=u"", maxlen=None, useperc=True): return formattext(stats, title, maxlen, useperc, prepend=u"&lt;tr&gt;&lt;td&gt;", sep=u"&lt;/td&gt;&lt;td&gt;", append=u"&lt;/td&gt;&lt;/tr&gt;\n", tprepend=u"&lt;tr&gt;&lt;th&gt;", tsep=u"&lt;/th&gt;&lt;th&gt;", tappend=u"&lt;/th&gt;&lt;/tr&gt;\n", head=u"&lt;table&gt;\n", foot=u"&lt;/table&gt;") def formatmediawiki(stats, title=u"", maxlen=None, useperc=True): return formattext(stats, title, maxlen, useperc, prepend=u"|-\n| ", sep=u" || ", append=u"\n", tprepend=u"! ", tsep=u" !! ", tappend=u"\n", head=u"{|\n", foot=u"|}\n") if __name__ == '__main__': stats = readstats('en-stats/normalizedletters.txt') print formatmediawiki(stats, title=u"Letter") # stats = readstats('en-stats/wordstats.txt', maxlen=25) # print formatmediawiki(stats, title=u"Word")
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8