Code:GenerateWordStats.py

See Letter Distribution for the use of this script.

""" generatewordstats.py Read all files in a directory, which contain exactly one word per line. Generate per-file word counts, normalize these numbers (divide by the total number of words in the file). Merge these per-file stats by calculating for every occuring word: the average, standard deviation, mean, minimum, maximum, # files with the word, the word itself. Write these words-stats, sorted by mean and average. """ import sys import os import unicodedata import codecs import math class Stats(object): """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation""" def __init__(self, values, entry=None): self.type = type(values[0]) # int or float self.values = sorted(values) self.entry = entry self.recalculate def recalculate(self): self.length = len(self.values) self.occurance = len(filter(lambda x: x > 0.0, self.values)) self.minimum = self.values[0] self.maximum = self.values[-1] self.mean = self.values[len(self.values)//2] self.sum = sum(self.values) self.average = float(self.sum)/self.length self.stddev = math.sqrt(sum((x-self.average)**2 for x in self.values) / self.length) def stats(self): return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurance,self.length,self.entry) def __str__(self): return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurance: %d/%d" % (self.entry, self.mean, self.average, self.stddev, self.occurance, self.length) def statline(self): if self.type == float: return u"%19.17f\t%19.17f\t%19.17f\t%19.17f\t%19.17f\t%3d\t%3d\t%s\n" % self.stats elif self.type == int: return u"%19d\t%19.17f\t%.17f\t%19d\t%19d\t%3d\t%3d\t%s\n" % self.stats else: print "Unkown type %s" % (self.type) return u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\n" % self.stats @staticmethod def titleline(title=None): if title == None: title = "" return (u"%-19s\t%-19s\t%-19s\t%-19s\t%-19s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurances", "# Books", title)).encode('utf-8') def wordfilestats(filename, seqlen=2): words   = {} doubles = {} src = codecs.open(filename, "rb", "utf-8") for line in src: word = line.rstrip if word not in words: words[word] = 0 words[word] += 1 src.close return words def printstats(letters, title="Letter", total=None): if total == None: total = totalcount(letters) print "%-20s %14s" % (title, "# Occurances") for c,v in sorted(letters.items): print "%-16s %9d (%5.2f%%)" % (c,v,100.0*v/total) def totalcount(occurancedict): """Given a occurance dictionary (entry: occurance count), return the total number of occurances""" return sum(occurancedict[c] for c in occurancedict) def normalize(occurancedict): """normalize a occurance dictionary (entry: occurance count), so that the total number of occurances is 1.0""" total = totalcount(occurancedict) for ch, v in occurancedict.items: occurancedict[ch] = float(occurancedict[ch])/total return occurancedict def merge(occurancedicts): """Given an array of (per-file) occurance dictionaries, find all entries and calculate the statistics of each entry. Return an array with Stats instances.""" zero = type(occurancedicts[0].values[0])(0) entries = set for od in occurancedicts: entries = entries.union(set(od.keys)) freqstats = [] for entry in entries: entryvalues = [] for od in occurancedicts: if entry in od: entryvalues.append(od[entry]) else: entryvalues.append(zero) freqstats.append(Stats(entryvalues, entry)) return freqstats def writefreqdist(freqstats, filename, title=None): """Givens an array of Stats instances, sort write them to disk""" f = open(filename, "w") if title: f.write(Stats.titleline(title)) for s in freqstats: f.write(s.statline.encode('utf-8')) f.close def dirfiles(dir, maxlen=None): # we do not visit subdirectories, so this is not a loop. # print os.walk(dir) root, dirs, files = os.walk(dir).next if maxlen: files = files[:maxlen+1] filenames = [] for f in files: if f.startswith('.'): continue filenames.append(os.path.join(dir,f)) return filenames if __name__ == '__main__': stats = [] totalcounts    = [] differentcounts = [] print "Reading and normalizing per-file statistics" for filename in dirfiles('en-words'): size = os.stat(filename).st_size print filename, "(",size/1000,"kB )" wordoccurance = wordfilestats(filename) totalwords = totalcount(wordoccurance) differentwords = len(wordoccurance) print "   %d different words; %d total words" % (differentwords, totalwords) stats.append(wordoccurance) totalcounts.append(totalwords) differentcounts.append(differentwords) print "Summary for all files:" st = Stats(totalcounts) sd = Stats(differentcounts) print "                   Mean     Average     Std Dev     Minimum     Maximum     # Books" print "Different Words %8d %11.2f %11.2f %11d %11d %9d" % (sd.mean, sd.average, sd.stddev, sd.minimum, sd.maximum, sd.occurance) print "Total Words    %8d %11.2f %11.2f %11d %11d %9d" % (st.mean, st.average, st.stddev, st.minimum, st.maximum, st.occurance) print "Merging per-file statistics ..." absstats = merge(stats) for k,v in enumerate(stats): stats[k] = normalize(v) stats = merge(stats) print u"# Words in all books                    %10d" % (st.sum) # sum(s.sum for s in absstats) print u"# Different words                       %10d" % (sd.sum) # len(stats) print u"# Words occurring at least twice        %10d" % (len(filter(lambda s: s.sum > 1, absstats))) print u"# Words occurring in at least 2 books   %10d" % (len(filter(lambda s: s.occurance > 1, absstats))) print u"# Words occurring in ≥ 50%% of the books %10d" % (len(filter(lambda s: s.mean > 0.0, absstats))) print u"Sorting statistics ..." stats = sorted(stats, key=lambda x:(-x.mean,-x.average,x.entry)) absstats = sorted(absstats, key=lambda x:(-x.mean,-x.average,x.entry)) print u"Ten most occuring words:" print u"                   Mean     Average     Std Dev     Minimum     Maximum     # Books" for i,s in enumerate(stats): print u"%-12s %10.3f%% %10.3f%% %10.3f%% %10.3f%% %10.3f%% %9d" % (s.entry, 100.0*s.mean, 100.0*s.average, 100.0*s.stddev, 100.0*s.minimum, 100.0*s.maximum, s.occurance) if i > 10: break print "Writing statistics ..." writefreqdist(stats, "en-stat/wordstats.txt", "Words") writefreqdist(absstats, "en-stats/wordabsolutestats.txt", "Words")
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8