Code:GenerateWordStats.py

From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
"""
generatewordstats.py
Read all files in a directory, which contain exactly one word per line.
Generate per-file word counts, normalize these numbers (divide by the total number of words in the file).
Merge these per-file stats by calculating for every occuring word:
the average, standard deviation, mean, minimum, maximum, # files with the word, the word itself.
Write these words-stats, sorted by mean and average.
"""

import sys
import os
import unicodedata
import codecs
import math

class Stats(object):
    """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation"""
    def __init__(self, values, entry=None):
        self.type = type(values[0])  # int or float
        self.values = sorted(values)
        self.entry = entry
        self.recalculate()
    def recalculate(self):
        self.length = len(self.values)
        self.occurance = len(filter(lambda x: x > 0.0, self.values))
        self.minimum = self.values[0]
        self.maximum = self.values[-1]
        self.mean = self.values[len(self.values)//2]
        self.sum = sum(self.values)
        self.average = float(self.sum)/self.length
        self.stddev  = math.sqrt(sum((x-self.average)**2 for x in self.values) / self.length)
    def stats(self):
        return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurance,self.length,self.entry)
    def __str__(self):
        return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurance: %d/%d" % (self.entry, self.mean, self.average, self.stddev, self.occurance, self.length)
    def statline(self):
        if self.type == float:
            return u"%19.17f\t%19.17f\t%19.17f\t%19.17f\t%19.17f\t%3d\t%3d\t%s\n" % self.stats()
        elif self.type == int:
            return u"%19d\t%19.17f\t%.17f\t%19d\t%19d\t%3d\t%3d\t%s\n" % self.stats()
        else:
            print "Unkown type %s" % (self.type)
            return u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\n" % self.stats()
    @staticmethod
    def titleline(title=None):
        if title == None:
            title = ""
        return (u"%-19s\t%-19s\t%-19s\t%-19s\t%-19s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurances", "# Books", title)).encode('utf-8')

def wordfilestats(filename, seqlen=2):
    words    = {}
    doubles  = {}
    src = codecs.open(filename, "rb", "utf-8")
    for line in src:
        word = line.rstrip()
        if word not in words:
            words[word] = 0
        words[word] += 1
    src.close()
    return words


def printstats(letters, title="Letter", total=None):
    if total == None:
        total = totalcount(letters)
    print "%-20s  %14s" % (title, "# Occurances")
    for c,v in sorted(letters.items()):
        print "%-16s  %9d (%5.2f%%)" % (c,v,100.0*v/total)


def totalcount(occurancedict):
    """Given a occurance dictionary (entry: occurance count), return the total number of occurances"""
    return sum(occurancedict[c] for c in occurancedict)


def normalize(occurancedict):
    """normalize a occurance dictionary (entry: occurance count), so that the total number of occurances is 1.0"""
    total = totalcount(occurancedict)
    for ch, v in occurancedict.items():
        occurancedict[ch] = float(occurancedict[ch])/total
    return occurancedict


def merge(occurancedicts):
    """Given an array of (per-file) occurance dictionaries, find all entries and calculate the statistics of each entry. Return an array with Stats instances."""
    zero = type(occurancedicts[0].values()[0])(0)
    entries = set()
    for od in occurancedicts:
        entries = entries.union(set(od.keys()))
    freqstats = []
    for entry in entries:
        entryvalues = []
        for od in occurancedicts:
            if entry in od:
                entryvalues.append(od[entry])
            else:
                entryvalues.append(zero)
        freqstats.append(Stats(entryvalues, entry))
    return freqstats

def writefreqdist(freqstats, filename, title=None):
    """Givens an array of Stats instances, sort write them to disk"""
    f = open(filename, "w")
    if title:
        f.write(Stats.titleline(title))
    for s in freqstats:
        f.write(s.statline().encode('utf-8'))
    f.close()

def dirfiles(dir, maxlen=None):
    # we do not visit subdirectories, so this is not a loop.
    # print os.walk(dir)
    root, dirs, files = os.walk(dir).next()
    if maxlen:
        files = files[:maxlen+1]
    filenames = []
    for f in files:
        if f.startswith('.'):
            continue
        filenames.append(os.path.join(dir,f))
    return filenames


if __name__ == '__main__':
    stats = []
    totalcounts     = []
    differentcounts = []
    print "Reading and normalizing per-file statistics"
    for filename in dirfiles('en-words'):
        size = os.stat(filename).st_size
        print filename, "(",size/1000,"kB )"
        wordoccurance = wordfilestats(filename)
        totalwords = totalcount(wordoccurance)
        differentwords = len(wordoccurance)
        print "    %d different words; %d total words" % (differentwords, totalwords)
        stats.append(wordoccurance)
        totalcounts.append(totalwords)
        differentcounts.append(differentwords)
    print "Summary for all files:"
    st = Stats(totalcounts)
    sd = Stats(differentcounts)
    print "                    Mean     Average     Std Dev     Minimum     Maximum     # Books"
    print "Different Words %8d %11.2f %11.2f %11d %11d %9d" % (sd.mean, sd.average, sd.stddev, sd.minimum, sd.maximum, sd.occurance)
    print "Total Words     %8d %11.2f %11.2f %11d %11d %9d" % (st.mean, st.average, st.stddev, st.minimum, st.maximum, st.occurance)
    print "Merging per-file statistics ..."
    absstats = merge(stats)
    for k,v in enumerate(stats):
        stats[k] = normalize(v)
    stats = merge(stats)
    print u"# Words in all books                     %10d" % (st.sum) # sum(s.sum for s in absstats)
    print u"# Different words                        %10d" % (sd.sum) # len(stats)
    print u"# Words occurring at least twice         %10d" % (len(filter(lambda s: s.sum > 1, absstats)))
    print u"# Words occurring in at least 2 books    %10d" % (len(filter(lambda s: s.occurance > 1, absstats)))
    print u"# Words occurring in ≥ 50%% of the books  %10d" % (len(filter(lambda s: s.mean > 0.0, absstats)))
    print u"Sorting statistics ..."
    stats = sorted(stats, key=lambda x:(-x.mean,-x.average,x.entry))
    absstats = sorted(absstats, key=lambda x:(-x.mean,-x.average,x.entry))
    print u"Ten most occuring words:"
    print u"                    Mean     Average     Std Dev     Minimum     Maximum     # Books"
    for i,s in enumerate(stats):
        print u"%-12s %10.3f%% %10.3f%% %10.3f%% %10.3f%% %10.3f%% %9d" % (s.entry, 100.0*s.mean, 100.0*s.average, 100.0*s.stddev, 100.0*s.minimum, 100.0*s.maximum, s.occurance)
        if i > 10:
            break
    print "Writing statistics ..."
    writefreqdist(stats, "en-stat/wordstats.txt", "Words")
    writefreqdist(absstats, "en-stats/wordabsolutestats.txt", "Words")