Code:GenerateLetterStats.py

From Exterior Memory
Jump to: navigation, search

See Letter Distribution for the use of this script.

#!/usr/bin/env python
# encoding: utf-8
"""
generateletterstats.py
Read all files in a directory, which contain exactly only letters and spaces (incl carriage returns).
Calculate the frequence distribution for letter, letter pairs and letter triplets in each file.
Normalize these numbers (divide by the total number of words in the file).
Merge these per-file stats by calculating for every occurring letter, letter pair and letter triplet:
the average, standard deviation, mean, minimum, maximum, # files with the word, the letter, letter pair or letter triplet itself.
Write these letter-stats, sorted by mean and average.
"""

import sys
import os
import unicodedata
import codecs
import math

class Stats(object):
    """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation"""
    def __init__(self, values, entry=None):
        self.values = sorted(values)
        self.entry = entry
        self.recalculate()
    def recalculate(self):
        self.length = len(self.values)
        self.occurrence = len(filter(lambda x: x > 0.0, self.values))
        self.minimum = self.values[0]
        self.maximum = self.values[-1]
        self.mean = self.values[len(self.values)//2]
        self.sum = sum(self.values)
        self.average = float(self.sum)/self.length
        self.stddev  = math.sqrt(sum((x-self.average)**2 for x in self.values) / self.length)
    def stats(self):
        return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,charseqname(self.entry))
    def __str__(self):
        return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (charseqname(self.entry), self.mean, self.average, self.stddev, self.occurrence, self.length)
    def statline(self):
        return (u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats()).encode('utf-8')
    @staticmethod
    def titleline(title=None):
        if title == None:
            title = ""
        return (u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)).encode('utf-8')

# def wordletterfilestats(filename, seqlen=2):
#     words    = {}
#     doubles  = {}
#     src = codecs.open(filename, "rb", "utf-8")
#     CHUNKSIZE=1024
#     prevline = u" "  # we prepend the file with an additional space, to also count the first letter of the first word
#     for line in src:
#         word = line.rstrip()
#         if word not in words:
#             words[word] = 0
#         words[word] += 1
#         line = prevline + line.replace(u"\n",u" ")
#         for i in xrange(len(line)-seqlen+1):
#             letterseq = line[i:i+seqlen]
#             if letterseq not in doubles:
#                 doubles[letterseq] = 0
#             doubles[letterseq] += 1
#             prevline = line[-seqlen+1:]
#     src.close()
#     return words, doubles, prevline

def letterfilestats(filename, seqlen=2):
    letterseqs  = {}
    src = codecs.open(filename, "rb", "utf-8")
    CHUNKSIZE=1024
    prevchunk = u" "  # we prepend the file with an additional space, to also count the first letter of the first word
    while True:
        chunk = src.read(CHUNKSIZE)
        if len(chunk) == 0:
            break
        chunk = prevchunk + chunk.replace(u"\n",u" ")
        for i in xrange(len(chunk)-seqlen+1):
            letterseq = chunk[i:i+seqlen]
            if letterseq not in letterseqs:
                letterseqs[letterseq] = 0
            letterseqs[letterseq] += 1
            prevchunk = chunk[-seqlen+1:]
    src.close()
    return letterseqs, prevchunk

def seqreduce(stats3, finalseq):
    """Reduce a 3-letter sequence to a 2-letter sequence distribution, or in general a n-letter distribution to a n-1-letter distribution."""
    stats2 = {}
    for char3,v in stats3.items():
        char2 = char3[:-1]
        if char2 not in stats2:
            stats2[char2] = 0
        stats2[char2] += v
    # correct for the last sequence, since we could not count that for the 3-char
    if finalseq:
        if finalseq not in stats2:
            stats2[finalseq] = 0
        stats2[finalseq] += 1
    return stats2, finalseq[1:]


def charname(char):
    try:
        return unicodedata.name(char).split()[-1].capitalize()
    except (ValueError, IndexError, TypeError):
        return u"unnamed character "+char

def charseqname(charseq):
    if len(charseq) > 3:
        return charseq
    names = [charname(ch) for ch in charseq]
    if sum(len(ch) for ch in names) == len(charseq):
        return charseq
    else:
        return u" ".join(names)

def printstats(letters, title="Letter", total=None):
    if total == None:
        total = totalcount(letters)
    print "%-20s  %14s" % (title, "# Occurrences")
    for c,v in sorted(letters.items()):
        print "%-16s  %9d (%5.2f%%)" % (c,v,100.0*v/total)


def totalcount(occurrencedict):
    """Given a occurrence dictionary (entry: occurrence count), return the total number of occurrences"""
    return sum(occurrencedict[c] for c in occurrencedict)


def normalize(occurrencedict):
    """normalize a occurrence dictionary (entry: occurrence count), so that the total number of occurrences is 1.0"""
    total = totalcount(occurrencedict)
    for ch, v in occurrencedict.items():
        occurrencedict[ch] = float(occurrencedict[ch])/total
    return occurrencedict


def merge(occurrencedicts):
    """Given an array of (per-file) occurrence dictionaries, find all entries and calculate the statistics of each entry. Return an array with Stats instances."""
    entries = set()
    for od in occurrencedicts:
        entries = entries.union(set(od.keys()))
    freqstats = []
    for entry in entries:
        entryvalues = []
        for od in occurrencedicts:
            if entry in od:
                entryvalues.append(od[entry])
            else:
                entryvalues.append(0.0)
        freqstats.append(Stats(entryvalues, entry))
    return freqstats

def writefreqdist(freqstats, filename, title=None):
    """Givens an array of Stats instances, sort write them to disk"""
    f = open(filename, "w")
    if title:
        f.write(Stats.titleline(title))
    for s in freqstats:
        f.write(s.statline())
    f.close()

def dirfiles(dir, maxlen=None):
    # we do not visit subdirectories, so this is not a loop.
    # print os.walk(dir)
    root, dirs, files = os.walk(dir).next()
    if maxlen:
        files = files[:maxlen+1]
    filenames = []
    for f in files:
        if f.startswith('.'):
            continue
        filenames.append(os.path.join(dir,f))
    return filenames


def seqname(seqlen, title="letter"):
    if seqlen == 1:
        return title+"s"
    elif seqlen == 2:
        return title+"pairs"
    elif seqlen == 3:
        return title+"triplets"
    else:
        return title+"seq"+str(seqlen)


if __name__ == '__main__':
    print "Reading and normalizing per-file statistics"
    maxseqlen = 3
    letteroccurrence = {}
    stats = {}
    for seqlen in range(1,maxseqlen+1):
        stats[seqlen] = []
    for filename in dirfiles('en-words'):
        size = os.stat(filename).st_size
        print filename, "(",size/1000,"kB )"
        letteroccurrence[maxseqlen], lastseq = letterfilestats(filename, seqlen=maxseqlen)
        for seqlen in range(maxseqlen,1,-1):
            letteroccurrence[seqlen-1], lastseq = seqreduce(letteroccurrence[seqlen], lastseq)
        for seqlen in letteroccurrence.keys():
            letteroccurrence[seqlen] = normalize(letteroccurrence[seqlen])
            stats[seqlen].append(letteroccurrence[seqlen])
        letterstats = letteroccurrence[1]
        print "    " + " ".join([charname(ch) for ch in sorted(letterstats)])
    print "Merging and sorting per-file statistics ..."
    for seqlen in stats.keys():
        stats[seqlen] = merge(stats[seqlen])
        stats[seqlen] = sorted(stats[seqlen], key=lambda x:(-x.mean,-x.average,x.entry))
    print "Writing statistics ..."
    for seqlen in stats.keys():
        filename = "en-stats/%s.txt" % seqname(seqlen)
        writefreqdist(stats[seqlen], filename, seqname(seqlen).capitalize())