Code:GenerateLetterStats.py

See Letter Distribution for the use of this script.

""" generateletterstats.py Read all files in a directory, which contain exactly only letters and spaces (incl carriage returns). Calculate the frequence distribution for letter, letter pairs and letter triplets in each file. Normalize these numbers (divide by the total number of words in the file). Merge these per-file stats by calculating for every occurring letter, letter pair and letter triplet: the average, standard deviation, mean, minimum, maximum, # files with the word, the letter, letter pair or letter triplet itself. Write these letter-stats, sorted by mean and average. """ import sys import os import unicodedata import codecs import math class Stats(object): """Given a number of measurements, calculate the length, non-zero count, mimimum, maximum, mean, average and standard deviation""" def __init__(self, values, entry=None): self.values = sorted(values) self.entry = entry self.recalculate def recalculate(self): self.length = len(self.values) self.occurrence = len(filter(lambda x: x > 0.0, self.values)) self.minimum = self.values[0] self.maximum = self.values[-1] self.mean = self.values[len(self.values)//2] self.sum = sum(self.values) self.average = float(self.sum)/self.length self.stddev = math.sqrt(sum((x-self.average)**2 for x in self.values) / self.length) def stats(self): return (self.mean,self.average,self.stddev,self.minimum,self.maximum,self.occurrence,self.length,self.entry,charseqname(self.entry)) def __str__(self): return "%s: mean %.17f; average %.17f; standard deviation %.17f; occurrence: %d/%d" % (charseqname(self.entry), self.mean, self.average, self.stddev, self.occurrence, self.length) def statline(self): return (u"%.17f\t%.17f\t%.17f\t%.17f\t%.17f\t%3d\t%3d\t%s\t%s\n" % self.stats).encode('utf-8') @staticmethod def titleline(title=None): if title == None: title = "" return (u"%17s\t%17s\t%17s\t%17s\t%17s\t%s\t%s\t%s\n" % ("Mean", "Average", "Std dev", "Mimimum", "Maximum", "# Occurrences", "# Books", title)).encode('utf-8') def letterfilestats(filename, seqlen=2): letterseqs = {} src = codecs.open(filename, "rb", "utf-8") CHUNKSIZE=1024 prevchunk = u" " # we prepend the file with an additional space, to also count the first letter of the first word while True: chunk = src.read(CHUNKSIZE) if len(chunk) == 0: break chunk = prevchunk + chunk.replace(u"\n",u" ") for i in xrange(len(chunk)-seqlen+1): letterseq = chunk[i:i+seqlen] if letterseq not in letterseqs: letterseqs[letterseq] = 0 letterseqs[letterseq] += 1 prevchunk = chunk[-seqlen+1:] src.close return letterseqs, prevchunk def seqreduce(stats3, finalseq): """Reduce a 3-letter sequence to a 2-letter sequence distribution, or in general a n-letter distribution to a n-1-letter distribution.""" stats2 = {} for char3,v in stats3.items: char2 = char3[:-1] if char2 not in stats2: stats2[char2] = 0 stats2[char2] += v    # correct for the last sequence, since we could not count that for the 3-char if finalseq: if finalseq not in stats2: stats2[finalseq] = 0 stats2[finalseq] += 1 return stats2, finalseq[1:] def charname(char): try: return unicodedata.name(char).split[-1].capitalize except (ValueError, IndexError, TypeError): return u"unnamed character "+char def charseqname(charseq): if len(charseq) > 3: return charseq names = [charname(ch) for ch in charseq] if sum(len(ch) for ch in names) == len(charseq): return charseq else: return u" ".join(names) def printstats(letters, title="Letter", total=None): if total == None: total = totalcount(letters) print "%-20s %14s" % (title, "# Occurrences") for c,v in sorted(letters.items): print "%-16s %9d (%5.2f%%)" % (c,v,100.0*v/total) def totalcount(occurrencedict): """Given a occurrence dictionary (entry: occurrence count), return the total number of occurrences""" return sum(occurrencedict[c] for c in occurrencedict) def normalize(occurrencedict): """normalize a occurrence dictionary (entry: occurrence count), so that the total number of occurrences is 1.0""" total = totalcount(occurrencedict) for ch, v in occurrencedict.items: occurrencedict[ch] = float(occurrencedict[ch])/total return occurrencedict def merge(occurrencedicts): """Given an array of (per-file) occurrence dictionaries, find all entries and calculate the statistics of each entry. Return an array with Stats instances.""" entries = set for od in occurrencedicts: entries = entries.union(set(od.keys)) freqstats = [] for entry in entries: entryvalues = [] for od in occurrencedicts: if entry in od: entryvalues.append(od[entry]) else: entryvalues.append(0.0) freqstats.append(Stats(entryvalues, entry)) return freqstats def writefreqdist(freqstats, filename, title=None): """Givens an array of Stats instances, sort write them to disk""" f = open(filename, "w") if title: f.write(Stats.titleline(title)) for s in freqstats: f.write(s.statline) f.close def dirfiles(dir, maxlen=None): # we do not visit subdirectories, so this is not a loop. # print os.walk(dir) root, dirs, files = os.walk(dir).next if maxlen: files = files[:maxlen+1] filenames = [] for f in files: if f.startswith('.'): continue filenames.append(os.path.join(dir,f)) return filenames def seqname(seqlen, title="letter"): if seqlen == 1: return title+"s" elif seqlen == 2: return title+"pairs" elif seqlen == 3: return title+"triplets" else: return title+"seq"+str(seqlen) if __name__ == '__main__': print "Reading and normalizing per-file statistics" maxseqlen = 3 letteroccurrence = {} stats = {} for seqlen in range(1,maxseqlen+1): stats[seqlen] = [] for filename in dirfiles('en-words'): size = os.stat(filename).st_size print filename, "(",size/1000,"kB )" letteroccurrence[maxseqlen], lastseq = letterfilestats(filename, seqlen=maxseqlen) for seqlen in range(maxseqlen,1,-1): letteroccurrence[seqlen-1], lastseq = seqreduce(letteroccurrence[seqlen], lastseq) for seqlen in letteroccurrence.keys: letteroccurrence[seqlen] = normalize(letteroccurrence[seqlen]) stats[seqlen].append(letteroccurrence[seqlen]) letterstats = letteroccurrence[1] print "   " + " ".join([charname(ch) for ch in sorted(letterstats)]) print "Merging and sorting per-file statistics ..." for seqlen in stats.keys: stats[seqlen] = merge(stats[seqlen]) stats[seqlen] = sorted(stats[seqlen], key=lambda x:(-x.mean,-x.average,x.entry)) print "Writing statistics ..." for seqlen in stats.keys: filename = "en-stats/%s.txt" % seqname(seqlen) writefreqdist(stats[seqlen], filename, seqname(seqlen).capitalize)
 * 1) !/usr/bin/env python
 * 2) encoding: utf-8
 * 1) def wordletterfilestats(filename, seqlen=2):
 * 2)     words    = {}
 * 3)     doubles  = {}
 * 4)     src = codecs.open(filename, "rb", "utf-8")
 * 5)     CHUNKSIZE=1024
 * 6)     prevline = u" "  # we prepend the file with an additional space, to also count the first letter of the first word
 * 7)     for line in src:
 * 8)         word = line.rstrip
 * 9)         if word not in words:
 * 10)             words[word] = 0
 * 11)         words[word] += 1
 * 12)         line = prevline + line.replace(u"\n",u" ")
 * 13)         for i in xrange(len(line)-seqlen+1):
 * 14)             letterseq = line[i:i+seqlen]
 * 15)             if letterseq not in doubles:
 * 16)                 doubles[letterseq] = 0
 * 17)             doubles[letterseq] += 1
 * 18)             prevline = line[-seqlen+1:]
 * 19)     src.close
 * 20)     return words, doubles, prevline