import nltk, csv

def austats(corpusName):
    corpus = nltk.corpus.gutenberg.raw(corpusName)
    lines = corpus.split('\n')
    dic = {}
    for line in lines:
        for wd in line.split():
            wd = cleanWord(wd)
            if wd not in dic: dic[wd] = 0
            dic[wd] += 1
    totalWords = float(sum(dic.values()))
    ranks = sorted([(cnt,wd) for (wd,cnt) in dic.items()],reverse=True)
    R = []
    for idx in range(len(ranks)):
        R.append((ranks[idx][1], ranks[idx][0] / totalWords, idx+1))
    return R

def cleanWord(word):
    return word.lower().strip("',.;:\"/\\[]{}!?#$%^&*()")

def writeStats(fname, stats):
    f = open(fname, 'w')
    wri = csv.writer(f)
    wri.writerow(['Word', 'Frequency', 'Rank'])
    for row in stats:
        wri.writerow(row)
    f.close()

def otheraustats():
    
    dic = {}
    for wd in words:
        if wd not in dic: dic[wd] = 0
        dic[wd] += 1
    ranks = sorted([(cnt,wd) for (wd,cnt) in dic.items()],reverse=True)
    R = []
    for idx in range(len(ranks)):
        R.append((ranks[idx][1],ranks[idx][0],idx+1))
    return R
    
from pylab import *

def graphRankVsFreq(corpus):
    stats = austats(corpus)[:100]
    ranks = [rank for (word, freq, rank) in stats]
    freqs = [freq for (word, freq, rank) in stats]
    plot(ranks, freqs, linewidth=1.0)

    xlabel('Rank')
    ylabel('Frequency')
    title('Rank vs Frequency')
    grid(True)
    show()

def barChart(corpus):
    stats = austats(corpus)[:10]
    freqs = [f for w,f,c in stats]
    words = [w for w,f,c in stats]
    
    p1 = bar(range(10), freqs)
    xticks(range(10), words)
