#########################################################
##  CS 2500 (Fall 2011), Assignment #7 (mod)           ##
##   Script File Name: PyText3.py                      ##
##       Student Name: Todd Wareham                    ##
##         Login Name: harold                          ##
##              MUN #: 8008765                         ##
#########################################################
"""
Given the name of a textfile of commands (one per line) for a text
 comparison system, execute these commands and print results
 or error messages to standard output as appropriate.
 
For a full description of this system, please see the on-line description of 
 in Assignment #7.

This script is a variant of PyText2.py (answer to Assignment #7) which places
 the interpretation-loop from the main program in a function interpret(). 
 This is done to allow profiling of the runtimes of the various functions in
 this script via script profile_stats5.py.
"""

import sys
import os.path


def createCharSet(T):
    return sorted(set("".join(T.split())))


def createWordSet(T):
    return sorted(set(T.split()))


def createDigramMatrix(T):
    chars = sorted(set("".join(T.split())))
    DM = []
    for i in range(len(chars)):
        row = []
        for j in range(len(chars)):
            row.append(0)
	DM.append(row)

    for word in T.split():
        for i in range(len(word) - 1):
	    DM[chars.index(word[i])][chars.index(word[i + 1])] += 1
    return DM


def printCharSet(text, textName):
    print "  Characters:"
    for char in text[textName]["chars"]:
        print char
    return


def printWordSet(text, textName):
    print "  Words:"
    for word in text[textName]["words"]:
        print word
    return


def printDigramMatrix(text, textName):
    print "  2-gram Matrix:"
    print "       ",
    for i in range(len(text[textName]["chars"])):
        print ("%3s" % text[textName]["chars"][i]),
    print
    for i in range(len(text[textName]["chars"])):
        print ("    %3s" % text[textName]["chars"][i]),
        for j in range(len(text[textName]["chars"])):
	    print ("%3d" % text[textName]["2gmM"][i][j]),
        print
    return


def computeNgramVector(text, textName, n):
    ngV = {}
    totNgram = 0
    for word in text[textName]["text"].split():
	for i in range(0,(len(word) - n) + 1):
	    ngram = word[i:i + n]
	    if ngram not in ngV:
		ngV[ngram] = 0
	    ngV[ngram] = ngV[ngram] + 1
	    totNgram = totNgram + 1
    for ngram in ngV:
	ngV[ngram] = ngV[ngram]/(totNgram * 1.0)
    return ngV


def computeNgramSim(ngV1, ngV2):
    commonNgram = set(ngV1.keys()) & set(ngV2.keys())
    ng1Ngram = set(ngV1.keys()) - commonNgram
    ng2Ngram = set(ngV2.keys()) - commonNgram
    
    diff = 0.0

    for ngram in commonNgram:
        diff = diff + abs(ngV1[ngram] - ngV2[ngram])
    for ngram in ng1Ngram:
        diff = diff + ngV1[ngram]
    for ngram in ng2Ngram:
        diff = diff + ngV2[ngram]
        
    return 1.0 - (diff/2.0) 


def interpret(commandFile):

    print "Welcome to PyText1 v2.0"

    cf = open(sys.argv[1], "r")

    text = {}

    while True:

        line = cf.readline()
        print ">>> " + line.rstrip()


        command = line.split()

##
## Process exit command.
##

        if ((len(command) == 1) and (command[0] == "exit")):
            break

##
## Process help command.
##

        elif ((len(command) == 1) and (command[0] == "help")):

            print
            print "  Commands:"
	    print "    help"
	    print "    list texts"
	    print "    load <file> as <text-name>"
	    print "    describe <text-name> as {text, chars, words, digrams}"
	    print "    compare <text-name> with {all, <text-name-list>} by {words, ngram=x}"
	    print "    exit"
            print
        
##
## Process list stored texts command.
##

        elif ((len(command) == 2) and (command[0] == "list") and (command[1] == "texts")):

            if len(text) == 0:
	        print "  No available texts"
            else:
                print "  Available texts:"
                for t in sorted(text):
	            print "    ", t
        
##
## Process load and store text from file command.
##

        elif ((len(command) == 4) and 
              (command[0] == "load") and
              (command[2] == "as")):

            if not os.path.exists(command[1]):
	        print "  Error: Specified file does not exist"
            else:
                tf = open(command[1], "r")
	        line = tf.read()
	        tf.close()

	        textName = command[3]
	        text[textName] = {}

	        text[textName]["text"]  = line
	        text[textName]["chars"] = createCharSet(line)
	        text[textName]["words"] = createWordSet(line)
	        text[textName]["2gmM"]  = createDigramMatrix(text[textName]["text"])

##
## Process print specified description of stored text command.
##

        elif ((len(command) == 4) and 
               (command[0] == "describe") and
               (command[2] == "as")):

            textName = command[1]
	    descType = command[3]

	    if textName not in text:
	        print "Error: Specified text not stored"
            else:
	        if descType == "text":
	            print text[textName]["text"],
	        elif descType == "chars":
	            printCharSet(text, textName)
	        elif descType == "words":
	            printWordSet(text, textName)
	        elif descType == "digrams":
	            printDigramMatrix(text, textName)
	        else:
	            print "  Error: Invalid description-type"

##
## Process compare stored texts in specified manner command.
##

        elif ((len(command) >= 6) and 
              (command[0] == "compare") and
              (command[2] == "with") and
              (command[len(command) - 2] == "by")):
        
	    baseTextName = command[1]
	    if ((len(command) == 6) and (command[3] == "all")):
	        comparisonTextNames = list(set(text.keys()) - set([baseTextName]))
            else:
	        comparisonTextNames = command[3:len(command) - 2]

            validTextNames = True
            for textName in comparisonTextNames + [baseTextName]:
	        if textName not in text:
	            validTextNames = False

            if not validTextNames:
	        print "  Error: Specified text(s) not stored"
            elif (command[len(command) - 1] == "words"):

                baseWord = set(text[baseTextName]["words"])

                maxScore = -1.0
                maxTextName = ""

                for textName in comparisonTextNames:
                    curWord = set(text[textName]["words"])
                    score = 1.0 - \
		           (len(baseWord ^ curWord) * 1.0)/(len(baseWord) + len(curWord))
                    print "  >>> Sim(\"%s\",\"%s\") = %5.3f" % (baseTextName, textName, score)
                    if score > maxScore:
                        maxScore = score
	                maxTextName = textName
          
	        if len(comparisonTextNames) > 1:
                    print "  Text \"%s\" is most similar to text \"%s\"" % \
	                  (maxTextName, baseTextName)

            elif ((len(command[len(command) - 1]) >= 7) and
                  (command[len(command) - 1][0:6] == "ngram=")):

                n = int(command[len(command) - 1].split("=")[1])

                bngV = computeNgramVector(text, baseTextName, n)

                maxScore = -1.0
                maxTextName = ""

                for textName in comparisonTextNames:

                    cngV = computeNgramVector(text, textName, n)

                    score = computeNgramSim(bngV, cngV)

                    print ">>> Sim(\"%s\",\"%s\") = %5.3f" % (baseTextName, textName, score)
                    if score > maxScore:
                        maxScore = score
	                maxTextName = textName
          
	        if len(comparisonTextNames) > 1:
                    print "Text \"%s\" is most similar to text \"%s\"" % \
		          (maxTextName, baseTextName)

	    else:
	        print "Error: Invalid comparison-type"
        
        else:
            print "  Error: Invalid command"

    cf.close()

    return



if __name__ == "__main__":

    if len(sys.argv) != 2:
        print "usage: ", sys.argv[0], " {commandfile}"
        sys.exit(1)

    interpret(sys.argv[1])
