__author__  = "Hugo Liu <hugo@media.mit.edu>"
__version__ = "1.3"
import sys,string,time
from types import *
import Tokenizer, LexiconEfficient, LexicalRuleParser, ContextualRuleParser, LexiconFast
class MontyTagger:


    theTokenizer = None
    theLexicon = None
    theLRP = None
    theCRP = None
    trace_p = 0
    def __init__(self,trace_p=0):

        self.trace_p = trace_p
        self.theTokenizer = Tokenizer.Tokenizer()
        notify = []
        self.theLexicon = LexiconFast.LexiconFast(notify)
        if len(notify) > 0:
            self.theLexicon = LexiconEfficient.LexiconEfficient()
        self.theLRP = LexicalRuleParser.LexicalRuleParser(self.theLexicon)
        self.theCRP = ContextualRuleParser.ContextualRuleParser()
    def tag(self,text,expand_contractions_p=0,all_pos_p=0):

        tokenized = self.theTokenizer.tokenize(text,expand_contractions_p)
        output = self.tag_tokenized(tokenized,all_pos_p)
        return output
    def tag_tokenized(self,text,all_pos_p=0):

        text_arr = []
        toks = string.split(text)
        for word in toks:
            if '/' in word and word[word.index('/'):].upper()==word[word.index('/'):]:
                word,apriori_pos = word.split('/')
                all_pos = [apriori_pos]
            else:
                all_pos = self.theLexicon.all_pos(word)
            if all_pos == []:
                pos = 'UNK'
                all_pos.append('UNK')
            else:
                pos = all_pos[0]
            text_arr.append({'word':word,'pos':pos,'all_pos':all_pos})
        boundary_token = {'word':'S-T-A-R-T','pos':'STAART','all_pos':[]}
        text_arr.insert(0,boundary_token.copy())
        text_arr.append(boundary_token.copy())
        if self.trace_p:
            print "TRACE: [output after lexicon lookup]:\n  ",self.form_output(text_arr)
        for i in range(len(text_arr)):
            word_dict = text_arr[i]
            if word_dict['pos'] != 'UNK':
                continue
            if (word_dict['word'][0] in string.uppercase):
                text_arr[i]['pos'] = 'NNP'
            else:
                text_arr[i]['pos'] = 'NN'
            self.theLRP.apply_all_rules(text_arr,i)
            text_arr[i]['all_pos'] = ['UNK',text_arr[i]['pos']]
        if self.trace_p:
            print "TRACE: [output after lexical rules were applied]:\n  ",self.form_output(text_arr)
        self.theCRP.apply_rules_to_all_words_brill(text_arr)
        return self.form_output(text_arr,all_pos_p)
    def form_output(self,text_arr,all_pos_p=0):

        output = ''
        for word_dict in text_arr[1:-1]:
            word = word_dict['word']
            thepos = word_dict['pos']
            if all_pos_p:
                all_pos = word_dict['all_pos']
                filtered = []
                for p in all_pos:
                    if p!=thepos:
                        filtered.append(p)
                all_pos = filtered
                pos_arr = [thepos] + all_pos
                output += word + '/' + string.join(pos_arr,'/') + ' '
            else:
                output += word + '/' + thepos + ' '
        output = string.strip(output)
        return output
    def verify_and_repair(self,tagged):

        text_arr = []
        toks = string.split(tagged)
        for word_pos in toks:
            word_pos_split = string.split(word_pos,'/')
            word = word_pos_split[0]
            pos = word_pos_split[1]
            all_pos = self.theLexicon.all_pos(word)
            if all_pos == []:
                all_pos.append('UNK')
            text_arr.append({'word':word,'pos':pos,'all_pos':all_pos})
        boundary_token = {'word':'S-T-A-R-T','pos':'STAART','all_pos':[]}
        text_arr.insert(0,boundary_token.copy())
        text_arr.append(boundary_token.copy())
        if self.trace_p:
            print "TRACE: [inputted as]:\n  ",self.form_output(text_arr)
        self.theCRP.apply_rules_to_all_words_brill(text_arr)
        return self.form_output(text_arr,all_pos_p)
if __name__ == "__main__":
    if '/?' in sys.argv or '-?' in sys.argv:
        print """
        USAGE: >> python MontyTagger.py [-trace] [-allpos] [-repair]
        -trace   shows intermediary steps and debug messages
        -allpos  displays all plausible POS tags, ranked
        -repair  in repair mode, enter tagged text at the
                 prompt, monty will attempt to fix the tags
    """
        sys.exit(0)
    if '-noverbose' in sys.argv:
        m = MontyTagger(0)
        while 1:
            sentence = sys.stdin.readline()
            print '\n' + string.strip(m.tag(sentence))
            print '--\n\n'
    if '-trace' in sys.argv:
        trace_p = 1
    else:
        trace_p = 0
    if '-allpos' in sys.argv:
        all_pos_p = 1
    else:
        all_pos_p = 0
    if '-repair' in sys.argv:
        repair_p = 1
    else:
        repair_p = 0
    print '\n***** INITIALIZING ******'
    if trace_p: print 'TRACE is on!'
    if all_pos_p: print 'ALL POS is on!'
    if repair_p: print 'REPAIR MODE is on!'
    m = MontyTagger(trace_p)
    print '*************************\n'
    print 'MontyTagger v1.2'
    print '--send bug reports to hugo@media.mit.edu--'
    print '\n'
    try:
        while 1:
            sentence = ''
            try:
                sentence = raw_input('> ')
            except:
                raise
            time1 = time.time()
            if repair_p:
                print '\nREPAIRED: ' + m.verify_and_repair(sentence)
            else:
                print '\n' + m.tag(sentence,0,all_pos_p)
            time2= time.time()
            print "-- monty took",str(round(time2-time1,2)),'seconds. --\n'
    except KeyboardInterrupt:
        print "\n-- monty says goodbye! --"
        sys.exit(0)