import sys,string,os,re
from types import *
class Tokenizer:


    def __init__(self):

        pass
    def tokenize(self,sentence,expand_contractions_p=0):

        sentence = ' ' + sentence + ' '
        punctuation = ['`','^','*','=','+','|','\\','[',']','}','{',',','!','?','#','&','(',')','"','>','<','~',';']
        special_punctuation = ['.','@','/','-',':']
        for p in punctuation:
            sentence = string.replace(sentence,p,' '+p+' ')
        toks = string.split(sentence)
        for i in range(len(toks)):
            tok = toks[i]
            if tok.lower() in self.common_abbrev_and_acro:
                continue
            m = re.search('^([A-Z][.])+$',tok)
            if type(m) != NoneType:
                continue
            m = re.search('^[$][0-9]{1,3}[.][0-9][0-9](?P<period>[.]?)$',tok)
            if type(m) != NoneType:
                if m.group('period') == '.':
                    toks[i] = toks[i][:-1] + ' ' + '.'
                continue
            for p in special_punctuation:
                toks[i] = string.replace(toks[i],p,' '+p+' ')
            toks[i] = string.strip(toks[i])
        sentence = string.join(toks,' ')
        if expand_contractions_p:
            contractions = self.contractions_unwound
        else:
            contractions = self.contractions_separated
        regexp = ' (?P<begin>)(?P<word>'
        for word in contractions.keys():
            regexp += word + '|'
        regexp = regexp[:-1]
        regexp += ')(?P<end>) '
        dirtyBit = 1
        while dirtyBit:
            dirtyBit = 0
            m = re.search(regexp,string.lower(sentence))
            if type(m) != NoneType:
                replace_with = contractions[m.group('word')]
                if sentence[m.start('begin')] in string.uppercase:
                    replace_with = replace_with[0].upper() + replace_with[1:]
                else:
                    replace_with = replace_with[0].lower() + replace_with[1:]
                sentence = sentence[:m.start('begin')] + replace_with + sentence[m.end('end'):]
                dirtyBit = 1
        sentence = string.replace(sentence,"'s "," 's ")
        sentence = string.replace(sentence,"'d "," 'd ")
        if expand_contractions_p:
            sentence = string.replace(sentence,"'ll "," will ")
        else:
            sentence = string.replace(sentence,"'ll "," 'll ")
        if expand_contractions_p:
            sentence = string.replace(sentence," i "," I ")
        return sentence
    contractions_separated = {
        "ain't":"ai n't",
        "aren't":"are n't",
        "isn't":"is n't",
        "wasn't":"was n't",
        "weren't":"were n't",
        "didn't":"did n't",
        "doesn't":"does n't",
        "don't":"do n't",
        "hadn't":"had n't",
        "hasn't":"has n't",
        "haven't":"have n't",
        "can't":"ca n't",
        "couldn't":"could n't",
        "needn't":"need n't",
        "shouldn't":"should n't",
        "shan't":"sha n't",
        "won't":"wo n't",
        "wouldn't":"would n't",
        "i'm":"i 'm",
        "you're":"you 're",
        "he's":"he 's",
        "she's":"she 's",
        "it's":"it 's",
        "we're":"we 're",
        "they're":"they 're",
        "i've":"i 've",
        "you've":"you 've",
        "we've":"we 've",
        "they've":"they 've",
        "who've":"who 've",
        "what've":"what 've",
        "when've":"when 've",
        "where've":"where 've",
        "why've":"why 've",
        "how've":"how 've",
        "i'd":"i 'd",
        "you'd":"you 'd",
        "he'd":"he 'd",
        "she'd":"she 'd",
        "we'd":"we 'd",
        "they'd":"they 'd",
        "i'll":"i 'll",
        "you'll":"you 'll",
        "he'll":"he 'll",
        "she'll":"she 'll",
        "we'll":"we 'll",
        "they'll":"they 'll"
        }
    contractions_unwound = {
        "ain't":"ai not", # copula
        "aren't":"are not",
        "isn't":"is not",
        "wasn't":"was not",
        "weren't":"were not",
        "didn't":"did not", # do
        "doesn't":"does not",
        "don't":"do not",
        "hadn't":"had not", # have
        "hafta":"have to",
        "hasn't":"has not",
        "haven't":"have not",
        "can't":"can not", # modal
        "couldn't":"could not",
        "needn't":"need not",
        "shouldn't":"should not",
        "shan't":"shall not",
        "won't":"will not",
        "wouldn't":"would not",
        "i'm":"I am", # to be
        "you're":"you are",
        "u're":"you are",
        "he's":"he is",
        "she's":"she is",
        "it's":"it is",
        "we're":"we are",
        "they're":"they are",
        "y'all're":"you all are",
        "i've":"I have", # to have
        "you've":"you have",
        "u've":"you have",
        "we've":"we have",
        "they've":"they have",
        "y'all've":"you all have",
        "who've":"who have",
        "what've":"what have",
        "when've":"when have",
        "where've":"where have",
        "why've":"why have",
        "how've":"how have",
        "i'd":"I would", # would
        "you'd":"you would",
        "u'd":"you would",
        "he'd":"he would",
        "she'd":"she would",
        "we'd":"we would",
        "they'd":"they would",
        "y'all'd":"you all would",
        "i'll":"I will",
        "you'll":"you will",
        "u'll":"you will",
        "he'll":"he will",
        "she'll":"she will",
        "we'll":"we will",
        "they'll":"they will",
        "y'all'll":"you all will",
        "'tis":"it is", # special cases
        "'twas":"it was",
        "'twere":"they were",
        "'twould":"it would",
        "y'all":"you all",
        "i'd've":"I would have",
        "aint":"am not",
        "aintcha":"are you not",
        "c'mon":"come on",
        "cannot":"can not",
        "dunno":"do not know",
        "gimme":"give me",
        "gonna":"going to",
        "gotta":"got to",
        "oughta":"ought to",
        "wanna":"want to",
        "ya":"you",
        "ur":"your"
        }
    common_abbrev_and_acro = [
        'mr.',
        'mrs.',
        'ms.',
        'sr.',
        'esq.',
        'jr.',
        'dr.',
        's.b.',
        'ph.d.',
        'm.d.',
        'm.eng.',
        'm.f.a.',
        'd.d.s.',
        'sc.d.',
        'b.s.',
        'b.sc.',
        'b.a.',
        'a.b.',
        'm.a.',
        'c.p.a.',
        'prof.',
        'capt.',
        'col.',
        'gen.',
        'sgt.',
        'lt.',
        'priv.',
        'ft.',
        'nav.',
        'a.f.',
        'u.s.a.f.',
        'a.f.b.'
        'i.e.',
        'etc.',
        'e.g.',
        'c.f.',
        'p.s.',
        'q.e.d.',
        'i.',
        'ii.',
        'iii.',
        'iv.',
        'v.',
        'vi.',
        'vii.',
        'viii.',
        'ix.',
        'x.',
        'a.m.',
        'p.m.',
        'morn.',
        'eve.',
        'corp.',
        'inc.',
        'co.',
        'ltd.',
        'reg.',
        'u.p.s.',
        'u.s.p.s.',
        'fedex.',
        'i.b.m.',
        'a.o.l.',
        'jan.',
        'feb.',
        'febr.',
        'mar.',
        'apr.',
        'may.',
        'jun.',
        'jul.',
        'aug.',
        'sep.',
        'sept.',
        'oct.',
        'nov.',
        'dec.',
        'ala.',
        'ariz.',
        'ark.',
        'calif.',
        'colo.',
        'conn.',
        'del.',
        'd.c.',
        'fla.',
        'ga.',
        'ill.',
        'ind.',
        'kans.',
        'ky.',
        'la.',
        'md.',
        'mass.',
        'mich.',
        'minn.',
        'miss.',
        'mo.',
        'nebr.',
        'nev.',
        'n.h.',
        'n.j.',
        'n.m.',
        'n.y.',
        'n.c.',
        'n.d.',
        'okla',
        'ore.',
        'pa.',
        'p.r.',
        'r.i.',
        's.c.',
        's.d.',
        'tenn.',
        'tex.',
        'vt.',
        'va.',
        'v.i.',
        'wash.',
        'w.va.',
        'wis.',
        'wyo.',
        'v.c.r.',
        'v.h.s.',
        'd.v.d.',
        'v.c.d.',
        'c.d.',
        'tele.',
        'tv.',
        't.v.',
        'p.c.',
        'd.s.l.',
        'a.s.a.p.',
        'r.s.v.p.',
        'n.y.c.',
        'c.o.d.'
        ]