import sys, re
from spark import GenericScanner
import error

# ___________________________________________________________________________
# Scanner
#
# We define some methods in the MultiScanner class itself, but most are
# defined outside from tables etc.  Note that the method names are
# sorted alphabetically, and those that come earlier in the alphabet
# have priority.  We have defined the tokens in that order as well,
# so those that appear earlier have higher or equal priority to those
# that come later.

class FileLoc(object):
    def __init__(self, file_name, line_number):
        self.file_name = file_name
        self.line_number = line_number

    def __repr__(self):
        return "%s:%d" % (self.file_name, self.line_number)

    def with_incd_line(self, amount):
        if amount:
            return FileLoc(self.file_name, self.line_number+amount)
        return self

    def __eq__(self, loc):
        assert isinstance(loc, FileLoc)
        return (self.file_name == loc.file_name
                and self.line_number == loc.line_number)

    def __cmp__(self, loc):
        "Locations within the same file are ordered by line number."
        assert isinstance(loc, FileLoc)
        if self.file_name == loc.file_name:
            return cmp(self.line_number, loc.line_number)
        return 0
    
DUMMY_LOC = FileLoc("Unknown", 0)

class Token(object):
    def __init__(self, type, val, loc):
        self.type = type
        self.val = val
        self.loc = loc

    def __str__(self):
        if self.val is None: return repr(self.type)
        return "%s %s" % (self.type, repr(self.val))
        
    def __repr__(self):
        if self.val is not None: return "T(%r,%r)@%r" % (
            self.type, self.val, self.loc)
        return "T(%r)@%r" % (self.type, self.loc)

    def is_whitespace(self):
        "True if this token represents whitespace"
        return self.type in ["nl", "indent", "dedent"]

class MultiScanner(GenericScanner):
    def tokenize(self, filenm, input):
        self.rv = []
        self.indents = [0]
        self.loc = FileLoc(filenm, 1)
        GenericScanner.tokenize(self, input)
        return self.rv

    def token(self, type, val):
        self.rv.append(Token(type, val, self.loc))
        
    def t_a_comment(self, s):
        r'\#[^\n]*'
        pass

    def t_a_whitespace(self, s):
        r'\s+'

        if '\t' in s:
            raise error.NoTabsError(self.loc)

        # track line number:
        nl_count = s.count('\n')
        self.loc = self.loc.with_incd_line(nl_count)

        # if there are any newlines, generate a NL character
        if nl_count:
            # only generate newlines if there have been tokens since
            # the last one
            if self.rv and not self.rv[-1].is_whitespace():
                self.token("nl", None)

            # whitespace after a newline might generate an indent character
            _, _, after = s.rpartition('\n')
            indent = len(after)
            if indent > self.indents[-1]:
                self.token("indent", None)
                self.indents.append(indent)
            else:
                while indent < self.indents[-1]:
                    self.token("dedent", None)
                    self.indents.pop()
                if indent != self.indents[-1]:
                    raise error.InvalidIndentationError(self.loc)

# "Key chars": characters that cannot be part of an operator
keychars = r'{}[]().|,'
esc_keychars = re.escape(keychars)
for idx, key in enumerate(keychars):
    def mthd(self, s):
        self.token(s, None)
    mthd.__doc__ = "\\" + key
    mthd.__name__ = "t_a_keychar_%d" % (idx,)
    setattr(MultiScanner, mthd.__name__, mthd)

# Operators: Any sequence of non-whitespace, non-alpha-numeric
# characters excluding the "key chars" listed above and '"'.  Some of
# them are specially recognized because they have a distinguished
# place in the grammar.  Note that, in expressions, the nonterminal
# AnyOp includes some of these operators.
keyops = [ "=", "/", "^", ":", ":=" ]
def t_b_op(self, s):
    if s in keyops: return self.token(s, None)
    return self.token("op", s)
t_b_op.__doc__ = r'[^\s\w%s"]+' % (esc_keychars)
MultiScanner.t_b_op = t_b_op

# Literals
token_table = [
    (r'\d[\d_]*', 'pos_int'),
    (r'\d[\d_]*\.[\d_]*', 'float'),
    (r'"[^"]*"', 'string'),
    ]
for re, nm in token_table:
    def make_mthd(nm):
        def mthd(self, s):
            self.token(nm, s)
        return mthd
    mthd = make_mthd(nm)
    mthd.__doc__ = re
    mthd.__name__ = "t_c_%s" % nm
    setattr(MultiScanner, mthd.__name__, mthd)

# Identifiers: Alphanumeric, underscores, and an optional ? or ! terminator.
keywords = ["import", "def", "unit", "var", "data",
            "return", "if", "else", "while"]
def t_z_id(self, s):
    if s in keywords: return self.token(s, s)
    return self.token("id", s)
t_z_id.__doc__ = r'[^%s\s0-9#][\w?!]*' % (esc_keychars)
MultiScanner.t_z_id = t_z_id

def tokenize(filenm, input=None):
    if not input: input = open(filenm).read()
    return MultiScanner().tokenize(filenm, input)

if __name__ == "__main__":
    for filenm in sys.argv[1:]:
        tokens = tokenize(filenm)
        for t in tokens:
            print repr(t)

