import re

import sys



# Reason last stmt is continued (or C_NONE if it's not).

(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,

 C_STRING_NEXT_LINES, C_BRACKET) = range(5)



if 0:   # for throwaway debugging output

    def dump(*stuff):

        sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")



# Find what looks like the start of a popular stmt.



_synchre = re.compile(r"""

    ^

    [ \t]*

    (?: while

    |   else

    |   def

    |   return

    |   assert

    |   break

    |   class

    |   continue

    |   elif

    |   try

    |   except

    |   raise

    |   import

    |   yield

    )

    \b

""", re.VERBOSE | re.MULTILINE).search



# Match blank line or non-indenting comment line.



_junkre = re.compile(r"""

    [ \t]*

    (?: \# \S .* )?

    \n

""", re.VERBOSE).match



# Match any flavor of string; the terminating quote is optional

# so that we're robust in the face of incomplete program text.



_match_stringre = re.compile(r"""

    \""" [^"\\]* (?:

                     (?: \\. | "(?!"") )

                     [^"\\]*

                 )*

    (?: \""" )?



|   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?



|   ''' [^'\\]* (?:

                   (?: \\. | '(?!'') )

                   [^'\\]*

                )*

    (?: ''' )?



|   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?

""", re.VERBOSE | re.DOTALL).match



# Match a line that starts with something interesting;

# used to find the first item of a bracket structure.



_itemre = re.compile(r"""

    [ \t]*

    [^\s#\\]    # if we match, m.end()-1 is the interesting char

""", re.VERBOSE).match



# Match start of stmts that should be followed by a dedent.



_closere = re.compile(r"""

    \s*

    (?: return

    |   break

    |   continue

    |   raise

    |   pass

    )

    \b

""", re.VERBOSE).match



# Chew up non-special chars as quickly as possible.  If match is

# successful, m.end() less 1 is the index of the last boring char

# matched.  If match is unsuccessful, the string starts with an

# interesting char.



_chew_ordinaryre = re.compile(r"""

    [^[\](){}#'"\\]+

""", re.VERBOSE).match



# Build translation table to map uninteresting chars to "x", open

# brackets to "(", and close brackets to ")".



_tran = ['x'] * 256

for ch in "({[":

    _tran[ord(ch)] = '('

for ch in ")}]":

    _tran[ord(ch)] = ')'

for ch in "\"'\\\n#":

    _tran[ord(ch)] = ch

_tran = ''.join(_tran)

del ch



try:

    UnicodeType = type(unicode(""))

except NameError:

    UnicodeType = None



class Parser:



    def __init__(self, indentwidth, tabwidth):

        self.indentwidth = indentwidth

        self.tabwidth = tabwidth



    def set_str(self, str):

        assert len(str) == 0 or str[-1] == '\n'

        if type(str) is UnicodeType:

            # The parse functions have no idea what to do with Unicode, so

            # replace all Unicode characters with "x".  This is "safe"

            # so long as the only characters germane to parsing the structure

            # of Python are 7-bit ASCII.  It's *necessary* because Unicode

            # strings don't have a .translate() method that supports

            # deletechars.

            uniphooey = str

            str = []

            push = str.append

            for raw in map(ord, uniphooey):

                push(raw < 127 and chr(raw) or "x")

            str = "".join(str)

        self.str = str

        self.study_level = 0



    # Return index of a good place to begin parsing, as close to the

    # end of the string as possible.  This will be the start of some

    # popular stmt like "if" or "def".  Return None if none found:

    # the caller should pass more prior context then, if possible, or

    # if not (the entire program text up until the point of interest

    # has already been tried) pass 0 to set_lo.

    #

    # This will be reliable iff given a reliable is_char_in_string

    # function, meaning that when it says "no", it's absolutely

    # guaranteed that the char is not in a string.



    def find_good_parse_start(self, is_char_in_string=None,

                              _synchre=_synchre):

        str, pos = self.str, None



        if not is_char_in_string:

            # no clue -- make the caller pass everything

            return None



        # Peek back from the end for a good place to start,

        # but don't try too often; pos will be left None, or

        # bumped to a legitimate synch point.

        limit = len(str)

        for tries in range(5):

            i = str.rfind(":\n", 0, limit)

            if i < 0:

                break

            i = str.rfind('\n', 0, i) + 1  # start of colon line

            m = _synchre(str, i, limit)

            if m and not is_char_in_string(m.start()):

                pos = m.start()

                break

            limit = i

        if pos is None:

            # Nothing looks like a block-opener, or stuff does

            # but is_char_in_string keeps returning true; most likely

            # we're in or near a giant string, the colorizer hasn't

            # caught up enough to be helpful, or there simply *aren't*

            # any interesting stmts.  In any of these cases we're

            # going to have to parse the whole thing to be sure, so

            # give it one last try from the start, but stop wasting

            # time here regardless of the outcome.

            m = _synchre(str)

            if m and not is_char_in_string(m.start()):

                pos = m.start()

            return pos



        # Peeking back worked; look forward until _synchre no longer

        # matches.

        i = pos + 1

        while 1:

            m = _synchre(str, i)

            if m:

                s, i = m.span()

                if not is_char_in_string(s):

                    pos = s

            else:

                break

        return pos



    # Throw away the start of the string.  Intended to be called with

    # find_good_parse_start's result.



    def set_lo(self, lo):

        assert lo == 0 or self.str[lo-1] == '\n'

        if lo > 0:

            self.str = self.str[lo:]



    # As quickly as humanly possible <wink>, find the line numbers (0-

    # based) of the non-continuation lines.

    # Creates self.{goodlines, continuation}.



    def _study1(self):

        if self.study_level >= 1:

            return

        self.study_level = 1



        # Map all uninteresting characters to "x", all open brackets

        # to "(", all close brackets to ")", then collapse runs of

        # uninteresting characters.  This can cut the number of chars

        # by a factor of 10-40, and so greatly speed the following loop.

        str = self.str

        str = str.translate(_tran)

        str = str.replace('xxxxxxxx', 'x')

        str = str.replace('xxxx', 'x')

        str = str.replace('xx', 'x')

        str = str.replace('xx', 'x')

        str = str.replace('\nx', '\n')

        # note that replacing x\n with \n would be incorrect, because

        # x may be preceded by a backslash



        # March over the squashed version of the program, accumulating

        # the line numbers of non-continued stmts, and determining

        # whether & why the last stmt is a continuation.

        continuation = C_NONE

        level = lno = 0     # level is nesting level; lno is line number

        self.goodlines = goodlines = [0]

        push_good = goodlines.append

        i, n = 0, len(str)

        while i < n:

            ch = str[i]

            i = i+1



            # cases are checked in decreasing order of frequency

            if ch == 'x':

                continue



            if ch == '\n':

                lno = lno + 1

                if level == 0:

                    push_good(lno)

                    # else we're in an unclosed bracket structure

                continue



            if ch == '(':

                level = level + 1

                continue



            if ch == ')':

                if level:

                    level = level - 1

                    # else the program is invalid, but we can't complain

                continue



            if ch == '"' or ch == "'":

                # consume the string

                quote = ch

                if str[i-1:i+2] == quote * 3:

                    quote = quote * 3

                firstlno = lno

                w = len(quote) - 1

                i = i+w

                while i < n:

                    ch = str[i]

                    i = i+1



                    if ch == 'x':

                        continue



                    if str[i-1:i+w] == quote:

                        i = i+w

                        break



                    if ch == '\n':

                        lno = lno + 1

                        if w == 0:

                            # unterminated single-quoted string

                            if level == 0:

                                push_good(lno)

                            break

                        continue



                    if ch == '\\':

                        assert i < n

                        if str[i] == '\n':

                            lno = lno + 1

                        i = i+1

                        continue



                    # else comment char or paren inside string



                else:

                    # didn't break out of the loop, so we're still

                    # inside a string

                    if (lno - 1) == firstlno:

                        # before the previous \n in str, we were in the first

                        # line of the string

                        continuation = C_STRING_FIRST_LINE

                    else:

                        continuation = C_STRING_NEXT_LINES

                continue    # with outer loop



            if ch == '#':

                # consume the comment

                i = str.find('\n', i)

                assert i >= 0

                continue



            assert ch == '\\'

            assert i < n

            if str[i] == '\n':

                lno = lno + 1

                if i+1 == n:

                    continuation = C_BACKSLASH

            i = i+1



        # The last stmt may be continued for all 3 reasons.

        # String continuation takes precedence over bracket

        # continuation, which beats backslash continuation.

        if (continuation != C_STRING_FIRST_LINE

            and continuation != C_STRING_NEXT_LINES and level > 0):

            continuation = C_BRACKET

        self.continuation = continuation



        # Push the final line number as a sentinel value, regardless of

        # whether it's continued.

        assert (continuation == C_NONE) == (goodlines[-1] == lno)

        if goodlines[-1] != lno:

            push_good(lno)



    def get_continuation_type(self):

        self._study1()

        return self.continuation



    # study1 was sufficient to determine the continuation status,

    # but doing more requires looking at every character.  study2

    # does this for the last interesting statement in the block.

    # Creates:

    #     self.stmt_start, stmt_end

    #         slice indices of last interesting stmt

    #     self.stmt_bracketing

    #         the bracketing structure of the last interesting stmt;

    #         for example, for the statement "say(boo) or die", stmt_bracketing

    #         will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are

    #         treated as brackets, for the matter.

    #     self.lastch

    #         last non-whitespace character before optional trailing

    #         comment

    #     self.lastopenbracketpos

    #         if continuation is C_BRACKET, index of last open bracket



    def _study2(self):

        if self.study_level >= 2:

            return

        self._study1()

        self.study_level = 2



        # Set p and q to slice indices of last interesting stmt.

        str, goodlines = self.str, self.goodlines

        i = len(goodlines) - 1

        p = len(str)    # index of newest line

        while i:

            assert p

            # p is the index of the stmt at line number goodlines[i].

            # Move p back to the stmt at line number goodlines[i-1].

            q = p

            for nothing in range(goodlines[i-1], goodlines[i]):

                # tricky: sets p to 0 if no preceding newline

                p = str.rfind('\n', 0, p-1) + 1

            # The stmt str[p:q] isn't a continuation, but may be blank

            # or a non-indenting comment line.

            if  _junkre(str, p):

                i = i-1

            else:

                break

        if i == 0:

            # nothing but junk!

            assert p == 0

            q = p

        self.stmt_start, self.stmt_end = p, q



        # Analyze this stmt, to find the last open bracket (if any)

        # and last interesting character (if any).

        lastch = ""

        stack = []  # stack of open bracket indices

        push_stack = stack.append

        bracketing = [(p, 0)]

        while p < q:

            # suck up all except ()[]{}'"#\\

            m = _chew_ordinaryre(str, p, q)

            if m:

                # we skipped at least one boring char

                newp = m.end()

                # back up over totally boring whitespace

                i = newp - 1    # index of last boring char

                while i >= p and str[i] in " \t\n":

                    i = i-1

                if i >= p:

                    lastch = str[i]

                p = newp

                if p >= q:

                    break



            ch = str[p]



            if ch in "([{":

                push_stack(p)

                bracketing.append((p, len(stack)))

                lastch = ch

                p = p+1

                continue



            if ch in ")]}":

                if stack:

                    del stack[-1]

                lastch = ch

                p = p+1

                bracketing.append((p, len(stack)))

                continue



            if ch == '"' or ch == "'":

                # consume string

                # Note that study1 did this with a Python loop, but

                # we use a regexp here; the reason is speed in both

                # cases; the string may be huge, but study1 pre-squashed

                # strings to a couple of characters per line.  study1

                # also needed to keep track of newlines, and we don't

                # have to.

                bracketing.append((p, len(stack)+1))

                lastch = ch

                p = _match_stringre(str, p, q).end()

                bracketing.append((p, len(stack)))

                continue



            if ch == '#':

                # consume comment and trailing newline

                bracketing.append((p, len(stack)+1))

                p = str.find('\n', p, q) + 1

                assert p > 0

                bracketing.append((p, len(stack)))

                continue



            assert ch == '\\'

            p = p+1     # beyond backslash

            assert p < q

            if str[p] != '\n':

                # the program is invalid, but can't complain

                lastch = ch + str[p]

            p = p+1     # beyond escaped char



        # end while p < q:



        self.lastch = lastch

        if stack:

            self.lastopenbracketpos = stack[-1]

        self.stmt_bracketing = tuple(bracketing)



    # Assuming continuation is C_BRACKET, return the number

    # of spaces the next line should be indented.



    def compute_bracket_indent(self):

        self._study2()

        assert self.continuation == C_BRACKET

        j = self.lastopenbracketpos

        str = self.str

        n = len(str)

        origi = i = str.rfind('\n', 0, j) + 1

        j = j+1     # one beyond open bracket

        # find first list item; set i to start of its line

        while j < n:

            m = _itemre(str, j)

            if m:

                j = m.end() - 1     # index of first interesting char

                extra = 0

                break

            else:

                # this line is junk; advance to next line

                i = j = str.find('\n', j) + 1

        else:

            # nothing interesting follows the bracket;

            # reproduce the bracket line's indentation + a level

            j = i = origi

            while str[j] in " \t":

                j = j+1

            extra = self.indentwidth

        return len(str[i:j].expandtabs(self.tabwidth)) + extra



    # Return number of physical lines in last stmt (whether or not

    # it's an interesting stmt!  this is intended to be called when

    # continuation is C_BACKSLASH).



    def get_num_lines_in_stmt(self):

        self._study1()

        goodlines = self.goodlines

        return goodlines[-1] - goodlines[-2]



    # Assuming continuation is C_BACKSLASH, return the number of spaces

    # the next line should be indented.  Also assuming the new line is

    # the first one following the initial line of the stmt.



    def compute_backslash_indent(self):

        self._study2()

        assert self.continuation == C_BACKSLASH

        str = self.str

        i = self.stmt_start

        while str[i] in " \t":

            i = i+1

        startpos = i



        # See whether the initial line starts an assignment stmt; i.e.,

        # look for an = operator

        endpos = str.find('\n', startpos) + 1

        found = level = 0

        while i < endpos:

            ch = str[i]

            if ch in "([{":

                level = level + 1

                i = i+1

            elif ch in ")]}":

                if level:

                    level = level - 1

                i = i+1

            elif ch == '"' or ch == "'":

                i = _match_stringre(str, i, endpos).end()

            elif ch == '#':

                break

            elif level == 0 and ch == '=' and \

                   (i == 0 or str[i-1] not in "=<>!") and \

                   str[i+1] != '=':

                found = 1

                break

            else:

                i = i+1



        if found:

            # found a legit =, but it may be the last interesting

            # thing on the line

            i = i+1     # move beyond the =

            found = re.match(r"\s*\\", str[i:endpos]) is None



        if not found:

            # oh well ... settle for moving beyond the first chunk

            # of non-whitespace chars

            i = startpos

            while str[i] not in " \t\n":

                i = i+1



        return len(str[self.stmt_start:i].expandtabs(\

                                     self.tabwidth)) + 1



    # Return the leading whitespace on the initial line of the last

    # interesting stmt.



    def get_base_indent_string(self):

        self._study2()

        i, n = self.stmt_start, self.stmt_end

        j = i

        str = self.str

        while j < n and str[j] in " \t":

            j = j + 1

        return str[i:j]



    # Did the last interesting stmt open a block?



    def is_block_opener(self):

        self._study2()

        return self.lastch == ':'



    # Did the last interesting stmt close a block?



    def is_block_closer(self):

        self._study2()

        return _closere(self.str, self.stmt_start) is not None



    # index of last open bracket ({[, or None if none

    lastopenbracketpos = None



    def get_last_open_bracket_pos(self):

        self._study2()

        return self.lastopenbracketpos



    # the structure of the bracketing of the last interesting statement,

    # in the format defined in _study2, or None if the text didn't contain

    # anything

    stmt_bracketing = None



    def get_last_stmt_bracketing(self):

        self._study2()

        return self.stmt_bracketing

