""" robotparser.py



    Copyright (C) 2000  Bastian Kleineidam



    You can choose between two licenses when using this package:

    1) GNU GPLv2

    2) PSF license for Python 2.2



    The robots.txt Exclusion Protocol is implemented as specified in

    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

"""

import urlparse

import urllib



__all__ = ["RobotFileParser"]





class RobotFileParser:

    """ This class provides a set of methods to read, parse and answer

    questions about a single robots.txt file.



    """



    def __init__(self, url=''):

        self.entries = []

        self.default_entry = None

        self.disallow_all = False

        self.allow_all = False

        self.set_url(url)

        self.last_checked = 0



    def mtime(self):

        """Returns the time the robots.txt file was last fetched.



        This is useful for long-running web spiders that need to

        check for new robots.txt files periodically.



        """

        return self.last_checked



    def modified(self):

        """Sets the time the robots.txt file was last fetched to the

        current time.



        """

        import time

        self.last_checked = time.time()



    def set_url(self, url):

        """Sets the URL referring to a robots.txt file."""

        self.url = url

        self.host, self.path = urlparse.urlparse(url)[1:3]



    def read(self):

        """Reads the robots.txt URL and feeds it to the parser."""

        opener = URLopener()

        f = opener.open(self.url)

        lines = [line.strip() for line in f]

        f.close()

        self.errcode = opener.errcode

        if self.errcode in (401, 403):

            self.disallow_all = True

        elif self.errcode >= 400:

            self.allow_all = True

        elif self.errcode == 200 and lines:

            self.parse(lines)



    def _add_entry(self, entry):

        if "*" in entry.useragents:

            # the default entry is considered last

            self.default_entry = entry

        else:

            self.entries.append(entry)



    def parse(self, lines):

        """parse the input lines from a robots.txt file.

           We allow that a user-agent: line is not preceded by

           one or more blank lines."""

        # states:

        #   0: start state

        #   1: saw user-agent line

        #   2: saw an allow or disallow line

        state = 0

        linenumber = 0

        entry = Entry()



        for line in lines:

            linenumber += 1

            if not line:

                if state == 1:

                    entry = Entry()

                    state = 0

                elif state == 2:

                    self._add_entry(entry)

                    entry = Entry()

                    state = 0

            # remove optional comment and strip line

            i = line.find('#')

            if i >= 0:

                line = line[:i]

            line = line.strip()

            if not line:

                continue

            line = line.split(':', 1)

            if len(line) == 2:

                line[0] = line[0].strip().lower()

                line[1] = urllib.unquote(line[1].strip())

                if line[0] == "user-agent":

                    if state == 2:

                        self._add_entry(entry)

                        entry = Entry()

                    entry.useragents.append(line[1])

                    state = 1

                elif line[0] == "disallow":

                    if state != 0:

                        entry.rulelines.append(RuleLine(line[1], False))

                        state = 2

                elif line[0] == "allow":

                    if state != 0:

                        entry.rulelines.append(RuleLine(line[1], True))

                        state = 2

        if state == 2:

            self.entries.append(entry)





    def can_fetch(self, useragent, url):

        """using the parsed robots.txt decide if useragent can fetch url"""

        if self.disallow_all:

            return False

        if self.allow_all:

            return True

        # search for given user agent matches

        # the first match counts

        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"

        for entry in self.entries:

            if entry.applies_to(useragent):

                return entry.allowance(url)

        # try the default entry last

        if self.default_entry:

            return self.default_entry.allowance(url)

        # agent not found ==> access granted

        return True





    def __str__(self):

        return ''.join([str(entry) + "\n" for entry in self.entries])





class RuleLine:

    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"

       (allowance==False) followed by a path."""

    def __init__(self, path, allowance):

        if path == '' and not allowance:

            # an empty value means allow all

            allowance = True

        self.path = urllib.quote(path)

        self.allowance = allowance



    def applies_to(self, filename):

        return self.path == "*" or filename.startswith(self.path)



    def __str__(self):

        return (self.allowance and "Allow" or "Disallow") + ": " + self.path





class Entry:

    """An entry has one or more user-agents and zero or more rulelines"""

    def __init__(self):

        self.useragents = []

        self.rulelines = []



    def __str__(self):

        ret = []

        for agent in self.useragents:

            ret.extend(["User-agent: ", agent, "\n"])

        for line in self.rulelines:

            ret.extend([str(line), "\n"])

        return ''.join(ret)



    def applies_to(self, useragent):

        """check if this entry applies to the specified agent"""

        # split the name token and make it lower case

        useragent = useragent.split("/")[0].lower()

        for agent in self.useragents:

            if agent == '*':

                # we have the catch-all agent

                return True

            agent = agent.lower()

            if agent in useragent:

                return True

        return False



    def allowance(self, filename):

        """Preconditions:

        - our agent applies to this entry

        - filename is URL decoded"""

        for line in self.rulelines:

            if line.applies_to(filename):

                return line.allowance

        return True



class URLopener(urllib.FancyURLopener):

    def __init__(self, *args):

        urllib.FancyURLopener.__init__(self, *args)

        self.errcode = 200



    def prompt_user_passwd(self, host, realm):

        ## If robots.txt file is accessible only with a password,

        ## we act as if the file wasn't there.

        return None, None



    def http_error_default(self, url, fp, errcode, errmsg, headers):

        self.errcode = errcode

        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,

                                                        errmsg, headers)

