#!/usr/bin/env python
# $Id: crawl.py,v 1.2 2000/12/31 03:15:06 wesc Exp $
#
# crawl.py -- crawls websites
#
# created by wesc 00/07/24

### TO-DO:
# - add thread support

import urllib
from sys        import argv # , stderr.write
from os         import makedirs, unlink, sep
from os.path    import isdir, exists, dirname, splitext
from string     import replace, find, lower
from htmllib    import HTMLParser
from urlparse   import urlparse, urljoin
from formatter  import DumbWriter, AbstractFormatter
from cStringIO  import StringIO

class My404UrlOpener(urllib.FancyURLopener):
    '''My404UrlOpener -- subclass of urllib.FancyURLopener to implement a
        retry method with an additional trailing slash for 404 Not Found
        errors (urlopener class to be used by urllib.urlretrieve())
    '''

    once = 0        # page retry flag (with trailing '/')

    # http_error_404() --> string
    def http_error_404(self, url, fp, errcode, errmsg, headers, data=None):
        '''http_error_404() -- called when server cannot find requested link;
            we change it here because we want to retry links without trailing
            slashes *with* the slash in because sometimes it makes a difference!
        '''
        # retry only once
        if not My404UrlOpener.once:
            My404UrlOpener.once = 1
            return self.open_http(url+'/', data)        # retry

        # already retried and failed again, so just return 404 error
        else:
            return urllib.addinfourl(fp, headers, "http:" + url)

urllib._urlopener = My404UrlOpener()        # tell urllib to use MY URLopener


class LinkImageParser(HTMLParser):
    '''LinkImageParser -- subclass of htmllib.HTMLParser to implement a retry
        method with an additional trailing slash for 404 Not Found errors
        (urlopener class to be used by urllib.urlretrieve())
    '''

    # __init__() --> None
    def __init__(self, fmtr, v=0):
        '''__init__() -- constructor for our own special parser,
        which also downloads graphic files'''
        HTMLParser.__init__(self, fmtr, v)        # call base class constructor
        self.imgsrclist = []

    # handle_image() --> None
    def handle_image(self, src, alt, *args):        # add image to image list
        '''handle images by processing alt (as in base class
        version and also add image to image list'''

        self.handle_data(alt)
        self.imgsrclist.append(src)


class Retriever:
    '''LinkImageParser -- subclass of htmllib.HTMLParser to implement a retry
        method with an additional trailing slash for 404 Not Found errors
        (urlopener class to be used by urllib.urlretrieve())
    '''

    # __init__() --> None
    def __init__(self, url):
        '__init__() -- constructor for Retriever class, saves url and filename'
        self.url = url
        self.file = self.filename(url)

    # filename() --> string
    def filename(self, url, deffile='index.html'):
        'filename() -- takes URL and creates a valid local filename'

        parsedurl = urlparse(url, '', 0)
        host = parsedurl[1]	# www.python.org
        path = parsedurl[2]	# /doc/FAQ.html
	
	# handles input of "xxx.yyy.zzz" (no protocol) because path
	# is set to domain and host is blank so swap them back
	if host == '' and path != '':
	    host, path = path, host

	# add trailing slash (so .com) is not seen as file "extension"
	if path == '':
	    path = '/'

	# directory path is hostname plus file path
        dirpath = host + path

	# pull out extension; if none there, it is an index file
        ext = splitext(dirpath)
        if ext[1] == '':
            if dirpath[-1] == '/':
                dirpath = dirpath + deffile
            else:
                dirpath = '%s/%s' % (dirpath, deffile)

	# change path sep if non-UNIX
	if sep != '/':
	    dirpath = replace(dirpath, '/', sep)

	# pull out directory name only
        dir = dirname(dirpath)

	# does it exist and is it a directory?  if not, create it
        if not isdir(dir):
            if exists(dir):
                unlink(dir)
            makedirs(dir)	# as of 1.5.2
        return dirpath

    # download() --> tuple
    def download(self):
        'download() -- call urllib.urlretrieve() to get data'

        try:
            retval = urllib.urlretrieve(self.url, self.file)
            self.parse()
        except IOError:
            retval = ('*** ERROR: invalid URL "%s"' % self.url,)
        return retval

    # parse() --> None
    def parse(self):
        'parse() -- parse retrieved HTML page, adding links/images to object'
        self.parser = LinkImageParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()

    # getPages() --> list
    def getPages(self):
        'getPages() -- returns list of links'
        return self.parser.anchorlist

    # getImages() --> list
    def getImages(self):
        'getImages() -- returns list of images'
        return self.parser.imgsrclist


class Crawler:
    '''Crawler -- this is the main class for our crawler/robot/spider application;
        it processes each link in the Q, creating a Retriever object for each link to
        download (we can add threads to this later), saving a list of already seen
        links in a separate list.
    '''

    count = 0        # number of objects downloaded

    # __init__() --> None
    def __init__(self, url):
        '__init__() -- constructor for our Crawler, sets first link in Q and domain'
        self.q = [url]
        self.seen = []
        self.dom = urlparse(url)[1]

    # getPage() --> None
    def getPage(self, url):
        'getPage() -- create Retriever to process a page, adding new links to Q'

        # create Retriever and download link
        r = Retriever(url)
        retval = r.download()

        # error
        if retval[0] == '*':
            print retval, '... skipping parse'
            return

        # otherwise downloaded ok, display link downloaded and saved location
        Crawler.count = Crawler.count + 1
        print '\n(', Crawler.count, ')'
        print 'URL:', url
        print 'FILE:', retval[0]
        self.seen.append(url)

        # add unvisited links and images to Q
        links = r.getPages() + r.getImages()
        for eachLink in links:

            # check for missing http:// but not ftp://, gopher, etc.;
            # use urlparse.urljoin() to get the top part of path
            if eachLink[:4] != 'http' and find(eachLink, '://') == -1:
                eachLink = urljoin(url, eachLink)
            print '* ', eachLink,

            # skip "mailto:" links
            if find(lower(eachLink), 'mailto:') != -1:
                print '... discarded, mailto link'
                continue

            # process potential new links
            if eachLink not in self.seen:

                # check if within same domain
                if find(eachLink, self.dom) == -1:
                    print '... discarded, not in domain'
                else:

                    # check if already in Q
                    if eachLink not in self.q:
                        self.q.append(eachLink)
                        print '... new, added to Q'
                    else:
                        print '... discarded, already in Q'

            # already visited link
            else:
                    print '... discarded, already processed'

    # go() --> None
    def go(self):
        'go() -- process all links in Q (processing may add new items to Q)'
        while self.q:
            url = self.q.pop()
            self.getPage(url)


# main() --> None
def main():
    'main() -- sets up first link, instantiates Crawler and runs!'

    # get first link from command-line
    if len(argv) > 1:
        url = argv[1]

    # get first link interactively
    else:
        try:
            url = raw_input('Enter starting URL: ')
        except (KeyboardInterrupt, EOFError):
            url = ''

    if not url: return                # bail if no first link
    s = Crawler(url)                # instantiate crawler
    s.go()                        # do it


# launch only if invoked directly
if __name__ == '__main__':
    main()

