#!/usr/bin/env python

"""
Spider.py   $Id: Spider.py,v 1.49 2001/10/18 19:13:18 janssen Exp $

Recursivly gets documents and thus collects a document set.


Copyright 1999, 2000 by Holger Duerer <holly@starship.python.net>

Distributable under the GNU General Public License Version 2 or newer.
"""

if __name__ == '__main__':
    ## The following section tries to get the PyPlucker directory onto the
    ## system path if called as a script and if it is not yet there:
    try: import PyPlucker
    except ImportError:
        import os, sys
        file = sys.argv[0]
        while os.path.islink (file): file = os.readlink (file)
        sys.path = [os.path.split (os.path.dirname (file))[0]] + sys.path
        try: import PyPlucker
        except ImportError:
            print "Cannot find where module PyPlucker is located!"
            sys.exit (1)

        # and forget the temp names...
        del file, os
    del PyPlucker
    ##
    ## Now PyPlucker things should generally be importable
    ##



import PyPlucker
from PyPlucker import Parser, ConfigFiles
from PyPlucker.Url import URL
from PyPlucker.AliasList import AliasList
import string, sys


    
class SpiderLink:
    """A class to maintain and encapsulate information about the
    various attributes related to links while spidering (MAXDEPTH,
    STAYONHOST, STAYBELOW, NOIMAGES, etc.)

    In some cases we make a distinction, if the link has already been
    taken or not."""

    def __init__ (self, url, dict={}):
        self._url = url
        self._dict = {}
        self._current_depth = None
        self._max_depth = None
        self._new_max_depth = None
        self._stay_on_host = None
        self._stay_below = None
        self._maxwidth = None
        self._maxheight = None
        self._bpp = 1
        self._update_from_dict (dict, after_taken=1)
        self._from_image = 0
        self.set_post (None)


    def __str__ (self):
        res = "<SpiderLink"
        res = res + (" Depth: %s/%s" % (repr(self._current_depth), repr(self._max_depth)))
        res = res + " MAXWIDTH=%s" % self._maxwidth
        res = res + " MAXHEIGHT=%s" % self._maxheight
        res = res + " BPP=%d" % self._bpp
        if self._stay_on_host:
            res = res + " STAYONHOST"
        if self._stay_below:
            res = res + (" STAYBELOW=\"%s\"" % self._stay_below)
        res = res + " " + repr (self._dict)
        res = res + ">"
        return res

    
    def _update_from_dict (self, dict, after_taken):
        """Update private values from link attributes in dict.
        If 'after_taken' is true, means that _max_depth can be altered
        immediately.  Otherwise the new value is stored in the helper
        variable _new_max_depth"""
        

        # POST processing
        if dict.has_key ('post'):
            self.set_post (dict['post'])

        # MAXWIDTH processing
        if dict.has_key ('maxwidth'):
            self.set_maxwidth (dict['maxwidth'])

        # MAXHEIGHT processing
        if dict.has_key ('maxheight'):
            self.set_maxheight (dict['maxheight'])

        # BPP processing
        if dict.has_key ('bpp'):
            bpp = dict['bpp']
            try:
                bpp = int (bpp)
                self.set_bpp (bpp)
            except:
                pass

        # MAXDEPTH processing
        if dict.has_key ('maxdepth'):
            if after_taken:
                self.set_max_depth (string.atoi (dict['maxdepth']))
                self.set_current_depth (1)
            else:
                self._new_max_depth = string.atoi (dict['maxdepth'])

        # NOIMAGES processing
        if dict.has_key ('noimages'):
            self.set_bpp (0)

        # STAYONHOST processing
        # This attribute is only evaluated *after* the link has been taken
        if after_taken and dict.has_key ('stayonhost'):
            self.set_stay_on_host (self._url.get_host ())


        # STAYBELOW processing
        # This attribute is only evaluated *after* the link has been taken
        if after_taken and dict.has_key ('staybelow'):
            if string.lower (dict['staybelow']) == 'staybelow' or dict['staybelow'] == '':
                the_url = self._url.as_string (with_fragment=0)
                dict['staybelow'] = the_url
                #print "Setting STAYBELOW to %s" % the_url
            self.set_stay_below (dict['staybelow'])


        # Finally update it compleytely...
        self._dict.update (dict)


    def make_child_attributes (self, url, dict, inline):
        """Generate a new SpiderLink object that represents the
        link from a document lead to by self.
        'dict' contains the attributes specified for this link"""
        
        new = SpiderLink (url, self._dict)
        if self._current_depth is not None:
            if inline:
                new.set_current_depth (self._current_depth)
            else:
                new.set_current_depth (self._current_depth + 1)
        if self._max_depth is not None:
            new.set_max_depth (self._max_depth)
        if self._stay_on_host is not None:
            new.set_stay_on_host (self._stay_on_host)
        if self._stay_below is not None:
            new.set_stay_below (self._stay_below)
        if self._maxwidth is not None:
            new.set_maxwidth (self._maxwidth)
        if self._maxheight is not None:
            new.set_maxheight (self._maxheight)

        new.set_bpp (self._bpp)

        new._update_from_dict (dict, after_taken=0)
        new.set_post (None)
        
        return new



    def link_taken (self, dict_of_attributes):
        """Note that this link has been taken.  The _max_depth is
        now changed to the new value (if there was one)."""
        if self._new_max_depth is not None:
            self._max_depth = self._new_max_depth
            self._current_depth = 1
        self._update_from_dict (dict_of_attributes, after_taken=1)


    def get_post (self):
        """Return the data for the post operation or None if none was specified"""
        if self._post_data is not None:
            return str (self._post_data)
        else:
            return None


    def set_post (self, post_data):
        """Set the data for a post operation"""
        self._post_data = post_data


    def get_maxwidth (self):
        return self._maxwidth


    def set_maxwidth (self, value):
        self._maxwidth = value
        

    def get_maxheight (self):
        return self._maxheight


    def set_maxheight (self, value):
        self._maxheight = value
        

    def get_bpp (self):
        return self._bpp


    def set_bpp (self, value):
        self._bpp = value
        

    def set_current_depth (self, n):
        self._current_depth = n


    def set_max_depth (self, n):
        self._max_depth = n


    def set_stay_on_host (self, host):
        self._stay_on_host = host


    def set_stay_below (self, urlpart):
        self._stay_below = urlpart


    def check_fetch (self, as_image):
        """Check whether this link should be taken.  If 'as_image' is
        true, this link points to an inline image"""

        # mailto: gets always included
        if (self._url.get_protocol() == 'mailto'):
            return 1
        
        if self._max_depth is not None and self._current_depth is not None:
            if self._current_depth > self._max_depth:
                # Depth exceeded
                return 0
        if as_image and self._bpp == 0:
            # No images wanted
            return 0

        if self._stay_on_host is not None:
            if self._stay_on_host != self._url.get_host():
                # Got to another host
                return 0

        if self._stay_below is not None:
            target_url = self._url.as_string (with_fragment=0)
            target_part = target_url[:len (self._stay_below)]
            if self._stay_below != target_part:
                return 0

        # default: fetch
        return 1

    def get_from_image(self):
        return self._from_image

    def set_from_image(self, n):
        self._from_image = n


class Spider:
    """A class to collect web pages by spidering from the home document."""

    def __init__ (self, retriever,
                  parser, \
                  collection, \
                  exclusion_list, \
                  config,
                  alias_list):
        """Call with a retriever and a parser function.
        'retriever' gets called with a Url.URL and should return
        a (header-dict, data) tupe.  In header-dict, the values 'error
        code' and 'error text' *must* be defined.
        If the data is valid, 'error code' must be zero.

        'parser' gets called with the url and these header-dict and
        the body and should return a PluckerDocument or None (if
        failed).
        
        Implementation:
        We have one queue and two dictionaries in which we just
        store, whether some URL has been tried before (and collected
        or failed)."""
        
        self._retriever = retriever
        self._parser = parser
        self._config = config
        if collection is None:
            self._collected = {}
        else:
            self._collected = collection
        self._taken = {}
        self._failed = {}
        self._queue = []
        self._exclusion_list = exclusion_list
        if alias_list is None:
            self._alias_list = AliasList ()
        else:
            self._alias_list = alias_list
        self._fatal_error = 0

        # Now initialize the first things we *do* want in the
        # collection of documents
        url = URL (config.get_string ('home_url', 'plucker:/home.html'))
        self._home_url = url
        bpp = config.get_int ('bpp', 1)
        attributes = {'maxdepth': "%d" % config.get_int ('home_maxdepth', 2),
                      'bpp': "%d" % bpp}
        if config.get_bool ('home_stayonhost', 0):
            attributes['stayonhost'] = 1
        tmp = config.get_string ('home_staybelow')
        if tmp is not None:
            attributes['staybelow'] = tmp
        self.add_queue (url,
                        SpiderLink (url, attributes),
                        force=1)


    def _needs_processing (self, url):
        if self._collected.has_key (url):
            return 0
        if self._failed.has_key (str (url)):
            return 0
        for (other_url, attr) in self._queue:
            if other_url == url:
                return 0
        return 1

    def add_queue (self, url, attr, force=0):
        """Maybe add url to the queue"""
        url = self._alias_list.get (url)
        # mailto: gets always included
        if (url[:7] == 'mailto:'):
            self._queue.append ((url, attr))
            return
        if not force and self._exclusion_list:
            if not self._exclusion_list.check (url):
                #print "Excluding '%s'\n" % url
                return
        if force or self._needs_processing (url):
            self._queue.append ((url, attr))


    def done (self):
        """Check whether something rests to be done in the queue"""
        return len (self._queue) == 0


    def process_all (self, verbose=0):
        """Process until all done"""
        while not self.done ():
            self.process (verbose)
            sys.stdout.flush ()


    def process (self, verbose=0):
        """Process the next thing in the queue"""

        #if verbose>1:
        #    # To help debugging, we can write out the current queue
        #    f = open ("spider.status", "w")
        #    f.write ("Queue:\n")
        #    for (key, attr) in self._queue:
        #        if self._collected.has_key (key):
        #            stat = "done"
        #        elif self._failed.has_key (key):
        #            stat = "failed"
        #        else:
        #            stat = ""
        #        f.write ("  %s %s  %s\n" % (repr(key), stat, str(attr)))
        #    f.write ("\nCollected:\n")
        #    for key in self._taken.keys ():
        #        attr = self._taken[key]
        #        f.write ("  %s  %s\n" % (repr(key), str(attr)))
        #    f.write ("\nNot Collected:\n")
        #    for key in self._failed.keys ():
        #        attr = self._failed[key]
        #        f.write ("  %s  %s\n" % (repr(key), str(attr)))
        #    f.close ()

        if self._queue:
            # de-queue top of queue
            (urltext, attributes) = self._queue[0]
            url = URL (urltext)
            del self._queue[0]
            if verbose:
                urltext = str (url)
                if len (urltext) > 60:
                    urltext = urltext[:40] + "....." + urltext[-15:]
                print "Processing %s.\n           %d collected, %d still to do" % \
                      (urltext, len (self._collected.keys ()), len (self._queue))

            post_data = attributes.get_post ()
            urltext = url.as_string (with_fragment=0)
            urltext_key = urltext
            if post_data is not None:
                urltext_key = urltext_key + post_data
            if self._collected.has_key (urltext_key):
                # already collected
                if verbose:
                    print "  Already done"
                return
            
            if self._failed.has_key (urltext_key):
                # already tried, but failed
                if verbose:
                    print "  Already tried, but failed"
                return
                
            (header, document) = self._retriever (url, alias_list=self._alias_list, \
                                                  post_data=post_data)
            assert header.has_key ('error code'), "Headers from retriever has no error code"
            assert header.has_key ('URL'), "Headers from retriever has no URL"
            if header['error code'] == 0:
                # Fetched OK
                new_url = URL (header['URL']).as_string (with_fragment=0)
                ####################################################################
                # header['URL'] are file:C:\path\file.ext on python 1.52 and       #
                # C:\path\file.ext om Python 2.0. Also the case of the Drive       #
                # letter may change. So a move detected here. This should fix that.#
                ####################################################################
                if sys.platform == 'win32' and (string.lower(urltext[0:5]) == 'file:'):
                    if new_url[0:5] != 'file:':
                       new_url = 'file:' + new_url
                       new_url = URL (new_url).as_string (with_fragment=0)
                new_url_key = new_url
                if post_data is not None:
                    new_url_key = new_url_key + post_data
                if urltext != new_url:
                    if verbose:
                        print "  Moved from '%s' to '%s'" % (urltext, new_url)
                    if self._alias_list.get (urltext) == urltext:
                        # The move was not recognized by the Retriever!
                        # This can be caused by specifying a file: URL
                        # without the 'file:' part
                        self._alias_list.add (urltext, new_url)
                if verbose:
                    print "  Retrieved ok"
                    
                if urltext != new_url:
                    if self._collected.has_key (new_url_key):
                        if verbose:
                            print "  Already done"
                        return
                    if not self._exclusion_list.check (new_url):
                        if verbose:
                            print "  Is excluded"
                        return
                    
                assert header.has_key ('content-type'), \
                       "Headers from retriever has no content-type (%s)" % repr (header)

                alt_maxwidth = config.get_string('alt_maxwidth', None)
                alt_maxheight = config.get_string('alt_maxheight', None)
                try:
                    if attributes.get_from_image():
			# process IMG SRC tags here
                        (pluckerdoc, scaled) = self._parser (new_url,
                                               header,
                                               document,
                                               config=self._config,
                                               maxwidth=attributes.get_maxwidth (),
                                               maxheight=attributes.get_maxheight (),
                                               image_bits_per_pixel=attributes.get_bpp ())
                    else:
                        # not from an IMG SRC (though may be an image document)
                        (pluckerdoc, scaled) = self._parser (new_url,
                                               header,
                                               document,
                                               config=self._config,
                                               maxwidth=(alt_maxwidth or attributes.get_maxwidth()),
                                               maxheight=(alt_maxheight or attributes.get_maxheight()),
                                               image_bits_per_pixel=attributes.get_bpp ())
                        scaled = 0

                except:
                    pluckerdoc = None
                    scaled = 0

                if pluckerdoc is None:
                    headers = {'error code': -1,
                               'error text': "parsing failed"}
                    self._failed[new_url_key] = headers
                    if verbose:
                        print "  Parsing failed!"
                    return
                self._collected[new_url_key] = pluckerdoc
                self._taken[new_url_key] = attributes
                if new_url != urltext:
                    self._taken[urltext_key] = attributes
                if pluckerdoc.is_text_document ():
                    (hrefs, imagerefs) = pluckerdoc.get_external_references ()
                    for (suburltext, dict) in hrefs:
                        suburl = URL (suburltext)
                        suburl.remove_fragment ()
                        if suburl.as_string(with_fragment=0)[:17] != "plucker:/~parts~/":
                            # Subparts are not needed for fetching
                            new_attr = attributes.make_child_attributes (suburl, dict, inline=0)
                            if new_attr.check_fetch (as_image = 0):
                                new_attr.link_taken (dict)
                                self.add_queue (suburl, new_attr)
                    for (suburltext, dict) in imagerefs:
                        suburl = URL (suburltext)
                        suburl.remove_fragment ()
                        new_attr = attributes.make_child_attributes (suburl, dict, inline=1)
                        new_attr.set_from_image(1)
                        if new_attr.check_fetch (as_image = 1):
                            new_attr.link_taken (dict)
                            self.add_queue (suburl, new_attr)
                        else:
                            if verbose>1:
                                print "Not fetching image %s" % str (suburl)
                else:
                    if pluckerdoc.is_image_document() and scaled:
                        if alt_maxheight or alt_maxwidth:
                            try:
                                (big_pluckerdoc, scaled) = self._parser (new_url + "_BIG",
                                               header,
                                               document,
                                               config=self._config,
                                               maxwidth=alt_maxwidth,
                                               maxheight=alt_maxheight,
                                               image_bits_per_pixel=attributes.get_bpp ())
                            except:
                                big_pluckerdoc = None
                                scaled = 0

                            if big_pluckerdoc is not None:
                                self._collected[new_url + "_BIG"] = big_pluckerdoc
                                self._taken[new_url_key + "_BIG"] = attributes
                            else:
                                if verbose:
                                    print "  Parsing failed!"

            else:
                # retrieving has failed.
                self._failed[urltext_key] = header
                if verbose:
                    if header.has_key ('error code'):
                        code = header['error code']
                    else:
                        code = "No error code"
                    if header.has_key ('error text'):
                        text = header['error text']
                    else:
                        text = "No error text"
                    print "  Retrieved failed: %s -- %s" % (code, text)
                failed_url = urltext
                if failed_url == self._alias_list.get (self._home_url):
                    print "Fetching the home document failed.  Aborting all!"
                    self._queue = []
                    self._collected = {}
                    self._fatal_error = 1
                    

    def get_collected (self):
        return self._collected


    def encountered_fatal_error (self):
        return self._fatal_error



def execute_commands (item_name, config):
    verbosity = config.get_int ('verbosity', 1)
    for affix  in [''] + map (lambda n: str (n), range (1,10)):
        command = config.get_string (item_name + affix, "")
        if command:
            if verbosity:
                print "Executing '%s': %s" % (item_name+affix, command)
            try:
                if os.system (command):
                    raise RuntimeError, "failed"
            except:
                print "Error during execution of '%s': %s" % (item_name+affix, command)


def main (config, excl_lists=[]):
    import os, sys
    from PyPlucker.Parser import default_parser
    from PyPlucker.Retriever import SimpleRetriever
    from PyPlucker.Writer import CacheWriter, PDBWriter, SimpleMapping, make_document_resolver
    from PyPlucker.ExclusionList import ExclusionList
    from PyPlucker.Profiling import CodeTimerOn, StartInterval, StopInterval, PrintTable
    import PyPlucker.PluckerDocs

    CodeTimerOn()

    StartInterval("Configuration", "CodeTimer")
    pluckerdir = config.get_string ('pluckerdir')
    assert pluckerdir is not None
    pluckerhome = config.get_string ('PLUCKERHOME')
    assert pluckerhome is not None

    if not os.path.exists (pluckerhome) or not os.path.isdir (pluckerhome):
        sys.stderr.write ("Pluckerhome (%s) does not exist or isn't a directory\n" % pluckerhome)
        sys.exit (1)
    if not os.path.exists (pluckerdir) or not os.path.isdir (pluckerdir):
        sys.stderr.write ("Pluckerdir (%s) does not exist or isn't a directory\n" % pluckerdir)
        sys.exit (1)
    
    verbosity = config.get_int ('verbosity', 1)
    if verbosity:
        print "Working for pluckerdir %s" % pluckerdir

    if verbosity:
        if os.environ.has_key ('HTTP_PROXY') and not (os.environ.has_key ('HTTP_PROXY_USER') and os.environ.has_key ('HTTP_PROXY_PASS')):
            print "Using proxy '%s'" % os.environ['HTTP_PROXY']
        if os.environ.has_key ('HTTP_PROXY') and (os.environ.has_key ('HTTP_PROXY_USER') and os.environ.has_key ('HTTP_PROXY_PASS')):
            print "Using proxy '%s' with authentification for user '%s'" % (os.environ['HTTP_PROXY'],os.environ['HTTP_PROXY_USER'])

    import PyPlucker.PluckerDocs
    PyPlucker.PluckerDocs._DOC_HEADER_SIZE = 8
    PyPlucker.PluckerDocs._PARA_HEADER_SIZE = 4

    alias_list = AliasList ()

    if config.get_bool ('zlib_compression', 0):
        try:
            import zlib
        except ImportError:
            print "Your python installation does not support ZLib compression."
            print "We fall back do DOC compression."
            config.set ('zlib_compression', 'false')
    if config.get_bool ('zlib_compression', 0):
        PyPlucker.PluckerDocs.UseZLibCompression ()
        if verbosity > 1:
            print "ZLib compression turned on"
    #
    #  Load the exclusion lists..
    #
    exclusion_list = ExclusionList (include_by_default=1)

    filename = os.path.join (pluckerhome, 'exclusionlist.txt')
    if os.path.exists (filename):
        if verbosity > 1:
            print "Using exclusion list ", filename
        exclusion_list.load_file (filename)

    filename = os.path.join (pluckerdir, 'exclusionlist.txt')
    if os.path.exists (filename):
        if verbosity > 1:
            print "Using exclusion list ", filename
        exclusion_list.load_file (filename)

    config_excl_list = config.get_string ('exclusion_lists')
    if config_excl_list is not None:
        config_excl_list = string.split (config_excl_list, os.pathsep)
        for filename in config_excl_list:
            if not os.path.isabs (filename):
                filename = os.path.join (pluckerdir, filename)
            if os.path.exists (filename) and \
               (os.path.isfile (filename) or os.path.islink (filename)):
                if verbosity > 1:
                    print "Adding extra exclusion list ", filename
                exclusion_list.load_file (filename)

    for filename in excl_lists:
        if os.path.exists (filename) and \
           (os.path.isfile (filename) or os.path.islink (filename)):
            if verbosity > 1:
                print "Adding extra exclusion list ", filename
            exclusion_list.load_file (filename)
    #
    # finished loading exclusion lists
    #

    home_url = config.get_string ('home_url', 'plucker:/home.html')
    if home_url != 'plucker:/home.html':
        alias_list.add ('plucker:/home.html', home_url)

    retriever = SimpleRetriever (pluckerdir, pluckerhome)
    
    max_depth = config.get_int ('home_maxdepth', 2)
    
    assert config.get_bool ('use_cache') is not None
    if config.get_bool ('use_cache'):
        cachedir = os.path.join (pluckerdir, config.get_string ('cache_dir_name', 'cache'))
	if not (os.path.exists(cachedir) and os.path.isdir(cachedir)):
	    sys.stderr.write("Error:  cache directory does not exist:  " + cachedir + "\n")
	    return 1
    else:
	if not (os.path.exists(pluckerdir) and os.path.isdir(pluckerdir)):
	    sys.stderr.write("Error:  Plucker directory does not exist:  " + cachedir + "\n")
	    return 1
        doc_file = config.get_string ('doc_file')
        if not doc_file:
            doc_file = config.get_string ('db_file')
            if doc_file:
                deprecated( "db_file", "doc_file" )
	if (doc_file == '<stdout>'):
	    filename = doc_file
	else:
	    filename = os.path.join (pluckerdir, doc_file+".pdb")
        doc_name = config.get_string ('doc_name')
        if not doc_name:
            doc_name = config.get_string ('db_name')
            if doc_name:
                deprecated( "db_name", "doc_name" )
            elif (doc_file != '<stdout>'):
                # use basename in case only file name is given
                doc_name = os.path.basename (doc_file)
	    else:
		# generate name based on home URL
		if len(home_url) > 31:
		    doc_name = "..." + home_url[-28:]
		else:
		    doc_name = home_url

    StopInterval("Configuration", "CodeTimer")

    StartInterval("GatherPages", "CodeTimer")
    spider = Spider (retriever.retrieve,
                     default_parser, \
                     collection=None, \
                     exclusion_list=exclusion_list, \
                     config=config,
                     alias_list=alias_list)
    spider.process_all(verbose=verbosity)
    StopInterval("GatherPages", "CodeTimer")

    if spider.encountered_fatal_error ():
        sys.stderr.write("Fatal error while processing.  Nothing written.")
        return 1
    

    if verbosity:
        print "\nWriting out collected data..."
    collection = spider.get_collected ()

    StartInterval("WritingDB", "CodeTimer")
    if config.get_bool ('use_cache'):
        writer = CacheWriter (collection, config, cachedir)
        if verbosity:
            print "Writing to cache dir %s" % cachedir
    else:
        writer = PDBWriter (collection, config, name=doc_name, version=1, filename=filename)
        if verbosity:
            print "Writing document '%s' to file %s" % (doc_name, filename)

    mapping = SimpleMapping (make_document_resolver (collection), alias_list=alias_list)
    mapping = writer.write (verbose=verbosity, mapping=mapping, alias_list=alias_list)
    StopInterval("WritingDB", "CodeTimer")

    if verbosity > 2:
        print "\nMapping: "
        mapped = mapping.get_mapping ()
        rev = {}
        for i in mapped.keys ():
            rev[mapped[i]] = i
        keys = rev.keys ()
        keys.sort ()
        for id in keys:
            print "%s => %s" % (id, rev[id])
       
    if verbosity > 1:
        items = Parser.unknown_things.keys ()
        if items:
            print "Unknown items encountered:"
            items.sort ()
            for item in items:
                print "  %s: %s" % (item, Parser.unknown_things[item])

    if verbosity > 1:
	PrintTable(sys.stdout, "CodeTimer")

    if verbosity:
        print "Done!"

    return 0

if __name__ == '__main__':
    import getopt, os

    if os.environ.has_key ('PLUCKERHOME'):
        pluckerhome = os.environ['PLUCKERHOME']
    else:
        pluckerhome = os.path.expanduser ("~/.plucker")

    if os.environ.has_key ('PLUCKERDIR'):
        pluckerdir = os.environ['PLUCKERDIR']
    else:
        pluckerdir = None

    def deprecated ( oldname, newname ):
        print "NOTE: %s is a deprecated option. Please use the %s option instead." % ( oldname, newname )

    def usage (reason=None, pluckerhome=pluckerhome):
        if reason:
            print 'Error:  ' + reason
        print "Usage: %s [OPTIONS] [HOMEURL]" % sys.argv[0]
	print "(Type '%s --help' for more information.)" % sys.argv[0]
        sys.exit (1)

    def display_help (pluckerhome=pluckerhome):

        print "Usage: %s [OPTIONS] [HOMEURL]" % sys.argv[0]
        print "  where HOMEURL is a 'file:' or 'http:' URL (which can alternatively"
	print "  be specified with --home-url=<homeurl>) and OPTIONS are:"
        print "    -c, --update-cache:"
        print "                   Write a traditional cache directory in the <plucker dir>"
        print "    -f <name prefix>, --doc-file=<name prefix>"
	print "                   Specify the name of the output file (see also --pluckerdir)."
	print "                   If not specified, the Plucker doc will be written"
	print "                   to stdout, if stdout is not a tty."
        print "    -h, --help:    Print this help"
        print "    -N <name>, --doc-name <name>"
        print "                   Specify the name of the document (NOT the filename)."
        print "                   Defaults to -f's argument."
        print "    -q, --quiet:   Be quiet, i.e. set verbosity level 0"
        print "    -v:            Set verbosity level 1 (which is the default)"
        print "    -V <n>, --verbosity=<n>:"
        print "                   Set verbosity leven <n>"
        print "                     Verbosity level 0 is silent except for errors"
        print "                     Verbosity level 1 gives progress status (default)"
        print "                     Verbosity level 2 is used for debugging"
        print "    -P<dir>, --pluckerhome=<dir>:"
        print "                   Use <dir> as plucker home instead of the default "
        print("                   %s" % pluckerhome ) + " (~/.plucker/ or $PLUCKERHOME)"
        print "    -p<dir>, --pluckerdir=<dir>:"
        print "                   Use <dir> as plucker dir instead of the default "
        print "                   Defaults to same as plucker home."
        print "    --bpp=<num>:   Bits per pixel for images; defaults to 1"
        print "                     <num> =  0, 1, 2, 4, 8 or 16 (16 not on Windows)"
        print "    --noimages:    Do not include images (same as --bpp=0)"
        print "    -H <homeurl>, --home-url=<homeurl>:  Use <homeurl> as the root document."
        print "                     Defaults to plucker:/home.html (i.e. home.html in"
        print "                     the plucker dir)"
        print "    -M <n>, --maxdepth=<n>:"
        print "                   Use MAXDEPTH=<n> on the home document.  Defaults to 2"
        print "    -E <filename>, --exclusion-list <filename>: "
        print "                   Add <filename> to list of files searched for exclusion lists"
        print "    -s <secname>, --extra-section=<secname>:"
        print "                   Add <secname> to the list of searched section in the config files"
        print "    --zlib-compression, --doc-compression:"
        print "                   Specify which compression method to use. (For expert's use)"
        print "    --compression=<compression-type>:"
        print "                   Use <compression-type> as the compression format"
        print "                   for the database.  Allowable options are 'doc', for"
        print "                   Palm DOC compression, or 'zlib', for zlib compression."
        print "                   Zlib compression is typically better than DOC compression."
        print "    --no-urlinfo:  Do not include info about the URLs"
        print "    --category=<category-name1>[;<category-name2>;..;<category-name16>]:"
        print "                   Put <category-name> in the database as the default"
        print "                   viewer category for the database."
        print "                   It is possible to assign several categories separated by ';'"
        print "    --stayonhost:  Do not follow external URLs"
        print "    --staybelow=<url-prefix>:"
        print "                   Automatically exclude any URL that doesn't begin with <url-prefix>."
        print "    --maxheight=<n>:"
        print "                   Set maximum height of images to <n> pixels."
        print "    --maxwidth=<n>:"
        print "                   Set maximum width of images to <n> pixels."
        print "    --alt-maxheight=<n>:"
        print "                   Set alternative maximum height of images to <n> pixels.  This value"
        print "                   is used for 'big' versions of inline images that had to be scaled"
        print "                   down in size to obey the MAXWIDTH and MAXHEIGHT parameters."
        print "    --alt-maxwidth=<n>:"
        print "                   Set alternative maximum width of images to <n> pixels.  This value"
        print "                   is used for 'big' versions of inline images that had to be scaled"
        print "                   down in size to obey the MAXWIDTH and MAXHEIGHT parameters."
        print "    --launchable, --not-launchable:"
        print "                   Set (or unset) the launchable bit in the output file."
        print "    --backup, --no-backup:"
        print "                   Set or clear the backup bit in the output file."
        print "    --beamable, --not-beamable:"
        print "                   Set or clear the beamable bit in the output file."
        print ""
        print "Note that you must specify either -f or specify HOMEURL as an argument,"
	print " or specify -c to update a cache."
        print ""

    try:
        home_url = None
        verbosity = None
        bpp = None
        max_depth=None
        use_cache = None
        use_file = None
        doc_name = None
        exclusion_lists = []
        extra_sections = []
        zlib_compression = None
        no_url_info = None
        stayonhost = None
        staybelow = None
        category = None
        maxwidth = None
        maxheight = None
        alt_maxwidth = None
        alt_maxheight = None
        launchable = None
        backup = None
        copy_protect = None
        iconfile = None
	default_charset = None

        (opts, args) = getopt.getopt(sys.argv[1:], "f:chqvV:p:P:H:E:M:N:s:", \
                                     [  "db-file=", "doc-file=", "help", "quiet", "pluckerdir=", "pluckerhome=",
                                        "bpp=", "noimages", "exclusion-list=",
                                        "maxdepth=", "db-name=", "doc-name=",
                                        "extra-section=", "verbosity=", 
                                        "zlib-compression", "doc-compression", 
                                        "no-urlinfo", "stayonhost", "staybelow=", "category=",
                                        "maxheight=", "maxwidth=", "alt-maxheight=", "alt-maxwidth=",
                                        "compression=", "home-url=", "update-cache", "launchable",
                                        "not-launchable", "backup", "no-backup", "beamable", "not-beamable",
                                        "icon=", "charset="])
        if args:
            # usage ("Only options are allowed as arguments.")
	    if len(args) > 1:
		if args[1][0] == '-':
		    usage("All options (such as '" + string.join(args[1:]) + "') must be specified before the argument.")
		else:
		    usage("Only one 'root' document should be specified as an argument.")
	    root = args[0]
	    if (len(root) > 5) and ((string.lower(root[:5]) == 'http:') or (string.lower(root[:5]) == 'file:')):
		home_url = root
	    elif os.path.exists(root):
		home_url = 'file:'+root
	    else:
		usage("Can't locate " + root)

        for (opt, arg) in opts:
            if opt == "-h" or opt == "--help":
                display_help()
		sys.exit(0)
            elif opt == "-c" or opt == "--update-cache":
                use_cache = 1
            elif opt == "-f" or opt == "--doc-file":
                use_file = arg
            elif opt == "--db-file":
                deprecated( "db-file", "doc-file" )
                use_file = arg
            elif opt == "--bpp":
                bpp = string.atoi (arg)
                if bpp != 0 and bpp != 1 and bpp != 2 and bpp != 4 and bpp != 8 and bpp != 16:
                    usage ("Only 0, 1, 2, 4, 8 or 16 allowed for -bpp")
            elif opt == "--noimages":
                bpp = 0
            elif opt == "-H" or opt == "--home-url":
		if home_url and (home_url <> arg):
		    usage("Two different root URLs specified:  " + home_url + " and " + arg)
                home_url = arg
            elif opt == "-E" or opt == "--exclusion-list":
                exclusion_lists.append (arg)
            elif opt == "-q" or opt == "--quiet":
                verbosity = 0
            elif opt == "-v":
                verbosity = 1
            elif opt == "-V" or opt == "--verbosity":
                verbosity = string.atoi (arg)
            elif opt == "-M" or opt == "--maxdepth":
                max_depth = string.atoi (arg)
            elif opt == "-N" or opt == "--doc-name":
                doc_name = arg
            elif opt == "--db-name":
                deprecated( "db-name", "doc-name" )
                doc_name = arg
            elif opt == "--pluckerdir" or opt == "-p":
                pluckerdir = arg
            elif opt == "--pluckerhome" or opt == "-P":
                pluckerhome = arg
            elif opt == "-s" or opt == "--extra-section":
                extra_sections.append (arg)
            elif opt == "--zlib-compression":
                zlib_compression = 'true'
            elif opt == "--doc-compression":
                zlib_compression = 'false'
            elif opt == "--compression" and arg == "doc":
                zlib_compression = 'false'
            elif opt == "--compression" and arg == "zlib":
                zlib_compression = 'true'
            elif opt == "--no-urlinfo":
                no_url_info = 'true'
            elif opt == "--stayonhost":
                stayonhost = 'true'
            elif opt == "--staybelow":
                staybelow = arg
            elif opt == "--category":
                category = arg
            elif opt == "--maxheight":
                maxheight = arg
            elif opt == "--maxwidth":
                maxwidth = arg
            elif opt == "--alt-maxheight":
                alt_maxheight = arg
            elif opt == "--alt-maxwidth":
                alt_maxwidth = arg
	    elif opt == "--launchable":
		launchable = 1
	    elif opt == "--not-launchable":
		launchable = 0
	    elif opt == "--backup":
		backup = 1
	    elif opt == "--no-backup":
		backup = 0
	    elif opt == "--beamable":
		copy_protect = 0
	    elif opt == "--not-beamable":
		copy_protect = 1
	    elif opt == "--icon":
		iconfile = arg
	    elif opt == "--charset":
		default_charset = arg
            else:
                usage ("Error:  Unknown option '%s'" % opt)
    except getopt.error, text:
        usage (text)

    def error_logger (message):
        sys.stderr.write (message + "\n")
        
    config = ConfigFiles.Configuration (pluckerhome,
                                        pluckerdir,
                                        extra_sections=extra_sections,
                                        error_logger=error_logger)

    if config.get_string ('http_proxy') is not None:
        os.environ['HTTP_PROXY'] = config.get_string ('http_proxy')
    if config.get_string ('http_proxy_user') is not None:
        os.environ['HTTP_PROXY_USER'] = config.get_string ('http_proxy_user')
    if config.get_string ('http_proxy_pass') is not None:
        os.environ['HTTP_PROXY_PASS'] = config.get_string ('http_proxy_pass')

    if pluckerdir is None:
        pluckerdir = config.get_string ('pluckerdir')
        if pluckerdir is None:
            # also not in the config, so we default to plucker home
            pluckerdir = pluckerhome
            config.set ('pluckerdir', pluckerdir)

    
    if use_file is None and use_cache is None:
        if config.get_string ('db_file') is not None and config.get_bool ('use_cache'):
            usage ("Config files specify both a 'db_file' and a 'use_cache=1'.\nYou must decide by specifiying an argument!")
        if config.get_string ('doc_file') is not None and config.get_bool ('use_cache'):
            usage ("Config files specify both a 'doc_file' and a 'use_cache=1'.\nYou must decide by specifiying an argument!")
        if config.get_string ('db_file') is None and config.get_string ('doc_file') is None:
            if config.get_string ('use_cache') is None:
		if sys.stdout.isatty():
		    usage("No output filename specified, and stdout is a terminal!")
		else:
		    use_file = '<stdout>'
		    verbosity = 0
        else:
            config.set ('use_cache', 0)
    if use_file and use_cache:
        usage ("You must not specify both -f and -c!")

    if doc_name and use_cache:
        usage ("Specify -N/--doc-name only with -f!")

    if zlib_compression is None:
        if config.get_string ('compression') == "doc":
            zlib_compression = 'false'
        elif config.get_string ('compression') == "zlib":
            zlib_compression = 'true'
        
    mibenum = None
    # if not specified on command line, look in .pluckerrc
    if default_charset is None:
	default_charset = config.get_string("default_charset")
    # if we have one, validate it
    if default_charset is not None:
	from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum, charset_known_names
	import string, re
	mibenum = charset_name_to_mibenum(default_charset)
	if mibenum:
	    config.set('default_charset', mibenum)
	else:
	    usage ("Error:  Unsupported charset '" + default_charset + "' specified as default charset.\n"
		   "        Charset must be either a decimal MIBenum value, or one of " + str(charset_known_names()))

    # update the config with the user options
    if use_file is not None:
        config.set ('doc_file', use_file)
        config.set ('use_cache', 0)
    if doc_name is not None:
        config.set ('doc_name', doc_name)
    if use_cache is not None:
        config.set ('use_cache', 1)
    if use_file:
        config.set ('use_cache', 0)
    if category is not None:
        category_count = string.count (category, ";")
        if category_count < 16:
            config.set ('category', category)
        else:
            usage ("Max number of categories is 16!")
        

    if bpp is not None:
        config.set ('bpp', bpp)
    if max_depth is not None:
        config.set ('home_maxdepth', max_depth)
    if verbosity is not None:
        config.set ('verbosity', verbosity)
    if home_url is not None:
        config.set ('home_url', home_url)
    if zlib_compression:
        config.set ('zlib_compression', zlib_compression)
    if no_url_info:
        config.set ('no_url_info', no_url_info)
    if stayonhost:
        config.set ('home_stayonhost', stayonhost)
    if staybelow:
        config.set ('home_staybelow', staybelow)
    if maxheight is not None:
        config.set ('maxheight', maxheight)
    if maxwidth is not None:
        config.set ('maxwidth', maxwidth)
    if alt_maxheight is not None:
        config.set ('alt_maxheight', alt_maxheight)
    if alt_maxwidth is not None:
        config.set ('alt_maxwidth', alt_maxwidth)
    if launchable == 1:
	config.set ('launchable_bit', 1)
	config.set ('icon', 1)
    elif launchable == 0:
	config.set ('launchable_bit', 0)
    if backup == 1:
	config.set ('backup_bit', 1)
    elif backup == 0:
	config.set ('backup_bit', 0)
    if copy_protect == 1:
	config.set ('copyprevention_bit', 1)
    elif copy_protect == 0:
	config.set ('copyprevention_bit', 0)
    if iconfile is not None:
	config.set ('icon', 1)
	config.set ('big_icon', iconfile)
    if mibenum is not None:
	config.set ('default_charset', mibenum)

    for i in range (len (exclusion_lists)):
        exclusion_lists[i] = os.path.join (pluckerdir, exclusion_lists[i])

    execute_commands ("before_command", config)
    
    retval = main (config, exclusion_lists)

    execute_commands ("after_command", config)

    sys.exit(retval)
