#!/usr/bin/python

import os
import sys
import string
import re
import time

cfg_assign = re.compile(r'^(.*?)\s*=\s*(.*)$')

# Literal dictionary syntax is similar to Perl's
# Note that I had to choose whether config would be the 
#  filename or the dictionary.  
# Python variables hold an object reference and don't 
#  have a separate space of names depending on the type
#  of the object whose reference the variable holds.
# I renamed the dictionary.
configuration = { 'hometitle'  : "Eric's Home Page",
                  'indextitle' : "Map of Eric's Home Web",
                  'fullname'   : "Eric S. Raymond",
                  'mailaddr'   : "esr@thyrsus.com",
                  'homepage'   : "http://www.tuxedo.org/~esr/",
                  'exclude'    : [ "test", "netbuilder", "jargon", "java",
                                  "worlds", "keeper", "mersenne", "netscape" ],
                  'language'   : "english",
                  'icondirs'   : "",
                  'icontext'   : "",
                  'body'       : ""}


# This function will be called by os.path.walk, a function similar to 
#  Perl's File:Find.  os.path.walk(top,makelist,xtra) will call 
#  makelist(xtra,dir,files) for each directory, starting at top, where
#  dir is the current directory, and files is the list of names in the
#  current directory.
def makelist(xtra,dir,files):
    # I compile the regular expressions once for speed.
    # Python's re module uses Perl's regular expression syntax.
    desc_re = re.compile(r'<META\s*NAME\s?=\s?"DESCRIPTION"\s*CONTENT\s?=\s?"([^"]*)"',
			 re.IGNORECASE)
    title_re = re.compile(r'<TITLE>([^<]*)</TITLE>',re.IGNORECASE)
    for file in files:
	title = None
	process_flag = 0
	# Don't forget that file is just the filename, not the whole path.
	fullpath = os.path.join(dir,file)
	# Instead of using a regular expression, I just check the extension
	# with explicit array slicing.
	if fullpath[-5:] in ('.html','shtml') or fullpath[-4:] == '.htm':
	    process_flag = 1
	# xtra is a list.  The first element is configuration['exclude'].
	# The original sitemap did a regular expression search to see 
	# whether excluded strings were in the absolute path of each file.
	# While this behavior doesn't seem exactly desired, I preserve it
	# in the initial translation.  In Python, I use string.find instead
	# of an actual regular expression.
	for entry in xtra[0]:
	    if process_flag and string.find(fullpath,entry) > -1:
		process_flag = 0
	# If the file was an .html, .htm, or .shtml file and it wasn't 
	# ecluded, we process it.
	if process_flag:
	    # Slurp the entire file into the buffer called input.
	    inFile = open(fullpath,'r')
	    input = inFile.read()
	    inFile.close()
	    # Replace all newlines with spaces in case tags span multiple lines.
	    # This probably only really works on UNIX, but I'm not sure
	    # what the re module considers to be an end-of-line.
	    # The original sitemap was very UNIX-centric.
	    string.replace(input,'\012',' ')
	    # Find the title and description.  Create match objects.
	    desc_mo = desc_re.search(input)
	    title_mo = title_re.search(input)
	    # Just in case someone forgot a title
	    if title_mo != None:
		title = title_mo.group(1)
	    else:
		title = "No title"
	    # Don't index files without a 'description' META tag
	    # Append the path (without the initial './') and its title and description.
	    # The second element of our xtra argument is a list of tuples.
	    # I decided to use real nested lists in Python.  Ther Perl version
	    # fakes it with a list of strings joined with the '|' character.
	    if desc_mo != None:
		xtra[1].append((fullpath[2:],title,desc_mo.group(1)))

# This sort function is called after os.path.walk to order
#  the pages for which we are making a sitemap.	
# x and y are our nested lists.  Their first elements are the filenames.
def indsort(x,y):
    first = x[0]
    second = y[0]
    # sort index.html entries as the directory name in 
    # the parent directory
    # Again, we use string slicing in Python instead of a regular expression.
    if first[-10:] == 'index.html':
	first = first[:-11]
    if second[-10:] == 'index.html':
	second = second[:-11]
    # This forces grouping by subdirectory depth
    # We do this a little more concisely than the Perl version.
    # We also make it more platform independent by using os.sep.
    first = `string.count(first,os.sep)` + first
    second = `string.count(second,os.sep)` + second
    # With the preparation above, cmp() now does what we want
    return cmp(first,second)
   
# environment veriables are in the environ dictionary of the os module
home = os.environ['HOME'] # simple Python way to do getpwuid?

# We'll throw an index error if we try to access sys.argv[1] when it doesn't exist.
# In Python, we have to deal with errors more explicitly than in Perl.
# Python is almost never quite about errors.  Generally, this is not a problem
# since the try-except mechanism (see below) is very simple and powerful.
if len(sys.argv) > 1:
    config = sys.argv[1]
else:
    # Python interpolation of a variable into a string
    config = "%s/.sitemaprc" % home
# A triple-quoted string permits multi-line quotes with unescaped quotes.
sitemappy_author = """Tom Bryan
                     <<a href="tbryan@python.net">tbryan@python.net</a>>"""

# Whenever we expect some errors, we begin the code with a try statment.
# Below, an except specifies how to handle errors that are thrown from
# this try.
try:
  config_file = open(config)
  # need to "declere" a dictionary before we assign to it
  configuration = {}
  configuration['exclude'] = [];
  # No assignment in a Python conditional.  
  # Of course, there's a *much* better way to do this in Python.
  while 1:
      line = config_file.readline()
      if not line: break
      else:
	  # A simple way to look for blank and comment lines is
	  # to stip all whitespace on the left of the line.
	  # Then check for an empty string or a string starting with #.
	  line = string.lstrip(line)
	  if len(line) == 0 or line[0] == '#':
	      continue
	  else:
	      m_obj = cfg_assign.search(line)
	      # m_obj gets None on failure
	      if m_obj:
		  if (string.lower(m_obj.group[1]) == 'exclude'):
		      configuration['exclude'].append(m_obj.group[2])
		  else:
		      configuration[string.lower(m_obj.group[1])] = m_obj.group[2]
	      else:
		  sys.stderr.write("Unrecognized config line %s\n" % line)
  # Wow!  That was unnecessarily painful.  
  # See the other versions for a much better way to do this.
  config_file.close()	  
# Perhaps the config file doesn't exist.
# Again, this error must be dealt with explicitly.  
#  Here, I just ignore it. :)
except IOError:
  pass

# Get the localtime.
(year, month, mday, hour, minute, sec, wday, day, isdst) = time.localtime(time.time())

# Set up the multi-language support as in the Perl version.
if configuration['language'] == 'norwegian':
    months = ('Jan','Feb','Mar','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Des')
else:
    months = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec')

# Again, a declaration before we can assign to the dictionary.
# This forced declaration actually helps prevent a multitude of errors.
message = {}
# I don't like the way that this is done either.
# It makes sense in code that evolved, I guess.
if (configuration['language'] == 'french'):
    message['sitemap'] = 'Carte du site'
    message['back_to'] = 'Retour '
    message['autogen'] = 'Cet index a t gnr automatiquement  partir de \
                          balises META prsentes dans chaque page. Les pages \
                          de plus haut niveau sont disponibles au dbut de cette \
                          carte du site.'
    message['toolgen'] = 'L\'utilitaire de gnration automatique est "sitemap.py", \
                          crit par %s' % sitemappy_author
    date = '%d %s %04d,  %d h %d mn' % (mday, months[month-1], year, hour, minute)
elif (configuration['language'] == 'swedish'):
    message['sitemap'] = 'Sajtkarta'
    message['back_to'] = 'Tillbaka till' 
    message['autogen'] = 'Denna indexsida r automatiskt genererad frn \
                          meta-taggar p varje sida. Toppnivsidor r listade frst.'
    message['toolgen'] = 'Den hr sidan r genererad av "sitemap.py", \
                          skapad av %s.' % sitemappy_author 
    date = '%04d-%02d-%02d %02d:%02d' % (year,month,mday,hour,minute)
elif (configuration['language'] == 'german'):
    message['sitemap'] = 'Site Map'
    message['back_to'] = 'Zurüick zu' 
    message['autogen'] = 'Dieser Index wurde automatisch generiert aus Meta Tags\
	                  aller Seiten. Top-Level-Seiten werden zuerst gelistet.' 
    message['toolgen'] = 'Diese Seite wurde generiert von "sitemap.py", \
	                  geschrieben von %s.' % sitemappy_author 
    date = '%d %s %04d %02d:%02d' % (mday, months[month-1], year,hour,minute)
elif (configuration['language'] == 'norwegian'):
    message['sitemap'] = 'Nettkart'
    message['back_to'] = 'Tilbake til' 
    message['autogen'] = 'Dette er en indeks generert automatisk ut fra \
	                  meta-tagger p hver side. Toppniv-sider er listet frst.' 
    message['toolgen'] = 'Denne siden er generert av "sitemap.py", \
                          skrevet av %s.' % sitemappy_author 
    date = '%d %s %04d, at %d:%02d' % (mday, months[month-1], year, hour, minute)
else:
    message['sitemap'] = 'Site Map'
    message['back_to'] = 'Back to' 
    message['autogen'] = 'This is an index automatically generated from meta tags \
                          present in each of the pages.  Top-level pages are listed first.' 
    message['toolgen'] = 'This page generated by "sitemap.py", written by %s.<BR> \
                          "sitemap.py" is based on "sitemap" by Eric S. Raymond.' % sitemappy_author
    
    date = '%d %s %04d, at %d:%02d' % (mday, months[month-1], year, hour, minute)
if configuration['icondirs'] != '':
    incondirs = '<img src="%s" alt="Dir">' % configuration['icondirs']
else: icondirs = ''

if configuration['icontext'] != '':
    icontext = '<img src="%s" alt="Text">' % configuration['icontext']
else: icontext = ''

# a dictionary retrieval with a default value
body_attr = configuration.get('body','')
body = '<BODY %s>' % body_attr

# The following string simulate's Perl's heredoc, but it's an 
#  ugly way to accomplish this task in Pyhton because of the 
#  large tuple of values that need to be interpolated.
print '''
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
  <TITLE>%s</TITLE>
  <META NAME="KEYWORDS" CONTENT="%s"> 
  <LINK REV=MADE HREF="mailto:%s">
</HEAD>
%s
<TABLE WIDTH="100%%" CELLPADDING=0><TR>
<TD WIDTH="50%%">%s <A HREF="%s">%s</A>
<TD WIDTH="50%%" ALIGN=RIGHT>%s
</TR></TABLE>
<HR><P>
<H1 ALIGN=CENTER>%s</H1>

<P>%s

<DL>
''' % (configuration['indextitle'],message['sitemap'],configuration['mailaddr'],
       body, message['back_to'], configuration['homepage'], configuration['hometitle'],
       date, message['sitemap'], message['autogen'])

# pages will be a list of tuple.  
# Each element of pages will hold the name, title, and description of one HTML page.
pages = []
os.path.walk('.',makelist,(configuration['exclude'],pages))

pages.sort(indsort)

# Python will also complain if you reference a variable to which 
# you haven't yet assigned a value.  
oldstem = None
for item in pages:
    (file, title, desc) = item 
    # The original got the part of the path before the first slash.
    # It seems that stem should be everything but the filename.
    # The next line implements this behavior with some platform independence.
    newstem = os.path.split(file)[0]
    # Print a HR separator.
    # This still doesn't quite work because of the way index.html
    # files are treated.
    if oldstem != newstem:
	print '<DT><P ALIGN=RIGHT><HR WIDTH="80%%">\n%s<BR>' % icondirs
    print '<DT>%s\n<a href="%s">%s</a>: <B>%s</B><DD>\n\t%s\n' % \
	  (icontext, file, file, title, desc)
    oldstem = newstem


print '''
</DL>
<P>
<HR>
%s
<HR>
<TABLE WIDTH="100%%" CELLPADDING=0><TR>
<TD WIDTH="50%%">%s <A HREF="%s">%s</A>
<TD WIDTH="50%%" ALIGN=RIGHT>%s
</TR><TR>
<TD COLSPAN=2><ADDRESS>%s <A HREF="mailto:%s"><%s></A></ADDRESS>
</TR></TABLE>

</BODY>
</HTML>
''' % (message['toolgen'],message['back_to'],configuration['homepage'],
       configuration['hometitle'],configuration['fullname'],date,
       configuration['mailaddr'],configuration['mailaddr'])

# But this isn't very Pythonish...