#!/usr/bin/python
import os
import sys
import string
import re
import time
cfg_assign = re.compile(r'^(.*?)\s*=\s*(.*)$')
# Literal dictionary syntax is similar to Perl's
# Note that I had to choose whether config would be the
# filename or the dictionary.
# Python variables hold an object reference and don't
# have a separate space of names depending on the type
# of the object whose reference the variable holds.
# I renamed the dictionary.
configuration = { 'hometitle' : "Eric's Home Page",
'indextitle' : "Map of Eric's Home Web",
'fullname' : "Eric S. Raymond",
'mailaddr' : "esr@thyrsus.com",
'homepage' : "http://www.tuxedo.org/~esr/",
'exclude' : [ "test", "netbuilder", "jargon", "java",
"worlds", "keeper", "mersenne", "netscape" ],
'language' : "english",
'icondirs' : "",
'icontext' : "",
'body' : ""}
# This function will be called by os.path.walk, a function similar to
# Perl's File:Find. os.path.walk(top,makelist,xtra) will call
# makelist(xtra,dir,files) for each directory, starting at top, where
# dir is the current directory, and files is the list of names in the
# current directory.
def makelist(xtra,dir,files):
# I compile the regular expressions once for speed.
# Python's re module uses Perl's regular expression syntax.
desc_re = re.compile(r'<META\s*NAME\s?=\s?"DESCRIPTION"\s*CONTENT\s?=\s?"([^"]*)"',
re.IGNORECASE)
title_re = re.compile(r'<TITLE>([^<]*)</TITLE>',re.IGNORECASE)
for file in files:
title = None
process_flag = 0
# Don't forget that file is just the filename, not the whole path.
fullpath = os.path.join(dir,file)
# Instead of using a regular expression, I just check the extension
# with explicit array slicing.
if fullpath[-5:] in ('.html','shtml') or fullpath[-4:] == '.htm':
process_flag = 1
# xtra is a list. The first element is configuration['exclude'].
# The original sitemap did a regular expression search to see
# whether excluded strings were in the absolute path of each file.
# While this behavior doesn't seem exactly desired, I preserve it
# in the initial translation. In Python, I use string.find instead
# of an actual regular expression.
for entry in xtra[0]:
if process_flag and string.find(fullpath,entry) > -1:
process_flag = 0
# If the file was an .html, .htm, or .shtml file and it wasn't
# ecluded, we process it.
if process_flag:
# Slurp the entire file into the buffer called input.
inFile = open(fullpath,'r')
input = inFile.read()
inFile.close()
# Replace all newlines with spaces in case tags span multiple lines.
# This probably only really works on UNIX, but I'm not sure
# what the re module considers to be an end-of-line.
# The original sitemap was very UNIX-centric.
string.replace(input,'\012',' ')
# Find the title and description. Create match objects.
desc_mo = desc_re.search(input)
title_mo = title_re.search(input)
# Just in case someone forgot a title
if title_mo != None:
title = title_mo.group(1)
else:
title = "No title"
# Don't index files without a 'description' META tag
# Append the path (without the initial './') and its title and description.
# The second element of our xtra argument is a list of tuples.
# I decided to use real nested lists in Python. Ther Perl version
# fakes it with a list of strings joined with the '|' character.
if desc_mo != None:
xtra[1].append((fullpath[2:],title,desc_mo.group(1)))
# This sort function is called after os.path.walk to order
# the pages for which we are making a sitemap.
# x and y are our nested lists. Their first elements are the filenames.
def indsort(x,y):
first = x[0]
second = y[0]
# sort index.html entries as the directory name in
# the parent directory
# Again, we use string slicing in Python instead of a regular expression.
if first[-10:] == 'index.html':
first = first[:-11]
if second[-10:] == 'index.html':
second = second[:-11]
# This forces grouping by subdirectory depth
# We do this a little more concisely than the Perl version.
# We also make it more platform independent by using os.sep.
first = `string.count(first,os.sep)` + first
second = `string.count(second,os.sep)` + second
# With the preparation above, cmp() now does what we want
return cmp(first,second)
# environment veriables are in the environ dictionary of the os module
home = os.environ['HOME'] # simple Python way to do getpwuid?
# We'll throw an index error if we try to access sys.argv[1] when it doesn't exist.
# In Python, we have to deal with errors more explicitly than in Perl.
# Python is almost never quite about errors. Generally, this is not a problem
# since the try-except mechanism (see below) is very simple and powerful.
if len(sys.argv) > 1:
config = sys.argv[1]
else:
# Python interpolation of a variable into a string
config = "%s/.sitemaprc" % home
# A triple-quoted string permits multi-line quotes with unescaped quotes.
sitemappy_author = """Tom Bryan
<<a href="tbryan@python.net">tbryan@python.net</a>>"""
# Whenever we expect some errors, we begin the code with a try statment.
# Below, an except specifies how to handle errors that are thrown from
# this try.
try:
config_file = open(config)
# need to "declere" a dictionary before we assign to it
configuration = {}
configuration['exclude'] = [];
# No assignment in a Python conditional.
# Of course, there's a *much* better way to do this in Python.
while 1:
line = config_file.readline()
if not line: break
else:
# A simple way to look for blank and comment lines is
# to stip all whitespace on the left of the line.
# Then check for an empty string or a string starting with #.
line = string.lstrip(line)
if len(line) == 0 or line[0] == '#':
continue
else:
m_obj = cfg_assign.search(line)
# m_obj gets None on failure
if m_obj:
if (string.lower(m_obj.group[1]) == 'exclude'):
configuration['exclude'].append(m_obj.group[2])
else:
configuration[string.lower(m_obj.group[1])] = m_obj.group[2]
else:
sys.stderr.write("Unrecognized config line %s\n" % line)
# Wow! That was unnecessarily painful.
# See the other versions for a much better way to do this.
config_file.close()
# Perhaps the config file doesn't exist.
# Again, this error must be dealt with explicitly.
# Here, I just ignore it. :)
except IOError:
pass
# Get the localtime.
(year, month, mday, hour, minute, sec, wday, day, isdst) = time.localtime(time.time())
# Set up the multi-language support as in the Perl version.
if configuration['language'] == 'norwegian':
months = ('Jan','Feb','Mar','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Des')
else:
months = ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec')
# Again, a declaration before we can assign to the dictionary.
# This forced declaration actually helps prevent a multitude of errors.
message = {}
# I don't like the way that this is done either.
# It makes sense in code that evolved, I guess.
if (configuration['language'] == 'french'):
message['sitemap'] = 'Carte du site'
message['back_to'] = 'Retour à'
message['autogen'] = 'Cet index a été généré automatiquement à partir de \
balises META présentes dans chaque page. Les pages \
de plus haut niveau sont disponibles au début de cette \
carte du site.'
message['toolgen'] = 'L\'utilitaire de génération automatique est "sitemap.py", \
écrit par %s' % sitemappy_author
date = '%d %s %04d, à %d h %d mn' % (mday, months[month-1], year, hour, minute)
elif (configuration['language'] == 'swedish'):
message['sitemap'] = 'Sajtkarta'
message['back_to'] = 'Tillbaka till'
message['autogen'] = 'Denna indexsida är automatiskt genererad från \
meta-taggar på varje sida. Toppnivåsidor är listade först.'
message['toolgen'] = 'Den här sidan är genererad av "sitemap.py", \
skapad av %s.' % sitemappy_author
date = '%04d-%02d-%02d %02d:%02d' % (year,month,mday,hour,minute)
elif (configuration['language'] == 'german'):
message['sitemap'] = 'Site Map'
message['back_to'] = 'Zurüick zu'
message['autogen'] = 'Dieser Index wurde automatisch generiert aus Meta Tags\
aller Seiten. Top-Level-Seiten werden zuerst gelistet.'
message['toolgen'] = 'Diese Seite wurde generiert von "sitemap.py", \
geschrieben von %s.' % sitemappy_author
date = '%d %s %04d %02d:%02d' % (mday, months[month-1], year,hour,minute)
elif (configuration['language'] == 'norwegian'):
message['sitemap'] = 'Nettkart'
message['back_to'] = 'Tilbake til'
message['autogen'] = 'Dette er en indeks generert automatisk ut fra \
meta-tagger på hver side. Toppnivå-sider er listet først.'
message['toolgen'] = 'Denne siden er generert av "sitemap.py", \
skrevet av %s.' % sitemappy_author
date = '%d %s %04d, at %d:%02d' % (mday, months[month-1], year, hour, minute)
else:
message['sitemap'] = 'Site Map'
message['back_to'] = 'Back to'
message['autogen'] = 'This is an index automatically generated from meta tags \
present in each of the pages. Top-level pages are listed first.'
message['toolgen'] = 'This page generated by "sitemap.py", written by %s.<BR> \
"sitemap.py" is based on "sitemap" by Eric S. Raymond.' % sitemappy_author
date = '%d %s %04d, at %d:%02d' % (mday, months[month-1], year, hour, minute)
if configuration['icondirs'] != '':
incondirs = '<img src="%s" alt="Dir">' % configuration['icondirs']
else: icondirs = ''
if configuration['icontext'] != '':
icontext = '<img src="%s" alt="Text">' % configuration['icontext']
else: icontext = ''
# a dictionary retrieval with a default value
body_attr = configuration.get('body','')
body = '<BODY %s>' % body_attr
# The following string simulate's Perl's heredoc, but it's an
# ugly way to accomplish this task in Pyhton because of the
# large tuple of values that need to be interpolated.
print '''
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<TITLE>%s</TITLE>
<META NAME="KEYWORDS" CONTENT="%s">
<LINK REV=MADE HREF="mailto:%s">
</HEAD>
%s
<TABLE WIDTH="100%%" CELLPADDING=0><TR>
<TD WIDTH="50%%">%s <A HREF="%s">%s</A>
<TD WIDTH="50%%" ALIGN=RIGHT>%s
</TR></TABLE>
<HR><P>
<H1 ALIGN=CENTER>%s</H1>
<P>%s
<DL>
''' % (configuration['indextitle'],message['sitemap'],configuration['mailaddr'],
body, message['back_to'], configuration['homepage'], configuration['hometitle'],
date, message['sitemap'], message['autogen'])
# pages will be a list of tuple.
# Each element of pages will hold the name, title, and description of one HTML page.
pages = []
os.path.walk('.',makelist,(configuration['exclude'],pages))
pages.sort(indsort)
# Python will also complain if you reference a variable to which
# you haven't yet assigned a value.
oldstem = None
for item in pages:
(file, title, desc) = item
# The original got the part of the path before the first slash.
# It seems that stem should be everything but the filename.
# The next line implements this behavior with some platform independence.
newstem = os.path.split(file)[0]
# Print a HR separator.
# This still doesn't quite work because of the way index.html
# files are treated.
if oldstem != newstem:
print '<DT><P ALIGN=RIGHT><HR WIDTH="80%%">\n%s<BR>' % icondirs
print '<DT>%s\n<a href="%s">%s</a>: <B>%s</B><DD>\n\t%s\n' % \
(icontext, file, file, title, desc)
oldstem = newstem
print '''
</DL>
<P>
<HR>
%s
<HR>
<TABLE WIDTH="100%%" CELLPADDING=0><TR>
<TD WIDTH="50%%">%s <A HREF="%s">%s</A>
<TD WIDTH="50%%" ALIGN=RIGHT>%s
</TR><TR>
<TD COLSPAN=2><ADDRESS>%s <A HREF="mailto:%s"><%s></A></ADDRESS>
</TR></TABLE>
</BODY>
</HTML>
''' % (message['toolgen'],message['back_to'],configuration['homepage'],
configuration['hometitle'],configuration['fullname'],date,
configuration['mailaddr'],configuration['mailaddr'])
# But this isn't very Pythonish...