#!/usr/bin/perl
#
# sitemap -- synthesize a site map from meta-description data
#
# A silly little Perl script I wrote to automatically generate a map of
# my website. Indexes all pages under the current directory and writes
# the map page to standard output. POD documentation is included at the
# end of this file.
#
# By Eric S. Raymond, Copyright 1997. Use and redistribute freely.
# 1.2 changes by Dave Pearson <davep@hagbard.demon.co.uk>.
# 1.3 changes by Jean-Philippe Argaud <jp.argaud@iname.com>.
# 1.4 changes by ESR.
# 1.5 fix suggested by Imre Simon.
# 1.6 Corrected month array.
# 1.7 Jean-Philippe Argaud's change to support separator icons.
# Erik Rossen <rossen@planet.ch> fixed a bug with wrapped meta tags.
# Swedish-language support added.
# 1.8 German-language support by Michael Wiedmann. Recognize .htm files.
# 1.9 Norwegian-language support added, national month names added by
# Erik I. Bolsø <eriki@himolde.no>
#
use File::Find;
%config = ( hometitle => "Eric's Home Page",
indextitle => "Map of Eric's Home Web",
fullname => "Eric S. Raymond",
mailaddr => "esr\@thyrsus.com",
homepage => "http://www.tuxedo.org/~esr/",
exclude => [ "test", "netbuilder", "jargon", "java",
"worlds", "keeper", "mersenne", "netscape" ],
language => "english",
icondirs => "",
icontext => "",
body => ""
);
$home = $ENV{ 'HOME' } || ( getpwuid( $< ) )[ 7 ] ;
$config = $ARGV[ 0 ] || "$home/.sitemaprc" ;
$sitemapauthor = "Eric S. Raymond\
<<A HREF=\"mailto:esr\@thyrsus.com\">esr\@thyrsus.com</A>>" ;
if ( open( CONFIG, $config ) ) {
$config{ "exclude" } = [];
while ( <CONFIG> ) {
# Ignore comments and blank lines.
next if /^\s*$/;
next if /^\s*\#/;
if ( /^\s*(.*?)\s*=\s*(.*)$/ ) {
if ( lc( $1 ) eq "exclude" ) {
push( @{ $config{ "exclude" } }, $2 );
} else {
$config{ lc( $1 ) } = $2;
}
} else {
print "Unrecognised config line '$_'\n";
}
}
close( CONFIG );
}
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$mon = $mon + 1;
if ( $config{language} eq 'norwegian' ) {
my(@month)=('Jan','Feb','Mar','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Des');
} else {
my(@month)=('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
}
if ( $config{language} eq 'french' ) {
$message{sitemap} = "Carte du site" ;
$message{back_to} = "Retour à" ;
$message{autogen} = "Cet index a été généré automatiquement à partir de\
balises META présentes dans chaque page. Les pages\
de plus haut niveau sont disponibles au début de cette\
carte du site." ;
$message{toolgen} = "L'utilitaire de génération automatique est \"sitemap\",\
écrit par $sitemapauthor." ;
$date = "$mday $month[$mon-1] 19$year, à $hour h $min mn";
} elsif ( $config{language} eq 'swedish' ) {
$message{sitemap} = "Sajtkarta" ;
$message{back_to} = "Tillbaka till" ;
$message{autogen} = "Denna indexsida är automatiskt genererad från " .
"meta-taggar på varje sida. Toppnivåsidor är listade först.";
$message{toolgen} = "Den här sidan är genererad av \"sitemap\", " .
"skapad av $sitemapauthor.";
$date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
$hour, $min);
} elsif ( $config{language} eq 'norwegian' ) {
$message{sitemap} = "Nettkart" ;
$message{back_to} = "Tilbake til" ;
$message{autogen} = "Dette er en indeks generert automatisk ut fra " .
"meta-tagger på hver side. Toppnivå-sider er listet først.";
$message{toolgen} = "Denne siden er generert av \"sitemap\", " .
"skrevet av $sitemapauthor.";
$date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
$hour, $min);
} elsif ( $config{language} eq 'german' ) {
$message{sitemap} = "Site Map" ;
$message{back_to} = "Zurück zu" ;
$message{autogen} = "Dieser Index wurde automatisch generiert aus Meta Tags\
aller Seiten. Top-Level-Seiten werden zuerst gelistet.";
$message{toolgen} = "Diese Seite wurde generiert von \"sitemap\",\
geschrieben von $sitemapauthor." ;
$date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
$hour, $min);
} else {
$message{sitemap} = "Site Map" ;
$message{back_to} = "Back to" ;
$message{autogen} = "This is an index automatically generated from meta tags\
present in each of the pages. Top-level pages are listed\
first." ;
$message{toolgen} = "This page generated by \"sitemap\",\
written by $sitemapauthor." ;
$date = "$mday $month[$mon-1] 19$year, at $hour:$min";
}
if ( "$config{icondirs}" ne "" ) {
$icondirs = "<IMG SRC=\"$config{icondirs}\" ALT=\"Dir\">"
} else { $icondirs = "" }
if ( "$config{icontext}" ne "" ) {
$icontext = "<IMG SRC=\"$config{icontext}\" ALT=\"Text\">"
} else { $icontext = "" }
my $body = $config{body} ? "<BODY $config{body}>" : "<BODY>";
print <<EOF;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<TITLE>$config{ "indextitle" }</TITLE>
<META NAME="KEYWORDS" CONTENT="$message{sitemap}">
<LINK REV=MADE HREF="mailto:$config{ "mailaddr" }">
</HEAD>
$body
<TABLE WIDTH="100%" CELLPADDING=0><TR>
<TD WIDTH="50%">$message{back_to} <A HREF="$config{ "homepage" }">$config{ "hometitle" }</A>
<TD WIDTH="50%" ALIGN=RIGHT>$date
</TR></TABLE>
<HR><P>
<H1 ALIGN=CENTER>$message{sitemap}</H1>
<P>$message{autogen}
<DL>
EOF
&finddepth(\&makelist, ".");
@pages = sort(indsort @pages);
for $item (@pages) {
my($file, $title, $desc) = split(/\|/, $item);
($newstem) = split('/', $file);
print "<DT><P ALIGN=RIGHT><HR WIDTH=\"80\%\">\n$icondirs\n<BR>" if ($oldstem ne $newstem);
print "<DT>$icontext\n<A HREF=\"$file\">$file</A>: <B>$title</B><DD>\n\t$desc\n\n";
$oldstem = $newstem;
}
print <<EOF;
</DL>
<P>
<HR>
$message{toolgen}
<HR>
<TABLE WIDTH="100%" CELLPADDING=0><TR>
<TD WIDTH="50%">$message{back_to} <A HREF="$config{ "homepage" }">$config{ "hometitle" }</A>
<TD WIDTH="50%" ALIGN=RIGHT>$date
</TR><TR>
<TD COLSPAN=2><ADDRESS>$config{ "fullname" } <A HREF="mailto:$config{ "mailaddr" }"><$config{ "mailaddr" }></A></ADDRESS>
</TR></TABLE>
</BODY>
</HTML>
EOF
sub makelist
{
my($file) = substr("$File::Find::dir/$_", 2);
my($title, $desc);
return unless $file =~ /\.s?html?$/;
return if grep($file =~ /$_/, @{ $config{ "exclude" } });
return if (-l $_);
open(FILE, $_);
# next 3 lines suck in the whole file and replace newlines with spaces
undef $/;
$_=<FILE>;
s/\n/ /;
# business as usual...
$desc = $1 if /.*\<META\s*NAME=\"DESCRIPTION\"\s*CONTENT=\"([^"]*)\"\>.*/i;
$title = $1 if /.*\<TITLE\>([^<]*)\<\/TITLE\>.*/i;
# set $INPUT_RECORD_SEPARATOR back to normal
$/ = "\n";
if ($desc) {
push(@pages, "$file|$title|$desc");
}
close(FILE);
return 0;
}
sub indsort
# The clever part; make sure each directory index sorts first
{
my ($first) = split(/\|/, $a);
my ($second) = split(/\|/, $b);
$first = substr($first, 0, -11) if ($first =~ /index.html$/);
$second = substr($second, 0, -11) if ($second =~ /index.html$/);
$first = &depth($first) . $first;
$second = &depth($second) . $second;
return $first cmp $second;
}
sub depth
{
my($name) = @_;
my ($depth) = 0;
for $i (1..length($name)) {
$depth++ if (substr($name, $i, 1) eq '/');
}
return $depth;
}
__END__
=head1 NAME
sitemap - make a site map from meta tags in an HTML tree
=head1 SYNOPSIS
sitemap [config-file]
=head1 DESCRIPTION
B<sitemap> indexes all pages under the current directory and writes an
HTML map page to standard output. The code looks for description
information for each page in a META DESCRIPTION header; if it doesn't
find one, the page is omitted from the index.
B<sitemap> is a Perl script. To configure the strings used in the index
page header and footer, you can create a configuration file in your home
directory called .sitemaprc. Options in the config file are:
=head2 hometitle=<title>
The title of your homepage.
=head2 homepage=<URL>
The URL of your homepage.
=head2 indextitle=<title>
The title for the index page.
=head2 fullname=<name>
Your full name.
=head2 mailaddr=<address>
Your email address.
=head2 language=<language>
The language for text (English, French, German, Norwegian or Swedish).
=head2 icondirs=<icon path>
The path (relative or URL) of the icon for directory changes.
=head2 icontext=<icon path>
The path (relative or URL) of the icon indicating files.
=head2 exclude=<file>
Files to ignore. Use one exclude line per file.
=head1 OPTIONS
B<config-file> is the optional name of a config file. If not supplied the
name ~/.sitemaprc will be used.
=head1 AUTHOR
Eric S. Raymond <esr@thyrsus.com>