#!/usr/bin/perl
#
# sitemap -- synthesize a site map from meta-description data
#
# A silly little Perl script I wrote to automatically generate a map of
# my website.  Indexes all pages under the current directory and writes
# the map page to standard output.  POD documentation is included at the
# end of this file.
#
# By Eric S. Raymond, Copyright 1997.  Use and redistribute freely.
# 1.2 changes by Dave Pearson <davep@hagbard.demon.co.uk>.
# 1.3 changes by Jean-Philippe Argaud <jp.argaud@iname.com>.
# 1.4 changes by ESR.
# 1.5 fix suggested by Imre Simon.
# 1.6 Corrected month array.
# 1.7 Jean-Philippe Argaud's change to support separator icons.
#     Erik Rossen <rossen@planet.ch> fixed a bug with wrapped meta tags.
#     Swedish-language support added.
# 1.8 German-language support by Michael Wiedmann.  Recognize .htm files.
# 1.9 Norwegian-language support added, national month names added by
#     Erik I. Bols <eriki@himolde.no>
#
use File::Find;

%config = ( hometitle  => "Eric's Home Page",
            indextitle => "Map of Eric's Home Web",
            fullname   => "Eric S. Raymond",
            mailaddr   => "esr\@thyrsus.com",
            homepage   => "http://www.tuxedo.org/~esr/",
            exclude    => [ "test", "netbuilder", "jargon", "java",
                            "worlds", "keeper", "mersenne", "netscape" ],
            language   => "english",
            icondirs   => "",
            icontext   => "",
	    body       => ""
            );
$home   = $ENV{ 'HOME' } || ( getpwuid( $< ) )[ 7 ] ;
$config =  $ARGV[ 0 ] || "$home/.sitemaprc" ;
$sitemapauthor = "Eric S. Raymond\
                 <<A HREF=\"mailto:esr\@thyrsus.com\">esr\@thyrsus.com</A>>" ;

if ( open( CONFIG, $config ) ) {
    
    $config{ "exclude" } = [];

    while ( <CONFIG> ) {
        
        # Ignore comments and blank lines.
        next if /^\s*$/;
        next if /^\s*\#/;
        
        if ( /^\s*(.*?)\s*=\s*(.*)$/ ) {
            if ( lc( $1 ) eq "exclude" ) {
                push( @{ $config{ "exclude" } }, $2 );
            } else {
                $config{ lc( $1 ) } = $2;
            }
        } else {
            print "Unrecognised config line '$_'\n";
        }
    }

    close( CONFIG );
}

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$mon = $mon + 1;

if ( $config{language} eq 'norwegian' ) {

my(@month)=('Jan','Feb','Mar','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Des');

} else {

my(@month)=('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');

}

if ( $config{language} eq 'french' ) {
  $message{sitemap} = "Carte du site" ;
  $message{back_to} = "Retour " ;
  $message{autogen} = "Cet index a t gnr automatiquement  partir de\
                       balises META prsentes dans chaque page. Les pages\
                       de plus haut niveau sont disponibles au dbut de cette\
                       carte du site." ;
  $message{toolgen} = "L'utilitaire de gnration automatique est \"sitemap\",\
                       crit par $sitemapauthor." ;
  $date = "$mday $month[$mon-1] 19$year,  $hour h $min mn";
} elsif ( $config{language} eq 'swedish' ) {
  $message{sitemap} = "Sajtkarta" ;
  $message{back_to} = "Tillbaka till" ;
  $message{autogen} = "Denna indexsida r automatiskt genererad frn " .
      "meta-taggar p varje sida. Toppnivsidor r listade frst.";
  $message{toolgen} = "Den hr sidan r genererad av \"sitemap\", " .
      "skapad av $sitemapauthor.";
  $date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
		  $hour, $min);
} elsif ( $config{language} eq 'norwegian' ) {
  $message{sitemap} = "Nettkart" ;
  $message{back_to} = "Tilbake til" ;
  $message{autogen} = "Dette er en indeks generert automatisk ut fra " .
      "meta-tagger p hver side. Toppniv-sider er listet frst.";
  $message{toolgen} = "Denne siden er generert av \"sitemap\", " .
      "skrevet av $sitemapauthor.";
  $date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
		  $hour, $min);
} elsif ( $config{language} eq 'german' ) {
  $message{sitemap} = "Site Map" ;
  $message{back_to} = "Zurück zu" ;
  $message{autogen} = "Dieser Index wurde automatisch generiert aus Meta Tags\
                     aller Seiten. Top-Level-Seiten werden zuerst gelistet.";
  $message{toolgen} = "Diese Seite wurde generiert von \"sitemap\",\
                       geschrieben von $sitemapauthor." ;
  $date = sprintf("%04d-%02d-%02d %02d:%02d", $year+1900, $mon, $mday,
                  $hour, $min);
 } else {
  $message{sitemap} = "Site Map" ;
  $message{back_to} = "Back to" ;
  $message{autogen} = "This is an index automatically generated from meta tags\
                       present in each of the pages. Top-level pages are listed\
                       first." ;
  $message{toolgen} = "This page generated by \"sitemap\",\
                       written by $sitemapauthor." ;
  $date = "$mday $month[$mon-1] 19$year, at $hour:$min";
}

if ( "$config{icondirs}" ne "" ) {
  $icondirs = "<IMG SRC=\"$config{icondirs}\" ALT=\"Dir\">"
} else { $icondirs = "" }

if ( "$config{icontext}" ne "" ) {
  $icontext = "<IMG SRC=\"$config{icontext}\" ALT=\"Text\">"
} else { $icontext = "" }

my $body = $config{body} ? "<BODY $config{body}>" : "<BODY>";

print <<EOF;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
  <TITLE>$config{ "indextitle" }</TITLE>
  <META NAME="KEYWORDS" CONTENT="$message{sitemap}"> 
  <LINK REV=MADE HREF="mailto:$config{ "mailaddr" }">
</HEAD>
$body
<TABLE WIDTH="100%" CELLPADDING=0><TR>
<TD WIDTH="50%">$message{back_to} <A HREF="$config{ "homepage" }">$config{ "hometitle" }</A>
<TD WIDTH="50%" ALIGN=RIGHT>$date
</TR></TABLE>
<HR><P>
<H1 ALIGN=CENTER>$message{sitemap}</H1>

<P>$message{autogen}

<DL>
EOF

&finddepth(\&makelist, ".");

@pages = sort(indsort @pages);

for $item (@pages) {
    my($file, $title, $desc) = split(/\|/, $item);

    ($newstem) = split('/', $file);

    print "<DT><P ALIGN=RIGHT><HR WIDTH=\"80\%\">\n$icondirs\n<BR>" if ($oldstem ne $newstem); 

    print "<DT>$icontext\n<A HREF=\"$file\">$file</A>: <B>$title</B><DD>\n\t$desc\n\n";

    $oldstem = $newstem;
}


print <<EOF;
</DL>
<P>
<HR>
$message{toolgen}
<HR>
<TABLE WIDTH="100%" CELLPADDING=0><TR>
<TD WIDTH="50%">$message{back_to} <A HREF="$config{ "homepage" }">$config{ "hometitle" }</A>
<TD WIDTH="50%" ALIGN=RIGHT>$date
</TR><TR>
<TD COLSPAN=2><ADDRESS>$config{ "fullname" } <A HREF="mailto:$config{ "mailaddr" }"><$config{ "mailaddr" }></A></ADDRESS>
</TR></TABLE>

</BODY>
</HTML>
EOF

sub makelist
{
    my($file) = substr("$File::Find::dir/$_", 2);
    my($title, $desc);

    return unless $file =~ /\.s?html?$/;
    return if grep($file =~ /$_/, @{ $config{ "exclude" } });
    return if (-l $_);

    open(FILE, $_);

    # next 3 lines suck in the whole file and replace newlines with spaces
    undef $/;
    $_=<FILE>;
    s/\n/ /;

    # business as usual...
    $desc = $1 if /.*\<META\s*NAME=\"DESCRIPTION\"\s*CONTENT=\"([^"]*)\"\>.*/i;
    $title = $1 if /.*\<TITLE\>([^<]*)\<\/TITLE\>.*/i;

    # set $INPUT_RECORD_SEPARATOR back to normal
    $/ = "\n";

    if ($desc) {
       push(@pages, "$file|$title|$desc");
    }

    close(FILE);
    return 0;
}

sub indsort
# The clever part; make sure each directory index sorts first
{
    my ($first)  = split(/\|/, $a);
    my ($second) = split(/\|/, $b);

    $first = substr($first, 0, -11) if ($first =~ /index.html$/);
    $second = substr($second, 0, -11) if ($second =~ /index.html$/);

    $first = &depth($first) . $first;
    $second = &depth($second) . $second;

    return $first cmp $second;
}

sub depth 
{
    my($name) = @_;
    my ($depth) = 0;

    for $i (1..length($name)) {
	$depth++ if (substr($name, $i, 1) eq '/');
    }

    return $depth;
}

__END__

=head1 NAME

sitemap - make a site map from meta tags in an HTML tree

=head1 SYNOPSIS

sitemap [config-file]

=head1 DESCRIPTION

B<sitemap> indexes all pages under the current directory and writes an
HTML map page to standard output.  The code looks for description
information for each page in a META DESCRIPTION header; if it doesn't
find one, the page is omitted from the index.

B<sitemap> is a Perl script.  To configure the strings used in the index
page header and footer, you can create a configuration file in your home
directory called .sitemaprc. Options in the config file are:

=head2 hometitle=<title>

The title of your homepage.

=head2 homepage=<URL>

The URL of your homepage.

=head2 indextitle=<title>

The title for the index page.

=head2 fullname=<name>

Your full name.

=head2 mailaddr=<address>

Your email address.

=head2 language=<language>

The language for text (English, French, German, Norwegian or Swedish).

=head2 icondirs=<icon path>

The path (relative or URL) of the icon for directory changes.

=head2 icontext=<icon path>

The path (relative or URL) of the icon indicating files.

=head2 exclude=<file>

Files to ignore. Use one exclude line per file.

=head1 OPTIONS

B<config-file> is the optional name of a config file. If not supplied the
name ~/.sitemaprc will be used.

=head1 AUTHOR

Eric S. Raymond <esr@thyrsus.com>