#!/usr/bin/perl 

#
## Google Sitemap Generator
##
## FC van Westrenen
## umantec
## 7 June 2006
## 4 feb 2008 : upgrade to sitemaps 0.9
##
## http://www.umantec.nl/varia/
##
## based on a Perl script by Tony Lawrence
## http://aplawrence.com/Words2005/2005_06_09.html
##
## using Sitemap Protocol 0.9
## http://www.sitemaps.org/protocol.php
#
# the script searches all directories on your site, up to 3 levels deep
# any file type can be included
# directories can be selected to be ignored
# check the priority calculation and frequency, you may like to change it
#
# more information on Google Sitemaps:
# https://www.google.com/webmasters/sitemaps/docs/en/about.html
# https://www.google.com/webmasters/tools/docs/en/about.html
# http://www.google.com/support/webmasters/bin/answer.py?answer=40318
# http://www.sitemaps.org/
#
# set the next 4 definitions for your situation
# 

my $sitepath="/var/www/mywebsite/"; 
my $website="www.mywebsite.com"; 
my @extentions=qw(.html .pl);
my @ignoredirs=qw(closed/ hidden/);

#
# no need to change anything below this point
#

use LWP::UserAgent;
use HTTP::Request;

chdir($sitepath); 

open(OUT,">sitemap.xml"); 
print OUT <<"EOF1"; 
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
EOF1

for ($depth=1;$depth<=3;$depth++){
  @allfiles=();
  foreach $ext (@extentions){
    @all=`find -L . -mindepth $depth -maxdepth $depth -type f -name "*$ext"`;
    foreach $_ (@all){
      foreach $ignore (@ignoredirs){
        $drop=1 if /$ignore/;
      }
      push(@allfiles,$_) unless ($drop==1);
      $drop=0;
    }
  }
  @allfiles=sort {$a cmp $b} @allfiles;

  foreach (@allfiles) {
    chomp; 
    $unclean=$_; 
    $unclean =~ tr/-_.\/a-zA-Z0-9//cd; 
    print if ($unclean ne $_); 
    s/^..//; 
    $rfile="$sitepath/$_"; 
    ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks)=stat $rfile; 
    ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime($mtime); 
    $year +=1900; 
    $mon++; 
    #
    # uncomment one of the next two lines to set the format of last-modified (long or short)
    # 
    #$lastmod=sprintf("%0.4d-%0.2d-%0.2dT%0.2d:%0.2d:%0.2d+00:00",$year,$mon,$mday,$hour,$min,$sec); 
    $lastmod=sprintf("%0.4d-%0.2d-%0.2d",$year,$mon,$mday); 
    $freq="yearly"; 
    $freq="monthly" if /index.html/;
    $priority=0.9-0.2*$depth;
    $priority+=0.3 if /index.html/; 
    $priority=sprintf ("%1.1f", $priority);
    #
    # uncomment the next line if you prefer directory entries instead of index.html
    # 
    #s/index.html$//;

    print OUT <<"EOF2"; 
 <url> 
  <loc>http://$website/$_</loc> 
  <lastmod>$lastmod</lastmod> 
  <changefreq>$freq</changefreq> 
  <priority>$priority</priority> 
 </url> 
EOF2
  }

}

print OUT <<"EOF3"; 
</urlset>
EOF3

close OUT; 
unlink("sitemap.xml.gz"); 
system("gzip sitemap.xml");

$ua = LWP::UserAgent->new();
$query = "http://www.google.com/webmasters/sitemaps/ping?sitemap=http%3A%2F%2F".$website."%2Fsitemap.xml.gz";
$r=$ua->request(HTTP::Request->new(GET=>$query));

if ($r->is_error()) {
  printf "Failed to connect to Google: %s\n", $r->status_line;
}
elsif ((($r->content) =~ /Sitemap Notification Received/)||
       (($r->content) eq "")) {
  print "Sitemap submitted to Google\n";
}
else {
  printf "Error submitting to Google: %s\n", $r->content;
}
