#!/usr/bin/python
# Python script for generating sitemap.xml and sitemapindex.xml files.
#
# Based on protocol specification here:
# http://www.sitemaps.org/protocol.php
#
# Directory walking classes adapted from recipe at ActiveState.com
# http://code.activestate.com/recipes/528904/
#
# Hugh Brown, 14 Jan 2009
# http://www.iwebthereforeiam.com/iwebthereforeiam/
import os, os.path
class Walker(object):
def __init__(self, dir, executeHook=None, excludeDirs=[]):
self.dir = dir
self.executeHook = executeHook
self.excludeDirs = excludeDirs
def isValidFile(self, fileName):
return True
def isValidDir(self, dir):
# Remove self.dir prefix (exclusions don't apply to the supplied root dir for walking)
if dir.startswith( self.dir ):
dir = dir[len(self.dir):]
# Check all sub-directories in the path of the file
subdirs = dir.split('/')
return not any(subdir in self.excludeDirs for subdir in subdirs)
def executeFile(self, path, args):
if self.executeHook and self.isValidFile(path) :
self.executeHook(self, path, args)
# else subclass Walker and override executeFile
def execute(self, args):
for root, dirs, fileNames in os.walk(self.dir):
for exdir in self.excludeDirs:
if exdir in dirs:
dirs.remove(exdir)
for fileName in fileNames:
path = os.path.join(root, fileName)
self.executeFile(path, args)
class ReWalker(Walker):
def __init__(self, prefix, dir, fileMatchRe, executeHook=None, excludeDirs=[]):
import re
Walker.__init__(self, dir, executeHook, excludeDirs)
self.fileMatchPat = re.compile(fileMatchRe)
self.prefix = prefix
def isValidFile(self, fileName):
return self.fileMatchPat.match(fileName)
# Utility functions
def urlQuote(path) :
import urllib
return urllib.quote(path)
def lastModTime(path) :
import time
ctime = os.path.getmtime(path)
gmtime = time.gmtime(ctime)
return time.strftime("%Y-%m-%dT%H:%M:%S+00:00", gmtime)
def generateUrl(walker, path, args):
fileOffset = path[ len(walker.dir) : ].replace("\\", "/")
htmlPath = os.path.join(walker.prefix, fileOffset)
s = '\t\n'
s += "\t\t%s\n" % urlQuote(htmlPath)
s += "\t\t%s\n" % lastModTime(path)
##s += "\t\tmonthly\n"
##s += "\t\t0.5"
s += '\t\n'
args.write(s)
class SiteMapper(object) :
def __init__(self, wwwPrefix, startDir, includeFiles) :
self.wwwPrefix, self.startDir, self.includeFiles = wwwPrefix, startDir, includeFiles
self.sitemapHttp = os.path.join(self.wwwPrefix, "sitemap.xml")
self.sitemapPath = os.path.join(self.startDir, "sitemap.xml")
excludeDirs = [ ]
self.walker = ReWalker(wwwPrefix, startDir, includeFiles, generateUrl, excludeDirs)
def generateMap(self) :
f = open (self.sitemapPath, "w")
f.write('\n')
f.write('\n')
self.walker.execute(f)
f.write('\n')
f.close()
class SiteMapIndex(object) :
def __init__(self, root, maps) :
self.root, self.maps = root, maps
self.siteMapIndex = os.path.join(self.root, "sitemapindex.xml")
def generateIndex(self) :
f = open(self.siteMapIndex, "w")
s = '\n'
s += '\n'
for map in maps :
sm = SiteMapper(map["wwwPrefix"], map["startDir"], map["includeFiles"])
sm.generateMap()
s += "\t\n"
s += "\t\t%s\n" % sm.sitemapHttp
#s += "\t\t%s" % lastModTime(sm.sitemapPath)
s += "\t\n"
s += '\n'
f.write(s)
f.close()
if __name__ == "__main__":
root = "/vservers/hughdbrown/htdocs/"
iwebthereforeiam = dict(zip(("wwwPrefix", "startDir", "includeFiles"), ("http://www.iwebthereforeiam.com/iwebthereforeiam/", '/vservers/hughdbrown/htdocs/iwebthereforeiam/', r".*\.html$")))
maps = [ iwebthereforeiam ]
smi = SiteMapIndex(root, maps)
smi.generateIndex()