#!/usr/bin/python # Python script for generating sitemap.xml and sitemapindex.xml files. # # Based on protocol specification here: # http://www.sitemaps.org/protocol.php # # Directory walking classes adapted from recipe at ActiveState.com # http://code.activestate.com/recipes/528904/ # # Hugh Brown, 14 Jan 2009 # http://www.iwebthereforeiam.com/iwebthereforeiam/ import os, os.path class Walker(object): def __init__(self, dir, executeHook=None, excludeDirs=[]): self.dir = dir self.executeHook = executeHook self.excludeDirs = excludeDirs def isValidFile(self, fileName): return True def isValidDir(self, dir): # Remove self.dir prefix (exclusions don't apply to the supplied root dir for walking) if dir.startswith( self.dir ): dir = dir[len(self.dir):] # Check all sub-directories in the path of the file subdirs = dir.split('/') return not any(subdir in self.excludeDirs for subdir in subdirs) def executeFile(self, path, args): if self.executeHook and self.isValidFile(path) : self.executeHook(self, path, args) # else subclass Walker and override executeFile def execute(self, args): for root, dirs, fileNames in os.walk(self.dir): for exdir in self.excludeDirs: if exdir in dirs: dirs.remove(exdir) for fileName in fileNames: path = os.path.join(root, fileName) self.executeFile(path, args) class ReWalker(Walker): def __init__(self, prefix, dir, fileMatchRe, executeHook=None, excludeDirs=[]): import re Walker.__init__(self, dir, executeHook, excludeDirs) self.fileMatchPat = re.compile(fileMatchRe) self.prefix = prefix def isValidFile(self, fileName): return self.fileMatchPat.match(fileName) # Utility functions def urlQuote(path) : import urllib return urllib.quote(path) def lastModTime(path) : import time ctime = os.path.getmtime(path) gmtime = time.gmtime(ctime) return time.strftime("%Y-%m-%dT%H:%M:%S+00:00", gmtime) def generateUrl(walker, path, args): fileOffset = path[ len(walker.dir) : ].replace("\\", "/") htmlPath = os.path.join(walker.prefix, fileOffset) s = '\t\n' s += "\t\t%s\n" % urlQuote(htmlPath) s += "\t\t%s\n" % lastModTime(path) ##s += "\t\tmonthly\n" ##s += "\t\t0.5" s += '\t\n' args.write(s) class SiteMapper(object) : def __init__(self, wwwPrefix, startDir, includeFiles) : self.wwwPrefix, self.startDir, self.includeFiles = wwwPrefix, startDir, includeFiles self.sitemapHttp = os.path.join(self.wwwPrefix, "sitemap.xml") self.sitemapPath = os.path.join(self.startDir, "sitemap.xml") excludeDirs = [ ] self.walker = ReWalker(wwwPrefix, startDir, includeFiles, generateUrl, excludeDirs) def generateMap(self) : f = open (self.sitemapPath, "w") f.write('\n') f.write('\n') self.walker.execute(f) f.write('\n') f.close() class SiteMapIndex(object) : def __init__(self, root, maps) : self.root, self.maps = root, maps self.siteMapIndex = os.path.join(self.root, "sitemapindex.xml") def generateIndex(self) : f = open(self.siteMapIndex, "w") s = '\n' s += '\n' for map in maps : sm = SiteMapper(map["wwwPrefix"], map["startDir"], map["includeFiles"]) sm.generateMap() s += "\t\n" s += "\t\t%s\n" % sm.sitemapHttp #s += "\t\t%s" % lastModTime(sm.sitemapPath) s += "\t\n" s += '\n' f.write(s) f.close() if __name__ == "__main__": root = "/vservers/hughdbrown/htdocs/" iwebthereforeiam = dict(zip(("wwwPrefix", "startDir", "includeFiles"), ("http://www.iwebthereforeiam.com/iwebthereforeiam/", '/vservers/hughdbrown/htdocs/iwebthereforeiam/', r".*\.html$"))) maps = [ iwebthereforeiam ] smi = SiteMapIndex(root, maps) smi.generateIndex()