# SWIG Mirror Script, v. 0.0
# July 6, 1997
# Dave Beazley (beazley@cs.utah.edu)
#
# This Python script mirrors the SWIG WWW site.  All links
# from the master pages are copied to the local host relative
# to the current working directory (ie. this script should be
# run in the same directory where you want the web-pages to go).
#
# How to run :
# ------------
#
#      1.   Edit the setting at the top of this file.  Most
#           importantly, edit the line 'newbase' to point to
#           the full URL where your SWIG mirror is going to be
#           located.
#
#      2.   The variables 'base' and 'ftpbase' point to the
#           SWIG master distribution you are going to mirror.
#
#      3.   Edit the hostinfo string to point to your local
#           site.   This text will automatically get placed
#           into the SWIG master pages.
#
# What does this script do?
# -------------------------
#
#      1.   All active links from the SWIG main page are copied
#           to the current working directory.  Subdirectories
#           will be created as needed. Inline images are also
#           grabbed from the pages.
#
#      2.   All non-relative links involving the 'base' variable
#           below are replaced with the value of 'newbase' you
#           provide.
#
#      3.   All links to files on the SWIG FTP server (ftpbase)
#           are copied into a subdirectory directory FTP.  This
#           is *not* a full mirror of the SWIG FTP server, but
#           it is a mirror of all files that are linked directly
#           from the WWW pages.  Links to these pages are also
#           updated automatically.
#
#      4.   File permissions should be set accordingly during
#           the mirror process.
#
# Notes:
# ------
#      1.   This script is a hack
#      2.   Mirroring the entire site takes about 10-20 minutes
#           depending on how busy the University of Utah
#           servers are.   The lowest traffic period is between
#           04:00-06:00 US mountain standard time (this is about
#           12:00 in Europe). 
#      3.   The SWIG web-page is not updated daily.  A weekly
#           mirror is more than sufficient.
#      4.   The script downloads everything.  It might be
#           more intelligent to check to see if files have
#           changed, but I haven't work that out yet.
#      5.   Sometimes the script terminates with a weird
#           exception in .__del__.  Don't worry about this.
#
# Please report problems to beazley@cs.utah.edu
#

newbase ="http://bifrost.lanl.gov/~dmb/SWIG/"
base    = "http://www.cs.utah.edu/~beazley/SWIG/"
ftpbase = "ftp://ftp.cs.utah.edu/pub/beazley/SWIG/"
hostinfo = """
<p>
This mirror is hosted by the <a href="http://bifrost.lanl.gov">
Condensed Matter and Statistical Physics</a> group at 
<a href="http://www.lanl.gov">Los Alamos National Laboratory.</a>
"""

# Don't touch anything below here

import urllib
import htmllib
import formatter
import posix
import string
import urlparse
import regsub

filter  = base

class MyParser(htmllib.HTMLParser):
	def __init__(self, formatter):
		htmllib.HTMLParser.__init__(self,formatter)
		self.images = []

	def handle_image(self,src,alt,ismap=None, align=None, width=None, height=None):
		self.images.append(src)

# Replaces occurrence of non-relative SWIG page URL with 
# new location

def replace_links(str):
	b = '"'+base
	nb = '"'+newbase
	if string.find(str,b) >= 0:
		print "Replacing link ", base
	
	str =  regsub.gsub(b,nb,str)
	
	# Replace any FTP links

	b = '"'+ftpbase
	nb = '"'+newbase+"FTP/"
	str = regsub.gsub(b,nb,str)

	return regsub.gsub("<!host>",hostinfo,str)

def grab_url(url,dest):
	print "Getting ", url, " --> ", dest
	try:
		u = urllib.urlopen(url)
		str = u.read(10000000)
		u.close()
		h = MyParser(formatter.NullFormatter())
		h.feed(str)
		str = replace_links(str)
		mkdirs(dest)
		f = open(dest,"wb")
		f.write(str)
		f.close()
		posix.system("chmod 644 "+dest)
		return (h.anchorlist, h.images)
		del h
	except:
		print "Unable to open ", url
		return None
		

def grab_image(url,dest):
	print "Getting image ", url, " --> ", dest
	try:
		u = urllib.urlopen(url)
		str = u.read(10000000)
		u.close()
		mkdirs(dest)
		f = open(dest,"wb")
		f.write(str)
		f.close()
		posix.system("chmod 644 "+dest)
	except:
		print "Unable to get ", url

# Code for managing subdirectories under me

subdirs = []

# Split a file name into components and try to make subdirectories

def mkdirs(file):
	dirs = string.split(file,"/")
	str = "."
	for d in dirs[:-1]:
		str = str + "/" + d
		if str in subdirs:
			pass
		else:
			print "Making ", str
			try:
				posix.system("mkdir "+str)
				posix.system("chmod 755 "+str)
			except:
				pass
			subdirs.append(str)
		


urls = []

# Gather all URLs and make a local copy

def gather(url):
	if url in urls:
		return
	urls.append(url)

	# Check to see if the URL is a SWIG FTP access

	if (url[0:len(ftpbase)] == ftpbase):
		# Yep.  We're going to play some games
                # With it

		s = url[len(ftpbase):]
		if (string.find(s,".")) >= 0:
			dest = "FTP/" + url[len(ftpbase):]
	                # Go grab it
			grab_url(url,dest)
			return
		else:
			return

	# Check to make sure the full URL complys with our filter

	if (url[0:len(filter)] != filter):
		return

	# Build the destination directory (from our base)

	dest = url[len(base):]

	# Try to grab the first URL
	d = grab_url(url,dest)

	# Now, we're going to go through and grab links and images
	try:
		links = d[0]
		imgs = d[1]
	except:
		links = []
		imgs = []

	for l in links:
		newu = urlparse.urljoin(url,l,0)
		if string.find(newu,'#') < 0:
			gather(newu)
	for i in imgs:
 		newu = urlparse.urljoin(url,i,0)
		dest = newu[len(base):]
		if newu in urls:
			pass
		else:
			grab_image(newu,dest)
			urls.append(newu)
	
	
gather(base+"index.html")





