#!/bin/rc
# Usage: crawler http://mysite.link/sitemap.xml
# pulls files out of a sitemap.xml and crawls them

files=`{hget $1 | grep 'https?://[^<]+' | sed 's/<[^>]*>//g' | awk
'{print $1}'}

for(i in $files){
	fname=`{echo $i | awk -F '/' '{print $NF}'}
	fpath=`{echo $i | sed 's/http\:\/\///g' | sed 's/'$fname'//g'}
	mkdir -p $fpath
	@{rfork n; hget $i > $fpath/$fname}
	# a courtesy
	sleep 0.1
}