import urllib import urllib2 import os #function to add preceding 0's to pages when necessary def fix(input): desired = 3 while len(input) < desired: input = "0"+input return input # defining what to download prefix = "http://quod.lib.umich.edu/cache/a/g/e/age2118.0001.001/00000" suffix = ".tif100.gif" prefix2 = "http://quod.lib.umich.edu/g/genpub/AGE2118.0001.001/" suffix2 = "?rgn=full+text;view=image" for page in range(398): #set the specific URLs pagestr = fix(str(page+1)) #try to download the picture, retry if it fails try: print "trying page"+pagestr statinfo = os.stat("page"+pagestr+".gif") while statinfo.st_size < 3000: #visiting the page before response = urllib2.urlopen(prefix2+pagestr+suffix2) page_source = response.read() #file0 = open("page"+pagestr+".html",'w') #file0.write(page_source) #print upon completion print "read page "+pagestr #download the picture in gif urllib.urlretrieve (prefix+pagestr+suffix, "page"+pagestr+".gif") statinfo = os.stat("page"+pagestr+".gif") if statinfo.st_size < 3000: specialurl = "http://quod.lib.umich.edu/cache//a/g/e/age2118.0001.001/p0000"+pagestr+".jp2.100.jpg" print specialurl urllib.urlretrieve (specialurl, "page"+pagestr+".gif") #print upon completion print "downloaded picture "+pagestr statinfo = os.stat("page"+pagestr+".gif") except: continue