python how to reset the url when i define the url in the loop -
there question when want open url url urllib2
, cookielib
. fine when define url alone, when define in loop, doesn’t work , gets error couldn't find url.
there code. want solution reset url or way solve question.
i did work getting cookie in front of code password. think cause problem. should wipe cache or reset url ?
import urllib2 import os cookielib import cookiejar htmlparser import htmlparser class myhtmlparser(htmlparser): def __init__(self): htmlparser.__init__(self) self.inlink = false self.datalist = [] self.directory = '/' self.indexcol = ';' self.counter = 0 def handle_starttag(self, tag, attrs): self.inlink = false if tag == 'table': self.counter += 1 if tag == 'a': name, value in attrs: if name == 'href': if self.directory in value or self.indexcol in value: break else: self.inlink = true self.lasttag = tag def handle_endtag(self, tag): if tag == 'table': self.counter +=1 def handle_data(self, data): if self.counter == 1: if self.lasttag == 'a' , self.inlink , data.strip(): self.datalist.append(data) parser = myhtmlparser() # define function batch downloading def batchjob(files, cookie_jar): dat in files: if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) , dat.find("hdf") != -1 , dat.find("xml") == -1: print "downloading: ", dat jobrequest = urllib2.request(url + dat) jobrequest.add_header('cookie', cookie_jar) # pass saved cookie additional http request jobredirect_url = urllib2.urlopen(jobrequest).geturl() + '&app_type=401' # request resource @ modified redirect url request = urllib2.request(jobredirect_url) response = urllib2.urlopen(request) f = open(dat, 'wb') f.write(response.read()) f.close() response.close() else: continue print "files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) # user credentials used authenticate access data # user credentials used authenticate access data username = "" password = "" # create password manager deal 401 reponse returned # earthdata login password_manager = urllib2.httppasswordmgrwithdefaultrealm() password_manager.add_password(none, "https://urs.earthdata.nasa.gov", username, password) # create cookie jar storing cookies. used store , return # session cookie given use data server (otherwise # keep sending earthdata login authenticate). ideally, # should use file based cookie jar preserve cookies between runs. # make more efficient. cookie_jar = cookiejar() # install handlers. opener = urllib2.build_opener( urllib2.httpbasicauthhandler(password_manager), # urllib2.httphandler(debuglevel=1), # uncomment these 2 lines see # urllib2.httpshandler(debuglevel=1), # details of requests/responses urllib2.httpcookieprocessor(cookie_jar)) urllib2.install_opener(opener) # create , submit requests. there wide range of exceptions # can thrown here, including httperror , urlerror. these should # caught , handled. # =============================================================================== # open requeset grab filenames within directory. print optional # =============================================================================== # full url of directory contains files bulk download x in range(1, 31): if x < 10: url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.0' + str(x) + '/' else: url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.' + str(x).lstrip("0") + '/' dirrequest = urllib2.request(url) dirresponse = urllib2.urlopen(dirrequest) # redirect url , append 'app_type=401' # basic http auth dirredirect_url = dirresponse.geturl() if x == 1: dirredirect_url += '&app_type=401' # request resource @ modified redirect url dirrequest = urllib2.request(dirredirect_url) dirbody = urllib2.urlopen(dirrequest).read() # dirbody = dirresponse.read(dirresponse) # uses html parser defined above pring content of directory containing data parser.feed(dirbody) files = parser.datalist # display contents of python list declared in htmlparser class # print files #uncomment print list of files # =============================================================================== # call function download files in url # =============================================================================== batchjob(files, cookie_jar) # comment out prevent downloading working directory
wehen code run in x=2
,the error occur
traceback (most recent call last): file "f:/ist/nsidc_parse_html_batchdl.py", line 136, in <module> dirbody = urllib2.urlopen(dirrequest).read() file "d:\software\python2.7\lib\urllib2.py", line 154, in urlopen return opener.open(url, data, timeout) file "d:\software\python2.7\lib\urllib2.py", line 435, in open response = meth(req, response) file "d:\software\python2.7\lib\urllib2.py", line 548, in http_response 'http', request, response, code, msg, hdrs) file "d:\software\python2.7\lib\urllib2.py", line 473, in error return self._call_chain(*args) file "d:\software\python2.7\lib\urllib2.py", line 407, in _call_chain result = func(*args) file "d:\software\python2.7\lib\urllib2.py", line 556, in http_error_default raise httperror(req.get_full_url(), code, msg, hdrs, fp) urllib2.httperror: http error 404: not found
found bug. solution is:
import urllib2 import os cookielib import cookiejar htmlparser import htmlparser class myhtmlparser(htmlparser): def __init__(self): htmlparser.__init__(self) self.inlink = false self.datalist = [] self.directory = '/' self.indexcol = ';' self.counter = 0 def handle_starttag(self, tag, attrs): self.inlink = false if tag == 'table': self.counter += 1 if tag == 'a': name, value in attrs: if name == 'href': if self.directory in value or self.indexcol in value: break else: self.inlink = true self.lasttag = tag def handle_endtag(self, tag): if tag == 'table': self.counter +=1 def handle_data(self, data): if self.counter == 1: if self.lasttag == 'a' , self.inlink , data.strip(): self.datalist.append(data) # define function batch downloading def batchjob(files, cookie_jar): dat in files: if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) , dat.find("hdf") != -1 , dat.find("xml") == -1: print "downloading: ", url + dat jobrequest = urllib2.request(url + dat) jobrequest.add_header('cookie', cookie_jar) # pass saved cookie additional http request jobredirect_url = urllib2.urlopen(jobrequest).geturl() + '&app_type=401' # request resource @ modified redirect url request = urllib2.request(jobredirect_url) response = urllib2.urlopen(request) f = open(dat, 'wb') f.write(response.read()) f.close() response.close() else: continue print "files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) # user credentials used authenticate access data username = "" password = "" # create password manager deal 401 reponse returned # earthdata login password_manager = urllib2.httppasswordmgrwithdefaultrealm() password_manager.add_password(none, "https://urs.earthdata.nasa.gov", username, password) # create cookie jar storing cookies. used store , return # session cookie given use data server (otherwise # keep sending earthdata login authenticate). ideally, # should use file based cookie jar preserve cookies between runs. # make more efficient. cookie_jar = cookiejar() # install handlers. opener = urllib2.build_opener( urllib2.httpbasicauthhandler(password_manager), #urllib2.httphandler(debuglevel=1), # uncomment these 2 lines see #urllib2.httpshandler(debuglevel=1), # details of requests/responses urllib2.httpcookieprocessor(cookie_jar)) urllib2.install_opener(opener) # create , submit requests. there wide range of exceptions # can thrown here, including httperror , urlerror. these should # caught , handled. # =============================================================================== # open requeset grab filenames within directory. print optional # =============================================================================== # full url of directory contains files bulk download x in range(1, 31): if x < 10: url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.0' + str(x) + '/' else: url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.' + str(x).lstrip("0") + '/' dirrequest = urllib2.request(url) dirresponse = urllib2.urlopen(dirrequest) # redirect url , append 'app_type=401' # basic http auth dirredirect_url = dirresponse.geturl() if x == 1: dirredirect_url += '&app_type=401' # request resource @ modified redirect url dirrequest = urllib2.request(dirredirect_url) dirbody = urllib2.urlopen(dirrequest).read() # dirbody = dirresponse.read(dirresponse) # uses html parser defined above pring content of directory containing data parser = myhtmlparser() parser.feed(dirbody) files = parser.datalist # display contents of python list declared in htmlparser class print files #uncomment print list of files # =============================================================================== # call function download files in url # =============================================================================== batchjob(files, cookie_jar)
please notice instantiate myhtmlparser
. if instantiate in original place, data feed parser remains there, meaning file names x=1
still present in files
, leads 404
.
Comments
Post a Comment