python how to reset the url when i define the url in the loop -


there question when want open url url urllib2 , cookielib. fine when define url alone, when define in loop, doesn’t work , gets error couldn't find url.

there code. want solution reset url or way solve question.

i did work getting cookie in front of code password. think cause problem. should wipe cache or reset url ?

import urllib2 import os cookielib import cookiejar htmlparser import htmlparser class myhtmlparser(htmlparser):     def __init__(self):         htmlparser.__init__(self)         self.inlink = false         self.datalist = []         self.directory = '/'         self.indexcol = ';'         self.counter = 0      def handle_starttag(self, tag, attrs):         self.inlink = false         if tag == 'table':             self.counter += 1         if tag == 'a':             name, value in attrs:                 if name == 'href':                     if self.directory in value or self.indexcol in value:                         break                     else:                         self.inlink = true                         self.lasttag = tag      def handle_endtag(self, tag):             if tag == 'table':                 self.counter +=1      def handle_data(self, data):         if self.counter == 1:             if self.lasttag == 'a' , self.inlink , data.strip():                 self.datalist.append(data)  parser = myhtmlparser()    # define function batch downloading def batchjob(files, cookie_jar):     dat in files:         if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) , dat.find("hdf") != -1 , dat.find("xml") == -1:             print "downloading: ", dat             jobrequest = urllib2.request(url + dat)             jobrequest.add_header('cookie', cookie_jar)  # pass saved cookie additional http request             jobredirect_url = urllib2.urlopen(jobrequest).geturl() + '&app_type=401'              # request resource @ modified redirect url             request = urllib2.request(jobredirect_url)             response = urllib2.urlopen(request)             f = open(dat, 'wb')             f.write(response.read())             f.close()             response.close()         else:             continue     print "files downloaded to: ", os.path.dirname(os.path.realpath(__file__))         # user credentials used authenticate access data # user credentials used authenticate access data username = "" password = ""  # create password manager deal 401 reponse returned # earthdata login password_manager = urllib2.httppasswordmgrwithdefaultrealm() password_manager.add_password(none, "https://urs.earthdata.nasa.gov", username, password)  # create cookie jar storing cookies. used store , return # session cookie given use data server (otherwise # keep sending earthdata login authenticate).  ideally, # should use file based cookie jar preserve cookies between runs. # make more efficient.  cookie_jar = cookiejar()  # install handlers. opener = urllib2.build_opener(     urllib2.httpbasicauthhandler(password_manager),     # urllib2.httphandler(debuglevel=1),    # uncomment these 2 lines see     # urllib2.httpshandler(debuglevel=1),   # details of requests/responses     urllib2.httpcookieprocessor(cookie_jar)) urllib2.install_opener(opener)  # create , submit requests. there wide range of exceptions # can thrown here, including httperror , urlerror. these should # caught , handled.  # =============================================================================== # open requeset grab filenames within directory. print optional # =============================================================================== # full url of directory contains files bulk download  x in range(1, 31):     if x < 10:         url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.0' + str(x) + '/'     else:         url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.' + str(x).lstrip("0") + '/'     dirrequest = urllib2.request(url)     dirresponse = urllib2.urlopen(dirrequest)      # redirect url , append 'app_type=401'     # basic http auth     dirredirect_url = dirresponse.geturl()     if x == 1:         dirredirect_url += '&app_type=401'      # request resource @ modified redirect url     dirrequest = urllib2.request(dirredirect_url)     dirbody = urllib2.urlopen(dirrequest).read()      # dirbody = dirresponse.read(dirresponse)     # uses html parser defined above pring content of directory containing data     parser.feed(dirbody)     files = parser.datalist      # display contents of python list declared in htmlparser class     # print files #uncomment print list of files      # ===============================================================================     # call function download files in url     # ===============================================================================     batchjob(files, cookie_jar)  # comment out prevent downloading working directory 

wehen code run in x=2 ,the error occur

traceback (most recent call last):   file "f:/ist/nsidc_parse_html_batchdl.py", line 136, in <module>     dirbody = urllib2.urlopen(dirrequest).read()   file "d:\software\python2.7\lib\urllib2.py", line 154, in urlopen     return opener.open(url, data, timeout)   file "d:\software\python2.7\lib\urllib2.py", line 435, in open     response = meth(req, response)   file "d:\software\python2.7\lib\urllib2.py", line 548, in http_response     'http', request, response, code, msg, hdrs)   file "d:\software\python2.7\lib\urllib2.py", line 473, in error     return self._call_chain(*args)   file "d:\software\python2.7\lib\urllib2.py", line 407, in _call_chain     result = func(*args)   file "d:\software\python2.7\lib\urllib2.py", line 556, in http_error_default     raise httperror(req.get_full_url(), code, msg, hdrs, fp) urllib2.httperror: http error 404: not found 

found bug. solution is:

import urllib2 import os cookielib import cookiejar htmlparser import htmlparser class myhtmlparser(htmlparser):     def __init__(self):         htmlparser.__init__(self)         self.inlink = false         self.datalist = []         self.directory = '/'         self.indexcol = ';'         self.counter = 0      def handle_starttag(self, tag, attrs):         self.inlink = false         if tag == 'table':             self.counter += 1         if tag == 'a':             name, value in attrs:                 if name == 'href':                     if self.directory in value or self.indexcol in value:                         break                     else:                         self.inlink = true                         self.lasttag = tag      def handle_endtag(self, tag):         if tag == 'table':             self.counter +=1      def handle_data(self, data):         if self.counter == 1:             if self.lasttag == 'a' , self.inlink , data.strip():                 self.datalist.append(data)     # define function batch downloading def batchjob(files, cookie_jar):     dat in files:         if (dat.find("h11v28") != -1 or dat.find("h12v28") != -1) , dat.find("hdf") != -1 , dat.find("xml") == -1:             print "downloading: ", url + dat             jobrequest = urllib2.request(url + dat)             jobrequest.add_header('cookie', cookie_jar)  # pass saved cookie additional http request             jobredirect_url = urllib2.urlopen(jobrequest).geturl() + '&app_type=401'              # request resource @ modified redirect url             request = urllib2.request(jobredirect_url)             response = urllib2.urlopen(request)             f = open(dat, 'wb')             f.write(response.read())             f.close()             response.close()         else:             continue     print "files downloaded to: ", os.path.dirname(os.path.realpath(__file__))  # user credentials used authenticate access data username = "" password = ""  # create password manager deal 401 reponse returned # earthdata login password_manager = urllib2.httppasswordmgrwithdefaultrealm() password_manager.add_password(none, "https://urs.earthdata.nasa.gov", username, password)  # create cookie jar storing cookies. used store , return # session cookie given use data server (otherwise # keep sending earthdata login authenticate).  ideally, # should use file based cookie jar preserve cookies between runs. # make more efficient.  cookie_jar = cookiejar()  # install handlers. opener = urllib2.build_opener(     urllib2.httpbasicauthhandler(password_manager),     #urllib2.httphandler(debuglevel=1),    # uncomment these 2 lines see     #urllib2.httpshandler(debuglevel=1),   # details of requests/responses     urllib2.httpcookieprocessor(cookie_jar)) urllib2.install_opener(opener)  # create , submit requests. there wide range of exceptions # can thrown here, including httperror , urlerror. these should # caught , handled.  # =============================================================================== # open requeset grab filenames within directory. print optional # =============================================================================== # full url of directory contains files bulk download  x in range(1, 31):     if x < 10:         url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.0' + str(x) + '/'     else:         url = 'https://n5eil01u.ecs.nsidc.org/most/mod29p1n.006/2015.11.' + str(x).lstrip("0") + '/'     dirrequest = urllib2.request(url)     dirresponse = urllib2.urlopen(dirrequest)      # redirect url , append 'app_type=401'     # basic http auth     dirredirect_url = dirresponse.geturl()     if x == 1:         dirredirect_url += '&app_type=401'      # request resource @ modified redirect url     dirrequest = urllib2.request(dirredirect_url)     dirbody = urllib2.urlopen(dirrequest).read()      # dirbody = dirresponse.read(dirresponse)     # uses html parser defined above pring content of directory containing data     parser = myhtmlparser()     parser.feed(dirbody)     files = parser.datalist      # display contents of python list declared in htmlparser class     print files #uncomment print list of files      # ===============================================================================     # call function download files in url     # ===============================================================================     batchjob(files, cookie_jar) 

please notice instantiate myhtmlparser. if instantiate in original place, data feed parser remains there, meaning file names x=1 still present in files, leads 404.


Comments

Popular posts from this blog

Is there a better way to structure post methods in Class Based Views -

performance - Why is XCHG reg, reg a 3 micro-op instruction on modern Intel architectures? -

c# - Asp.net web api : redirect unauthorized requst to forbidden page -