Dealing with xhr requests in Python HTML Scraping -
i need scrape entire html journal_url, purpose of example http://onlinelibrary.wiley.com/journal/10.1111/(issn)1467-6281/issues . have followed requests examples displayed on few questions on site, not getting correct html returned either .text or .json() methods requests.get. goal display whole html including ordered list underneath each year , volume pull-down.
import requests import pandas pd import http.cookiejar in range(0,len(df)): journal_name = df.loc[i,"journal full title"] journal_url = df.loc[i,"url"]+"/issues" access_start = df.loc[i,"content start date"] access_end = df.loc[i,"content end date"] #cj = http.cookiejar.cookiejar() #opener = urllib.request.build_opener(urllib.request.httpcookieprocessor(cj)) headers = {"x-requested-with": "xmlhttprequest", "user-agent": "mozilla/5.0 (x11; linux i686) applewebkit/537.36 (khtml, gecko) chrome/36.0.1985.125 safari/537.36"} r = requests.get(journal_url, headers=headers) response = r.text print(response)
if ultimate goal parse content mentioned above page, here is:
import requests ; bs4 import beautifulsoup base_link = "http://onlinelibrary.wiley.com" ; main_link = "http://onlinelibrary.wiley.com/journal/10.1111/(issn)1467-6281/issues" def abacus_scraper(main_link): soup = beautifulsoup(requests.get(main_link).text, "html.parser") titles in soup.select("a.issuesinyear"): title = titles.select("span")[0].text title_link = titles.get("href") main_content(title, title_link) def main_content(item, link): broth = beautifulsoup(requests.get(base_link + link).text, "html.parser") elems = [issue.text issue in broth.select("div.issue a")] print(item, elems) abacus_scraper(main_link)
Comments
Post a Comment