国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 學(xué)院 > 操作系統(tǒng) > 正文

Scrapy+phantomjs爬取動態(tài)網(wǎng)頁數(shù)據(jù)

2024-06-28 16:02:28
字體:
供稿:網(wǎng)友

安裝phantomjs

安裝包下載地址: http://phantomjs.org/ ,包括 Windows ,Mac OS,linux版本,自行選擇對應(yīng) 版本下載解壓即可( 為方便使用,可自已為phantomjs設(shè)置環(huán)境變量 ),其中帶有一個(gè)example文件夾,里面有很多已經(jīng)寫好的代碼供使用。本文假設(shè)phantomjs已經(jīng)安裝好并已設(shè)置了環(huán)境變量。

Scrapy 中在setting 文件設(shè)置

#phantomjs的文件路徑,這里我復(fù)制到spiders文件中JS_BIN="spiders//phantomjs.exe"LOGIN_TYPE="myCrawl"ROBOTSTXT_OBEY = True#反爬機(jī)制ROBOTSTXT_OBEY = False#設(shè)置取消CookesCOOKIES_ENABLED = False#設(shè)置用戶代理值,隨便瀏覽一個(gè)網(wǎng)頁,按F12 -> Network -> F5,隨便點(diǎn)擊一項(xiàng),你都能看到有 User-agent 這一項(xiàng),將這里面的內(nèi)容拷貝就可以。USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'DOWNLOAD_DELAY = 3CONCURRENT_REQUESTS=100#取消默認(rèn)的useragent,使用新的useragent DOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,#關(guān)閉默認(rèn)下載器 'javaScriptMiddleware.JavascriptMiddleware':543 #鍵為中間件類的路徑,值為中間件的順序 }

編寫中間件

神馬是中間件?

# -*- coding: utf-8 -*-from selenium import webdriverfrom scrapy.conf import settings# from scrapy.http.response import Responsefrom scrapy.http import HtmlResponseimport timefrom scrapy import signalsfrom scrapy.xlib.pydispatch import dispatcher from telnetlib import DOclass JavaScriptMiddleware(object): def __init__(self): if settings['LOGIN_TYPE'] == 'MyCrawl': ''' self.simulation = weibo_login(settings['USERNAME'], settings['PWD'], settings['COOKIE_FILE']) cookie_file = settings['COOKIE_FILE'] cookie_jar = cookielib.LWPCookieJar(cookie_file) cookie_jar.load(ignore_discard=True, ignore_expires=True) self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN']) for c in cookie_jar: self.driver.add_cookie({'name': c.name, 'value': c.value, 'path': '/', 'domain': c.domain}) ''' # simulate user login PRocess self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN'])# 登錄# self.driver.get('http://login.sina.com.cn/')# uid = self.driver.find_element_by_id('username')# upw = self.driver.find_element_by_id('passWord')# loginBtn = self.driver.find_element_by_class_name('smb_btn')# time.sleep(1)# uid.send_keys(settings['USERNAME'])# upw.send_keys(settings['PWD'])# loginBtn.click()# time.sleep(1) elif settings['LOGIN_TYPE'] == 'other': print('add login code') pass else: self.driver = webdriver.PhantomJS(executable_path=settings['JS_BIN']) dispatcher.connect(self.spider_closed, signals.spider_closed) def process_request(self, request, spider): self.driver.get(request.url) print("頁面渲染中····開始自動下拉頁面") indexPage = 1000 while indexPage<self.driver.execute_script("return document.body.offsetHeight"): self.driver.execute_script("scroll(0,"+str(indexPage)+")") indexPage = indexPage +1000 print(indexPage) time.sleep(1) rendered_body = self.driver.page_source #編碼處理 if r'charset="GBK"' in rendered_body or r'charset=gbk' in rendered_body: coding = 'gbk' else: coding = 'utf-8' return HtmlResponse(request.url, body=rendered_body, encoding='utf-8') #關(guān)閉瀏覽器 def spider_closed(self, spider, reason): print ('close driver......') self.driver.close()

最后放蟲咬網(wǎng)站

# -*- coding: utf-8 -*-import scrapyfrom scrapy.http import Requestfrom fileinput import filenamefrom pip._vendor.requests.packages.urllib3 import responsefrom win32ui import GetTypeimport refrom builtins import strclass DmozSpider(scrapy.Spider): name = "crawl007" redis_key = 'blog.csdn.net' start_urls = ["http://blog.csdn.net/u010085423/article/details/54943875"] def parse(self, response): #//*[@id="article_details"]/div[1]/h1/span/a content = response.xpath("http://[@id='article_details']/div[1]/h1/span/a/text()").extract() if content: print(content[0])
發(fā)表評論 共有條評論
用戶名: 密碼:
驗(yàn)證碼: 匿名發(fā)表
主站蜘蛛池模板: 浦县| 治县。| 探索| 清徐县| 观塘区| 松阳县| 健康| 木里| 封丘县| 秀山| 孙吴县| 枣阳市| 通海县| 黄骅市| 吴江市| 吉林市| 辽宁省| 长子县| 营山县| 广东省| 平安县| 普安县| 杨浦区| 惠来县| 博乐市| 枞阳县| 汝城县| 鹿泉市| 昌乐县| 上林县| 曲阜市| 始兴县| 晋中市| 页游| 石门县| 武宁县| 德阳市| 木里| 黄陵县| 乌兰县| 黄陵县|