Python代理抓取并驗證使用多線程實現
2020-02-23 04:53:48
供稿:網友
 
沒有使用隊列,也沒有線程池還在學習只是多線程 
 代碼如下: 
#coding:utf8 
import urllib2,sys,re 
import threading,os 
import time,datetime 
''''' 
這里沒有使用隊列 只是采用多線程分發對代理量不大的網頁還行但是幾百幾千性能就很差了 
''' 
def get_proxy_page(url): 
'''''解析代理頁面 獲取所有代理地址''' 
proxy_list = [] 
p = re.compile(r'''''<div>(.+?)<span class="Apple-tab-span" style="white-space:pre">.*?</span>(.+?)<span class="Apple-tab-span" style="white-space:pre">.+?</span>(.+?)(<span.+?)?</div>''') 
try: 
res = urllib2.urlopen(url) 
except urllib2.URLError: 
print 'url Error' 
sys.exit(1) 
pageinfo = res.read() 
res = p.findall(pageinfo) #取出所有的 
#組合成所有代理服務器列表成一個符合規則的list 
for i in res: 
ip = i[0] 
port = i[1] 
addr = i[2] 
l = (ip,port,addr) 
proxy_list.append(l) 
return proxy_list 
#同步鎖裝飾器 
lock = threading.Lock() 
def synchronous(f): 
def call(*args, **kw): 
lock.acquire() 
try: 
return f(*args, **kw) 
finally: 
lock.release() 
return call 
#時間計算器 
def sumtime(f): 
def call(*args, **kw): 
t1 = time.time() 
try: 
return f(*args, **kw) 
finally: 
print u'總共用時 %s' % (time.time() - t1) 
return call 
proxylist = [] 
reslist = [] 
#獲取單個代理并處理 
@synchronous 
def getoneproxy(): 
global proxylist 
if len(proxylist)>0: 
return proxylist.pop() 
else: 
return '' 
#添加驗證成功的代理 
@synchronous 
def getreslist(proxy): 
global reslist 
if not (proxy in reslist): 
reslist.append(proxy) 
def handle(): 
timeout = 10 
test_url = r'http://www.baidu.com' 
test_str = '030173' 
while 1: 
proxy = getoneproxy() 
#最后一個返回是空 
if not proxy: 
return 
print u"正在驗證 : %s" %proxy[0] 
#第一步啟用 cookie 
cookies = urllib2.HTTPCookieProcessor() 
proxy_server = r'http://%s:%s' %(proxy[0],proxy[1]) 
#第二步 裝載代理 
proxy_hander = urllib2.ProxyHandler({"http":proxy_server}) 
#第三步 組合request 
try: 
opener = urllib2.build_opener(cookies, proxy_hander) 
pass 
except urllib2.URLError: 
print u'url設置錯誤' 
continue 
#配置request 
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1')] 
#發送請求 
urllib2.install_opener(opener) 
t1 = time.time() 
try: 
req = urllib2.urlopen(test_url,timeout=timeout) 
result = req.read() 
pos = result.find(test_str) 
timeused = time.time() - t1 
if pos>1: 
#保存到列表中 
getreslist((proxy[0],proxy[1],proxy[2],timeused)) 
print u'成功采集',proxy[0],timeused 
else: 
continue 
except Exception,e: 
print u'采集失敗 %s :timeout' %proxy[0] 
continue