之前想研究下怎么抓網頁數據.然后就有了下面的練習了.
如有BUG.也純屬正常. 只是練習.請勿投入產品使用.


#!/usr/bin/python# -*- coding: utf-8 -*-#Filenaem: tqjinyan.pyimport os,sys,time,urllib2,reimport cookielibimport multiprocessingimport datetime,time#定義存儲完整的數據字典#keyWord: 具體短鏈接地址,發布時間,標題,平均每天瀏覽量,更新時間,總瀏覽量,鏈接地址userJinYanAll={}#提取用戶總共有多少經驗def tiquNumber(url): regex = re.compile(r'&pn=(/d{1,10})"') web=urllib2.urlopen(url).read() num= regex.findall(web) if not len(num): num.append(1) num=map(int,num) num.sort() return num[-1]#拼接每一頁的鏈接,返回鏈接的列表def retJinYanYe(url,num): # 分頁頁面...' yesNumLianjie=[] for i in range(0,num+1,7): yesNumLianjie.append(url+"&pn="+str(i)) return yesNumLianjie#返回分頁經驗def retNumTitle(jylist): numjisu=0 for url in jylist: numjisu+=1 #定義正則,鏈接,發布時間,標題 regex_href = re.compile(r'<p class="tit"><a href="(.{1,200})" title="') regex_time=re.compile('<span class="exp-time">(.{1,12})</span>') regex_title=re.compile('" title="(.{1,80})" target="_blank">') #定義字典關鍵詞 regex_keyword=re.compile('e/(.{1,50}).html') #獲取web分頁中的數據 web=urllib2.urlopen(url).read() #獲取鏈接,發布時間,標題 href=regex_href.findall(web) exp_time=regex_time.findall(web) title=regex_title.findall(web) #進行循環添加至列表的字典中 # print url for i in range(0,len(title)): #定義一個空列表,用于添加至列表字典中 userlist=[] keyword = regex_keyword.findall(href[i]) # print keyword userlist.append(href[i]) userlist.append(exp_time[i]) userlist.append(title[i]) # print keyword userJinYanAll[keyword[0]]=userlist # printstdout('/r正在獲取第 %i 頁的經驗信息...' % numjisu) # print userJinYanAll#根據地址,使用cookie瀏覽具體頁面,返回瀏覽量,更新時間def retLiuLanNum(keyword,url,i): loginUrl='http://jingyan.baidu.com'+url #以cookie來訪問具體的網頁 # cj = cookielib.CookieJar() # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) # urllib2.install_opener(opener) # resp = urllib2.urlopen(loginUrl) req=urllib2.Request(loginUrl,data="") f=urllib2.urlopen(req).read() regex_liulannum = re.compile(r'<span class="views">(/d{1,10})</span>') regex_updateTime=re.compile(r'<time>(.{1,20})</time>') viewsNum=regex_liulannum.findall(f) updateTime=regex_updateTime.findall(f) #平均流量 if int(viewsNum[0])!=0: jianGeDay=pingJunNum(keyword,updateTime[0],viewsNum[0]) pjNum=int(viewsNum[0])/int(jianGeDay)/1.00 if pjNum<1: userJinYanAll[keyword].append('-') else: userJinYanAll[keyword].append(str(pjNum)) # print pjNum else: userJinYanAll[keyword].append('-') # print pingJunNum(keyword,updateTime,viewsNum) # sys.exit() # print viewsNum,updateTime userJinYanAll[keyword].append(updateTime[0]) userJinYanAll[keyword].append(viewsNum[0]) userJinYanAll[keyword].append(loginUrl) # print userJinYanAll # sys.exit()
#下面這句.因為格式需要<換成兩行,如需測試.自行優化下格式.
print str(i)+"/t/t"+userJinYanAll[keyword][1]+"/t"+userJinYanAll[keyword][5]+
"/t"+userJinYanAll[keyword][3]+"/t"+userJinYanAll[keyword][2]+"/t"+userJinYanAll[keyword][6]def getcookie(): loginUrl='http://jingyan.baidu.com/article/ed2a5d1f1938f909f7be174f.html' cj = cookielib.CookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp=urllib2.urlopen(loginUrl) f = urllib2.urlopen(url= loginUrl)def menu(url): try: #獲取用戶姝經驗分頁 # print '提取經驗總數量...' numYe=tiquNumber(url) #根據分頁鏈接獲取每頁上面的鏈接 jylist=retJinYanYe(url,numYe) # print '總共有:%s頁經驗.'%len(jylist) # print '根據分頁信息獲取具體鏈接..' # print jylist retNumTitle(jylist) # for t in jylist: #根據鏈接生成字典數據 # retNumTitle(t) # print '共有:%s篇經驗.'%len(userJinYanAll) # print '獲取Cookie信息中...' getcookie() # print '獲取每篇經驗具體的經驗信息,此時間比較久,請耐心等待...' # print '----------輸出具體的經驗列表----------' # print '序列/t發布時間/t總瀏量/t平均每天的瀏覽量/t/t標題/t/t/t具體鏈接' i=0 for k,v in userJinYanAll.items(): i+=1 retLiuLanNum(k,v[0],i) # print "%s:%s"%(k,v) # for k,v in userJinYanAll.items(): # print k,v # print v[4]+"/t"+v[2]+"/t"+v[1]+"/t"+v[3]+"/t"+v[5] # print '-'*50 # print userJinYanAll return userJinYanAll # userjianyanpaixu=sorted(userJinYanAll.iteritems(),key=lambda asd:asd[1],reverse=True) # for k,v in userjianyanpaixu.items(): # i+=1 # print str(i)+"/t/t"+userjianyanpaixu[1]+"/t"+userjianyanpaixu[5]+"/t"+userjianyanpaixu[3]+
# "/t"+userjianyanpaixu[2]+"/t"+userjianyanpaixu[6] # # print userJinYanAll except KeyboardInterrupt,e: return e # print "QUIT"def printstdout(printname): sys.stdout.write("/r%s"%printname) sys.stdout.flush()def pingJunNum(keyword,update,num): # print keyword,update,num updatetime=datetime.datetime.strptime(update,'%Y-%m-%d %H:%M') newde=datetime.datetime.now() chadate= newde-updatetime return str(chadate).split(' ')[0]def sys_input(): url_baidu='http://jingyan.baidu.com/user/npublic/expList?un=' raw_str=urllib2.quote(raw_input('請輸入用戶百度經驗ID: ')) url=url_baidu+raw_str menu(url) # userjianyanpaixu=sorted(userJinYanAll.iteritems(),key=lambda asd:asd[1],reverse=True) # print userjianyanpaixu # for i in userjianyanpaixu: # print i[1]def sys_input_wap(baidujyid): url_baidu='http://jingyan.baidu.com/user/npublic/expList?un=' # raw_str=urllib2.quote(baidujyid) url=url_baidu+baidujyid returntest(url) # return urldef returntest(url): web=urllib2.urlopen(url).read() return webif __name__=="__main__": sys_input()
=======================================================
|| 好吧. 代碼如上. 或許有BUG.并且沒有進行重構.優化. ||
|| 已知問題.百度會有相關IP訪問頻率屏蔽.所以不適合商用.僅測試. ||
=======================================================
版權聲明:本文為博主原創文章,未經博主允許不得轉載。
|
新聞熱點
疑難解答