python解析html提取數據，并生成word文檔實例解析

2020-01-04 16:03:33

字體：大中小

來源：轉載

供稿：網友

簡介

今天試著用ptyhon做了一個html">抓取網頁內容，并生成word/120754.html">word文檔的功能，功能很簡單，做一下記錄以備以后用到。

生成word用到了第三方組件python/205749.html">python/149904.html">python/51676.html">python-docx，所以先進行第三方組件的安裝。由于windows下安裝的python默認不帶setuptools這個模塊，所以要先安裝setuptools這個模塊。

安裝

1、在python官網上找到　https://bootstrap.pypa.io/ez_setup.py　　，把代碼保存到本地并執行:　 python ez_setup.py

2、下載python-docx 　(https://pypi.python.org/pypi/python-docx/0.7.4)，下載完成后解壓并進入到　　XXX/python-docx-0.7.4　安裝python-docx :　python setup.py install

這樣python-docx就安裝成功了，可以用它來操作word文檔了，word文檔的生成參考的這里https://python-docx.readthedocs.org/en/latest/index.html

html解析用到的是sgmllib里的SGMLParser　　url內容的獲取用到的是urllib、urllib2

實現代碼

# -*- coding: cp936 -*- from sgmllib import SGMLParser import os import sys import urllib import urllib2 from docx import Document from docx.shared import Inches import time  ##獲取要解析的url class GetUrl(SGMLParser):   def __init__(self):     SGMLParser.__init__(self)     self.start=False     self.urlArr=[]     def start_div(self,attr):     for name,value in attr:       if value=="ChairmanCont Bureau":#頁面js中的固定值         self.start=True     def end_div(self):     self.start=False     def start_a(self,attr):     if self.start:       for name,value in attr:         self.urlArr.append(value)            def getUrlArr(self):     return self.urlArr    ##解析上面獲取的url，獲取有用數據 class getManInfo(SGMLParser):   def __init__(self):     SGMLParser.__init__(self)     self.start=False     self.p=False     self.dl=False     self.manInfo=[]     self.subInfo=[]    def start_div(self,attr):     for name,value in attr:       if value=="SpeakerInfo":#頁面js中的固定值         self.start=True    def end_div(self):     self.start=False    def start_p(self,attr):     if self.dl:       self.p=True    def end_p(self):     self.p=False    def start_img(self,attr):     if self.dl:       for name,value in attr:         self.subInfo.append(value)          def handle_data(self,data):     if self.p:       self.subInfo.append(data.decode('utf-8'))     def start_dl(self,attr):     if self.start:       self.dl=True    def end_dl(self):     self.manInfo.append(self.subInfo)     self.subInfo=[]     self.dl=False    def getManInfo(self):     return self.manInfo              urlSource="http://www.XXX" sourceData=urllib2.urlopen(urlSource).read()  startTime=time.clock() ##get urls getUrl=GetUrl() getUrl.feed(sourceData) urlArr=getUrl.getUrlArr() getUrl.close() print "get url use:" + str((time.clock() - startTime)) startTime=time.clock()   ##get maninfos manInfos=getManInfo() for url in urlArr:#one url one person   data=urllib2.urlopen(url).read()   manInfos.feed(data) infos=manInfos.getManInfo() manInfos.close() print "get maninfos use:" + str((time.clock() - startTime)) startTime=time.clock()  #word saveFile=os.getcwd()+"//xxx.docx" doc=Document() ##word title doc.add_heading("HEAD".decode('gbk'),0) p=doc.add_paragraph("HEADCONTENT:".decode('gbk'))   ##write info for infoArr in infos:   i=0   for info in infoArr:     if i==0:##img url       arr1=info.split('.')       suffix=arr1[len(arr1)-1]       arr2=info.split('/')       preffix=arr2[len(arr2)-2]       imgFile=os.getcwd()+"//imgs//"+preffix+"."+suffix       if not os.path.exists(os.getcwd()+"//imgs"):         os.mkdir(os.getcwd()+"//imgs")       imgData=urllib2.urlopen(info).read()        try:         f=open(imgFile,'wb')         f.write(imgData)         f.close()         doc.add_picture(imgFile,width=Inches(1.25))         os.remove(imgFile)       except Exception as err:         print (err)              elif i==1:       doc.add_heading(info+":",level=1)     else:       doc.add_paragraph(info,style='ListBullet')     i=i+1     doc.save(saveFile) print "word use:" + str((time.clock() - startTime))