python批量獲取html內body內容的實例

2020-02-16 00:26:55

字體：大中小

來源：轉載

供稿：網友

現在有一批完整的關于介紹城市美食、景點等的html頁面，需要將里面body的內容提取出來

方法：利用python插件beautifulSoup獲取htmlbody標簽的內容，并批量處理。

# -*- coding:utf8 -*- from bs4 import BeautifulSoupimport osimport os.pathimport sysreload(sys) sys.setdefaultencoding('utf8')   def printPath(level,path):	global allFileNum	#所有文件夾，第一個字段是此目錄的級別	dirList = [] 	#所有文件	fileList = [] 	#返回一個列表，其中包含在目錄條目的名稱	files = os.listdir(path) 	#先添加目錄級別	dirList.append(str(level)) 	for f in files:		if(os.path.isdir(path+'/'+f)):			#排除隱藏文件夾，因為隱藏文件夾過多			if(f[0] == '.'):				pass			else:				#添加隱藏文件夾				dirList.append(f)		if(os.path.isfile(path+'/'+f)):			#添加文件			fileList.append(f)	return (dirList,fileList) #將文件html文件抓取并寫入指定txt文件def getAndInsert(rootdir,savepath,path):	global file_num	f_list = os.listdir(rootdir+'/'+path)	for i in f_list:		temp = os.path.splitext(i)[0]		for num in range(1,11):			if(i==str(num)+'.html'):				#print rootdir+'/'+path+'/'+i				objFile = open(rootdir+'/'+path+'/'+i)				soup = BeautifulSoup(objFile)				arr = []				for child in soup.body:					arr.append(child)				if os.path.exists(savepath+'/'+path):					pass				else:					os.makedirs(savepath+'/'+path)				f = open(savepath+'/'+path+'/'+temp+'.txt','w')				for k,v in enumerate(arr):					if k!=1:						f.write(str(v))				f.close()				print path+'/'+i+' is running'	file_num = file_num + 1			 rootdir = '../zips2'dirList,fileList = printPath(1,rootdir) savepath = "../testC"file_num = 0 for fn in dirList:	if(fn == '1'):		pass	else:		getAndInsert(rootdir,savepath,fn)		print fn+' is ending'print '一共完成'+str(file_num)+'個城市的提取'

以上這篇python批量獲取html內body內容的實例就是小編分享給大家的全部內容了，希望能給大家一個參考，也希望大家多多支持武林站長站。

上一篇：pyshp創建shp點文件的方法

下一篇：python requests.post帶head和body的實例