国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 編程 > Python > 正文

python實現爬取千萬淘寶商品的方法

2019-11-25 17:14:42
字體:
來源:轉載
供稿:網友

本文實例講述了python實現爬取千萬淘寶商品的方法。分享給大家供大家參考。具體實現方法如下:

import timeimport leveldbfrom urllib.parse import quote_plus import reimport jsonimport itertoolsimport sysimport requestsfrom queue import Queuefrom threading import ThreadURL_BASE = 'http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}'def url_get(url):  # print('GET ' + url)  header = dict()  header['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'  header['Accept-Encoding'] = 'gzip,deflate,sdch'  header['Accept-Language'] = 'en-US,en;q=0.8'  header['Connection'] = 'keep-alive'  header['DNT'] = '1'  #header['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36'  header['User-Agent'] = 'Mozilla/12.0 (compatible; MSIE 8.0; Windows NT)'  return requests.get(url, timeout = 5, headers = header).textdef item_thread(cate_queue, db_cate, db_item):  while True:    try:      cate = cate_queue.get()      post_exist = True      try:        state = db_cate.Get(cate.encode('utf-8'))        if state != b'OK': post_exist = False      except:        post_exist = False      if post_exist == True:        print('cate-{}: {} already exists ... Ignore'.format(cate, title))        continue      db_cate.Put(cate.encode('utf-8'), b'crawling')      for item_page in itertools.count(1):        url = URL_BASE.format(quote_plus(cate), item_page)        for tr in range(5):          try:            items_obj = json.loads(url_get(url))            break          except KeyboardInterrupt:            quit()          except Exception as e:            if tr == 4: raise e        if len(items_obj['listItem']) == 0: break        for item in items_obj['listItem']:          item_obj = dict(            _id = int(item['itemNumId']),            name = item['name'],            price = float(item['price']),            query = cate,            category = int(item['category']) if item['category'] != '' else 0,            nick = item['nick'],            area = item['area'])          db_item.Put(str(item_obj['_id']).encode('utf-8'),                json.dumps(item_obj, ensure_ascii = False).encode('utf-8'))        print('Get {} items from {}: {}'.format(len(items_obj['listItem']), cate, item_page))        if 'nav' in items_obj:          for na in items_obj['nav']['navCatList']:            try:              db_cate.Get(na['name'].encode('utf-8'))            except:              db_cate.Put(na['name'].encode('utf-8'), b'waiting')      db_cate.Put(cate.encode('utf-8'), b'OK')      print(cate, 'OK')    except KeyboardInterrupt:      break    except Exception as e:      print('An {} exception occured'.format(e))def cate_thread(cate_queue, db_cate):  while True:    try:      for key, value in db_cate.RangeIter():        if value != b'OK':          print('CateThread: put {} into queue'.format(key.decode('utf-8')))          cate_queue.put(key.decode('utf-8'))      time.sleep(10)    except KeyboardInterrupt:      break    except Exception as e:      print('CateThread: {}'.format(e))if __name__ == '__main__':  db_cate = leveldb.LevelDB('./taobao-cate')  db_item = leveldb.LevelDB('./taobao-item')  orig_cate = '正裝'  try:    db_cate.Get(orig_cate.encode('utf-8'))  except:    db_cate.Put(orig_cate.encode('utf-8'), b'waiting')  cate_queue = Queue(maxsize = 1000)  cate_th = Thread(target = cate_thread, args = (cate_queue, db_cate))  cate_th.start()  item_th = [Thread(target = item_thread, args = (cate_queue, db_cate, db_item)) for _ in range(5)]  for item_t in item_th:    item_t.start()  cate_th.join()

希望本文所述對大家的Python程序設計有所幫助。

發表評論 共有條評論
用戶名: 密碼:
驗證碼: 匿名發表
主站蜘蛛池模板: 许昌县| 宽甸| 亳州市| 都昌县| 四平市| 临潭县| 涪陵区| 扎鲁特旗| 茶陵县| 玉龙| 吴川市| 尼玛县| 淳安县| 汾阳市| 保山市| 武陟县| 呼伦贝尔市| 驻马店市| 西峡县| 土默特左旗| 共和县| 天等县| 历史| 五寨县| 郎溪县| 铜鼓县| 新宾| 会同县| 始兴县| 江阴市| 方山县| 石门县| 青田县| 依安县| 白银市| 双峰县| 阜阳市| 甘德县| 黄石市| 肃南| 阳城县|