前段時間幫同事處理了一個把 CSV 數據導入到 MySQL 的需求。兩個很大的 CSV 文件, 分別有 3GB、2100 萬條記錄和 7GB、3500 萬條記錄。對于這個量級的數據,用簡單的單進程/單線程導入 會耗時很久,最終用了多進程的方式來實現。具體過程不贅述,記錄一下幾個要點:
批量插入而不是逐條插入
為了加快插入速度,先不要建索引
生產者和消費者模型,主進程讀文件,多個 worker 進程執行插入
注意控制 worker 的數量,避免對 MySQL 造成太大的壓力
注意處理臟數據導致的異常
原始數據是 GBK 編碼,所以還要注意轉換成 UTF-8
用 click 封裝命令行工具
具體的代碼實現如下:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- importcodecs
- importcsv
- importlogging
- importmultiprocessing
- importos
- importwarnings
- importclick
- importMySQLdb
- importsqlalchemy
- warnings.filterwarnings('ignore', category=MySQLdb.Warning)
- # 批量插入的記錄數量
- BATCH=5000
- DB_URI='mysql://root@localhost:3306/example?charset=utf8'
- engine=sqlalchemy.create_engine(DB_URI)
- defget_table_cols(table):
- sql='SELECT * FROM `{table}` LIMIT 0'.format(table=table)
- res=engine.execute(sql)
- returnres.keys()
- definsert_many(table, cols, rows, cursor):
- sql='INSERT INTO `{table}` ({cols}) VALUES ({marks})'.format(
- table=table,
- cols=', '.join(cols),
- marks=', '.join(['%s']*len(cols)))
- cursor.execute(sql,*rows)
- logging.info('process %s inserted %s rows into table %s', os.getpid(),len(rows), table)
- definsert_worker(table, cols, queue):
- rows=[]
- # 每個子進程創建自己的 engine 對象
- cursor=sqlalchemy.create_engine(DB_URI)
- whileTrue:
- row=queue.get()
- ifrowisNone:
- ifrows:
- insert_many(table, cols, rows, cursor)
- break
- rows.append(row)
- iflen(rows)==BATCH:
- insert_many(table, cols, rows, cursor)
- rows=[]
- definsert_parallel(table, reader, w=10):
- cols=get_table_cols(table)
- # 數據隊列,主進程讀文件并往里寫數據,worker 進程從隊列讀數據
- # 注意一下控制隊列的大小,避免消費太慢導致堆積太多數據,占用過多內存
- queue=multiprocessing.Queue(maxsize=w*BATCH*2)
- workers=[]
- foriinrange(w):
- p=multiprocessing.Process(target=insert_worker, args=(table, cols, queue))
- p.start()
- workers.append(p)
- logging.info('starting # %s worker process, pid: %s...', i+1, p.pid)
- dirty_data_file='./{}_dirty_rows.csv'.format(table)
- xf=open(dirty_data_file,'w')
- writer=csv.writer(xf, delimiter=reader.dialect.delimiter)
- forlineinreader:
- # 記錄并跳過臟數據: 鍵值數量不一致
- iflen(line) !=len(cols):
- writer.writerow(line)
- continue
- # 把 None 值替換為 'NULL'
- clean_line=[Noneifx=='NULL'elsexforxinline]
- # 往隊列里寫數據
- queue.put(tuple(clean_line))
- ifreader.line_num%500000==0:
- logging.info('put %s tasks into queue.', reader.line_num)
- xf.close()
- # 給每個 worker 發送任務結束的信號
- logging.info('send close signal to worker processes')
- foriinrange(w):
- queue.put(None)
- forpinworkers:
- p.join()
- defconvert_file_to_utf8(f, rv_file=None):
- ifnotrv_file:
- name, ext=os.path.splitext(f)
- ifisinstance(name,unicode):
- name=name.encode('utf8')
- rv_file='{}_utf8{}'.format(name, ext)
- logging.info('start to process file %s', f)
- withopen(f) as infd:
- withopen(rv_file,'w') as outfd:
- lines=[]
- loop=0
- chunck=200000
- first_line=infd.readline().strip(codecs.BOM_UTF8).strip()+'/n'
- lines.append(first_line)
- forlineininfd:
- clean_line=line.decode('gb18030').encode('utf8')
- clean_line=clean_line.rstrip()+'/n'
- lines.append(clean_line)
- iflen(lines)==chunck:
- outfd.writelines(lines)
- lines=[]
- loop+=1
- logging.info('processed %s lines.', loop*chunck)
- outfd.writelines(lines)
- logging.info('processed %s lines.', loop*chunck+len(lines))
- @click.group()
- defcli():
- logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
- @cli.command('gbk_to_utf8')
- @click.argument('f')
- defconvert_gbk_to_utf8(f):
- convert_file_to_utf8(f)
- @cli.command('load')
- @click.option('-t','--table', required=True,help='表名')
- @click.option('-i','--filename', required=True,help='輸入文件')
- @click.option('-w','--workers', default=10,help='worker 數量,默認 10')
- defload_fac_day_pro_nos_sal_table(table, filename, workers):
- withopen(filename) as fd:
- fd.readline() # skip header
- reader=csv.reader(fd)
- insert_parallel(table, reader, w=workers)
- //Vevb.com
- if__name__=='__main__':
- cli()
新聞熱點
疑難解答