前段時間幫同事處理了一個把 CSV 數據導入到 MySQL 的需求。兩個很大的 CSV 文件, 分別有 3GB、2100 萬條記錄和 7GB、3500 萬條記錄。對于這個量級的數據,用簡單的單進程/單線程導入 會耗時很久,最終用了多進程的方式來實現。具體過程不贅述,記錄一下幾個要點:
具體的代碼實現如下:
#!/usr/bin/env python# -*- coding: utf-8 -*-import codecsimport csvimport loggingimport multiprocessingimport osimport warningsimport clickimport MySQLdbimport sqlalchemywarnings.filterwarnings('ignore', category=MySQLdb.Warning)# 批量插入的記錄數量BATCH = 5000DB_URI = 'mysql://root@localhost:3306/example?charset=utf8'engine = sqlalchemy.create_engine(DB_URI)def get_table_cols(table): sql = 'SELECT * FROM `{table}` LIMIT 0'.format(table=table) res = engine.execute(sql) return res.keys()def insert_many(table, cols, rows, cursor): sql = 'INSERT INTO `{table}` ({cols}) VALUES ({marks})'.format( table=table, cols=', '.join(cols), marks=', '.join(['%s'] * len(cols))) cursor.execute(sql, *rows) logging.info('process %s inserted %s rows into table %s', os.getpid(), len(rows), table)def insert_worker(table, cols, queue): rows = [] # 每個子進程創建自己的 engine 對象 cursor = sqlalchemy.create_engine(DB_URI) while True: row = queue.get() if row is None: if rows: insert_many(table, cols, rows, cursor) break rows.append(row) if len(rows) == BATCH: insert_many(table, cols, rows, cursor) rows = []def insert_parallel(table, reader, w=10): cols = get_table_cols(table) # 數據隊列,主進程讀文件并往里寫數據,worker 進程從隊列讀數據 # 注意一下控制隊列的大小,避免消費太慢導致堆積太多數據,占用過多內存 queue = multiprocessing.Queue(maxsize=w*BATCH*2) workers = [] for i in range(w): p = multiprocessing.Process(target=insert_worker, args=(table, cols, queue)) p.start() workers.append(p) logging.info('starting # %s worker process, pid: %s...', i + 1, p.pid) dirty_data_file = './{}_dirty_rows.csv'.format(table) xf = open(dirty_data_file, 'w') writer = csv.writer(xf, delimiter=reader.dialect.delimiter) for line in reader: # 記錄并跳過臟數據: 鍵值數量不一致 if len(line) != len(cols): writer.writerow(line) continue # 把 None 值替換為 'NULL' clean_line = [None if x == 'NULL' else x for x in line] # 往隊列里寫數據 queue.put(tuple(clean_line)) if reader.line_num % 500000 == 0: logging.info('put %s tasks into queue.', reader.line_num) xf.close() # 給每個 worker 發送任務結束的信號 logging.info('send close signal to worker processes') for i in range(w): queue.put(None) for p in workers: p.join()def convert_file_to_utf8(f, rv_file=None): if not rv_file: name, ext = os.path.splitext(f) if isinstance(name, unicode): name = name.encode('utf8') rv_file = '{}_utf8{}'.format(name, ext) logging.info('start to process file %s', f) with open(f) as infd: with open(rv_file, 'w') as outfd: lines = [] loop = 0 chunck = 200000 first_line = infd.readline().strip(codecs.BOM_UTF8).strip() + '/n' lines.append(first_line) for line in infd: clean_line = line.decode('gb18030').encode('utf8') clean_line = clean_line.rstrip() + '/n' lines.append(clean_line) if len(lines) == chunck: outfd.writelines(lines) lines = [] loop += 1 logging.info('processed %s lines.', loop * chunck) outfd.writelines(lines) logging.info('processed %s lines.', loop * chunck + len(lines))@click.group()def cli(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')@cli.command('gbk_to_utf8')@click.argument('f')def convert_gbk_to_utf8(f): convert_file_to_utf8(f)@cli.command('load')@click.option('-t', '--table', required=True, help='表名')@click.option('-i', '--filename', required=True, help='輸入文件')@click.option('-w', '--workers', default=10, help='worker 數量,默認 10')def load_fac_day_pro_nos_sal_table(table, filename, workers): with open(filename) as fd: fd.readline() # skip header reader = csv.reader(fd) insert_parallel(table, reader, w=workers)if __name__ == '__main__': cli()以上就是本文給大家分享的全部沒人了,希望大家能夠喜歡
新聞熱點
疑難解答
圖片精選