xlrdが一番速い、openpyxlでセルはちゃんと動いているのか心配になるぐらい遅い
ws.valuesとiter_colsとiter_rowsはあまりかわらない
列が2列だけだと思ってiter_colsを2つにすると約倍遅い、全体を取る方がいい
xlrd < ws.values = iter_cols = iter_rows < Pandas < cell
4.72 < 10.9 = 11 = 11.3 < 32.3 < 1h 7min 12s
# セル cell CPU times: user 1h 7min 7s, sys: 5.18 s, total: 1h 7min 12s Wall time: 1h 7min 15s # 列 iter_cols CPU times: user 10.9 s, sys: 100 ms, total: 11 s Wall time: 11 s # 行 iter_rows CPU times: user 11.1 s, sys: 106 ms, total: 11.3 s Wall time: 11.3 s # 行 ws.values CPU times: user 10.8 s, sys: 98 ms, total: 10.9 s Wall time: 10.9 s # xlrd CPU times: user 4.7 s, sys: 15 ms, total: 4.72 s Wall time: 4.73 s # Pandas CPU times: user 32.2 s, sys: 53 ms, total: 32.3 s Wall time: 32.3 s
import csv import openpyxl
セルひとつずつ
def cell_run(): wb = openpyxl.load_workbook("data.xlsx", read_only=True, data_only=True) result = [] sheets = [n for n in wb.sheetnames if n.startswith("生命表")] for sheet in sheets: ws = wb[sheet] for i in range(1, ws.max_row, 49): v = ws.cell(row=i + 2, column=7).value if v == "平均余命": city = ws.cell(row=i + 1, column=2).value male = ws.cell(row=i + 5, column=7).value female = ws.cell(row=i + 27, column=7).value result.append([city, male, female]) else: print(i) with open("result_cell.csv", "w") as fw: writer = csv.writer(fw, dialect="excel", lineterminator="\n") writer.writerows(result) %time cell_run()
列 iter_cols
def col_run(): wb = openpyxl.load_workbook("data.xlsx", data_only=True) result = [] # sheets = [n for n in wb.sheetnames if n.startswith("生命表")] sheets = wb.sheetnames[1:] for sheet in sheets: ws = wb[sheet] col = [ i for i in ws.iter_cols( min_row=1, max_row=ws.max_row, min_col=1, max_col=7, values_only=True ) ] for i in range(0, ws.max_row, 49): v = col[6][i + 2] if v == "平均余命": city = col[1][i + 1] male = col[6][i + 5] female = col[6][i + 27] result.append([city, male, female]) else: print(i) with open("result_col.csv", "w") as fw: writer = csv.writer(fw, dialect="excel", lineterminator="\n") writer.writerows(result) %time col_run()
行 iter_rows
def row_run(): wb = openpyxl.load_workbook("data.xlsx", data_only=True) result = [] # sheets = [n for n in wb.sheetnames if n.startswith("生命表")] sheets = wb.sheetnames[1:] for sheet in sheets: ws = wb[sheet] row = [ i for i in ws.iter_rows( min_row=1, max_row=ws.max_row, min_col=1, max_col=7, values_only=True ) ] for i in range(0, ws.max_row, 49): v = row[i + 2][6] if v == "平均余命": city = row[i + 1][1] male = row[i + 5][6] female = row[i + 27][6] result.append([city, male, female]) else: print(i) with open("result_row.csv", "w") as fw: writer = csv.writer(fw, dialect="excel", lineterminator="\n") writer.writerows(result) %time row_run()
行 ws.values
def row2_run(): wb = openpyxl.load_workbook("data.xlsx", data_only=True) result = [] # sheets = [n for n in wb.sheetnames if n.startswith("生命表")] sheets = wb.sheetnames[1:] for sheet in sheets: ws = wb[sheet] row = [row for row in ws.values] for i in range(0, ws.max_row, 49): v = row[i + 2][6] if v == "平均余命": city = row[i + 1][1] male = row[i + 5][6] female = row[i + 27][6] result.append([city, male, female]) else: print(i) with open("result_row2.csv", "w") as fw: writer = csv.writer(fw, dialect="excel", lineterminator="\n") writer.writerows(result) %time row2_run()
xlrd
import xlrd def xlrd_run(): wb = xlrd.open_workbook("data.xlsx") sheet_names = wb.sheet_names()[1:] result = [] for sheet_name in sheet_names: sheet = wb.sheet_by_name(sheet_name) n = sheet.nrows data = [sheet.row_values(row) for row in range(sheet.nrows)] for i in range(0, n, 49): v = data[i + 2][6] if v == "平均余命": city = data[i + 1][1] male = data[i + 5][6] female = data[i + 27][6] result.append([male, female]) else: print(i) with open("result_xlrd.csv", "w") as fw: writer = csv.writer(fw, dialect="excel", lineterminator="\n") writer.writerows(result) %time xlrd_run()
Pandas
import pandas as pd def panda_run(): data = [] for i in range(1, 8): df1_tmp = pd.read_excel("data.xlsx", sheet_name=f"生命表{i}", header=None) row, _ = df1_tmp.shape for n in range(0, row, 49): city = df1_tmp.iat[n + 1, 1] male = df1_tmp.iat[n + 5, 6] female = df1_tmp.iat[n + 27, 6] data.append([city, male, female]) df = pd.DataFrame(data) df.to_csv("result_panda.csv") %time panda_run()