Excelをデーターラングリングスピード比較(openpyxl・xlrd・Pandas)

xlrdが一番速い、openpyxlでセルはちゃんと動いているのか心配になるぐらい遅い

ws.valuesとiter_colsとiter_rowsはあまりかわらない

列が2列だけだと思ってiter_colsを2つにすると約倍遅い、全体を取る方がいい

xlrd < ws.values = iter_cols = iter_rows < Pandas < cell

4.72 < 10.9 = 11 = 11.3 < 32.3 < 1h 7min 12s

# セル cell
CPU times: user 1h 7min 7s, sys: 5.18 s, total: 1h 7min 12s
Wall time: 1h 7min 15s

# 列 iter_cols
CPU times: user 10.9 s, sys: 100 ms, total: 11 s
Wall time: 11 s

# 行 iter_rows
CPU times: user 11.1 s, sys: 106 ms, total: 11.3 s
Wall time: 11.3 s

# 行 ws.values
CPU times: user 10.8 s, sys: 98 ms, total: 10.9 s
Wall time: 10.9 s

# xlrd
CPU times: user 4.7 s, sys: 15 ms, total: 4.72 s
Wall time: 4.73 s

# Pandas
CPU times: user 32.2 s, sys: 53 ms, total: 32.3 s
Wall time: 32.3 s
import csv
import openpyxl

セルひとつずつ

def cell_run():

    wb = openpyxl.load_workbook("data.xlsx", read_only=True, data_only=True)

    result = []

    sheets = [n for n in wb.sheetnames if n.startswith("生命表")]

    for sheet in sheets:

        ws = wb[sheet]

        for i in range(1, ws.max_row, 49):

            v = ws.cell(row=i + 2, column=7).value

            if v == "平均余命":

                city = ws.cell(row=i + 1, column=2).value
                male = ws.cell(row=i + 5, column=7).value
                female = ws.cell(row=i + 27, column=7).value

                result.append([city, male, female])

            else:
                print(i)

    with open("result_cell.csv", "w") as fw:
        writer = csv.writer(fw, dialect="excel", lineterminator="\n")
        writer.writerows(result)

%time cell_run()

iter_cols

def col_run():

    wb = openpyxl.load_workbook("data.xlsx", data_only=True)

    result = []

    # sheets = [n for n in wb.sheetnames if n.startswith("生命表")]
    sheets = wb.sheetnames[1:]

    for sheet in sheets:

        ws = wb[sheet]

        col = [
            i
            for i in ws.iter_cols(
                min_row=1, max_row=ws.max_row, min_col=1, max_col=7, values_only=True
            )
        ]

        for i in range(0, ws.max_row, 49):

            v = col[6][i + 2]

            if v == "平均余命":

                city = col[1][i + 1]
                male = col[6][i + 5]
                female = col[6][i + 27]

                result.append([city, male, female])

            else:
                print(i)

    with open("result_col.csv", "w") as fw:
        writer = csv.writer(fw, dialect="excel", lineterminator="\n")

        writer.writerows(result)

%time col_run()

iter_rows

def row_run():

    wb = openpyxl.load_workbook("data.xlsx", data_only=True)

    result = []

    # sheets = [n for n in wb.sheetnames if n.startswith("生命表")]
    sheets = wb.sheetnames[1:]

    for sheet in sheets:

        ws = wb[sheet]

        row = [
            i
            for i in ws.iter_rows(
                min_row=1, max_row=ws.max_row, min_col=1, max_col=7, values_only=True
            )
        ]

        for i in range(0, ws.max_row, 49):

            v = row[i + 2][6]

            if v == "平均余命":

                city = row[i + 1][1]
                male = row[i + 5][6]
                female = row[i + 27][6]

                result.append([city, male, female])

            else:
                print(i)

    with open("result_row.csv", "w") as fw:
        writer = csv.writer(fw, dialect="excel", lineterminator="\n")

        writer.writerows(result)

%time row_run()

行 ws.values

def row2_run():

    wb = openpyxl.load_workbook("data.xlsx", data_only=True)

    result = []

    # sheets = [n for n in wb.sheetnames if n.startswith("生命表")]
    sheets = wb.sheetnames[1:]

    for sheet in sheets:

        ws = wb[sheet]

        row = [row for row in ws.values]

        for i in range(0, ws.max_row, 49):

            v = row[i + 2][6]

            if v == "平均余命":

                city = row[i + 1][1]
                male = row[i + 5][6]
                female = row[i + 27][6]

                result.append([city, male, female])

            else:
                print(i)

    with open("result_row2.csv", "w") as fw:
        writer = csv.writer(fw, dialect="excel", lineterminator="\n")

        writer.writerows(result)

%time row2_run()

xlrd

import xlrd

def xlrd_run():

    wb = xlrd.open_workbook("data.xlsx")
    sheet_names = wb.sheet_names()[1:]

    result = []

    for sheet_name in sheet_names:
        sheet = wb.sheet_by_name(sheet_name)

        n = sheet.nrows

        data = [sheet.row_values(row) for row in range(sheet.nrows)]

        for i in range(0, n, 49):

            v = data[i + 2][6]

            if v == "平均余命":

                city = data[i + 1][1]
                male = data[i + 5][6]
                female = data[i + 27][6]

                result.append([male, female])

            else:
                print(i)

        with open("result_xlrd.csv", "w") as fw:
            writer = csv.writer(fw, dialect="excel", lineterminator="\n")

            writer.writerows(result)

%time xlrd_run()

Pandas

import pandas as pd

def panda_run():

    data = []

    for i in range(1, 8):

        df1_tmp = pd.read_excel("data.xlsx", sheet_name=f"生命表{i}", header=None)

        row, _ = df1_tmp.shape

        for n in range(0, row, 49):

            city = df1_tmp.iat[n + 1, 1]
            male = df1_tmp.iat[n + 5, 6]
            female = df1_tmp.iat[n + 27, 6]

            data.append([city, male, female])

    df = pd.DataFrame(data)

    df.to_csv("result_panda.csv")


%time panda_run()