平均寿命の箱ひげ図をPandasでデータラングリング2

imabari.hateblo.jp

別のExcelファイルからデータラングリングしてみた

確かに都道府県が入っているから楽でした

!wget "https://www.e-stat.go.jp/stat-search/file-download?statInfId=000031693271&fileKind=0" -O data.xlsx
import pandas as pd

data = []

for i in range(1, 8):

    df1_tmp = pd.read_excel(
        "data.xlsx", sheet_name=f"生命表{i}", header=None, na_values="…"
    )

    row, _ = df1_tmp.shape

    for n in range(0, row, 49):

        city = df1_tmp.iat[n + 1, 1]
        male = df1_tmp.iat[n + 5, 6]
        female = df1_tmp.iat[n + 27, 6]

        data.append([city, male, female])

df = pd.DataFrame(data)

df.columns = ["市区町村", "男", "女"]

df_city = df["市区町村"].str.split(r"\s", expand=True)

df_city.columns = ["都道府県", "市郡", "区町村"]

df1 = pd.concat([df_city, df], axis=1).drop("市区町村", axis=1)

df1["集計"] = 1

df1.loc[df1["市郡"].isnull() & df1["区町村"].isnull(), "集計"] = 0

df1.loc[
    (df1.duplicated(subset=["都道府県", "市郡"], keep=False)) & (df1["区町村"].isnull()), "集計"
] = -1

df1.loc[df1["都道府県"] == "全国", "集計"] = -1

df1.loc[df1["市郡"] == "東京都区部", "集計"] = -1

df1[df1["集計"] == -1]

df2 = df1[df1["集計"] > 0].copy().drop("集計", axis=1).reset_index(drop=True)

df2 = df2.loc[:, ["都道府県", "市郡", "区町村", "男", "女"]]

df2.to_csv("data.csv")