Colaboratoryで公開
PDFダウンロードからTSV変換まで
colab.research.google.com
import pandas as pd
from tabula import read_pdf
max_page = 6360
pages = f"3-{max_page}"
df = read_pdf("kameiten_touroku_list.pdf",
lattice=True,
pages=pages,
java_options=["-Xmx2G"],
pandas_options={
"names":
["No.", "都道府県", "市区町村", "事業所名(屋号)", "業種", "区分", "還元率"]
})
print(df)
df = df[df["No."] != "No."]
grouped_df = df.groupby((df["No."] == '1').cumsum())
df1 = grouped_df.get_group(1).copy()
df1.drop(columns="No.", inplace=True)
df1.reset_index(inplace=True, drop=True)
df1.index += 1
print(df1)
df2 = grouped_df.get_group(2).copy()
df2.drop(columns=["No.", "事業所名(屋号)", "業種", "区分", "還元率"], inplace=True)
df2.rename(columns={"都道府県": "事業所名(屋号)", "市区町村": "還元率"}, inplace=True)
df2["業種"] = "EC・通信販売"
df2["区分"] = "楽天市場"
df2.reset_index(inplace=True, drop=True)
df2.index += 1
print(df2)
df3 = grouped_df.get_group(3).copy()
df3.drop(columns=["No.", "事業所名(屋号)", "業種", "区分", "還元率"], inplace=True)
df3.rename(columns={"都道府県": "事業所名(屋号)", "市区町村": "還元率"}, inplace=True)
df3["業種"] = "EC・通信販売"
df3["区分"] = "Yahoo!ショッピング"
df3.reset_index(inplace=True, drop=True)
df3.index += 1
print(df3)
df4 = grouped_df.get_group(4).copy()
df4.drop(columns=["No.", "事業所名(屋号)", "業種", "区分", "還元率"], inplace=True)
df4.rename(columns={"都道府県": "事業所名(屋号)", "市区町村": "還元率"}, inplace=True)
df4["業種"] = "EC・通信販売"
df4["区分"] = "その他ECサイト"
df4.reset_index(inplace=True, drop=True)
df4.index += 1
print(df4)
df_all = pd.concat([df1, df2, df3, df4], sort=False)
df_all["還元率"] = df_all["還元率"].str.rstrip("%").astype(float)
print(df_all)
df_all.to_csv('kameiten_touroku_list.tsv', sep='\t')