camelotが動かないので
pip install pdfminer.six==20200517
import datetime import pathlib import re import time from urllib.parse import urljoin import pandas as pd import requests from bs4 import BeautifulSoup from tqdm.notebook import tqdm import camelot from japanera import EraDate, EraDateTime, Japanera from japanmap import pref_code OUT_DIR = "download" def mynumber_pdf(tag): if tag.name == "a": text = tag.get_text(strip=True) if text.startswith("マイナンバーカード交付状況"): href = tag.get("href") if href.endswith(".pdf"): return True return False def wareki2date(s): m = re.search("(H|R|平成|令和)([0-9元]{1,2})[.年]([0-9]{1,2})[.月]([0-9]{1,2})日?", s) year, month, day = [1 if i == "元" else int(i) for i in m.group(2, 3, 4)] janera = Japanera() wareki = m.group(1) if m.group(1) in "HR": temp = sorted( janera.era_match(m.group(1), lambda x: x.english_head, lambda x, y: x == y) )[-1] wareki = temp.kanji ws = f"{wareki}{year:02}年{month:02}月{day:02}日" dt = janera.strptime(ws, r"%-E%-o年%m月%d日")[0] return dt.date() def get_pdf(url, file_name): r = requests.get(url) p = pathlib.Path(OUT_DIR, file_name + ".pdf") p.parent.mkdir(parents=True, exist_ok=True) with p.open(mode="wb") as fw: fw.write(r.content) return p if __name__ == "__main__": url = "https://www.soumu.go.jp/kojinbango_card/" cjk = str.maketrans("⻲⻑黑戶⻯⻄⻘⻤", "亀長黒戸竜西青鬼") # cjk = str.maketrans("⺟⺠⻁⻄⻑⻘⻤⻨⻩⻫⻭⻯⻲戶黑", "母民虎西長青鬼麦黄斉歯竜亀戸黒") # PDFファイルをダウンロード r = requests.get(url) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") tags = soup.find_all(mynumber_pdf) for tag in tqdm(tags): link = urljoin(url, tag.get("href")) dt_now = wareki2date(tag.get_text(strip=True)) pdf = get_pdf(link, dt_now.isoformat()) tables = camelot.read_pdf( str(pdf), pages="all", split_text=True, strip_text="\n", line_scale=40 ) # 団体区分別 dt_jinkou = wareki2date(tables[0].df.iat[0, 2]) dt_koufu = wareki2date(tables[0].df.iat[0, 3]) df_summary = tables[0].df.iloc[1:].copy() df_summary.columns = ["区分", "", "人口", "交付枚数", "人口に対する交付枚数率"] df_summary["人口"] = df_summary["人口"].str.replace(",", "").astype(int) df_summary["交付枚数"] = df_summary["交付枚数"].str.replace(",", "").astype(int) df_summary["人口に対する交付枚数率"] = ( df_summary["人口に対する交付枚数率"].str.rstrip("%%").astype(float) ) df_summary["人口算出基準日"] = dt_jinkou df_summary["交付件数基準日"] = dt_koufu df_summary["公開日"] = dt_now df_summary = df_summary.reindex( columns=["公開日", "区分", "", "人口", "交付枚数", "人口に対する交付枚数率", "人口算出基準日", "交付件数基準日"] ) p1 = pathlib.Path(dt_now.isoformat(), "summary_by_types.csv") p1.parent.mkdir(parents=True, exist_ok=True) df_summary.to_csv(str(p1), index=False) # 都道府県一覧 dt_jinkou = wareki2date(tables[3].df.iat[0, 1]) dt_koufu = wareki2date(tables[3].df.iat[0, 2]) df_pref = pd.concat([table.df.loc[1:] for table in tables[3:5]]) df_pref.columns = ["都道府県名", "総数(人口)", "交付枚数", "交付率"] df_pref["都道府県名"] = df_pref["都道府県名"].str.normalize("NFKC") df_pref["都道府県名"] = df_pref["都道府県名"].apply(lambda s: s.translate(cjk)) df_pref["総数(人口)"] = df_pref["総数(人口)"].str.replace(",", "").astype(int) df_pref["交付枚数"] = df_pref["交付枚数"].str.replace(",", "").astype(int) df_pref["交付率"] = df_pref["交付率"].str.rstrip("%%").astype(float) df_pref["人口算出基準日"] = dt_jinkou df_pref["交付件数基準日"] = dt_koufu df_pref["公開日"] = dt_now df_pref = df_pref.reindex( columns=["公開日", "都道府県名", "総数(人口)", "交付枚数", "交付率", "人口算出基準日", "交付件数基準日"] ) df_pref["コード"] = df_pref["都道府県名"].apply(lambda s: pref_code(s)) df_pref = df_pref.set_index("コード").sort_index() p2 = pathlib.Path(dt_now.isoformat(), "all_prefectures.csv") df_pref.to_csv(str(p2), index=False) # 男女・年齢別 n = 5 if dt_now > datetime.date(2017, 3, 8): n = 6 dt_jinkou = wareki2date(tables[5].df.iat[0, 1]) dt_koufu = wareki2date(tables[5].df.iat[0, 4]) df_ages = tables[5].df.iloc[2:].copy() df_ages.columns = [ "年齢", "人口(男)", "人口(女)", "人口(計)", "交付件数(男)", "交付件数(女)", "交付件数(計)", "交付率(男)", "交付率(女)", "交付率(計)", "全体に対する交付件数割合(男)", "全体に対する交付件数割合(女)", "全体に対する交付件数割合(計)", ] df_ages = df_ages.applymap(lambda s: s.rstrip("%%").replace(",", "")) df_ages["人口算出基準日"] = dt_jinkou df_ages["交付件数基準日"] = dt_koufu df_ages["公開日"] = dt_now df_ages = df_ages.reindex( columns=[ "公開日", "年齢", "人口(男)", "人口(女)", "人口(計)", "交付件数(男)", "交付件数(女)", "交付件数(計)", "交付率(男)", "交付率(女)", "交付率(計)", "全体に対する交付件数割合(男)", "全体に対する交付件数割合(女)", "全体に対する交付件数割合(計)", "人口算出基準日", "交付件数基準日", ] ) p3 = pathlib.Path(dt_now.isoformat(), "demographics.csv") df_ages.to_csv(str(p3), index=False) # 市区町村別一覧 dt_jinkou = wareki2date(tables[n].df.iat[0, 2]) dt_koufu = wareki2date(tables[n].df.iat[0, 3]) df_local = pd.concat([table.df.iloc[1:] for table in tables[n:]]) df_local.columns = ["都道府県名", "市区町村名", "総数(人口)", "交付枚数", "交付率"] # 全国を削除 # df_local.drop_duplicates(keep=False, inplace=True) df_local = df_local[df_local["都道府県名"] != "全国"] df_local["都道府県名"] = df_local["都道府県名"].str.normalize("NFKC") df_local["市区町村名"] = df_local["市区町村名"].str.normalize("NFKC") df_local["都道府県名"] = df_local["都道府県名"].apply(lambda s: s.translate(cjk)) df_local["市区町村名"] = df_local["市区町村名"].apply(lambda s: s.translate(cjk)) df_local["総数(人口)"] = df_local["総数(人口)"].str.replace(",", "").astype(int) df_local["交付枚数"] = df_local["交付枚数"].str.replace(",", "").astype(int) df_local["交付率"] = df_local["交付率"].str.rstrip("%%").astype(float) df_local["人口算出基準日"] = dt_jinkou df_local["交付件数基準日"] = dt_koufu df_local["公開日"] = dt_now df_local = df_local.reindex( columns=[ "公開日", "都道府県名", "市区町村名", "総数(人口)", "交付枚数", "交付率", "人口算出基準日", "交付件数基準日", ] ) p4 = pathlib.Path(dt_now.isoformat(), "all_localgovs.csv") df_local.to_csv(str(p4), index=False) time.sleep(3)