www.j-magazine.or.jp
一般社団法人日本雑誌協会より印刷部数公表をスクレイピング
https://www.j-magazine.or.jp/user/printed/index/XX/YY
XX:期間、YY:雑誌種類
少年向けコミック誌
https://www.j-magazine.or.jp/user/printed/index/XX/14
女性週刊誌
https://www.j-magazine.or.jp/user/printed/index/XX/16
プログラム
!pip install -U pandas
!pip install japanize-matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import japanize_matplotlib
from tqdm import tqdm_notebook
import requests
from bs4 import BeautifulSoup
import time
import datetime
import re
url = "https://www.j-magazine.or.jp/user/printed/index"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
dfs = []
p = re.compile(r"(\d{4})年(\d{1,2})月~(\d{4})年(\d{1,2})月")
for i in tqdm_notebook(soup.select("select#period_cd_top > option")):
v = i.get("value")
t = i.get_text(strip=True)
m = p.match(t)
s_year, s_month, e_year, e_month = (int(i) for i in m.groups())
df_tmp = pd.concat(pd.read_html(f"https://www.j-magazine.or.jp/user/printed/index/{v[0]}/14", na_values="※該当するデータがありません。"))
df_tmp["期間"] = t
df_tmp["date"] = datetime.date(s_year, s_month, 1)
dfs.append(df_tmp)
time.sleep(3)
df = pd.concat(dfs)
df.dropna(inplace=True)
df
pv1 = df.pivot(index="期間", columns="雑誌名", values="印刷証明付き発行部数")
pv1
pv1.to_markdown()
pv2 = df.pivot(index="date", columns="雑誌名", values="印刷証明付き発行部数")
pv2
mpl.rcParams["figure.dpi"] = 200
pv2.plot()
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)
plt.savefig("01.png", dpi=200, bbox_inches="tight")
plt.show()