3/10からExcelファイルに変更されました
!pip install python-docx
import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin import jaconv headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } url = "http://www.pref.osaka.lg.jp/iryo/osakakansensho/corona.html" r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") h3 = soup.find("h3", string="新型コロナウイルスに関連した患者の発生等について") href = h3.find_next("a", string=re.compile("^大阪府における新型コロナウイルス感染症患者の発生状況"), href=re.compile("\.docx$")).get("href") link = urljoin(url, href)
!wget $link -O osaka.docx
import docx import csv doc = docx.Document("osaka.docx") data = [] for tbls in doc.tables: for row in tbls.rows: values = [jaconv.z2h(cell.text.strip(), digit=True) for cell in row.cells] data.append(values) with open("data.csv", "w", encoding="utf_8_sig") as fw: writer = csv.writer(fw) writer.writerows(data)
import pandas as pd df_tmp = pd.read_csv("data.csv", header=0, na_values="―") df1 = df_tmp[df_tmp["番号"] != "番号"].copy() df2 = df1["居住地"].str.split("\n", expand=True) df2.rename(columns={0: "県", 1: "市"}, inplace=True) df2["市"] = df2["市"].str.strip("()()") df3 = df1["備 考"].str.extract(r"・(退院|入院中|入院調整中|入院予定)(.+$)?", expand=True) df3.rename(columns={0: "状況", 1: "退院日"}, inplace=True) df3["退院日"] = df3["退院日"].str.strip("()()") df1["年代"] = df1["年代"].str.replace("\n", "") df1.drop(["居住地", "備 考"], axis=1, inplace=True) df = pd.concat([df1, df2, df3], axis=1) import datetime # 日付をdatetimeに変換 df["報道提供日"] = pd.to_datetime(df["報道提供日"], format='%m月%d日', errors='coerce').apply(lambda x: x.replace(year=2020)) df["発症日"] = pd.to_datetime(df["発症日"], format='%m月%d日', errors='coerce').apply(lambda x: x.replace(year=2020)) df["退院日"] = pd.to_datetime(df["退院日"], format='%m月%d日', errors='coerce').apply(lambda x: x.replace(year=2020)) df df.to_csv("osaka.csv", encoding="utf_8_sig")
!pip install japanize-matplotlib
import matplotlib.pyplot as plt import japanize_matplotlib df['市'].value_counts().plot.barh() # グラフを保存 plt.savefig('01.png', dpi=200, bbox_inches="tight") plt.show() df['性別'].value_counts().plot.barh() # グラフを保存 plt.savefig('02.png', dpi=200, bbox_inches="tight") plt.show() df['年代'].value_counts().plot.barh() # グラフを保存 plt.savefig('03.png', dpi=200, bbox_inches="tight") plt.show() df['状況'].value_counts().plot.barh() # グラフを保存 plt.savefig('04.png', dpi=200, bbox_inches="tight") plt.show()