!pip install python-docx
import requests
from bs4 import BeautifulSoup
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
def get_link():
url = "http://www.pref.osaka.lg.jp/iryo/osakakansensho/corona.html"
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
h3 = soup.find("h3", string="新型コロナウイルスに関連した患者の発生等について")
link = h3.find_next("a", string=re.compile("^新型コロナウイルス感染症患者")).get("href")
return link
link = get_link()
r = requests.get(link, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
url = soup.find("a", string="別紙", href=re.compile(r"\.docx$")).get("href")
Wordファイルをダウンロード
!wget $url -O osaka.docx
Wordファイルから表を抽出
import docx
import csv
doc = docx.Document("osaka.docx")
data = []
for tbls in doc.tables:
for row in tbls.rows:
values = [cell.text for cell in row.cells]
tmp = values[6:].copy()
bikou = "\n".join(sorted(set(tmp), key=tmp.index))
data.append(values[0:6] + [bikou])
with open("data.csv", "w") as fw:
writer = csv.writer(fw)
writer.writerows(data)
データラングリング
import pandas as pd
df = pd.read_csv("data.csv", header=0)
df.head()
df1 = (
df.groupby(["番号", "年代", "性別", "居住地", "症状", "発症日"])
.agg(lambda x: "\n".join(x))
.reset_index()
)
df1.to_csv("osaka.csv", encoding="utf_8_sig")