drive.google.com
日付修正
行は0スタートでの位置
syugin147.csv
行 |
回・番号 |
内容 |
修正 |
267 |
7-120 |
末尾に提出がない |
昭和二十五年四月六日 |
1207 |
75-27 |
末尾に提出がない |
昭和五十年七月四日 |
1399 |
84-63 |
年が有になっている |
昭和五十三年六月十六日 |
1436 |
87-14 |
昭和が昭利になっている |
昭和五十四年三月二十二日 |
2345 |
136-18 |
撤回 |
|
2573 |
145-18 |
平成が平灰になっている |
平成十一年三月十六日 |
syugin200.csv
行 |
回・番号 |
内容 |
修正 |
5511 |
176-144 |
撤回 |
|
5738 |
177-129 |
撤回 |
|
5740 |
177-131 |
撤回 |
|
5745 |
177-136 |
撤回 |
|
5746 |
177-137 |
撤回 |
|
5748 |
177-139 |
撤回 |
|
5750 |
177-141 |
撤回 |
|
5751 |
177-142 |
撤回 |
|
8038 |
190-222 |
撤回 |
|
エラー箇所
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
www.shugiin.go.jp
- 154-082の答弁書の文字化け確認(①~⑤が文字化け)
- それ以外はテキストでdiffしてみたけどエラー箇所不明?
プログラム
import time
from urllib.parse import urljoin
import csv
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
def get_info(url):
time.sleep(3)
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
info = soup.find("div", {"id": "mainlayout"})
info.find("div", {"id":"breadcrumb"}).decompose()
for i in info.find_all("a", {"title": "質問本文(PDF)へ"}):
i.parent.decompose()
for script in info(["script", "style"]):
script.decompose()
result = "\n".join([i for i in map(lambda x: x.strip(), info.text.splitlines()) if i])
return result
if __name__ == "__main__":
result = []
for i in tqdm_notebook(range(1, 201)):
print(i)
if i < 148:
link = f"http://www.shugiin.go.jp/internet/itdb_shitsumona.nsf/html/shitsumon/kaiji{i:03}_l.htm"
else:
link = f"http://www.shugiin.go.jp/internet/itdb_shitsumon.nsf/html/shitsumon/kaiji{i:03}_l.htm"
time.sleep(3)
r = requests.get(link, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
trs = soup.select("table#shitsumontable tr")
for tr in tqdm_notebook(trs):
tds = tr.select("td")
data = {}
if tds:
data["kaiji"] = i
data["number"] = int(tds[0].get_text(strip=True))
data["kenmei"] = tds[1].get_text(strip=True)
data["status"] = tds[2].get_text(strip=True)
data["klink"] = (
urljoin(link, tds[3].a.get("href")) if tds[3].a else None
)
data["slink"] = (
urljoin(link, tds[4].a.get("href")) if tds[4].a else None
)
data["slink_pdf"] = (
urljoin(link, tds[5].a.get("href")) if tds[5].a else None
)
data["tlink"] = (
urljoin(link, tds[6].a.get("href")) if tds[6].a else None
)
data["tlink_pdf"] = (
urljoin(link, tds[7].a.get("href")) if tds[7].a else None
)
data["stext"] = get_info(data["slink"]) if data["slink"] else None
data["date"] = None
data["name"] = None
print(data["kenmei"])
if data["stext"]:
for s in data["stext"].splitlines():
if s.endswith("提出"):
data["date"] = s
if s.startswith("提出者"):
data["name"] = s.replace("提出者", "").strip()
if data["date"] and data["name"]:
break
data["ttext"] = get_info(data["tlink"]) if data["tlink"] else None
result.append(data)
with open(f"syugin{i:03}.csv", "w", newline="", encoding="utf-8") as fw:
fieldnames = [
"kaiji",
"number",
"date",
"kenmei",
"name",
"status",
"stext",
"ttext",
"klink",
"slink",
"tlink",
"slink_pdf",
"tlink_pdf",
]
writer = csv.DictWriter(fw, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(result)