github.com
import datetime
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import japanize_matplotlib
import matplotlib.pyplot as plt
from janome.tokenizer import Tokenizer
from wordcloud import WordCloud
dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha.htm", header=0)
df_2019 = dfs[2]
dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha30.htm", header=0)
df_2018 = dfs[1]
dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha29.htm", header=0)
df_2017 = dfs[1]
df = pd.concat([df_2019, df_2018, df_2017])
df.head(10)
df1 = df["概 要"].str.extract("[((](.+)[))].*◆.+:(.+)◆.+:(.+)◆.+:(.+)◆.+:(.+)")
df1.isnull().sum()
df[df1.isnull().all(axis=1)]
df1 = df1.apply(lambda x: x.str.strip())
df1 = df1.apply(lambda x: x.str.normalize("NFKC"))
df1.rename(columns={0: "管轄署", 1: "種別", 2: "日時", 3: "場所", 4: "状況"}, inplace=True)
df1["日時"] = df1["日時"].str.replace("ころの", "")
df1["日時"] = df1["日時"].str.replace("分分", "分")
df1["種別"] = df1["種別"].str.replace("等", "")
df1["管轄署"] = df1["管轄署"].str.replace("管内", "署")
df1["管轄署"] = df1["管轄署"].str.replace("署署", "署")
df1.head()
df2 = df1["日時"].str.extract(
"(平成|令和)(.+)年(\d{1,2})月([上中下]旬)?((\d{1,2})日\((.)\))?(午前|午後|昼|夕方)?((\d{1,2})時)?((\d{1,2})分)?(ころ)?"
)
df2.rename(
columns={
0: "和暦",
1: "年",
2: "月",
3: "旬",
5: "日",
6: "曜日",
7: "時間帯",
9: "時",
11: "分",
},
inplace=True,
)
df_temp = df2.loc[:, ["和暦", "年", "月", "日", "曜日", "時間帯", "時", "分"]]
df_temp
df_temp.isnull().sum()
df_temp[df_temp.isnull().any(axis=1)]
df2["和暦"].mask((df2["和暦"] == "令和"), 2018, inplace=True)
df2["和暦"].mask((df2["和暦"] == "平成"), 1988, inplace=True)
df2["年"].mask((df2["年"] == "元"), 1, inplace=True)
df2["時間帯"].mask((df2["時間帯"] == "午前"), 0, inplace=True)
df2["時間帯"].mask((df2["時間帯"] == "午後"), 12, inplace=True)
df2["時間帯"].mask((df2["時間帯"] == "昼"), 12, inplace=True)
df2["時間帯"].mask((df2["時間帯"] == "夕方"), 18, inplace=True)
df2["日"].mask((df2["旬"] == "上旬"), 5, inplace=True)
df2["日"].mask((df2["旬"] == "中旬"), 15, inplace=True)
df2["日"].mask((df2["旬"] == "下旬"), 25, inplace=True)
df2["日"].fillna(1, inplace=True)
df2["時"].fillna(0, inplace=True)
df2["分"].fillna(0, inplace=True)
df2.isnull().sum()
df3 = df2.astype(
{"和暦": int, "年": int, "月": int, "日": int, "時間帯": int, "時": int, "分": int}
)
df3.dtypes
df1["datetime"] = df3.apply(
lambda x: datetime.datetime(
x["和暦"] + x["年"], x["月"], x["日"], x["時間帯"] + x["時"], x["分"]
),
axis=1,
)
df1["年"] = df3.apply(lambda x: x["和暦"] + x["年"], axis=1)
df1["時"] = df3.apply(lambda x: x["時間帯"] + x["時"], axis=1)
df1["月"] = df3["月"]
df1["曜日"] = df3["曜日"]
df1.head(10)
df1
df4 = df1.pivot_table(
values="場所", columns="管轄署", index="種別", aggfunc="count", fill_value=0
)
df4
df4.plot.barh(title="不審者情報(種類別)", stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)
plt.savefig("01.png", dpi=300, bbox_inches="tight")
df4.T.plot.barh(title="不審者情報(管轄署別)", stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)
plt.savefig("02.png", dpi=300, bbox_inches="tight")
df5 = df1.pivot_table(
values="場所", columns="曜日", index="種別", aggfunc="count", fill_value=0
)
df5 = df5.loc[:, ["月", "火", "水", "木", "金", "土", "日"]]
df5.T.plot.bar(subplots=True, layout=(6, 4), figsize=(10, 10), cmap="tab20")
plt.savefig("03.png", dpi=300, bbox_inches="tight")
df5.T.plot.barh(stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)
plt.savefig("04.png", dpi=300, bbox_inches="tight")
df6 = df1.pivot_table(
values="場所", columns="時", index="種別", aggfunc="count", fill_value=0
)
df6
df6.T.plot.bar(subplots=True, layout=(5, 5), figsize=(20, 15), cmap="tab20")
plt.savefig("05.png", dpi=300, bbox_inches="tight")
df7 = df1.pivot_table(
values="場所", columns="月", index="種別", aggfunc="count", fill_value=0
)
df7
df7.T.plot.bar(subplots=True, layout=(5, 5), figsize=(20, 15), cmap="tab20")
plt.savefig("06.png", dpi=300, bbox_inches="tight")
df8 = df1.pivot_table(
values="場所", columns="年", index="種別", aggfunc="count", fill_value=0
)
df8
df8.plot.barh(figsize=(10, 15))
df_type = df1
def counter(texts):
t = Tokenizer()
words_count = defaultdict(int)
words = []
for text in texts:
tokens = t.tokenize(text)
for token in tokens:
pos = token.part_of_speech.split(",")[0]
if pos == "名詞":
words_count[token.base_form] += 1
words.append(token.base_form)
return words_count, words
words_count, words = counter(df_type["状況"])
text = " ".join(words)
s = pd.Series(words_count)
s.sort_values(ascending = False, inplace = True)
s.head(50)
with open("analysis_text.txt", "w", encoding="utf-8") as fw:
fw.write(text)
with open("analysis_text.txt", "r", encoding="utf-8") as fr:
text = fr.read()
fpath = "/usr/share/fonts/opentype/ipafont-gothic/ipagp.ttf"
stop_words = ["身長", "所持", "着用", "容姿", "センチ", "もの"]
wordcloud = WordCloud(
background_color="white",
font_path=fpath,
collocations=False,
stopwords=set(stop_words),
width=900,
height=500,
).generate(text)
plt.figure(figsize=(15, 12))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()