愛媛県警の不審者情報をテキストマイニング

github.com
import datetime
from collections import Counter, defaultdict

import numpy as np
import pandas as pd

import japanize_matplotlib
import matplotlib.pyplot as plt

from janome.tokenizer import Tokenizer
from wordcloud import WordCloud

dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha.htm", header=0)

df_2019 = dfs[2]

dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha30.htm", header=0)

df_2018 = dfs[1]

dfs = pd.read_html("https://www.police.pref.ehime.jp/fushinsha29.htm", header=0)

df_2017 = dfs[1]

df = pd.concat([df_2019, df_2018, df_2017])


df.head(10)

# 概要から抽出
df1 = df["概　要"].str.extract("[(（](.+)[）)].*◆.+：(.+)◆.+：(.+)◆.+：(.+)◆.+：(.+)")

# 欠損値の個数確認
df1.isnull().sum()

# 欠損値のデータ確認
df[df1.isnull().all(axis=1)]

# 前後の空白文字を削除
df1 = df1.apply(lambda x: x.str.strip())

# 正規化
df1 = df1.apply(lambda x: x.str.normalize("NFKC"))

# 列名
df1.rename(columns={0: "管轄署", 1: "種別", 2: "日時", 3: "場所", 4: "状況"}, inplace=True)

# 平成29年度のみ
df1["日時"] = df1["日時"].str.replace("ころの", "")
df1["日時"] = df1["日時"].str.replace("分分", "分")

# 種別の等を除去
df1["種別"] = df1["種別"].str.replace("等", "")

# 管内
df1["管轄署"] = df1["管轄署"].str.replace("管内", "署")
df1["管轄署"] = df1["管轄署"].str.replace("署署", "署")

# データ確認
df1.head()


# 日時から日付と時間を抽出
df2 = df1["日時"].str.extract(
    "(平成|令和)(.+)年(\d{1,2})月([上中下]旬)?((\d{1,2})日\((.)\))?(午前|午後|昼|夕方)?((\d{1,2})時)?((\d{1,2})分)?(ころ)?"
)

# 列名変更
df2.rename(
    columns={
        0: "和暦",
        1: "年",
        2: "月",
        3: "旬",
        5: "日",
        6: "曜日",
        7: "時間帯",
        9: "時",
        11: "分",
    },
    inplace=True,
)

# 確認用に抽出
df_temp = df2.loc[:, ["和暦", "年", "月", "日", "曜日", "時間帯", "時", "分"]]
df_temp

# 欠損値の個数確認
df_temp.isnull().sum()

# 欠損値のデータ確認
df_temp[df_temp.isnull().any(axis=1)]


# 和暦を西暦に変換
df2["和暦"].mask((df2["和暦"] == "令和"), 2018, inplace=True)
df2["和暦"].mask((df2["和暦"] == "平成"), 1988, inplace=True)

# 元年を1年に変換
df2["年"].mask((df2["年"] == "元"), 1, inplace=True)

# 午前は0時
df2["時間帯"].mask((df2["時間帯"] == "午前"), 0, inplace=True)

# 午後は12時
df2["時間帯"].mask((df2["時間帯"] == "午後"), 12, inplace=True)

# 昼は12時
df2["時間帯"].mask((df2["時間帯"] == "昼"), 12, inplace=True)

# 夕方は18時
df2["時間帯"].mask((df2["時間帯"] == "夕方"), 18, inplace=True)

# 上旬は5日
df2["日"].mask((df2["旬"] == "上旬"), 5, inplace=True)

# 中旬は15日
df2["日"].mask((df2["旬"] == "中旬"), 15, inplace=True)

# 下旬は25日
df2["日"].mask((df2["旬"] == "下旬"), 25, inplace=True)

# 欠損値は最初に設定
df2["日"].fillna(1, inplace=True)
df2["時"].fillna(0, inplace=True)
df2["分"].fillna(0, inplace=True)

# 欠損値の個数確認
df2.isnull().sum()

# 整数型に変換
df3 = df2.astype(
    {"和暦": int, "年": int, "月": int, "日": int, "時間帯": int, "時": int, "分": int}
)

# 型を確認
df3.dtypes

# datetimeに変換
df1["datetime"] = df3.apply(
    lambda x: datetime.datetime(
        x["和暦"] + x["年"], x["月"], x["日"], x["時間帯"] + x["時"], x["分"]
    ),
    axis=1,
)

# 年に変換
df1["年"] = df3.apply(lambda x: x["和暦"] + x["年"], axis=1)

# 時間に変換
df1["時"] = df3.apply(lambda x: x["時間帯"] + x["時"], axis=1)

df1["月"] = df3["月"]
df1["曜日"] = df3["曜日"]

df1.head(10)


df1

# 不審者情報（種類別）
df4 = df1.pivot_table(
    values="場所", columns="管轄署", index="種別", aggfunc="count", fill_value=0
)
df4

df4.plot.barh(title="不審者情報（種類別）", stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)

# グラフを保存
plt.savefig("01.png", dpi=300, bbox_inches="tight")

# 不審者情報（管轄署別）
df4.T.plot.barh(title="不審者情報（管轄署別）", stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)

# グラフを保存
plt.savefig("02.png", dpi=300, bbox_inches="tight")

# 不審者情報（曜日別）
df5 = df1.pivot_table(
    values="場所", columns="曜日", index="種別", aggfunc="count", fill_value=0
)
df5 = df5.loc[:, ["月", "火", "水", "木", "金", "土", "日"]]

df5.T.plot.bar(subplots=True, layout=(6, 4), figsize=(10, 10), cmap="tab20")

# グラフを保存
plt.savefig("03.png", dpi=300, bbox_inches="tight")

df5.T.plot.barh(stacked=True, cmap="tab20")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0, fontsize=8)

# グラフを保存
plt.savefig("04.png", dpi=300, bbox_inches="tight")

# 不審者情報（時間別）
df6 = df1.pivot_table(
    values="場所", columns="時", index="種別", aggfunc="count", fill_value=0
)
df6

df6.T.plot.bar(subplots=True, layout=(5, 5), figsize=(20, 15), cmap="tab20")

# グラフを保存
plt.savefig("05.png", dpi=300, bbox_inches="tight")

# 不審者情報（月別）
df7 = df1.pivot_table(
    values="場所", columns="月", index="種別", aggfunc="count", fill_value=0
)
df7

df7.T.plot.bar(subplots=True, layout=(5, 5), figsize=(20, 15), cmap="tab20")

# グラフを保存
plt.savefig("06.png", dpi=300, bbox_inches="tight")

# 不審者情報（年別）
df8 = df1.pivot_table(
    values="場所", columns="年", index="種別", aggfunc="count", fill_value=0
)
df8

df8.plot.barh(figsize=(10, 15))

# ワードクラウド

df_type = df1
# df_type = df1[df1["種別"] == "写真撮影"]


def counter(texts):

    t = Tokenizer()
    words_count = defaultdict(int)
    words = []

    for text in texts:
        tokens = t.tokenize(text)

        for token in tokens:
            # 品詞から名詞だけ抽出
            pos = token.part_of_speech.split(",")[0]

            if pos == "名詞":
                words_count[token.base_form] += 1
                words.append(token.base_form)

    return words_count, words


words_count, words = counter(df_type["状況"])
text = " ".join(words)

s = pd.Series(words_count)
s.sort_values(ascending = False, inplace = True)
s.head(50)

with open("analysis_text.txt", "w", encoding="utf-8") as fw:
    fw.write(text)

with open("analysis_text.txt", "r", encoding="utf-8") as fr:
    text = fr.read()

fpath = "/usr/share/fonts/opentype/ipafont-gothic/ipagp.ttf"

stop_words = ["身長", "所持", "着用", "容姿", "センチ", "もの"]

wordcloud = WordCloud(
    background_color="white",
    font_path=fpath,
    collocations=False,
    stopwords=set(stop_words),
    width=900,
    height=500,
).generate(text)

plt.figure(figsize=(15, 12))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()