漱石書簡から人物登場頻度と地名登場頻度

curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19000908.xml'
curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19001008.xml'
curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19001022.xml'
import pathlib

from lxml import etree

ns = {"tei": "http://www.tei-c.org/ns/1.0"}

for p in pathlib.Path(".").glob("soseki_letter_*.xml"):

    tree = etree.parse(p)
    root = tree.getroot()

    persons = root.xpath("//tei:back/tei:listPerson/tei:person/@xml:id", namespaces=ns)
    places = root.xpath("//tei:back/tei:listPlace/tei:place/@xml:id", namespaces=ns)

    # 人名

    for person in persons:

        data = root.xpath(f"//tei:body//tei:*[@corresp='#{person}']", namespaces=ns)

        print(f"{person}: 言及{len(data)}回")

        for d in data:

            s = " ".join([i.strip() for i in d.xpath("./..//text()") if i.strip()])

            print(" -", s)

    print("-" * 20)

    # 地名

    for place in places:

        data = root.xpath(f"//tei:body//tei:*[@corresp='#{place}']", namespaces=ns)

        print(f"{place}: 言及{len(data)}回")

    print("=" * 20)

現代日本語訳仏典の人名と地名の登場回数

www.dhii.jp

!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT0353b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2046b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2047b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2063b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2661.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2887.xml"
import pathlib

import pandas as pd
from lxml import etree

ns = {"tei": "http://www.tei-c.org/ns/1.0"}

se_names = []
se_places = []

for p in pathlib.Path(".").glob("JT*.xml"):

    tree = etree.parse(p)
    root = tree.getroot()

    names = root.xpath("//tei:body//tei:persName/text()", namespaces=ns)
    places = root.xpath("//tei:body//tei:placeName/text()", namespaces=ns)

    se_names.append(pd.Series(names))
    se_places.append(pd.Series(places))

pd.concat(se_names).value_counts()
pd.concat(se_places).value_counts()

TEI/XMLファイルから抜き出した地理情報を地図上にマッピング(lxmlで抽出)

digitalnagasaki.hatenablog.com

curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19000908.xml"
curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19001008.xml"
curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19001022.xml"
import pathlib

import folium
import pandas as pd
from lxml import etree

ns = {"tei": "http://www.tei-c.org/ns/1.0"}

data = []

for p in pathlib.Path(".").glob("soseki_letter*.xml"):

    tree = etree.parse(p)
    root = tree.getroot()

    for l in root.xpath("//tei:location", namespaces=ns):

        d = {}

        name = l.xpath("./tei:placeName/text()", namespaces=ns)
        region = l.xpath("./tei:address/tei:region/text()", namespaces=ns)

        d["title"] = (name or region)[0]
        d["geo"] = l.xpath("./tei:geo/text()", namespaces=ns)[0]

        data.append(d)


df = pd.DataFrame(data)

# 重複削除
df.drop_duplicates(subset="geo", inplace=True)

# 緯度経度分割
df[["lat", "lon"]] = df["geo"].str.split(expand=True).astype(float)

df

map = folium.Map(
    location=[35.5, 138.5],
    zoom_start=2,
)

for i, r in df.iterrows():

    folium.Marker(
        location=[r.lat, r.lon],
        popup=folium.Popup(f"<p>{r.title}</p>", max_width=300),
    ).add_to(map)

map