漱石書簡から人物登場頻度と地名登場頻度
curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19000908.xml' curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19001008.xml' curl -O 'https://www.dhii.jp/dh/tei/soseki_letter_19001022.xml'
import pathlib from lxml import etree ns = {"tei": "http://www.tei-c.org/ns/1.0"} for p in pathlib.Path(".").glob("soseki_letter_*.xml"): tree = etree.parse(p) root = tree.getroot() persons = root.xpath("//tei:back/tei:listPerson/tei:person/@xml:id", namespaces=ns) places = root.xpath("//tei:back/tei:listPlace/tei:place/@xml:id", namespaces=ns) # 人名 for person in persons: data = root.xpath(f"//tei:body//tei:*[@corresp='#{person}']", namespaces=ns) print(f"{person}: 言及{len(data)}回") for d in data: s = " ".join([i.strip() for i in d.xpath("./..//text()") if i.strip()]) print(" -", s) print("-" * 20) # 地名 for place in places: data = root.xpath(f"//tei:body//tei:*[@corresp='#{place}']", namespaces=ns) print(f"{place}: 言及{len(data)}回") print("=" * 20)
現代日本語訳仏典の人名と地名の登場回数
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT0353b.xml" !curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2046b.xml" !curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2047b.xml" !curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2063b.xml" !curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2661.xml" !curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2887.xml"
import pathlib import pandas as pd from lxml import etree ns = {"tei": "http://www.tei-c.org/ns/1.0"} se_names = [] se_places = [] for p in pathlib.Path(".").glob("JT*.xml"): tree = etree.parse(p) root = tree.getroot() names = root.xpath("//tei:body//tei:persName/text()", namespaces=ns) places = root.xpath("//tei:body//tei:placeName/text()", namespaces=ns) se_names.append(pd.Series(names)) se_places.append(pd.Series(places)) pd.concat(se_names).value_counts() pd.concat(se_places).value_counts()
TEI/XMLファイルから抜き出した地理情報を地図上にマッピング(lxmlで抽出)
digitalnagasaki.hatenablog.com
curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19000908.xml" curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19001008.xml" curl -O "https://www.dhii.jp/dh/tei/soseki_letter_19001022.xml"
import pathlib import folium import pandas as pd from lxml import etree ns = {"tei": "http://www.tei-c.org/ns/1.0"} data = [] for p in pathlib.Path(".").glob("soseki_letter*.xml"): tree = etree.parse(p) root = tree.getroot() for l in root.xpath("//tei:location", namespaces=ns): d = {} name = l.xpath("./tei:placeName/text()", namespaces=ns) region = l.xpath("./tei:address/tei:region/text()", namespaces=ns) d["title"] = (name or region)[0] d["geo"] = l.xpath("./tei:geo/text()", namespaces=ns)[0] data.append(d) df = pd.DataFrame(data) # 重複削除 df.drop_duplicates(subset="geo", inplace=True) # 緯度経度分割 df[["lat", "lon"]] = df["geo"].str.split(expand=True).astype(float) df map = folium.Map( location=[35.5, 138.5], zoom_start=2, ) for i, r in df.iterrows(): folium.Marker( location=[r.lat, r.lon], popup=folium.Popup(f"<p>{r.title}</p>", max_width=300), ).add_to(map) map