www.dhii.jp
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT0353b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2046b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2047b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2063b.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2661.xml"
!curl -O "https://21dzk.l.u-tokyo.ac.jp/SAT2018/JT2887.xml"
import pathlib
import pandas as pd
from lxml import etree
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
se_names = []
se_places = []
for p in pathlib.Path(".").glob("JT*.xml"):
tree = etree.parse(p)
root = tree.getroot()
names = root.xpath("//tei:body//tei:persName/text()", namespaces=ns)
places = root.xpath("//tei:body//tei:placeName/text()", namespaces=ns)
se_names.append(pd.Series(names))
se_places.append(pd.Series(places))
pd.concat(se_names).value_counts()
pd.concat(se_places).value_counts()