TEI XML

yuranhiko.hatenablog.com

blog.imind.jp

orangain.hatenablog.com

lxml.de

from lxml import etree
import pathlib

parser = etree.XMLParser(recover=True)

tree = etree.parse(p, parser)
root = tree.getroot()

# 名前空間マッピング確認
root.nsmap

# タグ確認
print(etree.tostring(root, pretty_print=True, encoding="utf-8").decode())

# 抽出
for i in root.xpath("//tei:physDesc", namespaces=ns):

    print(i.xpath(".//tei:bindingDesc/p/text()", namespaces=ns))

# glob
for p in pathlib.Path("cam_jp_xml").glob("*.xml"):
   print(str(p))