Beautifulsoupの方がひとつのファイルですむのでやっぱり楽
import datetime import os import re import string import time from urllib.parse import urljoin import requests from bs4 import BeautifulSoup headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } markdown = '''--- title: $title date: $date 12:00:00 category: - 宇和島市 tag: - 緊急速報 --- 宇和島市で公開された情報です。(オリジナルは[こちら]($url)) $body''' def scraping(url): res = {} time.sleep(1) r = requests.get(url, headers=headers) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.content, 'html.parser') # URL res['url'] = url # タイトル res['title'] = soup.select_one('#main_header > h1').get_text( strip=True) # 本文 res['body'] = soup.select_one('#main_body > div.detail_free').get_text( '\n\n', strip=True) s = soup.select_one('#content_date').get_text(strip=True) m = re.match(r'掲載日:(\d{4})年(\d{1,2})月(\d{1,2})日更新', s) # 掲載日 res['date'] = datetime.date(*[int(i) for i in m.groups()]) # print(res) return res if __name__ == '__main__': url = 'https://www.city.uwajima.ehime.jp/soshiki/list3-1.html' r = requests.get(url, headers=headers) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.content, 'html.parser') for i in soup.select( '#news_wrap > div.list_ccc > ul > li > span.span_b > a'): item = scraping(urljoin(url, i.get('href'))) template = string.Template(markdown) md_post = template.safe_substitute(item) root, _ = os.path.splitext(os.path.basename(item['url'])) filename = '{}-uwajima-{}.md'.format( item['date'].strftime('%Y-%m-%d'), root) with open(filename, 'w', encoding='utf-8') as fw: fw.write(md_post)