Beautifulsoupでスクレイピング

Beautifulsoupの方がひとつのファイルですむのでやっぱり楽

import datetime
import os
import re
import string
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

markdown = '''---
title: $title
date: $date 12:00:00
category:
  - 宇和島市
tag:
  - 緊急速報
---
宇和島市で公開された情報です。(オリジナルは[こちら]($url))

$body'''


def scraping(url):

    res = {}

    time.sleep(1)

    r = requests.get(url, headers=headers)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html.parser')

        # URL
        res['url'] = url

        # タイトル
        res['title'] = soup.select_one('#main_header > h1').get_text(
            strip=True)

        # 本文
        res['body'] = soup.select_one('#main_body > div.detail_free').get_text(
            '\n\n', strip=True)

        s = soup.select_one('#content_date').get_text(strip=True)
        m = re.match(r'掲載日:(\d{4})年(\d{1,2})月(\d{1,2})日更新', s)

        # 掲載日
        res['date'] = datetime.date(*[int(i) for i in m.groups()])

        # print(res)

    return res


if __name__ == '__main__':

    url = 'https://www.city.uwajima.ehime.jp/soshiki/list3-1.html'

    r = requests.get(url, headers=headers)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html.parser')

        for i in soup.select(
                '#news_wrap > div.list_ccc > ul > li > span.span_b > a'):

            item = scraping(urljoin(url, i.get('href')))

            template = string.Template(markdown)
            md_post = template.safe_substitute(item)

            root, _ = os.path.splitext(os.path.basename(item['url']))

            filename = '{}-uwajima-{}.md'.format(
                item['date'].strftime('%Y-%m-%d'), root)

            with open(filename, 'w', encoding='utf-8') as fw:
                fw.write(md_post)