import datetime
import re
from urllib.parse import urljoin
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_refuge(url):
html = urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
title = soup.select_one('#main_container > h1, h3').get_text(strip=True)
date_pattern = re.compile(
'(\d{4})年(\d{1,2})月(\d{1,2})日[ ](\d{1,2})時(\d{1,2})分')
result = date_pattern.search(title)
if result:
d = map(int, result.groups())
pubdate = datetime.datetime(*d)
description = soup.select_one('#main_container > p').get_text(strip=True)
return title, description, url, pubdate
def scraping(url, css_select):
html = urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
result = [
get_refuge(urljoin(url, i.get('href')))
for i in soup.select(css_select)
]
return result
if __name__ == '__main__':
urge = scraping('http://www.city.imabari.ehime.jp/bousai/kankoku/',
'#main_container > p > a')
for i in urge:
print('\n'.join(i[:3]))
print('-' * 30)
shelter = scraping('http://www.city.imabari.ehime.jp/bousai/hinanjo/',
'#main_container > div > p > a')
for i in shelter:
print('\n'.join(i[:3]))
print('-' * 30)