普段Beautifulsoupしか使ってないのでScrapyで作成してみた
github.com
をベースに作成しました。
note.nkmk.me
https://doc.scrapy.org/en/latest/topics/commands.html
pip install scrapy
scrapy startproject cheerup_ehime
cd cheerup_ehime
scrapy genspider -t crawl uwajima www.city.uwajima.ehime.jp
cheerup_ehime
├── scrapy.cfg
└── cheerup_ehime
├── __init__.py
├── __pycache__
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
├── uwajima.py
└── __pycache__
/cheeup_ehime/items.py
import scrapy
class CheerupEhimeItem(scrapy.Item):
title = scrapy.Field()
date = scrapy.Field()
url = scrapy.Field()
body = scrapy.Field()
/cheeup_ehime/spiders/uwajima.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime
class UwajimaSpider(CrawlSpider):
name = 'uwajima'
allowed_domains = ['www.city.uwajima.ehime.jp']
start_urls = ['https://www.city.uwajima.ehime.jp/soshiki/list3-1.html']
rules = (Rule(
LinkExtractor(
restrict_xpaths='//*[@id="news_wrap"]/div[@class="list_ccc"]/ul'),
callback='parse_item',
follow=True), )
def parse_item(self, response):
i = {}
i['title'] = response.xpath(
'//*[@id="main_header"]/h1/text()').extract_first()
i['date'] = datetime.date(*[
int(n) for n in response.xpath('//*[@id="content_date"]/text()')
.re(r'掲載日:(\d{4})年(\d{1,2})月(\d{1,2})日更新')
])
i['body'] = '\n\n'.join([
j.strip() for j in response.xpath(
'//*[@id="main_body"]/div[@class="detail_free"]//text()')
.extract() if j.strip()
])
i['url'] = response.url
return i
shinyorke.hatenablog.com
/cheeup_ehime/settings.py
BOT_NAME = 'cheerup_ehime'
SPIDER_MODULES = ['cheerup_ehime.spiders']
NEWSPIDER_MODULE = 'cheerup_ehime.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 2
CONCURRENT_REQUESTS_PER_DOMAIN = 2
CONCURRENT_REQUESTS_PER_IP = 0
ITEM_PIPELINES = {
'cheerup_ehime.pipelines.CheerupEhimePipeline': 100,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
HTTPCACHE_DIR = 'httpcache'
/cheeup_ehime/pipelines.py
import string
import os
class CheerupEhimePipeline(object):
def open_spider(self, spider):
self.markdown = open('template.md').read()
def process_item(self, item, spider):
template = string.Template(self.markdown)
md_post = template.safe_substitute(item)
root, _ = os.path.splitext(os.path.basename(item['url']))
filename = '{}-uwajima-{}.md'.format(item['date'].strftime('%Y-%m-%d'),
root)
with open(filename, 'w') as fw:
fw.write(md_post)
return item
---
title: $title
date: $date 12:00:00
category:
- 宇和島市
tag:
- 緊急速報
---
宇和島市で公開された情報です。(オリジナルは[こちら]($url))
$body
実行
scrapy crawl uwajima