普段Beautifulsoupしか使ってないのでScrapyで作成してみた
をベースに作成しました。
https://doc.scrapy.org/en/latest/topics/commands.html
# インストール pip install scrapy # プロジェクト作成 scrapy startproject cheerup_ehime cd cheerup_ehime
# ひながた作成 scrapy genspider -t crawl uwajima www.city.uwajima.ehime.jp
cheerup_ehime ├── scrapy.cfg └── cheerup_ehime ├── __init__.py ├── __pycache__ ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── uwajima.py └── __pycache__
/cheeup_ehime/items.py
import scrapy class CheerupEhimeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() date = scrapy.Field() url = scrapy.Field() body = scrapy.Field()
/cheeup_ehime/spiders/uwajima.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule import datetime class UwajimaSpider(CrawlSpider): name = 'uwajima' allowed_domains = ['www.city.uwajima.ehime.jp'] start_urls = ['https://www.city.uwajima.ehime.jp/soshiki/list3-1.html'] # 範囲指定 rules = (Rule( LinkExtractor( restrict_xpaths='//*[@id="news_wrap"]/div[@class="list_ccc"]/ul'), callback='parse_item', follow=True), ) def parse_item(self, response): i = {} # タイトル i['title'] = response.xpath( '//*[@id="main_header"]/h1/text()').extract_first() # 掲載日 i['date'] = datetime.date(*[ int(n) for n in response.xpath('//*[@id="content_date"]/text()') .re(r'掲載日:(\d{4})年(\d{1,2})月(\d{1,2})日更新') ]) # 本文 i['body'] = '\n\n'.join([ j.strip() for j in response.xpath( '//*[@id="main_body"]/div[@class="detail_free"]//text()') .extract() if j.strip() ]) # URL i['url'] = response.url return i
/cheeup_ehime/settings.py
BOT_NAME = 'cheerup_ehime' SPIDER_MODULES = ['cheerup_ehime.spiders'] NEWSPIDER_MODULE = 'cheerup_ehime.spiders' ROBOTSTXT_OBEY = True CONCURRENT_REQUESTS = 2 CONCURRENT_REQUESTS_PER_DOMAIN = 2 CONCURRENT_REQUESTS_PER_IP = 0 ITEM_PIPELINES = { 'cheerup_ehime.pipelines.CheerupEhimePipeline': 100, } HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 HTTPCACHE_DIR = 'httpcache'
/cheeup_ehime/pipelines.py
import string import os class CheerupEhimePipeline(object): def open_spider(self, spider): self.markdown = open('template.md').read() def process_item(self, item, spider): template = string.Template(self.markdown) md_post = template.safe_substitute(item) root, _ = os.path.splitext(os.path.basename(item['url'])) filename = '{}-uwajima-{}.md'.format(item['date'].strftime('%Y-%m-%d'), root) with open(filename, 'w') as fw: fw.write(md_post) return item
--- title: $title date: $date 12:00:00 category: - 宇和島市 tag: - 緊急速報 --- 宇和島市で公開された情報です。(オリジナルは[こちら]($url)) $body
実行
scrapy crawl uwajima