Scrapyでスクレイピング

普段Beautifulsoupしか使ってないのでScrapyで作成してみた

github.com

をベースに作成しました。

note.nkmk.me

https://doc.scrapy.org/en/latest/topics/commands.html

# インストール
pip install scrapy

# プロジェクト作成
scrapy startproject cheerup_ehime

cd cheerup_ehime
# ひながた作成 
scrapy genspider -t crawl uwajima www.city.uwajima.ehime.jp
cheerup_ehime
├── scrapy.cfg
└── cheerup_ehime
    ├── __init__.py
    ├── __pycache__
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── uwajima.py
        └── __pycache__

/cheeup_ehime/items.py

import scrapy

class CheerupEhimeItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = scrapy.Field()
    date = scrapy.Field()
    url = scrapy.Field()
    body = scrapy.Field()

/cheeup_ehime/spiders/uwajima.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime


class UwajimaSpider(CrawlSpider):
    name = 'uwajima'
    allowed_domains = ['www.city.uwajima.ehime.jp']
    start_urls = ['https://www.city.uwajima.ehime.jp/soshiki/list3-1.html']

    # 範囲指定
    rules = (Rule(
        LinkExtractor(
            restrict_xpaths='//*[@id="news_wrap"]/div[@class="list_ccc"]/ul'),
        callback='parse_item',
        follow=True), )

    def parse_item(self, response):
        i = {}

        # タイトル
        i['title'] = response.xpath(
            '//*[@id="main_header"]/h1/text()').extract_first()

        # 掲載日
        i['date'] = datetime.date(*[
            int(n) for n in response.xpath('//*[@id="content_date"]/text()')
            .re(r'掲載日:(\d{4})年(\d{1,2})月(\d{1,2})日更新')
        ])

        # 本文
        i['body'] = '\n\n'.join([
            j.strip() for j in response.xpath(
                '//*[@id="main_body"]/div[@class="detail_free"]//text()')
            .extract() if j.strip()
        ])

        # URL
        i['url'] = response.url

        return i

shinyorke.hatenablog.com

/cheeup_ehime/settings.py

BOT_NAME = 'cheerup_ehime'
SPIDER_MODULES = ['cheerup_ehime.spiders']
NEWSPIDER_MODULE = 'cheerup_ehime.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 2
CONCURRENT_REQUESTS_PER_DOMAIN = 2
CONCURRENT_REQUESTS_PER_IP = 0
ITEM_PIPELINES = {
    'cheerup_ehime.pipelines.CheerupEhimePipeline': 100,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
HTTPCACHE_DIR = 'httpcache'

/cheeup_ehime/pipelines.py

import string
import os


class CheerupEhimePipeline(object):
    def open_spider(self, spider):
        self.markdown = open('template.md').read()

    def process_item(self, item, spider):
        template = string.Template(self.markdown)
        md_post = template.safe_substitute(item)

        root, _ = os.path.splitext(os.path.basename(item['url']))

        filename = '{}-uwajima-{}.md'.format(item['date'].strftime('%Y-%m-%d'),
                                             root)

        with open(filename, 'w') as fw:
            fw.write(md_post)

        return item
---
title: $title
date: $date 12:00:00
category:
  - 宇和島市
tag:
  - 緊急速報
---
宇和島市で公開された情報です。(オリジナルは[こちら]($url))

$body

実行

scrapy crawl uwajima