読者です 読者をやめる 読者になる 読者になる

Scrapy

Scrapy Tutorial — Scrapy 1.3.0 documentation

Scrapy 1.2 ドキュメント — Scrapy 1.2.2 ドキュメント

data.gunosy.io

speakerdeck.com

# インストール
pip install scrapy
conda install -c conda-forge scrapy=1.3.0

# プロジェクト作成
scrapy startproject ehime_np

cd ehime_np

item.py編集

import scrapy


class EhimeNpItem(scrapy.Item):

    title = scrapy.Field()
    url = scrapy.Field()
# ひながた作成 
scrapy genspider ehime_news www.ehime-np.co.jp

# テスト
scrapy shell https://www.ehime-np.co.jp/online/news/ehime/list/
response.css('#js_contents > section > main > article > dl > dd > a > span.imgtextlink_rightimg__text--tit::text').extract_first()
# -*- coding: utf-8 -*-
import scrapy
from ehime_np.items import EhimeNpItem

class EhimeNewsSpider(scrapy.Spider):
    name = "ehime_news"
    allowed_domains = ["www.ehime-np.co.jp"]
    start_urls = (
        'https://www.ehime-np.co.jp/online/news/ehime/list/',
    )

    def parse(self, response):
        for sel in response.css("#js_contents > section > main > article > dl > dd"):
            article = EhimeNpItem()
            article['title'] = sel.css("a > span.imgtextlink_rightimg__text--tit::text").extract_first()
            href = sel.css("a::attr('href')").extract_first()
            article['url'] = response.urljoin(href)
            yield article
scrapy crawl ehime_news -o test.csv

scrapy parse --spider=ehime_news http://www.ehime-np.co.jp/online/news/ehime/list