メモ

スクレイピングテスト

kanji.hatenablog.jp

Scrapy

scrapy startproject ehime_news
cd ehime_news
scrapy genspider -t crawl ehime_np www.ehime-np.co.jp
scrapy crawl ehime_np -o test.csv

import scrapy


class EhimeNewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    description = scrapy.Field()

# -*- coding: utf-8 -*-

# Scrapy settings for ehime_news project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ehime_news'

SPIDER_MODULES = ['ehime_news.spiders']
NEWSPIDER_MODULE = 'ehime_news.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ehime_news (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 2

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 2
CONCURRENT_REQUESTS_PER_IP = 0

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'ehime_news.middlewares.EhimeNewsSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'ehime_news.middlewares.EhimeNewsDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'ehime_news.pipelines.EhimeNewsPipeline': 100,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

DEPTH_LIMIT = 1

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class EhimeNpSpider(CrawlSpider):
    name = 'ehime_np'
    allowed_domains = ['www.ehime-np.co.jp']

    rules = (Rule(
        LinkExtractor(
            allow=r'/article/news',
            restrict_css=
            '#js_contents > section > main.main_box > article.article_box > dl'
        ),
        callback='parse_item',
        follow=True), )

    def start_requests(self):
        for i in range(3):
            yield scrapy.FormRequest(
                'https://www.ehime-np.co.jp/online/news/ehime/list/',
                formdata={
                    'imgtextlink_rightimg_box_A@ehime_list_616[page]': str(i)
                })

    def parse_item(self, response):
        i = {}
        i['title'] = response.css('h2.article_tit--tit::text').extract_first()
        i['link'] = response.url
        i['description'] = response.css(
            'div.article_detail__body > p::text').extract()
        return i

Beautifulsoup

import datetime
import re
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}


def get_url(url, page):

    result = []

    for i in range(page):
        payload = {
            'imgtextlink_rightimg_box_A@heavyrain_list_3730[page]': str(i)
        }

        r = requests.post(url, headers=headers, data=payload)

        if r.status_code == requests.codes.ok:

            soup = BeautifulSoup(r.content, 'html5lib')

            for link in soup.select(
                    'dl.imgtextlink_rightimg_box > dd.imgtextlink_rightimg__text > a'
            ):
                result.append(
                    urljoin('https://www.ehime-np.co.jp/', link.get('href')))

            time.sleep(1)

    return result


def scraping(url):

    result = {}

    r = requests.get(url, headers=headers)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html5lib')

        base = soup.select_one('article.article_detail_box')

        title = base.select_one('h2.article_tit--tit').get_text(strip=True)

        subtitle = base.select_one('h3.article_tit--subtit').get_text(
            strip=True)

        result = {'title': title, 'subtitle': subtitle, 'url': url}

    print(result)

    return result


# メイン
if __name__ == '__main__':

    urls = get_url('https://www.ehime-np.co.jp/online/life/heavyrain/list/', 3)

    result = [scraping(url) for url in urls]