qiita.com
qiita.com
qiita.com
qiita.com
qiita.com
kanji.hatenablog.jp
Scrapy
scrapy startproject ehime_news
cd ehime_news
scrapy genspider -t crawl ehime_np www.ehime-np.co.jp
scrapy crawl ehime_np -o test.csv
import scrapy
class EhimeNewsItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
description = scrapy.Field()
BOT_NAME = 'ehime_news'
SPIDER_MODULES = ['ehime_news.spiders']
NEWSPIDER_MODULE = 'ehime_news.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 2
DOWNLOAD_DELAY = 3
CONCURRENT_REQUESTS_PER_DOMAIN = 2
CONCURRENT_REQUESTS_PER_IP = 0
ITEM_PIPELINES = {
'ehime_news.pipelines.EhimeNewsPipeline': 100,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
HTTPCACHE_DIR = 'httpcache'
DEPTH_LIMIT = 1
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class EhimeNpSpider(CrawlSpider):
name = 'ehime_np'
allowed_domains = ['www.ehime-np.co.jp']
rules = (Rule(
LinkExtractor(
allow=r'/article/news',
restrict_css=
'#js_contents > section > main.main_box > article.article_box > dl'
),
callback='parse_item',
follow=True), )
def start_requests(self):
for i in range(3):
yield scrapy.FormRequest(
'https://www.ehime-np.co.jp/online/news/ehime/list/',
formdata={
'imgtextlink_rightimg_box_A@ehime_list_616[page]': str(i)
})
def parse_item(self, response):
i = {}
i['title'] = response.css('h2.article_tit--tit::text').extract_first()
i['link'] = response.url
i['description'] = response.css(
'div.article_detail__body > p::text').extract()
return i
Beautifulsoup
import datetime
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def get_url(url, page):
result = []
for i in range(page):
payload = {
'imgtextlink_rightimg_box_A@heavyrain_list_3730[page]': str(i)
}
r = requests.post(url, headers=headers, data=payload)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html5lib')
for link in soup.select(
'dl.imgtextlink_rightimg_box > dd.imgtextlink_rightimg__text > a'
):
result.append(
urljoin('https://www.ehime-np.co.jp/', link.get('href')))
time.sleep(1)
return result
def scraping(url):
result = {}
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html5lib')
base = soup.select_one('article.article_detail_box')
title = base.select_one('h2.article_tit--tit').get_text(strip=True)
subtitle = base.select_one('h3.article_tit--subtit').get_text(
strip=True)
result = {'title': title, 'subtitle': subtitle, 'url': url}
print(result)
return result
if __name__ == '__main__':
urls = get_url('https://www.ehime-np.co.jp/online/life/heavyrain/list/', 3)
result = [scraping(url) for url in urls]