2018-08-25

puppeteerでリンク全部取得

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://www.yahoo.co.jp/');

    const hrefs = await page.$$eval('a', hrefs => hrefs.map((a) => {
        return a.href
    }));

    console.log(hrefs);

    await browser.close();
})();

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('https://www.yahoo.co.jp/');

  const datas = await page.$$eval('a', list => list.map(item => item.href));

  console.log(datas);

  await browser.close();
})();

2018-08-25

puppetterで日経平均をスクレイピング

Javascript スクレイピング

github.com

qiita.com

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('http://www.nikkei.com/markets/kabu/');

    var data = await page.$eval('span.mkc-stock_prices', item => {
        return item.textContent;
    });

    console.log(data)

    await browser.close();
})();

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('http://www.nikkei.com/markets/kabu/');

  const price = await page.evaluate(() => document.querySelector('span.mkc-stock_prices').textContent);

  console.log(price);

  await browser.close();
})();

imabari.hateblo.jp

2018-08-25

スマートコンセントとRaspberry Pi

蒼社川の監視モード

【28倍ポイント最大】TP-Link WiFi スマートプラグ遠隔操作直差しコンセント Echo シリーズ Googleホーム対応音声コントロールコンパクトハブ不要 3年保証 HS105

ジャンル: その他
ショップ: TP-Linkダイレクト　楽天市場店
価格: 3,900円

組み合わせればいつでもボタンで起動できるな

2018-08-23

json feed

Python

import requests
import json

url = 'http://www.city.seiyo.ehime.jp/kinkyu/index.update.json'

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

r = requests.get(url, headers=headers)

data = r.json()

for i in data:
    print(i['page_name'], i['url'], i['publish_datetime'])

2018-08-18

Beautifulsoupでスクレイピング

Python スクレイピング

Beautifulsoupの方がひとつのファイルですむのでやっぱり楽

import datetime
import os
import re
import string
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

markdown = '''---
title: $title
date: $date 12:00:00
category:
  - 宇和島市
tag:
  - 緊急速報
---
宇和島市で公開された情報です。（オリジナルは[こちら]($url)）

$body'''


def scraping(url):

    res = {}

    time.sleep(1)

    r = requests.get(url, headers=headers)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html.parser')

        # URL
        res['url'] = url

        # タイトル
        res['title'] = soup.select_one('#main_header > h1').get_text(
            strip=True)

        # 本文
        res['body'] = soup.select_one('#main_body > div.detail_free').get_text(
            '\n\n', strip=True)

        s = soup.select_one('#content_date').get_text(strip=True)
        m = re.match(r'掲載日：(\d{4})年(\d{1,2})月(\d{1,2})日更新', s)

        # 掲載日
        res['date'] = datetime.date(*[int(i) for i in m.groups()])

        # print(res)

    return res


if __name__ == '__main__':

    url = 'https://www.city.uwajima.ehime.jp/soshiki/list3-1.html'

    r = requests.get(url, headers=headers)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html.parser')

        for i in soup.select(
                '#news_wrap > div.list_ccc > ul > li > span.span_b > a'):

            item = scraping(urljoin(url, i.get('href')))

            template = string.Template(markdown)
            md_post = template.safe_substitute(item)

            root, _ = os.path.splitext(os.path.basename(item['url']))

            filename = '{}-uwajima-{}.md'.format(
                item['date'].strftime('%Y-%m-%d'), root)

            with open(filename, 'w', encoding='utf-8') as fw:
                fw.write(md_post)

2018-08-18

Scrapyでスクレイピング

Python スクレイピング

普段Beautifulsoupしか使ってないのでScrapyで作成してみた

github.com

をベースに作成しました。

note.nkmk.me

https://doc.scrapy.org/en/latest/topics/commands.html

# インストール
pip install scrapy

# プロジェクト作成
scrapy startproject cheerup_ehime

cd cheerup_ehime

# ひながた作成 
scrapy genspider -t crawl uwajima www.city.uwajima.ehime.jp

cheerup_ehime
├── scrapy.cfg
└── cheerup_ehime
    ├── __init__.py
    ├── __pycache__
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── uwajima.py
        └── __pycache__

/cheeup_ehime/items.py

import scrapy

class CheerupEhimeItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    title = scrapy.Field()
    date = scrapy.Field()
    url = scrapy.Field()
    body = scrapy.Field()

/cheeup_ehime/spiders/uwajima.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import datetime


class UwajimaSpider(CrawlSpider):
    name = 'uwajima'
    allowed_domains = ['www.city.uwajima.ehime.jp']
    start_urls = ['https://www.city.uwajima.ehime.jp/soshiki/list3-1.html']

    # 範囲指定
    rules = (Rule(
        LinkExtractor(
            restrict_xpaths='//*[@id="news_wrap"]/div[@class="list_ccc"]/ul'),
        callback='parse_item',
        follow=True), )

    def parse_item(self, response):
        i = {}

        # タイトル
        i['title'] = response.xpath(
            '//*[@id="main_header"]/h1/text()').extract_first()

        # 掲載日
        i['date'] = datetime.date(*[
            int(n) for n in response.xpath('//*[@id="content_date"]/text()')
            .re(r'掲載日：(\d{4})年(\d{1,2})月(\d{1,2})日更新')
        ])

        # 本文
        i['body'] = '\n\n'.join([
            j.strip() for j in response.xpath(
                '//*[@id="main_body"]/div[@class="detail_free"]//text()')
            .extract() if j.strip()
        ])

        # URL
        i['url'] = response.url

        return i

shinyorke.hatenablog.com

/cheeup_ehime/settings.py

BOT_NAME = 'cheerup_ehime'
SPIDER_MODULES = ['cheerup_ehime.spiders']
NEWSPIDER_MODULE = 'cheerup_ehime.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 2
CONCURRENT_REQUESTS_PER_DOMAIN = 2
CONCURRENT_REQUESTS_PER_IP = 0
ITEM_PIPELINES = {
    'cheerup_ehime.pipelines.CheerupEhimePipeline': 100,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24
HTTPCACHE_DIR = 'httpcache'

/cheeup_ehime/pipelines.py

import string
import os


class CheerupEhimePipeline(object):
    def open_spider(self, spider):
        self.markdown = open('template.md').read()

    def process_item(self, item, spider):
        template = string.Template(self.markdown)
        md_post = template.safe_substitute(item)

        root, _ = os.path.splitext(os.path.basename(item['url']))

        filename = '{}-uwajima-{}.md'.format(item['date'].strftime('%Y-%m-%d'),
                                             root)

        with open(filename, 'w') as fw:
            fw.write(md_post)

        return item

---
title: $title
date: $date 12:00:00
category:
  - 宇和島市
tag:
  - 緊急速報
---
宇和島市で公開された情報です。（オリジナルは[こちら]($url)）

$body

実行

scrapy crawl uwajima

2018-08-18

ボランティア動向　直近７日間の集計のグラフ

スクレイピング Google Apps Script

https://docs.google.com/spreadsheets/d/e/2PACX-1vStDz-dB8i7W6m0GEYup_DQ76CHXaPRgalKpkIP0hNZ_Q5eMVUvc3lKi1lC5VFeKeTFqh6s1yHmP3vj/pubchart?oid=1377073713&format=interactive

表を取得　C1

=IMPORTHTML("https://ehimesvc.jp/?p=70","table",1)

何週目か　A:A

=WEEKNUM($C2)

何曜日か　B:B

=WEEKDAY($C2)

直近一週間

=OFFSET('データ'!C1,MATCH("合計",'データ'!C:C)-8,0,7,7)

ピボットテーブル

ピボットテーブルで日付でSUMすると数字以外は0になる

表を取得 C1

何週目か A:A

何曜日か B:B

直近一週間

ピボットテーブル

表を取得　C1

何週目か　A:A

何曜日か　B:B