kanji.hatenablog.jp
github.com
import time
import requests
from bs4 import BeautifulSoup
from extractcontent3 import ExtractContent
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def scraping(url):
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
r.encoding = r.apparent_encoding
extractor = ExtractContent()
extractor.analyse(r.text)
text, title = extractor.as_text()
print(title)
print(text)
print('-' * 20)
if __name__ == '__main__':
url = 'https://www.city.imabari.ehime.jp/'
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html5lib')
for i in soup.select('div#top_osirse > dl > dd > a'):
scraping(i.get('href'))