import csv
import time
from urllib.parse import parse_qs, urljoin, urlparse
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def scraping(url):
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html5lib')
result = [
td.get_text(strip=True)
for td in soup.select('table#hor-zebra > tbody > tr > td')
]
gmap = soup.select_one(
'table#jyouhou-table > tbody > tr > td > div > iframe').get('src')
temp = parse_qs(urlparse(gmap).query)
lon, lat = temp['q'][0].replace('loc:', '').split(',')
return ([lon, lat] + result)
if __name__ == '__main__':
url = 'https://www.ehimeda.or.jp/doctor/sdental_u/resultlist.php?selADD1=02&SelCount=5'
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html5lib')
with open('result.csv', 'w') as fw:
writer = csv.writer(fw, dialect='excel', lineterminator='\n')
for trs in soup.select('table#list-table > tbody > tr')[1:]:
result = [td.get_text(strip=True) for td in trs.select('td')]
link = urljoin(url, trs.select_one('a').get('href'))
temp = scraping(link)
writer.writerow(result + temp)
time.sleep(1)