首页 » 技术分享 » Python爬取好大夫网站

Python爬取好大夫网站

 

本次试验只是简单的爬取好大夫官网医疗团队信息:

import urllib.request
import re
import random
uapools=[
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
]
ippools=[
    '1183.129.207.80:12520',
    '115.218.212.239:9000',
    '121.232.194.111:9999',
    '117.91.254.235:9999',
    '118.24.156.214:8118',
]
def ip(uapools,ippools):
    thisheaders=random.choice(uapools)
    print(thisheaders)
    thisip=random.choice(ippools)
    print(thisip)
    proxy = urllib.request.ProxyHandler({'http': thisip})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    headers = ('User-Agent', thisheaders)
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)

try:
    for x in range(1,999):
        ip(uapools,ippools)
        print('当前为爬取第{}次。'.format(x))
        url='https://www.haodf.com/doctorteam/p_'+str(x)+'.htm'
        data=urllib.request.urlopen(url).read().decode('gbk','ignore')
        pat_url='<a class="d-c-i-c-n-click" href="(.*?)">'
        pat_title='">(.*?)</a></p>'
        alllink = re.compile(pat_url).findall(data)
        alltitle=re.compile(pat_title).findall(data)
        print(len(alllink))
        print(len(alltitle))
        for i in range(0,len(alllink),1):
            link = 'https:'+alllink[i]
            urllib.request.urlretrieve(link,"E:\\PythonStudy\\haodaifu_data_doctorinfo\\"+str(x)+str(i)+".html")
except urllib.error.URLError as e:
    if hasattr(e,'code'):
        print(e.code)
    if hasattr(e,'reason'):
        print(e.reason)

 

转载自原文链接, 如需删除请联系管理员。

原文链接:Python爬取好大夫网站,转载请注明来源!

0