首页 » 技术分享 » 快代理爬取免费代理

快代理爬取免费代理

 

爬取快代理网址,提取免费代理,并将可用代理保存在本地,(练习爬虫,因为很多免费代理只能用一次)
本次利用消息队列,对多个URL同时提取,注意延迟访问,

import requests
import time
import random
from lxml import etree
from fake_useragent import UserAgent
from queue import Queue
from threading import Thread


def get_ip():
    while True:
        if not q.empty():
        	# 验证IP是否可用网址
            url = 'http://httpbin.org/get'
            proxies = q.get()
            try:
                html = requests.get(url, headers=headers, proxies=proxies, timeout=10).text
                print('ip可以用')
                with open('ip.txt','a')as f:
                    f.write(str(proxies))
                    f.write('\n')
            except:
                print('ip不可用  下一个')
        else:
            break

def main():
    t_list = []
    for i in range(5):
        t = Thread(target=get_ip)
        t_list.append(t)
        t.start()

    for t in t_list:
        t.join()


ip_list = []
q = Queue()
# 爬取的是800到1000页   靠后的代理可能用的人少点,。自我安慰。。
for i in range(800 , 1000):
    url = 'https://www.kuaidaili.com/free/inha/{}'.format(i)
    print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    }
    # proxies = {'http': 'http://211.159.219.225:8118', 'https': 'https://211.159.219.225:8118'}
    # html = requests.get(url, headers=headers,proxies=proxies).text
    html = requests.get(url, headers=headers).text
    # print(html)
    parse_html = etree.HTML(html)
    tr_list = parse_html.xpath('//*[@id="list"]/table/tbody/tr')
    # 延迟访问6到11秒。
    sleep = random.randint(6, 11)
    print(f'等待{sleep}秒')
    time.sleep(sleep)
    print('开始')
    for tr in tr_list[1:]:
        ip = tr.xpath('./td[1]/text()')[0]
        port = tr.xpath('./td[2]/text()')[0]
        proxies = {
            'http': f'http://{ip}:{port}',
            'https': f'https://{ip}:{port}',
        }
        print(proxies)
        # 存入队列
        q.put(proxies)
    main()

print(ip_list)






转载自原文链接, 如需删除请联系管理员。

原文链接:快代理爬取免费代理,转载请注明来源!

0