首页 » 技术分享 » python_爬取【搜狗图片】

python_爬取【搜狗图片】

 

1.利用python抓取网站上的图片,对于学习python及对网页数据分析处理很有帮助,也可以学习一些web方面的知识,我尝试使用【搜狗图片】搜索到的图片作为抓取对象,抓取【搜狗图片】主页各个标题栏的图片,以及【其他】输入图片类型的图片,使用tkinter完成了一个简单的UI界面。

2.一般抓取网页图片,需要先访问页面,然后提取源码,依次解析各个图片URL,然后直接下载即可,这些网上的教程很多,在此不再赘述。但是对于一些图片较多的页面,往往使用动态加载的方式呈现图片,也就是我们抓取的页面源码中,并没有各个图片的URL,这就需要 分析页面结构,找到页面图片真正的URL资源地址,才能完成下载。

3.例如在【搜狗图片】搜索美女,然后点进图片,查找该图片的URL地址:

【请求URL】:

【页面源码】:找不到对应的图片URL。

但是通过分析发现图片是动态加载出来的,然后参考了 Python爬取网页中的图片(搜狗图片)详解

发现【标题类】的图片都集中在如下URL:

而通过【搜索】得到得图片URL集中在:

这样,就可以很清楚的得到各个图片URL的地址,爬取图片了。

4.源码:

#-*- encoding=UTF-8 -*-
import urllib.request,socket,re,sys,os
from urllib.request import urlopen
import time
from tkinter import *
import webbrowser
from bs4 import BeautifulSoup
import requests
import json
import urllib

##############################常量区##############################
sougou_url="http://pic.sogou.com/"
###URL
download_pics_path="C:/bz2018/"
download_pics_num=10
download_success = ""
sougou_pics_tag=["pic_url","thumbUrl","bthumbUrl","ori_pic_url"]
sougou_url_pics_start="http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category="
sougou_url_pics_mid="&tag=%E5%85%A8%E9%83%A8&start=0&len="
sougou_url_pics_start_other="http://pic.sogou.com/pics?query="
sougou_url_pics_mid_other="&did=1&mode=1&start=0&len="
sougou_url_pics_stop_other="&reqType=ajax"
###title
title_key_start="a class=\"nav-tab\" href=\"/pics/"
title_key_stop="<"
###tkinter
window_name="搜狗图片下载器"
window_size="500x500"
frm_bg="white"
real_columnspan=4
label_type_str="---------------------------------------------图片类型---------------------------------------------"
##############################常量区##############################

real_url_arr = [] ###组成url集合

##############################函数区##############################
###获取网页上标题,返回标题数组
def get_title(url):
    html = urlopen(url)
    sougou_html = BeautifulSoup(html.read())
    title_key = []
    for ihtml in sougou_html:
        data1 = str(ihtml).split(title_key_start)
        if len(data1) > 1:
            for jhtml in data1:
                data2 = jhtml.split(title_key_stop)[0]
                data3 = data2.split("\">")
                if len(data3) == 2:
                    title_key.append(data3[1])
    return title_key

###获取网页图片并下载,返回下载失败个数
def get_pics(url,path):
    # 检测当前路径的有效性
    if not os.path.isdir(path):
        os.mkdir(path)
    pics_str = requests.get(url)
    pics_dict = json.loads(pics_str.text)
    pics_dict_items = pics_dict['all_items']
    i_item=0
    fail_count=0
    for item in pics_dict_items:
        fail_flag=0
        for itag in sougou_pics_tag:
            try:
                pic_url=item[itag]
                pic_title=item['title']
                if pic_title == "":
                    pic_title = str(i_item)
                    i_item = i_item + 1
                if pic_url != "":
                    urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg')
                    print(pic_title+": download complete!")
                    fail_flag=1
                    break
            except:
                print("download fail!")
                continue
        if fail_flag != 1:
            fail_count=fail_count+1
    return fail_count

def get_pics_other(url,path):
    pics_str = requests.get(url)
    pics_dict = json.loads(pics_str.text)
    pics_dict_items = pics_dict['items']
    i_item=0
    fail_count=0
    for item in pics_dict_items:
        fail_flag=0
        for itag in sougou_pics_tag:
            try:
                pic_url=item[itag]
                pic_title=item['title']
                pic_title=pic_title+str(i_item)
                i_item = i_item + 1
                if pic_url != "":
                    urllib.request.urlretrieve(pic_url, path + pic_title + '.jpg')
                    print(pic_title+": download complete!")
                    fail_flag=1
                    break
            except:
                print("download fail!")
                continue
        if fail_flag != 1:
            fail_count=fail_count+1
    return fail_count

def url_get_othertype():
    global real_url_arr
    if PhotoType.get() != "":
        real_url_arr.append(PhotoType.get())
        real_url_arr = list(set(real_url_arr))

def url_get_phototype(all_type):
    global real_url_arr
    real_url_arr=[]
    url_get_othertype()
    if "其他" in all_type:
        all_type.remove("其他")
    for i in range(len(all_type)):
        if CheckType[i].get() == 1:
            real_url_arr.append(typeBtn[all_type[i]]['text'])
    real_url_arr = list(set(real_url_arr))

def other_type():
    if OtherType.get() == 1 :
        type["state"] = "normal"
    else:
        type["state"] = "disabled"
        PhotoType.set("")

def get_full_url(all_type):
    global download_pics_num
    down_result["text"] = ""
    url_get_phototype(all_type)
    if download_num_str.get() != "":
        download_pics_num = int((download_num_str.get()))
    sum = len(real_url_arr) * download_pics_num
    down_result["text"] = "准备下载: " + str(sum) + "张照片"
    fail_num = 0
    for iurl in real_url_arr:
        if iurl in photo_type:
            tmp_url=sougou_url_pics_start+iurl+sougou_url_pics_mid+str(download_pics_num)
            fail_num = fail_num + get_pics(tmp_url, download_pics_path)
        else:
            tmp_url=sougou_url_pics_start_other + iurl + sougou_url_pics_mid_other + str(download_pics_num) + sougou_url_pics_stop_other
            time.sleep(1)
            fail_num = fail_num + get_pics_other(tmp_url, download_pics_path)
    down_result["text"] ="成功下载: " + str(sum-fail_num) + "张照片"

###tkinter label占一行
def write_line(row,text="",column=0,columnspan=real_columnspan,bg=frm_bg):
    label = Label(frm, text=text, bg=bg)
    label.grid(row=row, column=column,columnspan=columnspan)
    return label

###调用网页
def callback(url=sougou_url):
    webbrowser.open_new(url)

##############################函数区##############################

##############################UI部分##########################################
root =Tk() #给窗体
root.title(window_name) #设置窗体名字
root.geometry(window_size)
root.resizable(width=False, height=False) ###固定窗体大小

frm=Frame(root,bg=frm_bg) #新建框架
frm.pack(expand = YES,fill = BOTH) #放置框架

###控制行的参数
real_row=0
###空一行
write_line(real_row)
real_row=real_row+1
###进入官网
Button(frm,text="点击进入搜狗图片官网",command=callback).grid(row=real_row,column=0,columnspan=real_columnspan,sticky=N)
real_row=real_row+1
###空一行
write_line(real_row)
real_row=real_row+1
###图片类型
write_line(real_row,label_type_str)
real_row=real_row+1
###空一行
write_line(real_row)
real_row=real_row+1

###checkbutton
photo_type=get_title(sougou_url)
photo_type.append("其他")
typeBtn={}
CheckType=[]
real_column=0
for itype in photo_type:
    if itype == "其他":
        OtherType = IntVar()
        PhotoType = StringVar()
        type = Entry(frm, textvariable=PhotoType, width=9, state='disabled')  # 添加输入框
        Checkbutton(frm, text="其他", variable=OtherType, onvalue=1, offvalue=2, command=other_type).grid(row=real_row, column=1)
        type.grid(row=real_row, column=2, columnspan=4, sticky=W, padx=40, ipadx=60)  # 放置输入框位置
    else:
        CheckType.append(IntVar())
        typeBtn[itype]=Checkbutton(frm, text=itype, variable=CheckType[-1], command=lambda: url_get_phototype(photo_type))
        typeBtn[itype].grid(row=real_row, column=real_column)
    real_column=real_column+1
    if real_column == 4:
        real_column = 0
        real_row = real_row + 1
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###下载个数
lab1 = Label(frm,text = "下载个数:")# 添加Label
lab1.grid(row = real_row,column=0)
download_num_str = StringVar()
download_num = Entry(frm,width=10,textvariable=download_num_str)# 添加Entry
download_num.grid(row = real_row,column=1,sticky=W)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###get
Button(frm,text="获取照片",command=lambda: get_full_url(photo_type)).grid(row=real_row,column=0,columnspan=4,sticky=N)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

###结果
down_result=write_line(real_row)
real_row=real_row+1

###空一行
write_line(real_row)
real_row=real_row+1

Button(frm,text="退出程序",command=root.quit).grid(row=real_row,column=0,columnspan=4,sticky=N)
real_row=real_row+1

mainloop()
##############################UI部分##########################################

转载自原文链接, 如需删除请联系管理员。

原文链接:python_爬取【搜狗图片】,转载请注明来源!

0