首页 » 技术分享 » Python——爬取赶集网招聘信息——求职信息--兼职工作信息

Python——爬取赶集网招聘信息——求职信息--兼职工作信息

 
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urllib import request
from bs4 import  BeautifulSoup

req=request.Request("http://bj.ganji.com/");
req.add_header('User-Agent',"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36");
response=request.urlopen(req);
html=response.read().decode('UTF-8')
soup=BeautifulSoup(html,'html.parser');
#获取总的关于工作的工作的html
# gongzuo=soup.find('div',{'id':'col-2'})
# gongzuo_soup=BeautifulSoup(str(gongzuo),'html.parser');
#获取北京招聘的工作的html
# beijingzhaopin=gongzuo_soup.find("div",{"class":"category clearfix"})
# print(beijingzhaopin)
gongzuo=soup.findAll('a')

#招聘
zps={};
#兼职
jzs={};
#求职
qzs={};
#保存招聘网址
def getZPUrl():
    for i in gongzuo:
        try:
            if (str(i['href'])[:2] == "zp"):
                soup = BeautifulSoup(str(i), 'html.parser');
                zps[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href'])
            elif (str(i['href'])[:2] == "jz"):
                soup = BeautifulSoup(str(i), 'html.parser');
                jzs[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href'])
            elif (str(i['href'])[:2] == "qz"):
                soup = BeautifulSoup(str(i), 'html.parser');
                qzs[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href'])
            else:
                pass
        except:
            pass
    return zps,jzs,qzs

#判断页数是否到最后
# def judgeNum(soup):
#     numberPages = soup.find_all("ul", attrs={'class': 'pageLink clearfix'})
#     print(numberPages)

#解析每种类型的招聘网址
def getZPInfo():
    zps, jzs, qzs=getZPUrl();
    del zps["放心企业"]
    del jzs['放心兼职']
    print(zps)
    #zp为每种类型工作的URL
    #北京招聘
    for zp in zps.values():
        req=request.Request(zp);
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
        response=request.urlopen(req);
        html=response.read().decode("UTF-8");
        soup=BeautifulSoup(html,'html.parser');
        # 页数不确定(判断是否为空结束他)
        #每种类型工作的发布的招聘信息(第一页)
        for gongzuoInfo in soup.find_all(class_="con-list-zcon new-dl"):
            #工作名字
            # print(gongzuoInfo.dt.a.string)
            #公司名字
            # print(gongzuoInfo.dt.div.find_all("p")[0].string)
            #工资
            # print(gongzuoInfo.dd.div.string)
            #待遇    div:标签名字    limit:表示取第几个div,返回列表    [0]表示取列表中第几个元素
            # welfare=[]#存储待遇
            # for i in gongzuoInfo.dd.find_all("div")[1].find_all("i"):
            #     welfare.append(i.string);
            # print(welfare)
            #获取工作地点
            # Workingplaces=[];
            # gongzuodidians=gongzuoInfo.find_all("dd")[1].find_all("a");
            # for i in gongzuodidians:
            #     Workingplaces.append(i.string);
            # print(Workingplaces)
            # 发布时间
            releasetime=gongzuoInfo.find_all("dd")[2].span;
            print(releasetime.string)
            pass
        break

    # for jz in jzs.values():
    #     print(jz)
    #     req=request.Request(jz);
    #     req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36");
    #     response=request.urlopen(req);
    #     html=response.read().decode("UTF-8");
    #     soup=BeautifulSoup(html,"html.parser");
    #     # 获取每一页的多条职位信息的html
    #     for jianzhiInfo in soup.find_all('dl',attrs={"class":"list-noimg job-list clearfix"}):
    #         #获取工作名称
    #         gongzuoName=jianzhiInfo.dt.a.string
    #         print(gongzuoName)
    #         #获取公司名称
    #         try:
    #             TranslateName=jianzhiInfo.find_all("dd")[0].a.attrs["title"];
    #         except:
    #             TranslateName=jianzhiInfo.find_all(class_="s-tit14 fl")[0].string
    #             print(TranslateName)
    #         #获取工作地点
    #         gongzuodidian=jianzhiInfo.find_all("dd",attrs={"class":"pay"});
    #         if len(gongzuodidian)==0:
    #             gongzuodidian="";
    #         else:
    #             gongzuodidian=gongzuodidian[0].string;
    #         print(gongzuodidian)
    #         #获取发布时间
    #         try:
    #             pub_time=jianzhiInfo.find_all("dd",attrs={"class":"pub-time"})[0].string;
    #         except:
    #             pub_time=""
    #         print(pub_time)
    #         pass
    #
    #     break
    # for qz in qzs.values():
    #     req=request.Request(qz);
    #     req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36")
    #     response=request.urlopen(req);
    #     html=response.read().decode("UTF-8");
    #     soup=BeautifulSoup(html,"html.parser");
    #     #获取去每一页的信息数量
    #     info_num=soup.find_all("dl", attrs={"class": "list-noimg job-j-list clearfix job-new-list"})
    #     for userinfoall in soup.find_all("dl",attrs={"class":"list-noimg job-j-list clearfix job-new-list"}):
    #         userinfo_a=userinfoall.a
    #         #获取求职者图片
    #         user_img=userinfo_a.find_all("div",attrs={"class","fl head-portrait"})[0].img.attrs["src"];
    #         if str(user_img).find("http")==-1:
    #             user_img="http:"+user_img;
    #         print(user_img)
    #         #获取求职者信息
    #         userinfo=userinfo_a.dt.div.div
    #         userinfo_name=userinfo.find_all("span",attrs={"class":"name"})[0].string
    #         userinfo_sex = userinfo.find_all("span", attrs={"class": "bor-right"})[0].string
    #         userinfo_age = userinfo.find_all("span", attrs={"class": "bor-right"})[1].string
    #         #学历
    #         userinfo_edu = userinfo.find_all("span", attrs={"class": "bor-right"})[2].string
    #         #工作经验
    #         userinfo_experience = userinfo.find_all("span")[4].string
    #         #期望工作地点
    #         userinfo_workplace=userinfo_a.find_all("p",attrs={"class":"district"})[0].string.replace(" ","");
    #         print(userinfo_workplace)
    #         #期望薪资
    #         userinfo_salary = userinfo_a.find_all("p", attrs={"class": "salary"})[0].string.replace(" ", "");
    #         print(userinfo_salary)
    #         #发布时间
    #         userinfo_time = userinfo_a.find_all("div", attrs={"class": "order fr"})[0].string.replace(" ","")
    #         print(userinfo_time)
    #         pass
    #     break




getZPInfo()



#使用bs4模块  urllib模块


转载自原文链接, 如需删除请联系管理员。

原文链接:Python——爬取赶集网招聘信息——求职信息--兼职工作信息,转载请注明来源!

0