#!/usr/bin/env python # -*- coding: utf-8 -*- from urllib import request from bs4 import BeautifulSoup req=request.Request("http://bj.ganji.com/"); req.add_header('User-Agent',"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"); response=request.urlopen(req); html=response.read().decode('UTF-8') soup=BeautifulSoup(html,'html.parser'); #获取总的关于工作的工作的html # gongzuo=soup.find('div',{'id':'col-2'}) # gongzuo_soup=BeautifulSoup(str(gongzuo),'html.parser'); #获取北京招聘的工作的html # beijingzhaopin=gongzuo_soup.find("div",{"class":"category clearfix"}) # print(beijingzhaopin) gongzuo=soup.findAll('a') #招聘 zps={}; #兼职 jzs={}; #求职 qzs={}; #保存招聘网址 def getZPUrl(): for i in gongzuo: try: if (str(i['href'])[:2] == "zp"): soup = BeautifulSoup(str(i), 'html.parser'); zps[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href']) elif (str(i['href'])[:2] == "jz"): soup = BeautifulSoup(str(i), 'html.parser'); jzs[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href']) elif (str(i['href'])[:2] == "qz"): soup = BeautifulSoup(str(i), 'html.parser'); qzs[soup.a.get_text()] = "http://bj.ganji.com/" + str(i['href']) else: pass except: pass return zps,jzs,qzs #判断页数是否到最后 # def judgeNum(soup): # numberPages = soup.find_all("ul", attrs={'class': 'pageLink clearfix'}) # print(numberPages) #解析每种类型的招聘网址 def getZPInfo(): zps, jzs, qzs=getZPUrl(); del zps["放心企业"] del jzs['放心兼职'] print(zps) #zp为每种类型工作的URL #北京招聘 for zp in zps.values(): req=request.Request(zp); req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36") response=request.urlopen(req); html=response.read().decode("UTF-8"); soup=BeautifulSoup(html,'html.parser'); # 页数不确定(判断是否为空结束他) #每种类型工作的发布的招聘信息(第一页) for gongzuoInfo in soup.find_all(class_="con-list-zcon new-dl"): #工作名字 # print(gongzuoInfo.dt.a.string) #公司名字 # print(gongzuoInfo.dt.div.find_all("p")[0].string) #工资 # print(gongzuoInfo.dd.div.string) #待遇 div:标签名字 limit:表示取第几个div,返回列表 [0]表示取列表中第几个元素 # welfare=[]#存储待遇 # for i in gongzuoInfo.dd.find_all("div")[1].find_all("i"): # welfare.append(i.string); # print(welfare) #获取工作地点 # Workingplaces=[]; # gongzuodidians=gongzuoInfo.find_all("dd")[1].find_all("a"); # for i in gongzuodidians: # Workingplaces.append(i.string); # print(Workingplaces) # 发布时间 releasetime=gongzuoInfo.find_all("dd")[2].span; print(releasetime.string) pass break # for jz in jzs.values(): # print(jz) # req=request.Request(jz); # req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"); # response=request.urlopen(req); # html=response.read().decode("UTF-8"); # soup=BeautifulSoup(html,"html.parser"); # # 获取每一页的多条职位信息的html # for jianzhiInfo in soup.find_all('dl',attrs={"class":"list-noimg job-list clearfix"}): # #获取工作名称 # gongzuoName=jianzhiInfo.dt.a.string # print(gongzuoName) # #获取公司名称 # try: # TranslateName=jianzhiInfo.find_all("dd")[0].a.attrs["title"]; # except: # TranslateName=jianzhiInfo.find_all(class_="s-tit14 fl")[0].string # print(TranslateName) # #获取工作地点 # gongzuodidian=jianzhiInfo.find_all("dd",attrs={"class":"pay"}); # if len(gongzuodidian)==0: # gongzuodidian="无"; # else: # gongzuodidian=gongzuodidian[0].string; # print(gongzuodidian) # #获取发布时间 # try: # pub_time=jianzhiInfo.find_all("dd",attrs={"class":"pub-time"})[0].string; # except: # pub_time="无" # print(pub_time) # pass # # break # for qz in qzs.values(): # req=request.Request(qz); # req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36") # response=request.urlopen(req); # html=response.read().decode("UTF-8"); # soup=BeautifulSoup(html,"html.parser"); # #获取去每一页的信息数量 # info_num=soup.find_all("dl", attrs={"class": "list-noimg job-j-list clearfix job-new-list"}) # for userinfoall in soup.find_all("dl",attrs={"class":"list-noimg job-j-list clearfix job-new-list"}): # userinfo_a=userinfoall.a # #获取求职者图片 # user_img=userinfo_a.find_all("div",attrs={"class","fl head-portrait"})[0].img.attrs["src"]; # if str(user_img).find("http")==-1: # user_img="http:"+user_img; # print(user_img) # #获取求职者信息 # userinfo=userinfo_a.dt.div.div # userinfo_name=userinfo.find_all("span",attrs={"class":"name"})[0].string # userinfo_sex = userinfo.find_all("span", attrs={"class": "bor-right"})[0].string # userinfo_age = userinfo.find_all("span", attrs={"class": "bor-right"})[1].string # #学历 # userinfo_edu = userinfo.find_all("span", attrs={"class": "bor-right"})[2].string # #工作经验 # userinfo_experience = userinfo.find_all("span")[4].string # #期望工作地点 # userinfo_workplace=userinfo_a.find_all("p",attrs={"class":"district"})[0].string.replace(" ",""); # print(userinfo_workplace) # #期望薪资 # userinfo_salary = userinfo_a.find_all("p", attrs={"class": "salary"})[0].string.replace(" ", ""); # print(userinfo_salary) # #发布时间 # userinfo_time = userinfo_a.find_all("div", attrs={"class": "order fr"})[0].string.replace(" ","") # print(userinfo_time) # pass # break getZPInfo()
#使用bs4模块 urllib模块
转载自原文链接, 如需删除请联系管理员。
原文链接:Python——爬取赶集网招聘信息——求职信息--兼职工作信息,转载请注明来源!