defget_areas_regions_urls():
areas =["pudong","minhang","baoshan","xuhui","putuo","yangpu","changning","songjiang","jiading","huangpu","jingan","zhabei","hongkou","qingpu","fengxian","jinshan","chongming","shanghaizhoubian"]
areas_regions_urls =[]#这是我们要返回的元组列表,其内每一个元组将包含地区、地点、和urlfor area in areas:
page = getPage("https://sh.lianjia.com/ershoufang/"+ area)
region_names = page.xpath("/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a/text()")#获取地点名
region_urls = page.xpath("/html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a/@href")#获取地点对应的urlfor url in region_urls:#创建元组并将其写入目标列表
areas_regions_urls.append((area,region_names[region_urls.index(url)],"https://gz.lianjia.com"+url))# print(area,region_names[region_urls.index(url)],"https://gz.lianjia.com"+url)# print("Region urls in Area {} have been added!".format(area))print("All regions urls have been added")return areas_regions_urls
defregion_spider(x):#获取信息条数
info_num =int(getPage(x[2]).xpath("/html/body/div[4]/div[1]/div[2]/h2/span/text()")[0])#计算信息页数(已知每页最多30条数据)
page_num = math.ceil(info_num/30)# print("{}有{}条数据,共{}页".format(x[1],info_num,page_num))for url in[x[2]+"pg"+str(num+1)for num inrange(page_num)]:
page = getPage(url)for house in page.xpath("/html/body/div[4]/div[1]/ul/li"):try:# print(house.xpath("div[1]/div[1]/a/text()")[0])#x代表get_areas_regions_urls()返回的列表中的每一个元组,则x[0]代表地区,x[1]代表地点,x[2]代表url
Area = x[0]
Region = x[1]
info = house.xpath("div[1]/div[2]/div/text()")[0].split("|")#由于别墅房源和普通房源的网页结构稍有不同,所以这里我们需要做一个判断if info[1].strip()[-2:]=="别墅":
Garden = house.xpath("div[1]/div[2]/div/a/text()")[0]
Layout = info[2]
Acreage = info[3].strip()
Direction = info[4].strip()
Renovation = info[5].strip()
Elevator = info[6].strip()
Price =int(house.xpath("div[1]/div[6]/div[1]/span/text()")[0])
BuiltYear = re.search("\d{4}",house.xpath("div[1]/div[3]/div/text()")[0]).group()
Height = re.search("\d层",house.xpath("div[1]/div[3]/div/text()")[0]).group()
Building = info[1].strip()else:
Garden = house.xpath("div[1]/div[2]/div/a/text()")[0]
Layout = info[1]
Acreage = info[2].strip()
Direction = info[3].strip()
Renovation = info[4].strip()try:
Elevator = info[5].strip()#并不是所有房源都有电梯信息,若无则设为“无数据”except:
Elevator ="无数据"
Price = house.xpath("div[1]/div[6]/div[1]/span/text()")[0]try:
BuiltYear = re.search("\d{4}",house.xpath("div[1]/div[3]/div/text()")[0]).group()#并不是所有房源都有年代信息,若无则设为0except:
BuiltYear =0
Height = house.xpath("div[1]/div[3]/div/text()")[0][0:3]try:#并不是所有房源都有建筑类型信息,若无则设为“无数据”
Building = re.search("..楼",house.xpath("div[1]/div[3]/div/text()")[0]).group()[-2:]except:
Building ="无数据"except:print("Error")else:#写入并打印爬到的数据
csvWrite([Area,Region,Garden,Acreage,Direction,Layout,Renovation,Height,Elevator,BuiltYear,Building,Price])# print([Area,Region,Garden,Acreage,Direction,Layout,Renovation,Height,Elevator,BuiltYear,Building,Price])print("All data of District{} in Area {} have sbeen downloaded!".format(x[1],x[0]))
if __name__ =="__main__":
url_list = get_areas_regions_urls()
pool = Pool()#创建线程池
pool.map(region_spider,url_list)#使用多线程运行爬虫
pool.close()#关闭线程池
pool.join()#等待所有线程结束
二.数据分析
import pandas as pd
import pandas_profiling as pp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline