1 多线程抓取
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock() # 递归锁
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
# 存储地址和链接
areaDict = {}
for area in areaList:
# 区名
areaName = area.xpath('./text()')[0]
# url
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
# 西湖 https://hz.lianjia.com/ershoufang/xihu/ 将其变成字典
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
for house in sellList:
# 概述
title = house.xpath('.//div[@class="title"]/a/text()')[0]
# url
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子信息
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] +
house.xpath('.//div[@class="houseInfo"]/text()')[0]
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] +
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
# 总价
# /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
# print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
with rlock:
print(areaName)
with open(areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
# 多线程
threadList = []
for areaName, url in areaDict.items():
t = threading.Thread(target=gethouseInfo, args=(areaName, url))
# 开启
threadList.append(t)
t.start()
# 保证线程都结束
for i in threadList:
i.join()
print(time.clock())
2 多协程抓取
import gevent
from gevent import monkey
gevent.monkey.patch_all() #有些需要刚开始进行初始化
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock() # 递归锁
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
# 存储地址和链接
areaDict = {}
for area in areaList:
# 区名
areaName = area.xpath('./text()')[0]
# url
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
for house in sellList:
# 概述
title = house.xpath('.//div[@class="title"]/a/text()')[0]
# url
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子信息
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] +
house.xpath('.//div[@class="houseInfo"]/text()')[0]
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] +
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
# 总价
# /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
# print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
# 多协程
# gevent.monkey.patch_all() # 非阻塞io 如果此处不行则需要在最上方导入
geventList = []
for k, v in areaDict.items():
g = gevent.spawn(gethouseInfo, k, v)
geventList.append(g)
gevent.joinall(geventList)
print(time.clock())
3 多进程抓取
import lxml
from lxml import etree
import requests
import multiprocessing
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
# 存储地址和链接
areaDict = {}
for area in areaList:
# 区名
areaName = area.xpath('./text()')[0]
# url
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
for house in sellList:
# 概述
title = house.xpath('.//div[@class="title"]/a/text()')[0]
# url
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子信息
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] +
house.xpath('.//div[@class="houseInfo"]/text()')[0]
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] +
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
# 总价
# /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
')
f.flush()
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
# 多进程
processList = []
for areaName, url in areaDict.items():
t = multiprocessing.Process(target=gethouseInfo, args=(areaName, url)) #开启多进程
# 开启
processList.append(t)
t.start()
# 保证线程都结束
for i in processList:
i.join()
print(time.clock())
4 多线程加协程
import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json
import lxml
from lxml import etree
import requests
import threading
import time
rlock = threading.RLock() # 递归锁
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 非阻塞IO
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
# 存储地址和链接
areaDict = {}
for area in areaList:
# 区名
areaName = area.xpath('./text()')[0]
# url
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
for house in sellList:
# 概述
title = house.xpath('.//div[@class="title"]/a/text()')[0]
# url
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子信息
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] +
house.xpath('.//div[@class="houseInfo"]/text()')[0]
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] +
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
# 总价
# /html/body/div[4]/div[1]/ul/li[1]/div[1]/div[6]/div[1]/span
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
# print(title, houseurl, houseInfo, positionInfo, totalPrice, unitPrice)
with rlock:
print(areaName)
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
')
f.flush()
def getPageNum(areaName, url):
'''
获取当前页面
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageNum = json.loads(pageNum) # json数据
pageNum = pageNum['totalPage']
geventList = []
for i in range(1, int(pageNum) + 1):
newurl = url + "pg%d/" % i
g = gevent.spawn(gethouseInfo, areaName, newurl)
geventList.append(g)
gevent.joinall(geventList)
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
# 多线程
threadList = []
for areaName, url in areaDict.items():
t = threading.Thread(target=getPageNum, args=(areaName, url))
# 开启
threadList.append(t)
t.start()
# 保证线程都结束
for i in threadList:
i.join()
print(time.clock())
5 多进程加协程
import gevent
from gevent import monkey
gevent.monkey.patch_all()
import json
import lxml
from lxml import etree
import requests
import multiprocessing
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 非阻塞IO
def getArea(url):
'''
获取区域名和链接
:param url: 种子
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
areaList = mytree.xpath('//div[@data-role="ershoufang"]/div[1]/a')
# 存储地址和链接
areaDict = {}
for area in areaList:
# 区名
areaName = area.xpath('./text()')[0]
# url
areaurl = "https://hz.lianjia.com" + area.xpath('./@href')[0]
print(areaName, areaurl)
areaDict[areaName] = areaurl
return areaDict
def gethouseInfo(areaName, url):
'''
获取房子信息
:param areaname: 地区名
:param url: 区域的url
:return:
'''
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
sellList = mytree.xpath("//ul[@class='sellListContent']/li[@class="clear"]")
for house in sellList:
# 概述
title = house.xpath('.//div[@class="title"]/a/text()')[0]
# url
houseurl = house.xpath('.//div[@class="title"]/a/@href')[0]
# 房子信息
houseInfo = house.xpath('.//div[@class="houseInfo"]/a/text()')[0] +
house.xpath('.//div[@class="houseInfo"]/text()')[0]
# 位置信息
positionInfo = house.xpath('.//div[@class="positionInfo"]/text()')[0] +
house.xpath('.//div[@class="positionInfo"]/a/text()')[0]
# 总价
totalPrice = house.xpath('.//div[@class="totalPrice"]/span/text()')[0] + '万'
# 平方价
unitPrice = house.xpath('.//div[@class="unitPrice"]/span/text()')[0]
print(areaName)
with open("./hz/" + areaName + '.txt', 'a+', encoding='utf-8', errors='ignore') as f:
f.write(str((title, houseInfo, houseurl, positionInfo, totalPrice, unitPrice)) + '
')
f.flush()
def getPageNum(areaName, url):
response = requests.get(url, headers=headers).text
mytree = lxml.etree.HTML(response)
pageNum = mytree.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
pageNum = json.loads(pageNum) # json数据
pageNum = pageNum['totalPage']
geventList = []
for i in range(1, int(pageNum) + 1):
newurl = url + "pg%d/" % i
g = gevent.spawn(gethouseInfo, areaName, newurl)
geventList.append(g)
gevent.joinall(geventList)
if __name__ == '__main__':
starUrl = "https://hz.lianjia.com/ershoufang/"
areaDict = getArea(starUrl)
time.clock()
print(areaDict)
# 多线程
processList = []
for areaName, url in areaDict.items():
# 开启多进程
p = multiprocessing.Process(target=getPageNum,args=(areaName, url))
processList.append(p)
p.start()
# 保证进程都结束
for i in processList:
i.join()
print(time.clock())