1. 线程池 的应用
from multiprocessing.dummy import Pool
import requests from lxml import etree url="https://sz.lianjia.com/ershoufang/co32/" # url="https://sz.lianjia.com/ershoufang/pg2co32/" from multiprocessing.dummy import Pool headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } import pymongo res = requests.get(url=url,headers=headers) class Ljia(): def __init__(self): self.start_url ="https://sz.lianjia.com/ershoufang/co32/" self.headers ={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } self.client = pymongo.MongoClient() self.collention = self.client.Ljia.lianjia def get_content(self,url): # 提取数据 html = requests.get(url,headers=self.headers) tree = etree.HTML(html.text) li_list=tree.xpath('//ul[@class="sellListContent"]/li') for li in li_list: item={} item['title'] =li.xpath(".//div[@class='title']/a/text()")[0] item['detail_url'] = li.xpath(".//div[@class='houseInfo']/a/@href")[0] item['houseInfo'] =li.xpath(".//div[@class='houseInfo']//text()") item['houseInfo'] =[i.strip() for i in item['houseInfo'] if i.strip()] item['houseInfo'] = ''.join(item['houseInfo']) item['totalPrice'] = li.xpath(".//div[@class='totalPrice']//text()") item['totalPrice'] = [i.strip()for i in item['totalPrice'] if i.strip()] item['totalPrice'] = ''.join(item['totalPrice']) item['Price'] = li.xpath(".//div[@class='unitPrice']/span/text()")[0] item['followInfo'] =li.xpath(".//div[@class='followInfo']//text()") item['followInfo'] = [i.strip() for i in item['followInfo'] if i.strip()] item['followInfo'] = ''.join( item['followInfo']) print(item) return item def get_page_url(self): # 生成待抓取的url yield self.start_url for i in range(2,101): url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i print('正在抓取:=============%s'%url) yield url def save_data(self,item): # 保存数据 if item: self.collention.insert(item) else: print('数据不存在===========') def run(self): pool = Pool(5) # 线程池进行抓取数据 data = pool.map(self.get_content,self.get_page_url()) # 线程池进行存储 pool.map(self.save_data,data) if __name__ == '__main__': lian_jia = Ljia() lian_jia.run()