zoukankan      html  css  js  c++  java
  • l线程池抓取lianjia

    1. 线程池 的应用

    from multiprocessing.dummy import Pool
    import requests
    from lxml import etree
    url="https://sz.lianjia.com/ershoufang/co32/"
    # url="https://sz.lianjia.com/ershoufang/pg2co32/"
    
    from multiprocessing.dummy import Pool
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
    }
    
    import pymongo
    
    
    res = requests.get(url=url,headers=headers)
    
    class Ljia():
    
        def __init__(self):
    
            self.start_url ="https://sz.lianjia.com/ershoufang/co32/"
    
            self.headers ={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
                }
            self.client = pymongo.MongoClient()
            self.collention = self.client.Ljia.lianjia
    
    
        def get_content(self,url):
    
            # 提取数据
            html = requests.get(url,headers=self.headers)
    
            tree = etree.HTML(html.text)
    
            li_list=tree.xpath('//ul[@class="sellListContent"]/li')
    
            for li in li_list:
                item={}
                item['title'] =li.xpath(".//div[@class='title']/a/text()")[0]
                item['detail_url'] = li.xpath(".//div[@class='houseInfo']/a/@href")[0]
                item['houseInfo'] =li.xpath(".//div[@class='houseInfo']//text()")
                item['houseInfo'] =[i.strip() for i in item['houseInfo'] if i.strip()]
                item['houseInfo'] = ''.join(item['houseInfo'])
                item['totalPrice'] = li.xpath(".//div[@class='totalPrice']//text()")
                item['totalPrice'] = [i.strip()for i in item['totalPrice'] if i.strip()]
                item['totalPrice'] = ''.join(item['totalPrice'])
                item['Price'] = li.xpath(".//div[@class='unitPrice']/span/text()")[0]
                item['followInfo'] =li.xpath(".//div[@class='followInfo']//text()")
                item['followInfo'] = [i.strip() for i in item['followInfo'] if i.strip()]
                item['followInfo'] = ''.join( item['followInfo'])
    
                print(item)
    
    
                return item
    
    
    
        def  get_page_url(self):
    
            # 生成待抓取的url
    
            yield self.start_url
    
            for i in range(2,101):
    
                url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i
    
                print('正在抓取:=============%s'%url)
    
    
                yield url
    
        def save_data(self,item):
            # 保存数据
            if item:
    
                self.collention.insert(item)
    
            else:
                print('数据不存在===========')
    
    
    
        def run(self):
    
            pool = Pool(5)
            # 线程池进行抓取数据
            data = pool.map(self.get_content,self.get_page_url())
            # 线程池进行存储
            pool.map(self.save_data,data)
    
    
    
    
    if __name__ == '__main__':
    
    
        lian_jia = Ljia()
    
        lian_jia.run()
  • 相关阅读:
    粉丝投稿!从2月份的面试被拒到如今的阿里P7,说一说自己学java以来的经验!
    深入浅出!阿里P7架构师带你分析ArrayList集合源码,建议是先收藏再看!
    简单梳理一下Redis实现分布式Session,建议做java开发的都看看!
    HashMap知识点总结,这一篇算是总结的不错的了,建议看看!
    面试官:小伙子,够了够了,一个工厂模式你都在这说半个小时了!
    iOS-----推送机制(下)
    iOS-----推送机制(上)
    iOS-----使用CoreLocation定位
    iOS-----使用AFNetworking实现网络通信
    iOS-----JSON解析
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10800431.html
Copyright © 2011-2022 走看看