zoukankan      html  css  js  c++  java
  • l线程池抓取lianjia

    1. 线程池 的应用

    from multiprocessing.dummy import Pool
    import requests
    from lxml import etree
    url="https://sz.lianjia.com/ershoufang/co32/"
    # url="https://sz.lianjia.com/ershoufang/pg2co32/"
    
    from multiprocessing.dummy import Pool
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
    }
    
    import pymongo
    
    
    res = requests.get(url=url,headers=headers)
    
    class Ljia():
    
        def __init__(self):
    
            self.start_url ="https://sz.lianjia.com/ershoufang/co32/"
    
            self.headers ={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
                }
            self.client = pymongo.MongoClient()
            self.collention = self.client.Ljia.lianjia
    
    
        def get_content(self,url):
    
            # 提取数据
            html = requests.get(url,headers=self.headers)
    
            tree = etree.HTML(html.text)
    
            li_list=tree.xpath('//ul[@class="sellListContent"]/li')
    
            for li in li_list:
                item={}
                item['title'] =li.xpath(".//div[@class='title']/a/text()")[0]
                item['detail_url'] = li.xpath(".//div[@class='houseInfo']/a/@href")[0]
                item['houseInfo'] =li.xpath(".//div[@class='houseInfo']//text()")
                item['houseInfo'] =[i.strip() for i in item['houseInfo'] if i.strip()]
                item['houseInfo'] = ''.join(item['houseInfo'])
                item['totalPrice'] = li.xpath(".//div[@class='totalPrice']//text()")
                item['totalPrice'] = [i.strip()for i in item['totalPrice'] if i.strip()]
                item['totalPrice'] = ''.join(item['totalPrice'])
                item['Price'] = li.xpath(".//div[@class='unitPrice']/span/text()")[0]
                item['followInfo'] =li.xpath(".//div[@class='followInfo']//text()")
                item['followInfo'] = [i.strip() for i in item['followInfo'] if i.strip()]
                item['followInfo'] = ''.join( item['followInfo'])
    
                print(item)
    
    
                return item
    
    
    
        def  get_page_url(self):
    
            # 生成待抓取的url
    
            yield self.start_url
    
            for i in range(2,101):
    
                url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i
    
                print('正在抓取:=============%s'%url)
    
    
                yield url
    
        def save_data(self,item):
            # 保存数据
            if item:
    
                self.collention.insert(item)
    
            else:
                print('数据不存在===========')
    
    
    
        def run(self):
    
            pool = Pool(5)
            # 线程池进行抓取数据
            data = pool.map(self.get_content,self.get_page_url())
            # 线程池进行存储
            pool.map(self.save_data,data)
    
    
    
    
    if __name__ == '__main__':
    
    
        lian_jia = Ljia()
    
        lian_jia.run()
  • 相关阅读:
    Enhancing State-of-the-art Classifiers with API Semantics to Detect Evolved Android Malware论文阅读笔记
    this和super的总结
    软件工程结对WordCount项目
    软工个人作业 数独
    问题
    自我介绍
    shuduku
    access to DeepLearning
    自我介绍
    学习软工基目标
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10800431.html
Copyright © 2011-2022 走看看