zoukankan      html  css  js  c++  java
  • l线程池抓取lianjia

    1. 线程池 的应用

    from multiprocessing.dummy import Pool
    import requests
    from lxml import etree
    url="https://sz.lianjia.com/ershoufang/co32/"
    # url="https://sz.lianjia.com/ershoufang/pg2co32/"
    
    from multiprocessing.dummy import Pool
    
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
    }
    
    import pymongo
    
    
    res = requests.get(url=url,headers=headers)
    
    class Ljia():
    
        def __init__(self):
    
            self.start_url ="https://sz.lianjia.com/ershoufang/co32/"
    
            self.headers ={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    
                }
            self.client = pymongo.MongoClient()
            self.collention = self.client.Ljia.lianjia
    
    
        def get_content(self,url):
    
            # 提取数据
            html = requests.get(url,headers=self.headers)
    
            tree = etree.HTML(html.text)
    
            li_list=tree.xpath('//ul[@class="sellListContent"]/li')
    
            for li in li_list:
                item={}
                item['title'] =li.xpath(".//div[@class='title']/a/text()")[0]
                item['detail_url'] = li.xpath(".//div[@class='houseInfo']/a/@href")[0]
                item['houseInfo'] =li.xpath(".//div[@class='houseInfo']//text()")
                item['houseInfo'] =[i.strip() for i in item['houseInfo'] if i.strip()]
                item['houseInfo'] = ''.join(item['houseInfo'])
                item['totalPrice'] = li.xpath(".//div[@class='totalPrice']//text()")
                item['totalPrice'] = [i.strip()for i in item['totalPrice'] if i.strip()]
                item['totalPrice'] = ''.join(item['totalPrice'])
                item['Price'] = li.xpath(".//div[@class='unitPrice']/span/text()")[0]
                item['followInfo'] =li.xpath(".//div[@class='followInfo']//text()")
                item['followInfo'] = [i.strip() for i in item['followInfo'] if i.strip()]
                item['followInfo'] = ''.join( item['followInfo'])
    
                print(item)
    
    
                return item
    
    
    
        def  get_page_url(self):
    
            # 生成待抓取的url
    
            yield self.start_url
    
            for i in range(2,101):
    
                url = "https://sz.lianjia.com/ershoufang/pg%sco32/"%i
    
                print('正在抓取:=============%s'%url)
    
    
                yield url
    
        def save_data(self,item):
            # 保存数据
            if item:
    
                self.collention.insert(item)
    
            else:
                print('数据不存在===========')
    
    
    
        def run(self):
    
            pool = Pool(5)
            # 线程池进行抓取数据
            data = pool.map(self.get_content,self.get_page_url())
            # 线程池进行存储
            pool.map(self.save_data,data)
    
    
    
    
    if __name__ == '__main__':
    
    
        lian_jia = Ljia()
    
        lian_jia.run()
  • 相关阅读:
    Python 模块 itertools
    Python 字符串的encode与decode
    python 模块 hashlib(提供多个不同的加密算法)
    暴力尝试安卓gesture.key
    hdu 1300 Pearls(DP)
    hdu 1232 畅通工程(并查集)
    hdu 1856 More is better(并查集)
    hdu 1198 Farm Irrigation(并查集)
    hdu 3635 Dragon Balls(并查集)
    hdu 3038 How Many Answers Are Wrong(并查集)
  • 原文地址:https://www.cnblogs.com/knighterrant/p/10800431.html
Copyright © 2011-2022 走看看