zoukankan      html  css  js  c++  java
  • 广州楼盘抓取分析-分析问题

    上文其实还是有不少问题的。

    1.顺序执行,效率比较慢;2.不能断点执行。

    那么,解决办法是什么呢?

    对于问题1,可以采用生产者消费者模式来改写,代码如下

    # -*- coding: utf-8 -*-
    #######################################################################
    # Copyright (C) 2005-2016 UC Mobile Limited. All Rights Reserved
    # File          : first_sale_spider.py
    #
    # Creation      : 2016/2/23 19:41
    # Author        : shufeng.lsf@ucweb.com
    #######################################################################
    import random
    from threading import Thread
    
    import requests
    import re
    
    import time
    from pyquery import PyQuery as pq
    from Queue import Queue
    import MySQLdb
    import uniout
    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    community_list = []
    
    HOST = "127.0.0.1"
    USER = "root"
    PASSWD = ""
    DB = "house_analysis"
    PORT = 3306
    
    queue = Queue(10)
    
    class DBOperate(object):
        def __init__(self, host, user, passwd, db, port, charset="utf8"):
            self.host = host
            self.user = user
            self.passwd = passwd
            self.db = db
            self.port = port
            self.conn = MySQLdb.connect(self.host, self.user, self.passwd, self.db, self.port, charset="utf8")
            self.cur = self.conn.cursor()
    
        def insertSql(self,sql):
            self.cur.execute(sql)
            self.conn.commit()
    
        def __del__(self):
            self.cur.close()
            self.conn.close()
    
    
    def requestByGet(url):
        r = requests.get(url)
        return r.content
    
    def getNextPage(content):
        m = re.findall(r'<a href="(.+?)" class="next-page next-link">下一页</a>',content)
        if len(m)>0:
            next_url = m[0]
        else:
            next_url = ''
        return next_url
    
    def getCommunityList(content):
        community_urls = re.findall(r'data-link="(http://gz.fang.anjuke.com/loupan/d+?.html)"',content)
        print "正在采集...",community_urls
        if len(community_urls)>0:
            return community_urls
    
    
    def getHouseInfo(url):
        p = pq(url)
        name = p('h1').text().strip()
        style = p('.house-item').text().split(",")[0].strip()
        price = p('.sp-price').text().strip()
        l = p('.lpAddr-text').text()
        location = re.split('[ | ]',l)
        area = location[-2].split('-')[0].strip()
        zone = location[-2].split('-')[1].strip()
        address = location[-1].strip()
        detail_location = location[-1].strip()
        result = {
            "name": name,
            "area": area,
            "location": zone,
            "detail_location": detail_location,
            "house_style": style,
            "price": price
        }
        return result
    
    
    def detailPageHandler(cur, detail_url):
        result = getHouseInfo(detail_url)
        print "result:",result
        cur.insertSql("insert into first_sale (name,area,location,detail_location,house_style,price) VALUES('%s','%s','%s','%s','%s','%s')" % (
            result['name'],
            result['area'],
            result['location'],
            result['detail_location'],
            result['house_style'],
            result['price']
        ))
    
    class UrlProducer(Thread):
        def __init__(self, start_url):
            Thread.__init__(self)
            self.start_url = start_url
    
        def run(self):
            global queue
            while True:
                content = requestByGet(self.start_url)
                next_url = getNextPage(content)
                community_urls = getCommunityList(content)
                for url in community_urls:
                    queue.put(url)
                    time.sleep(random.random())
                    print "进入队列的url:",url
                if next_url != '':
                    self.start_url = next_url
                    continue
                else:
                    break
    
    class GetHouseInfo(Thread):
        def __init__(self, cur):
            Thread.__init__(self)
            self.cur = cur
    
        def run(self):
            global queue
            while True:
                url = queue.get()
                detailPageHandler(self.cur, url)
                queue.task_done()
                time.sleep(random.random())
                print "处理完毕的url:", url
    
    
    def main():
        cur = DBOperate(host=HOST, user=USER, passwd=PASSWD, db=DB, port=PORT)
        UrlProducer("http://gz.fang.anjuke.com/loupan/?from=navigation").start()
        GetHouseInfo(cur).start()
    
    
    if __name__ == '__main__':
        main()
    

    2.对于不能断点执行的问题,可以用异常捕获的方式将当前执行的url保存下来,下次直接从文件中读取执行即可。

  • 相关阅读:
    django的命令, 配置,以及django使用mysql的流程
    vue中局部组件的使用
    Chapter14【Collection、泛型】
    泛型
    集合遍历的方式(迭代器和增强for)
    Collection集合
    集合
    数组
    包装类
    基本类型与字符串之间的转换
  • 原文地址:https://www.cnblogs.com/alexkn/p/5225744.html
Copyright © 2011-2022 走看看