zoukankan      html  css  js  c++  java
  • 爬取58二手数据.py

    #第一个模块 抓取所有频道链接
    from bs4 import BeautifulSoup
    import requests
    
    
    start_url = 'http://bj.58.com/sale.shtml'
    url_host = 'http://bj.58.com'
    
    def get_index_url(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        links = soup.select('ul.ym-submnu > li > b > a')
        for link in links:
            page_url = url_host + link.get('href')
            print(page_url)
    
    get_index_url(start_url)
    
    
    #第二个模块 抓取所有商品链接和详情数据
    
    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost', 27017)
    ceshi = client['ceshi']
    url_list = ceshi['url_list4']
    item_info = ceshi['item_info4']
    
    
    
    # 在最左边是在python 中对象的名称,后面的是在数据库中的名称
    # spider 1
    def get_links_from(channel, pages):
        # td.t 没有这个就终止
        list_view = '{}/pn{}/'.format(channel, str(pages))
        wb_data = requests.get(list_view)
        time.sleep(1)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if soup.find('td', 't'):
            for link in soup.select('td.t a.t'):
                item_link = link.get('href').split('?')[0]
                if item_link != 'http://jump.zhineng.58.com/jump':
                    url_list.insert({'url':item_link})
                    print(item_link)
                # return urls
        else:
            # It's the last page !
            pass
    
    # spider 2
    def get_item_info(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if url[:25] == 'http://zhuanzhuan.58.com/':
            data={
                'title':soup.title.text,
                'price': soup.select('span.price_now')[0].text,
                'area':soup.select('div.palce_li > span > i')[0].text,
                'url':url
                }
            item_info.insert(data)
        else:
            data={
                'title': soup.title.text,
                'price':soup.select('span.price.c_f50')[0].text,
                'area':soup.select('div.su_con > a ')[0].get_text(),
                'sale_man':soup.select('ul.vcard > li > a ')[0].text,
                'url':url
                }
            item_info.insert(data)
    
    
    #第三个模块 主文件运行开始抓取
    from multiprocessing import Pool
    from pages_parsing import get_item_info,url_list,item_info,get_links_from
    from channel_extact import channel_list
    
    
    item_url = (item['url'] for item in url_list.find())
    index_urls0 = (item['url'] for item in item_info.find())
    x = set(item_url)
    y = set(index_urls0)
    rest_of_urls = x-y
    
    def get_all_links_from(channel):
        for i in range(1,100):
            get_links_from(channel,i)
        return rest_of_urls
    
    if __name__ == '__main__':
        pool = Pool()
        # pool = Pool(processes=6)
        #pool.map(get_all_links_from,channel_list.split())
        pool.map(get_item_info,rest_of_urls)
    
    # count = 0
    # for url in rest_of_urls:
    #     print(url)
    #     count += 1
    # print(count)
    
    #第四个模块 查看数据流
    import time
    from pages_parsing import url_list
    
    while True:
        print(url_list.find().count())
        time.sleep(5)
  • 相关阅读:
    面向过程编程
    生成器
    迭代器
    装饰器
    函数对象与闭包
    名称空间和作用域
    Django中的as_view方法源码分析
    DRF3序列化反序列化
    DRF4级联与外键字段
    django--BBS项目,后端业务逻辑整理
  • 原文地址:https://www.cnblogs.com/dws-love-jfl-1314/p/6045670.html
Copyright © 2011-2022 走看看