zoukankan      html  css  js  c++  java
  • 爬取赶集网二手物品下所有物品的信息

    爬取赶集网二手物品下所有物品的信息。
    大致思路:
    1、爬取频道页url;
    2、爬取商品详情页url,写入mongodb,url_list表;
    3、从url_list表读取url,爬取商品信息,写入mongodb,p_info表

    分成3个py文件:
    1、channel_url.py,获取频道页url;
    2、page_parse.py,主要是2个爬虫函数,分别完成2个数据表的写入;
    3、main.py,主程序文件,也就是本文件,开启多进程,完成整个工作

    最后顺利完成任务,感觉现在赶集真不行了,没多少数据。
    channel_url.py文件:
    import requests
    from bs4 import BeautifulSoup
    
    start_url = 'http://bj.ganji.com/wu/'
    url_host = 'http://bj.ganji.com'
    
    
    def get_channel_url(url):
        channel_urls = []
        raw_data = requests.get(url).text
        soup = BeautifulSoup(raw_data,'lxml')
        eles = soup.select('div.content dt>a')
        for e in eles:
            channel_url = url_host + e.get('href')
            print(channel_url)
            channel_urls.append(channel_url)
        return channel_urls
    
    # channel_urls = get_channel_url(start_url)
    # print('len(channel_urls):',len(channel_urls))
    
    # 这是程序运行的结果,直接保存下来了,就不用再运行get_channel_url()了
    channel_urls = '''
        http://bj.ganji.com/jiaju/
        http://bj.ganji.com/rirongbaihuo/
        http://bj.ganji.com/shouji/
        http://bj.ganji.com/bangong/
        http://bj.ganji.com/nongyongpin/
        http://bj.ganji.com/jiadian/
        http://bj.ganji.com/ershoubijibendiannao/
        http://bj.ganji.com/ruanjiantushu/
        http://bj.ganji.com/yingyouyunfu/
        http://bj.ganji.com/diannao/
        http://bj.ganji.com/xianzhilipin/
        http://bj.ganji.com/fushixiaobaxuemao/
        http://bj.ganji.com/meironghuazhuang/
        http://bj.ganji.com/shuma/
        http://bj.ganji.com/laonianyongpin/
        http://bj.ganji.com/xuniwupin/
        http://bj.ganji.com/qitawupin/
        http://bj.ganji.com/ershoufree/
        http://bj.ganji.com/wupinjiaohuan/
    '''
    

      

    page_parse.py文件:
    import requests
    from bs4 import BeautifulSoup
    from time import sleep
    from pymongo import MongoClient
    
    client = MongoClient('localhost',27017)
    ganji = client['ganji']
    url_list = ganji['url_list']
    p_info = ganji['p_info']
    
    # 给定频道url,爬取此频道下所有商品的url,打印并写入mongo数据库
    def get_product_url(url):
        channel_url = url
        page_num = 1
        while True:
            raw_page = requests.get(url).text
            print('正在get网页:',url)
            sleep(2)
            soup = BeautifulSoup(raw_page,'lxml')
            eles = soup.select('a.ft-tit')
            print('len(eles):',len(eles))
            for e in eles:
                p_url = e.get('href')
                url_list.insert_one({'p_url':p_url})
                print(p_url)
            if soup.select('a.next'):
                page_num += 1
                url = channel_url + 'o' + str(page_num) + '/'
            else:
                break
    
    # 给定商品详情页url,爬取商品具体信息,打印并写入mongo数据库
    def get_product_info(url):
        raw_page = requests.get(url).text
        sleep(2)
        soup = BeautifulSoup(raw_page,'lxml')
    
        if soup.select("p:contains('信息刚被删除~')"):   # 判断商品信息是否已经删除
            print('信息刚被删除~')
            pass
        else:
            title = soup.select('h1.title-name')[0].get_text() if soup.select('h1.title-name') else None
            category = list(soup.select('div.crumbs.routes.clearfix')[0].stripped_strings) if soup.select('div.crumbs.routes.clearfix') else None
            date = soup.select('i.pr-5')[0].get_text().split('\')[0].strip() if soup.select('i.pr-5') else None
            price = soup.select('i.f22.fc-orange.f-type')[0].get_text() if soup.select('i.f22.fc-orange.f-type') else None
            address = soup.select('ul.det-infor>li:nth-child(2)>a')[0].get_text() if soup.select('ul.det-infor>li:nth-child(2)>a') else None
            p_dict = {'title':title,'category':category,'date':date,'price':price,'address':address,'url':url}
            p_info.insert_one(p_dict)
            print(p_dict)
    

      

    main.py文件:
    from channel_url import channel_urls     # 从channel_url.py导入某变量,会把channel_url.py都执行一遍,但变量只在模块内部保留
    from page_parse import get_product_url, get_product_info, url_list   # 需要导入url_list
    from multiprocessing import Pool
    from datetime import datetime
    
    # 从mongodb中读取商品url,返回所有商品的url
    def read_all_p_urls():
        all_p_urls = []
        for item in url_list.find():
            all_p_urls.append(item['p_url'])
        return all_p_urls
    
    
    if __name__ == '__main__':
        start_time = datetime.now()
        
        # 不用多进程的方式,耗时会多好几倍
        # for channel in channel_urls.split():
        #     get_product_url(channel)
    
        pool = Pool()
        #用多进程的方式,4进程和自动分配进程耗时差不多
        #pool = Pool(processes=4)
    
        # 根据channel url,获取商品url,写入mongodb
        pool.map(get_product_url,channel_urls.split())
    
        # 根据商品url,获取商品信息,写入mongodb;这一句可以跟上面那句分开执行
        pool.map(get_product_info,read_all_p_urls())
    
        end_time = datetime.now()
        during = end_time - start_time
        print('总共耗时:',during)
    

      

  • 相关阅读:
    Python全栈开发之6、面向对象
    Python全栈开发之5、模块
    kvm恢复和删除快照
    virsh命令和虚拟机克隆
    Python全栈开发之4、迭代器、生成器、装饰器
    索引
    字段属性--唯一键
    安装虚拟机
    字段属性--自增长
    centos7安装kvm
  • 原文地址:https://www.cnblogs.com/djlbolgs/p/12539821.html
Copyright © 2011-2022 走看看