zoukankan      html  css  js  c++  java
  • 赶集网二手数据.py

    #获取所有二手频道链接
    import requests
    from bs4 import BeautifulSoup
    
    star_url = 'http://bj.ganji.com/wu/'
    url_host = 'http://bj.ganji.com'
    page_url = []
    def get_index_url(url):
        wb_data = requests.get(url)
        if wb_data.status_code == 200:
            soup = BeautifulSoup(wb_data.text,'lxml')
            links =soup.select('dl.fenlei > dt > a')
            for link in links:
                all_list = url_host + link.get('href')
                page_url.append(all_list)
            return page_url
        else:
            pass
    
    get_index_url(star_url)
    
    
    
    #获取所有频道里面的子链接
    from bs4 import BeautifulSoup
    from multiprocessing import Pool
    from channel_exciting import page_url
    import requests
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    ganji = client['ganji']
    url_list = ganji['url_list']
    item_info = ganji['item_info']
    
    
    headers = {
        'user_agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36',
        'Connection':'keep-alive'
    }
    
    def get_links_from(channel,page):
        #http://bj.ganji.com/jiaju/o3/
        url_host = '{}o{}'.format(channel,str(page))
        wb_data = requests.get(url_host,headers=headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        for link in soup.select('td a,t'):
            item_link = link.get('href').split('?')[0]
            if wb_data.status_code != 200:
                pass
            else:
                url_list.insert_one({'url':item_link})
        print(url_list)
    
    def get_all_links(channel):
        for i in range(1,100):
            get_links_from(channel,i)
    
    if __name__ == '__main__':
        list = []
        for item in page_url:
            list.append(item)
        pool = Pool()
        pool.map(get_all_links,list)
    
    
    #获取所有子链接里面的数据
    from multiprocessing import Pool
    from page_parsing import url_list
    from bs4 import BeautifulSoup
    import requests
    import pymongo
    import time
    
    client = pymongo.MongoClient('localhost',27017)
    ganji = client['ganji']
    item_info = ganji['item_info']
    
    headers = {
        'user_agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36',
        'Connection':'keep-alive'
    }
    
    
    def get_items_info(url):
        wb_data = requests.get(url,headers=headers)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        try:
            data={
                'title':soup.title.text,
                'price':soup.select('span.price_now > i')[0].text,
                'area':soup.select('div.palce_li > span > i')[0].text,
                'url':url
            }
            item_info.insert_one(data)
        except IndexError:
            pass
        else:
            print(data)
            time.sleep(2)
    
    
    if __name__ == '__main__':
        list = []
        for item in url_list.find():
            list.append(item['url'])
        pool = Pool()
        pool.map(get_items_info,list)
  • 相关阅读:
    [20200316]dmesg与时间戳2.txt
    [20200312]不要设置net.ipv4.tcp_tw_recycle=1.txt
    [20200223]关于latch and mutext的优化.txt
    [20200211]使用DBMS_SHARED_POOL.MARKHOT与sql_id的计算.txt
    [20200129]子光标不共享BIND_EQUIV_FAILURE.txt
    [20200103]GUID转换GUID_BASE64.txt
    [20191220]关于共享内存段相关问题.txt
    [20191218]降序索引疑问4.txt
    git工具-系列目录
    git工具-git基础
  • 原文地址:https://www.cnblogs.com/dws-love-jfl-1314/p/6057691.html
Copyright © 2011-2022 走看看