zoukankan      html  css  js  c++  java
  • 实例学习——爬取简书网用户动态

    开发环境:(Windows)eclipse+pydev+MongoDB

    爬取网址:简书网

    lxml解析:

    # _*_ coding:utf-8 _*_
    import requests
    from lxml import etree
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    mydb = client['mydb']
    timeline = mydb['timeline']
    
    def get_time_info(url, page):
        user_id = url.split('/')
        user_id =user_id[4]
        if url.find('page='):
            page = page + 1
        html = requests.get(url)
        selector = etree.HTML(html.text)
        infos = selector.xpath('//ul[@class="note-list"]/li')
        
        for info in infos:
            dd = info.xpath('div/div/div/span/@data-datetime')[0]
            type = info.xpath('div/div/div/span/@data-type')[0]
            timeline.insert_one({'date':dd, 'type':type})    
            
        id_infos = selector.xpath('//ul[@class="note-list"]/li/@id')
        if len(id_infos) >1:
            feed_id = id_infos[-1]
            max_id = feed_id.split('-')[1]
            next_url = 'https://www.jianshu.com/users/%s/timeline?max_id=%s&page=%s' %(user_id, max_id, page)
    
            get_time_info(next_url, page)
            
    if __name__ =='__main__':
        get_time_info('https://www.jianshu.com/users/9104ebf5e177/timeline', 1)
    

    bs4解析:

    import requests
    from bs4 import BeautifulSoup
    
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
    
    ##获取data_type和datetime信息。
    def get_info(url):   
        r = requests.get(url,headers = headers)
        soup = BeautifulSoup(r.text,"lxml")
        infos = soup.select("ul.note-list li")
        for info in infos:
            data_type = info.select("div.author span")[0].get("data-type")
            datetime = info.select("div.author span")[0].get("data-datetime")
            print(data_type,datetime)   
    
    ##获取max_id信息
    def get_id(url):
        r = requests.get(url,headers = headers)
        soup = BeautifulSoup(r.text,"lxml")
        max_id = soup.select("ul.note-list li")[-1].get("id").split("-")[1]
        return int(max_id)
    
    if __name__ == "__main__":
        start_url = "https://www.jianshu.com/users/9104ebf5e177/timeline"
        get_info(start_url)
        max_id = get_id(start_url) + 1
        
        #利用循环代替递归函数。
        for page in range(2,11):        
            next_url = "https://www.jianshu.com/users/9104ebf5e177/timeline?max_id={}&page={}".format(max_id, page)
            get_info(next_url)
            max_id = get_id(next_url) + 1
    

    bs4解析方法采自:简书网

  • 相关阅读:
    javascript中replace()
    防止IE6出现BUG的十种常见解决方法
    IE6 重复字符的bug
    IE6 BUG大全
    display:inline
    JavaScript 图片上传预览效果
    用一行代码让w3wp进程崩溃,如何查找w3wp进程崩溃的原因
    近期学习任务
    气死我的存储过程和用户定义函数
    Damn,China Mobile!!!!
  • 原文地址:https://www.cnblogs.com/junecode/p/11550903.html
Copyright © 2011-2022 走看看