zoukankan      html  css  js  c++  java
  • python获取豆瓣日记

    最近迷上了看了四个春天,迷上了饭叔的豆瓣日记,想全部抓取下来,简单了写了下面的脚本

    import urllib.request
    import os
    
    from bs4 import BeautifulSoup
    
    
    def get_html(url):
        """通用方法,获取整个链接得html·"""
        web = urllib.request.urlopen(url)
        soup = BeautifulSoup(web, "html.parser")
        # print(soup)
        data = soup.find("div", id="content")
        return data
    
    
    def get_diary(data,path):
        """获取日记链接,并且存储起来"""
        data = data.find_all("div",class_="note-header-container")
        for link in data:
            # print(link)
            diary_url = link.find('div', class_="rr").find('a').get("href")
            with open(path, 'a+', encoding='UTF-8') as f:
                f.write(diary_url+'
    ')
    
    
    def get_num(url):
        #获取最大页数
        html_data = get_html(url)
        paginator_data = html_data.find("div",class_="paginator")
        page_num =[]
        for link in  paginator_data.find_all("a"):
            page_num.append(link.get_text())
    
    
        return "".join(page_num[-2:-1])
    
    def get_diary_data(url,path):
        """获取日记内容,保存为txt文件"""
        data = get_html(url)
        title = data.find("h1").get_text()
        file_name = path+"/"+title+".txt"
        with open(file_name,'a+',encoding='UTF-8') as f:
            f.write(title)
        note_data = data.find("div",id="link-report")
        for node_line in note_data.stripped_strings:
            with open(file_name, 'a+', encoding='UTF-8') as f:
                f.write(repr(node_line))
    
    
    
    if __name__ == '__main__':
        url = 'https://www.douban.com/people/luqy/notes'
        path = "d://陆导"
        diary_url_path = path + "/"+"diary_url.txt"
        page_num = get_num(url)
        for i in range(14):
            url1 = url + "?start=%d&type=note"%(i*10)
            get_diary(get_html(url1),diary_url_path)
    
        f = open(diary_url_path,'r',encoding='utf-8')
        for line in f.readlines():
            try:
                get_diary_data(line,path)
            except Exception as e:
                print(e)
        f.close()
    

    目前存在一个问题

    1,抓取次数过多会被分IP地址

    爬取结果:


  • 相关阅读:
    compilation debug= true targetframework= 4.0 / configuration error
    Using Temp table in SSIS package
    Using an Excel Destination in SSIS with x64
    SQL Server 中的两个查询级别的Hint NOLOCK和ROWLOCK
    SQL Server中的timeout设置
    Global.asax 转
    VC++动态链接库编程之MFC规则DLL
    堆栈详解(数据与内存中的存储方式) .
    [C++]拷贝构造函数和赋值运算符重载
    #ifdef __cplusplus extern "C" { #endif”的定义的含义 .
  • 原文地址:https://www.cnblogs.com/mrwuzs/p/11273244.html
Copyright © 2011-2022 走看看