zoukankan      html  css  js  c++  java
  • [Python] 爬取博客园博主标题 网络爬虫 2020.2.8

    爬取积分榜前3000名博主前20页的博客标题,

    根据左下角

     博客列表页面

     进行爬取

    代码如下:

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import io
     4 import re
     5 
     6 url=""
     7 
     8 #写入内容
     9 def Content(url):
    10     try:
    11         kv = {'user-agent': 'Mozilla/5.0'}
    12         r = requests.get(url, headers=kv)
    13         r.encoding = r.apparent_encoding
    14         demo = r.text
    15         soup = BeautifulSoup(demo, "html.parser")
    16         print(url)
    17         for a in soup.find_all("a",{"class":"postTitle2"}):
    18             print(str(a.string).rstrip().lstrip().replace("[置顶]",""))
    19             Content_write(str(a.string).rstrip().lstrip().replace("[置顶]",""))
    20     except:
    21         print("没有数据了!")
    22 
    23 #读取博主主页链接
    24 def Href():
    25     try:
    26         kv = {'user-agent': 'Mozilla/5.0'}
    27         r = requests.get("https://www.cnblogs.com/AllBloggers.aspx", headers=kv)
    28         r.encoding = r.apparent_encoding
    29         demo = r.text
    30         soup = BeautifulSoup(demo, "html.parser")
    31         print(url)
    32         text = ""
    33         for t in soup.find_all("td"):
    34             if t.find("a") is not None:
    35                 print(t.find("a").attrs['href'])
    36                 write(t.find("a").attrs['href'])
    37     except:
    38         print("没有数据了")
    39 
    40 #写入链接
    41 def write(contents):
    42     f=open('E://bloghref.txt','a+',encoding='utf-8')
    43     f.write(contents+"
    ")
    44     print('写入成功!')
    45     f.close()
    46 
    47 #写入内容
    48 def Content_write(contents):
    49     f=open('E://blogcontent.txt','a+',encoding='utf-8')
    50     f.write(contents+"
    ")
    51     print('写入成功!')
    52     f.close()
    53 
    54 #循环写入
    55 def write_all():
    56     try:
    57         f=open('E://bloghref.txt','r+',encoding='utf-8')
    58         for line in f:
    59             line=line.rstrip("
    ")
    60             for i in range(1,20):
    61                 url=line+"default.html?page="+str(i)
    62                 Content(url)
    63     except:
    64         print("超出页数!")
    65 if __name__=="__main__":
    66    Herf()
    67     write_all()
    68     #
    69     #Content("https://www.cnblogs.com/#p3")
    70     #Content("https://www.cnblogs.com/Terrylee/")
    import requests
    from bs4 import BeautifulSoup
    import io
    import re

    url=""

    #写入内容
    def Content(url):
    try:
    kv = {'user-agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=kv)
    r.encoding = r.apparent_encoding
    demo = r.text
    soup = BeautifulSoup(demo, "html.parser")
    print(url)
    for a in soup.find_all("a",{"class":"postTitle2"}):
    print(str(a.string).rstrip().lstrip().replace("[置顶]",""))
    Content_write(str(a.string).rstrip().lstrip().replace("[置顶]",""))
    except:
    print("没有数据了!")

    #读取博主主页链接
    def Href():
    try:
    kv = {'user-agent': 'Mozilla/5.0'}
    r = requests.get("https://www.cnblogs.com/AllBloggers.aspx", headers=kv)
    r.encoding = r.apparent_encoding
    demo = r.text
    soup = BeautifulSoup(demo, "html.parser")
    print(url)
    text = ""
    for t in soup.find_all("td"):
    if t.find("a") is not None:
    print(t.find("a").attrs['href'])
    write(t.find("a").attrs['href'])
    except:
    print("没有数据了")

    #写入链接
    def write(contents):
    f=open('E://bloghref.txt','a+',encoding='utf-8')
    f.write(contents+" ")
    print('写入成功!')
    f.close()

    #写入内容
    def Content_write(contents):
    f=open('E://blogcontent.txt','a+',encoding='utf-8')
    f.write(contents+" ")
    print('写入成功!')
    f.close()

    #循环写入
    def write_all():
    try:
    f=open('E://bloghref.txt','r+',encoding='utf-8')
    for line in f:
    line=line.rstrip(" ")
    for i in range(1,20):
    url=line+"default.html?page="+str(i)
    Content(url)
    except:
    print("超出页数!")
    if __name__=="__main__":

    write_all()
    #Href()
    #Content("https://www.cnblogs.com/#p3")
    #Content("https://www.cnblogs.com/Terrylee/")

  • 相关阅读:
    项目经理所需要具备的素质
    项目经理的个人修养
    项目拖期怎么办
    创业起步的十大准备步骤
    注册(创办)公司的手续过程
    如何让女人每天更快乐
    全面比较:中美两国百姓的生活成本
    ADO.NET级别的事物
    respondsToSelector的使用
    用js在两个页面之间传值
  • 原文地址:https://www.cnblogs.com/zlc364624/p/12285623.html
Copyright © 2011-2022 走看看