zoukankan html css js c++ java

[Python] 爬取博客园博主标题网络爬虫 2020.2.8

爬取积分榜前3000名博主前20页的博客标题，

根据左下角

博客列表页面

进行爬取

代码如下：

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import io
 4 import re
 5 
 6 url=""
 7 
 8 #写入内容
 9 def Content(url):
10     try:
11         kv = {'user-agent': 'Mozilla/5.0'}
12         r = requests.get(url, headers=kv)
13         r.encoding = r.apparent_encoding
14         demo = r.text
15         soup = BeautifulSoup(demo, "html.parser")
16         print(url)
17         for a in soup.find_all("a",{"class":"postTitle2"}):
18             print(str(a.string).rstrip().lstrip().replace("[置顶]",""))
19             Content_write(str(a.string).rstrip().lstrip().replace("[置顶]",""))
20     except:
21         print("没有数据了！")
22 
23 #读取博主主页链接
24 def Href():
25     try:
26         kv = {'user-agent': 'Mozilla/5.0'}
27         r = requests.get("https://www.cnblogs.com/AllBloggers.aspx", headers=kv)
28         r.encoding = r.apparent_encoding
29         demo = r.text
30         soup = BeautifulSoup(demo, "html.parser")
31         print(url)
32         text = ""
33         for t in soup.find_all("td"):
34             if t.find("a") is not None:
35                 print(t.find("a").attrs['href'])
36                 write(t.find("a").attrs['href'])
37     except:
38         print("没有数据了")
39 
40 #写入链接
41 def write(contents):
42     f=open('E://bloghref.txt','a+',encoding='utf-8')
43     f.write(contents+"
")
44     print('写入成功！')
45     f.close()
46 
47 #写入内容
48 def Content_write(contents):
49     f=open('E://blogcontent.txt','a+',encoding='utf-8')
50     f.write(contents+"
")
51     print('写入成功！')
52     f.close()
53 
54 #循环写入
55 def write_all():
56     try:
57         f=open('E://bloghref.txt','r+',encoding='utf-8')
58         for line in f:
59             line=line.rstrip("
")
60             for i in range(1,20):
61                 url=line+"default.html?page="+str(i)
62                 Content(url)
63     except:
64         print("超出页数！")
65 if __name__=="__main__":
66 　　 Herf()
67     write_all()
68     #
69     #Content("https://www.cnblogs.com/#p3")
70     #Content("https://www.cnblogs.com/Terrylee/")

import requests
from bs4 import BeautifulSoup
import io
import re

url=""

#写入内容
def Content(url):
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, headers=kv)
        r.encoding = r.apparent_encoding
        demo = r.text
        soup = BeautifulSoup(demo, "html.parser")
        print(url)
        for a in soup.find_all("a",{"class":"postTitle2"}):
            print(str(a.string).rstrip().lstrip().replace("[置顶]",""))
            Content_write(str(a.string).rstrip().lstrip().replace("[置顶]",""))
    except:
        print("没有数据了！")

#读取博主主页链接
def Href():
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get("https://www.cnblogs.com/AllBloggers.aspx", headers=kv)
        r.encoding = r.apparent_encoding
        demo = r.text
        soup = BeautifulSoup(demo, "html.parser")
        print(url)
        text = ""
for t in soup.find_all("td"):
            if t.find("a") is not None:
                print(t.find("a").attrs['href'])
                write(t.find("a").attrs['href'])
    except:
        print("没有数据了")

#写入链接
def write(contents):
    f=open('E://bloghref.txt','a+',encoding='utf-8')
    f.write(contents+"
")
    print('写入成功！')
    f.close()

#写入内容
def Content_write(contents):
    f=open('E://blogcontent.txt','a+',encoding='utf-8')
    f.write(contents+"
")
    print('写入成功！')
    f.close()

#循环写入
def write_all():
    try:
        f=open('E://bloghref.txt','r+',encoding='utf-8')
        for line in f:
            line=line.rstrip("
")
            for i in range(1,20):
                url=line+"default.html?page="+str(i)
                Content(url)
    except:
        print("超出页数！")
if __name__=="__main__":

    write_all()
    #Href()
    #Content("https://www.cnblogs.com/#p3")
    #Content("https://www.cnblogs.com/Terrylee/")

查看全文

相关阅读:
小程序开发日志-1、小程序自带的日志功能
 java判断List里面的值是否存在重复元素
 java给List<String>批量赋值方法
 （转）post请求携带cookie时配置跨域问题（withCredentials设置）
redis远程连接不上，配置redis远程连接
 Velocity判断是否为空（Velocity基本语法）
mysql设置权限，添加远程访问用户
 java 接收邮件时附件中文乱码问题
 JAVA AES加解密问题（解密时出错）
om.baomidou.mybatisplus.core.exceptions.MybatisPlusException: 该模式不能应用于非数据库字段!

原文地址：https://www.cnblogs.com/zlc364624/p/12285623.html

[Python] 爬取博客园博主标题 网络爬虫 2020.2.8

[Python] 爬取博客园博主标题网络爬虫 2020.2.8