zoukankan html css js c++ java

python爬微信文章

 1 #http://weiixn.sogou.com
 2 import re
 3 import urllib.request
 4 import time
 5 import urllib.error
 6 
 7 #自定义函数，功能为使用代理服务器爬一个网址
 8 def use_proxy(proxy_addr,url):
 9     try:
10         #伪装浏览器报头
11         print(str("伪装成浏览器"))
12         req=urllib.request.Request(url)
13         req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36')
14         #代理服务器
15         print(str("代理服务器"))
16         proxy=urllib.request.ProxyHandler({'http':proxy_addr})
17         #设置代理服务器opener对象
18         print(str("设置opener对象"))
19         opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
20         #设置全局变量
21         print(str("全局变量"))
22         urllib.request.install_opener(opener)
23         print(str("读数据"))
24         data=urllib.request.urlopen(req).read()
25         print(str("读取成功"))
26         return data
27     except urllib.error.URLError as e:
28         if hasattr(e,"code"):
29             print(e.code)
30         if hasattr(e,"reason"):
31             print(e.reason)
32         #若为URLError异常，延时10秒执行
33         time.sleep(10)
34     except Exception as e:
35         print("exception:"+str(e))
36         #若为Exception异常，延时1秒执行
37         time.sleep(1)
38         
39     
40 #设置关键词
41 key="Python"
42 #设置代理服务器，该代理服务器有可能失效，及时更换
43 proxy="114.113.126.86:80"
44 #爬多少页
45 for i in range(1,3):
46     key=urllib.request.quote(key)
47     print(str("进入链接"))
48     thispageurl="http://weixin.sogou.com/weixin?query="+key+"&type=2&page="+str(i)
49     thispagedata=use_proxy(proxy,thispageurl)
50     print(len(str(thispagedata)))
51     pat1='<a href="(.*?)"'
52     rs1=re.compile(pat1,re.S).findall(str(thispagedata))
53     if(len(rs1)==0):
54         print("此次（"+str(i)+"页）没成功")
55         continue
56     for j in range(0,len(rs1)):
57         thisurl=rs1[j]
58         thisurl=thisurl.replace("amp;","")
59         file="F:/python/python爬虫/weixin/"+str(i)+"页第"+str(j)+"篇文章.html"
60         thisdata=use_proxy(proxy,thisurl)
61         try:
62             fh=open(file,"wb")
63             fh.write(thisdata)
64             fh.close()
65             print("第"+str(i)+"页第"+str(j)+"篇文章成功")
66         except Exception as e:
67             print(e)
68             print("第"+str(i)+"页第"+str(j)+"篇文章失败")

本代码爬取前两页微信文章

注意：

1.代理服务器应即使更换，防止网站屏蔽

2.伪装浏览器类型：Chrome

查看全文

相关阅读:
网页链接在什么时候进行跳转到哪里?
word中那些重要但是被人忽略的快捷键和长word文档的跳转
 如何在editplus中配置ctags?
winsow xp不能安装软件, 提示"中断" 是因为设置了软件限制策略
 firefox的plugin-container.exe进程如何关闭?
thinkphp的url地址区分大小写?
thinkphp单入口和多入口的访问方法
 什么时候使用tab键来对齐代码和代码的风格
 该不该用inline-block取代float? inline和float的区别?
cad中关于点样式点的绘制

原文地址：https://www.cnblogs.com/du1269038969/p/9670502.html