zoukankan      html  css  js  c++  java
  • 爬取微信文章代码

     1 import re
     2 import urllib.request
     3 import time
     4 import urllib.error
     5 def use_proxy(proxy_addr,url):
     6     try:
     7       req=urllib.request.Request(url)
     8       req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0')
     9       proxy=urllib.request.ProxyHandler({'http':proxy_addr})
    10       opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    11       urllib.request.install_opener(opener)
    12       data=urllib.request.urlopen(req).read()
    13       return data
    14     except urllib.error.URLError as e:
    15         if(hasattr(e,"code")):
    16             print(e.code)
    17         if(hasattr(e,"reason")):
    18             print(e.reason)
    19         time.sleep(10)
    20     except Exception as e:
    21         print("exception:"+str(e))
    22         time.sleep(1)
    23 
    24 key="Python"
    25 proxy="127.0.0.1:8888"
    26 for i in range(0,10):
    27     key=urllib.request.quote(key)
    28     thispageurl="http://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1777&lkt=7%2C1519106265525%2C1519106267321&s_from=input&_sug_=y&type=2&sst0=1519106267427&page="+str(i)+"&ie=utf8&w=01019900&dr=1"
    29     thispagedata=use_proxy(proxy,thispageurl)
    30     print(len(str(thispagedata)))
    31     pat='<a target="_blank" href="(.*?)"'
    32     rs=re.compile(pat,re.S).findall(str(thispagedata))
    33     if(len(rs)==0):
    34         print("第("+str(i)+")页没成功")
    35         continue
    36     for j in range(0,len(rs)):
    37         thisurl=rs[j]
    38         thisurl=thisurl.replace("amp;","")
    39         file="d:/111"+str(i)+str(j)+".html"
    40         thisdata=use_proxy(proxy,thisurl)
    41         try:
    42             fh=open(file,"wb")
    43             fh.write(thisdata)
    44             fh.close()
    45             print(""+str(i)+str(j)+"篇文章成功")
    46         except Exception as e:
    47             print(e)
    48             print(""+str(i)+str(j)+"篇文章不成功")

    用python3.5对weixin.sogou.com中的微信文章进行文章的爬取,浏览器为火狐浏览器,使用本地代理,代码如上。

  • 相关阅读:
    BZOJ 2002 [Hnoi2010]Bounce 弹飞绵羊(分块)
    BZOJ 4241 历史研究(分块)
    BZOJ 3110 [Zjoi2013]K大数查询(整体二分)
    hdu 5412 CRB and Queries(整体二分)
    POJ2104 K-th Number(整体二分)
    luogu P3157 [CQOI2011]动态逆序对(CDQ分治)
    陌上开花(CDQ分治)
    BZOJ 1176[Balkan2007]Mokia(CDQ分治)
    BZOJ 3626 LCA(离线+树链剖分+差分)
    bzoj1592 Making the Grade
  • 原文地址:https://www.cnblogs.com/xxp17457741/p/8455298.html
Copyright © 2011-2022 走看看