zoukankan      html  css  js  c++  java
  • 微信爬取

    import re
    import urllib.request
    import time
    import urllib.error
    def wei(url,duan):
    try:
    open_url=urllib.request.Request(url)
    open_url.add_header("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Mobile Safari/537.36")
    file=urllib.request.ProxyHandler({'http':duan})
    cood=urllib.request.build_opener(file,urllib.request.HTTPHandler)
    urllib.request.install_opener(cood)
    date=urllib.request.urlopen(open_url).read()
    print(len(date))
    return date
    except urllib.error.URLError as e:
    if hasattr(e,"code"):
    print(e.code)
    if hasattr(e,"reason"):
    print(e.reason)
    time.sleep(10)
    except Exception as t:
    print(str(t))
    time.sleep(1)

    duan="121.231.226.12:6666"

    //一般免费的端口其中爬取不完全
    key="Python"
    for i in range(0,10):
    try:
    key1=urllib.request.quote(key)
    url="http://weixin.sogou.com/weixin?query="+key1+"&_sug_type_=&sut=10977&lkt=7%2C1527054607490%2C1527054613464&s_from=input&_sug_=y&type=2&sst0=1527054613567&page="+str(i+1)+"&ie=utf8&w=01019900&dr=1"
    shi=wei(url,duan)
    print(shi)
    zheng='<a href="(.*?)'
    long=re.compile(zheng).findall(str(shi))
    if long==0:
    print('此'+str(i)+'爬取未成功')
    continue
    for j in range(len(long)):
    rom=long[j]
    rom=rom.replace("amp;","")
    ong="D:/html/"+str(j)+".txt"
    shi=wei(url,duan)
    try:
    ce=open(ong,"w")
    ce.write(shi)
    ce.close()
    except Exception as e:
    print(str(e))
    except urllib.error.URLError as e:
    if hasattr(e,"code"):
    print(e.code)
    if hasattr(e,"reason"):
    print(e.reason)
    except Exception as t:
    print(str(t))

  • 相关阅读:
    cross-domain
    【转】React、Vue访问NotFound
    Flutter环境配置
    antd遇见的坑
    npm源设置
    React中的生命周期函数
    【备忘】javascript原型、Function、eval、闭包、json处理、类、arguments不定
    ADB获取手机信息
    selenium操作
    操作execl
  • 原文地址:https://www.cnblogs.com/chunqing/p/9079153.html
Copyright © 2011-2022 走看看