zoukankan      html  css  js  c++  java
  • python爬虫学习(一)

    第一个爬虫:

    import urllib
    from urllib import request 
    
    response = request.urlopen('http://www.baidu.com')
    html = response.read().decode()
    print (html)
        

    伪造浏览器:

    
    

    from urllib import request
    import re
    url=r"http://www.baidu.com/"
    header={"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16A366 MicroMessenger/6.7.3(0x16070321) NetType/WIFI Language/zh_CN"}
    req=request.Request(url,headers=header)
    response= request.urlopen(req).read().decode() #
    pat=r"<title>(.*?)</title>" #
    data=re.findall(pat,response)
    print(data)

     

     设置代理:

    from urllib import request
    import random
    proxylist=[{"http":"120.83.109.103:9999"},
    {"http":"120.83.109.103:9999"},
    {"http":"120.83.109.103:9999"},
    {"http":"120.83.109.103:9999"},
    ]
    proxy=random.choice(proxylist)
    print(proxy)
    #构建代理处理器对象
    proxyHandler=request.ProxyHandler(proxy)
    
    #创建请求对象
    http_handler=request.HTTPHandler()
    opener=request.build_opener(proxyHandler)
    req=request.Request("http://www.baidu.com")
    res=opener.open(req)
    print(res.read().decode())

    爬取贴吧内容:

    from urllib import request
    import urllib
    import time
    import os
    
    #发送请求,获取服务器响应文件
    def loadPage(fullurl,filename):
        print("正在下载:",filename)
        header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
        req=request.Request(fullurl,headers=header)
        res=request.urlopen(req).read()
        txt=request.urlopen(req).read().decode()
        return res
        
      
    #将url写入本地
    def writePage(html,filename):
        print("正在保存:",filename)
        with open(filename,"wb") as f:
            f.write(html)
            print(".....")
            
        
    #处理每个页面URL
    def tiebaSpider(url,begin,end):
        for page in range(begin,end+1):
            pn=(page-1)*50
            fullurl=url+"&pn="+str(pn)
            print(fullurl)
            filename="C:/第"+str(page)+"页.html"
            html=loadPage(fullurl,filename)#调用爬虫,爬取网页
            writePage(html,filename)#将获取网页信息写入本地
    
    
    if __name__ == '__main__':
        kw=input("请输入贴吧名:")
        begin=int(input("请输入开始页:"))
        end=int(input("请输入结束页:"))
        url="http://tieba.baidu.com/f?"
        key=urllib.parse.urlencode({"kw":kw})
        url=url+key
        tiebaSpider(url,begin ,end)
        time.sleep(6)

    get 请求:

    from urllib import request
    import urllib

    http_handler=request.HTTPHandler()
    opener=request.build_opener(http_handler)
    wd={"wd":"北京"}
    wdd=urllib.parse.urlencode(wd)
    print(wdd)
    url="http://www.baidu.com/s?"
    url=url+wdd
    print(url)
    req=request.Request(url)
    print(req)
    res=opener.open(req)
    print(res.read().decode())

    post 请求:

    from urllib import request
    import urllib
    import re
    
    header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
    url="http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"#url要去掉_o
    url1="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
    key="中国"
    formdata={
    "i":key,
    "from":"AUTO",
    "to":"AUTO",
    "smartresult":"dict",
    "client":"fanyideskweb",
    "salt":"15651639810027",
    "sign":"7688b75dad2fed75aea5924b1f8ee127",
    "ts":"1565163981002",
    "bv":"62188471f020213764ab67d1893204f7",
    "doctype":"json",
    "version":"2.1",
    "keyfrom":"fanyi.web",
    "action":"FY_BY_REALTlME"
    }
    data=urllib.parse.urlencode(formdata).encode("utf-8")
    req=request.Request(url=url1,data=data,headers=header)
    resp=request.urlopen(req).read().decode()
    pat=r'"tgt":"(.*?)"}]]}'
    result=re.findall(pat,resp)
    print(result[0])

    异常处理:

    from urllib import request
    list1 =["www.baidu.com",
    "http://www.baidu.com",
    "http://www.baidu.cm0",
    "http://www.baidu.com",
    "http://www.baidu.com"]
    
    
    i=0 
    for url in list1:
        i=i+1
        try:
            request.urlopen(url)
        except Exception as e:
                 print(e) 
        print("",i,"个请求完成")
  • 相关阅读:
    Linux CentOS 安装 宝塔
    Linux CentOS 基本命令
    Linux CentOS 各个版本的区别
    OSI物理层之数据通信基础知识
    如何在集群里服役新节点、退役旧节点(DataNode)
    MapReduce的运行流程概述
    MapReduce计算框架的核心编程思想
    解决HDFS上小文件的存储
    计算机网络体系架构之OSI七层模型、TCP/IP四层模型
    集群里常见进程的端口号
  • 原文地址:https://www.cnblogs.com/vennus/p/11351043.html
Copyright © 2011-2022 走看看