zoukankan      html  css  js  c++  java
  • 爬虫入门学习 贴吧小案例


    1
    import urllib.request 2 import urllib.parse 3 import random 4 5 #目标地址 6 url="http://tieba.baidu.com/f" 7 8 #伪造客户端 http请求头 9 ua_list = [ 10 "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 11 "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 12 "User-Agent: Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 13 "User-Agent: Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 14 "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 15 "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" 16 ] 17 #随机选择一个作为请求头 18 user_agent=random.choice(ua_list) 19 def doWrite(html,f_name): 20 """ 21 :param html: 请求得到响应后收到的数据 22 :param f_name: 用于保存写操作的文件名 23 :return: E:Demopyswt 24 """ 25 with open(f_name,"w",encoding='utf8')as f: 26 f.write(html) 27 print(">"*30) 28 29 def loadPage(f_url,f_name): 30 #对目标地址进行请求 31 request=urllib.request.Request(f_url) 32 #设置http请求头 33 request.add_header("User-Agent",user_agent) 34 #获取响应数据 35 response=urllib.request.urlopen(request) 36 html=response.read().decode("utf-8") 37 #下载并保存 38 print("准备写入数据....") 39 doWrite(html,f_name) 40 41 def doCode(url,kwd): 42 ''' 43 对搜索关键字进行编码 44 :return: 45 ''' 46 kw={"kw":kwd} 47 kw=urllib.parse.urlencode(kw) 48 #关键字拼接 49 full_url=url+"?"+kw 50 return full_url 51 52 53 def doUrl(url,star,end): 54 ''' 55 拼接url地址 56 ''' 57 for pages in range(star,end+1): 58 page=(pages-1)*50 59 60 f_url=url+"&pn="+str(page) 61 f_name=""+str(pages)+""+".html" 62 print("即将加载第{0}页数据".format(pages)) 63 loadPage(f_url,f_name) 64 print("下载完成,谢谢使用!") 65 66 if __name__ == '__main__': 67 tb_name=input("请输入要访问的贴吧名: ") 68 starPage=int(input("请输入起始页")) 69 endPage=int(input("请输入结束页")) 70 71 full_url=doCode(url,tb_name) 72 doUrl(full_url,starPage,endPage)

     爬虫里踩不完的坑,pathon2和python3版本问题,造成许多地方的使用差异。

    比如此次用puyhon3简单的获取网页数据时,爆出类型错误,必须是byte类型或者文件对象类型,不能是str类型。而python2食用则代码正常。无fuck说……

    走代码:

     1 # https://movie.douban.com/j/chart/top_list?type=13&interval_id=100%3A90&action=
     2 
     3 import urllib.request
     4 import urllib.parse
     5 import random
     6 
     7 url="https://movie.douban.com/j/chart/top_list?type=13&interval_id=100%3A90&action="
     8 
     9 ua_list=[
    10     "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    11     "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    12     "User-Agent: Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    13     "User-Agent: Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    14     "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    15     "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
    16 ]
    17 user_agent=random.choice(ua_list)
    18 
    19 formdata = {
    20     "limit": "20",
    21     "start": "0"
    22 }
    23 data=urllib.parse.urlencode(formdata).encode("utf-8")
    24 
    25 request=urllib.request.Request(url,data=data)
    26 request.add_header("User-Agent",user_agent)
    27 
    28 response=urllib.request.urlopen(request)
    29 html=response.read().decode("utf-8")
    30 
    31 print(html)
    32 
    33 with open("movie.txt","a",encoding="utf8")as f:
    34     f.write(html)
    35     print("下载完成!!!")

     标红的地方不可少,不然直接报错。

    运用代理服务器去访问百度:

     1 import urllib
     2 import urllib.request
     3 
     4 #是否启用代理
     5 proxyswitch=False
     6 
     7 if proxyswitch:
     8     #启用则使用指定的代理服务器IP去访问
     9     httpproxy_handler= urllib.request.ProxyHandler({"http":"110.52.235.14:9999"})
    10     opener=urllib.request.build_opener(httpproxy_handler)
    11 else:
    12     #否则制为空 字典不可省略
    13     nullproxy_handler= urllib.request.ProxyHandler({})
    14     opener=urllib.request.build_opener(nullproxy_handler)
    15 
    16 #构建一个全局的opener对象  以后去访问直接用urlopen去做
    17 urllib.request.install_opener(opener)
    18 
    19 request=urllib.request.Request("http://www.baidu.com/")
    20 
    21 response=urllib.request.urlopen(request)
    22 html=response.read().decode("utf-8")
    23 print(html)

     如果要使用私密代理,则需要添加授权的用户名和密码:填写格式如下

    httpproxy_handler= urllib.request.ProxyHandler({"http":"用户名:密码@110.52.235.14:9999"})

     

  • 相关阅读:
    积水路面Wet Road Materials 2.3
    门控时钟问题
    饮料机问题
    Codeforces Round #340 (Div. 2) E. XOR and Favorite Number (莫队)
    Educational Codeforces Round 82 (Rated for Div. 2)部分题解
    Educational Codeforces Round 86 (Rated for Div. 2)部分题解
    Grakn Forces 2020部分题解
    2020 年百度之星·程序设计大赛
    POJ Nearest Common Ancestors (RMQ+树上dfs序求LCA)
    算法竞赛进阶指南 聚会 (LCA)
  • 原文地址:https://www.cnblogs.com/wen-kang/p/10422179.html
Copyright © 2011-2022 走看看