zoukankan      html  css  js  c++  java
  • 手动爬虫之淘宝笔记本栏(ptyhon3)

    1.这次爬虫用到了之前封装的Url_ProxyHelper类,源代码如下

     1 import urllib.request as ur
     2 
     3 class Url_ProxyHelper:
     4     def __init__(self, url, proxy_add, savepath=None):
     5         self.url = url
     6         self.proxy_add = proxy_add
     7         self.req = None
     8         self.proxy = None
     9         self.opener = None
    10         self.info = None
    11         self.save_path = savepath
    12 
    13     # 报头代理设置
    14     def set_UrlAndProxy(self):
    15         # 添加报头
    16         self.req = ur.Request(self.url)
    17         self.req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
    18 
    19         # 设置代理服务器
    20         self.proxy = ur.ProxyHandler({'http': self.proxy_add})
    21         self.opener = ur.build_opener(self.proxy, ur.HTTPHandler)
    22         ur.install_opener(self.opener)
    23         return self.req
    24 
    25     # 数据存档
    26     def save_InFile(self):
    27         self.req = self.set_UrlAndProxy()
    28         self.info = ur.urlopen(self.req).read()
    29         open(self.save_path, 'wb').write(self.info)
    30 
    31     # 数据返回
    32     def feedbak_info(self):
    33         self.req = self.set_UrlAndProxy()
    34         self.info = ur.urlopen(self.req).read().decode('utf-8')  # decode()用来解码,特别是中文
    35         return str(self.info)

    2.爬取源代码:

     1 import urllib.request as ur
     2 import urllib.error as ue
     3 from Url_ProxyHelper import Url_ProxyHelper
     4 import re
     5 
     6 # 设置目标网址 quote()函数能够解决url中出现的中文所导致的一些解析问题
     7 url = ur.quote("https://s.taobao.com/list?q=平板电脑&q=平板电脑&s=",  safe='/:?=', encoding='utf-8')
     8 # 设置存放路径
     9 save_path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo2/images/"
    10 # 设置代理服务器IP
    11 proxy_add = "218.73.139.196:808"
    12 
    13 def craw(url, save_path, proxy_add,page):
    14     url = url+str((page-1)*48)
    15     # 调用Url_ProxyHelper封装类
    16     uph = Url_ProxyHelper(url, proxy_add)
    17     infos = uph.feedbak_info()
    18     # 设置正则表达式 一般来讲先把这个实例拿出来,然后根据实例写通式
    19     pattern = '"(pic_url)":"(.+?.jpg)'
    20     infos = re.compile(pattern=pattern).findall(infos)
    21     x = 1
    22     for info in infos:
    23         image_name = save_path+str(page)+"_"+str(x)+".jpg"
    24         image_url = "http:"+info[1]
    25         try:
    26             ur.urlretrieve(image_url, filename=image_name)
    27         except ue.HTTPError as e:
    28             if hasattr(e, 'code'):
    29                 print(e.code)
    30             if hasattr(e, 'reason'):
    31                 print(e.reason)
    32         except ue.URLError as e:
    33             if hasattr(e, 'code'):
    34                 print(e.code)
    35             if hasattr(e, 'reason'):
    36                 print(e.reason)
    37         x += 1
    38 
    39 # 只爬取了第一页
    40 craw(url, save_path, proxy_add, 1)
  • 相关阅读:
    JSDOM优化
    Firebug Console 与命令行全集
    input输入框和 pure框架中的 box-sizing 值问题
    模块化网站注意事项
    COOKIE
    鼠标滚动
    拖拽的基本函数(已有限制范围和修复浏览器默认行为以及磁性吸附、碰撞检测、改变层大小、模拟滚动条)
    app_can 的AJax异步,两个解决方案
    基于jQuery的message插件实现右下角弹出消息框
    C#后台讲字符串转化为计算公式
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7110868.html
Copyright © 2011-2022 走看看