zoukankan      html  css  js  c++  java
  • 写了个小爬虫,为何用上代理ip总是出现错误。

     1 import urllib.request
     2 import re
     3 import os
     4 import random
     5 import threading
     6 
     7 def url_open(url):  #在第8到第12行,总是无法正常运行,代理Ip是从网上免费代理ip获取的。
     8     #ips = ['117.136.234.12:80', '218.189.26.20:8080','202.194.101.150:80','180.166.112.47:8888']
     9         
    10     #proxy = urllib.request.ProxyHandler({'http':random.choice(ips)})#{'http':'124.202.174.66:8118'}
    11     #opener = urllib.request.build_opener(proxy)
    12     #opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36')]
    13         
    14     #urllib.request.install_opener(opener)
    15 
    16     req = urllib.request.Request(url)
    17     
    18     req.add_header('User-Agent','Mozilla/5.0')
    19     urlobject = urllib.request.urlopen(req)
    20     response = urlobject.read()
    21     return response
    22     
    23 def find_page(html):
    24     s2 = r'[d{4}]'
    25     m = re.search(s2, html)
    26     page = m.group()
    27     print("find_page")
    28     return page
    29 
    30 def find_page_link(html):
    31     s = r'http://ww[0-9].sinaimg.cn/mw600/w+.jpg'
    32     m = re.findall(s, html)
    33     return m
    34 
    35 
    36 def save_page(jpg):
    37     for file in jpg:
    38         data = url_open(file)
    39         #print("wwwwwwwwww")
    40         name = "E:\作业\j_d\"+file.split('/')[-1]
    41         with open(name, 'wb') as f:
    42             f.write(data)
    43         
    44 
    45 def down_jpg(dir_name='E:作业j_d', page=10, pages=10):
    46     #os.mkdir(dir_name)
    47     os.chdir(dir_name)
    48     #red = url_open('http://jandan.net/ooxx')
    49     #print(type(red))
    50     #red = red.decode('utf-8')
    51     
    52     #page = find_page(red)
    53     #page = int(page[1:-1])
    54     #page = 1333
    55     for i in range(pages):
    56         page += 1
    57         url = 'http://jandan.net/ooxx/page-'+str(page)+'#comments'
    58         print(url)
    59         data = url_open(url)
    60         data = data.decode('utf-8')
    61         print("dddddddddddddd")
    62         page_list = find_page_link(data)
    63         #print("sssssssssssssss")
    64         save_page(page_list)
    65     
    66 
    67 if __name__ == '__main__':
    68     p = threading.Thread(target=down_jpg,args=('E:作业j_d',1555,10))
    69     c = threading.Thread(target=down_jpg,args=('E:作业j_d',1024,10))
    70     #down_jpg()
    71     p.start()
    72     c.start()
    73 
    74     p.join()
    75     c.join()
  • 相关阅读:
    Codeforces 950E Data Center Maintenance 强连通分量
    Codeforces Round #469 Div. 2 A B C D E
    Codeforces Round #391 A B C D E
    bzoj 4569 [Scoi2016]萌萌哒 并查集 + ST表
    Codeforces 940F Machine Learning 带修改莫队
    How to read the SQL Server Database Transaction Log
    Mysql 主从复制触发器问题
    Git出现 SSL certificate的处理方法
    mac磁盘启动
    发件人地址重写的几种方法记录
  • 原文地址:https://www.cnblogs.com/nethk/p/4825131.html
Copyright © 2011-2022 走看看