zoukankan      html  css  js  c++  java
  • GET请求/百度贴吧 有bug

     1 #  -*- coding:utf-8 -*-
     2 import urllib, urllib2
     3 import re
     4 import sys
     5 
     6 class Cuzz():
     7     """这是一个类"""
     8     def __init__(self, url, header, start_page, end_page):
     9         self.url = url 
    10         self.header = header
    11         self.start_page = start_page
    12         self.end_page = end_page
    13 
    14     def deal_url(self):
    15         """处理url"""
    16         for i in range(self.start_page, self.end_page+1):
    17             num = 50*(i-1)
    18             url = self.url+str(num)
    19             request = urllib2.Request(url, headers=self.header)
    20             response = urllib2.urlopen(request)
    21             htmltext = response.read()
    22             self.load_images(htmltext)
    23 
    24 
    25     def load_images(self, htmltext):
    26         """下载图片"""
    27         # 找出这一页的所有帖子类似这样的/p/1111111111
    28         pattern = re.findall(r"/p/d+", htmltext)        
    29         for temp in pattern:
    30             url = "http://tieba.baidu.com" + temp
    31             request = urllib2.Request(url, headers=self.header)
    32             response = urllib2.urlopen(request)
    33             htmltext1 = response.read()
    34             self.save_images(htmltext1)
    35 
    36 
    37     def save_images(self, htmltext1):
    38         """保存到本地"""
    39         image_links = re.findall(r"https://imgsa.baidu.+?.jpg",htmltext1)    
    40         for url in image_links:
    41             request = urllib2.Request(url, headers=self.header)
    42             response = urllib2.urlopen(request)
    43             htmltext1 = response.read()
    44             print htmltext1
    45             with open("./images/"+str(url[-10:-1]), "w") as f:
    46                 f.write(htmltext1)
    47 
    48 def main():
    49     """控制"""
    50     header = {"User-Agent":"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1"}
    51     title = raw_input("请输入您要下载的贴吧:")
    52     keyword = {"kw":title} # 是一个字典的格式,转换后"kw=%34dgfdg%fg
    53     
    54     keyword = urllib.urlencode(keyword) # 有中文需要转成url的格式
    55     
    56     url = "https://tieba.baidu.com/f?" + keyword + "&ie=utf-8&pn="
    57 
    58     start_page = int(raw_input("请输入起始页面:"))
    59     end_page = int(raw_input("请输入截止页面:"))
    60     
    61     
    62     cuzz = Cuzz(url, header, start_page, end_page)
    63     cuzz.deal_url() 
  • 相关阅读:
    运行时动态的创建form的方法
    用X++代码来动态的改变表的属性
    使用WinAPI类来查找文件
    用循环得到表中所有的字段
    用X++建立和调用报表(Report)
    JAVA 保留字
    Cygwin使用
    系统程序员成长计划-算法与容器(三) (上)
    系统程序员成长计划工程管理(二)
    系统程序员成长计划-算法与容器(三) (下)
  • 原文地址:https://www.cnblogs.com/cuzz/p/7643261.html
Copyright © 2011-2022 走看看