zoukankan      html  css  js  c++  java
  • python 微信爬虫实例

    单线程版:

     1 import  urllib.request
     2 import urllib.parse
     3 import urllib.error
     4 import re,time
     5 headers = ("User-Agent",
     6            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
     7 operner = urllib.request.build_opener()
     8 operner.addheaders = [headers]
     9 urllib.request.install_opener(operner)
    10  
    11 list_url = []
    12  
    13  
    14 ###使用代理获取网页url内容
    15 def use_proxy(url):
    16     try:
    17         # proxy = urllib.request.ProxyHandler({'http':proxy_addr})    ##使用代理版
    18         # operner = urllib.request.build_opener()
    19         # urllib.request.install_opener(operner)
    20         headers = ("User-Agent",
    21                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
    22         operner = urllib.request.build_opener()
    23         operner.addheaders = [headers]
    24         urllib.request.install_opener(operner)
    25         data = urllib.request.urlopen(url).read().decode('utf-8')
    26         # print (data)
    27         return data
    28     except urllib.error.URLError as e:
    29         if hasattr(e, "code"):
    30             print(e.code)
    31         elif hasattr(e, "reason"):
    32             print(e.reason)
    33  
    34     except Exception as e:
    35         print("exception" + str(e))
    36         time.sleep(1)
    37  
    38 ##获取要爬取的url
    39 def get_url(key, pagestart, pageend):
    40     try:
    41  
    42         keycode = urllib.parse.quote(key)
    43  
    44         for page in range(pagestart, pageend + 1):
    45             url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (
    46             keycode, page)
    47             data1 = use_proxy(url)
    48             #print("data1的内容是", data1)
    49             listurl_pattern = '<h3>.*?("http://.*?)</h3>'
    50             result = re.compile(listurl_pattern, re.S).findall(data1)
    51             for i in range(len(result)):
    52                 res = result[i].replace("amp;", "").split(" ")[0].replace(""", "")
    53                 list_url.append(res)
    54         #print(list_url)
    55         return list_url
    56     except urllib.error.URLError as e:
    57         if hasattr(e, "code"):
    58             print(e.code)
    59         elif hasattr(e, "reason"):
    60             print(e.reason)
    61     except Exception as e:
    62         print("exception:", e)
    63  
    64 ##通过获取的url爬行内容数据并处理
    65 def get_url_content(list_url):
    66     fh1=open("D:\python-script\1.html", 'wb')
    67     html1 = '''<!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhmtl">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>微信文章</title></head>
    <body>'''
    68     fh1.write(html1.encode("utf-8"))
    69     fh1.close()
    70     fh = open("D:\python-script\1.html", 'ab')
    71     for url in list_url:
    72         data_content = use_proxy(url)
    73         #print (data_content)
    74         #sys.exit()
    75         title_pattern = '<h2.*>.*?</h2>'
    76         result_title = re.compile(title_pattern, re.S).findall(data_content)
    77         ##标题(str)
    78         res_title = result_title[0].replace("<h2 class="rich_media_title" id="activity-name">", "").replace("</h2>",
    79                                                                                           "").strip()
    80  
    81         content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
    82         content = re.compile(content_pattern, re.S).findall(data_content)
    83  
    84         try:
    85             fh.write(res_title.encode("utf-8"))
    86             for i in content:
    87                 fh.write(i.strip().encode("utf-8"))           
    88         except UnicodeEncodeError as e:
    89             continue
    90  
    91     fh.write("</body></html>".encode("utf-8"))
    92  
    93 if __name__ == '__main__':
    94     pagestart = 1
    95     pageend = 2
    96     key = "人工智能"
    97     get_url(key, pagestart, pageend)
    98     get_url_content(list_url)
    View Code

     多线程版:

    import  urllib.request
    import urllib.parse
    import urllib.error
    import re,time
    import queue
    import threading
    
    headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
    operner = urllib.request.build_opener()
    operner.addheaders = [headers]
    urllib.request.install_opener(operner)
    
    urlque = queue.Queue()
    list_url = []
    
    ###使用代理获取网页url内容
    def use_proxy(url):
        try:
            # proxy = urllib.request.ProxyHandler({'http':proxy_addr})
            # operner = urllib.request.build_opener()
            # urllib.request.install_opener(operner)
            headers = ("User-Agent",
                       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
            operner = urllib.request.build_opener()
            operner.addheaders = [headers]
            urllib.request.install_opener(operner)
            data = urllib.request.urlopen(url).read().decode('utf-8')
            #print (data)
            return data
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print (e.code)
            elif hasattr(e,"reason"):
                print (e.reason)
    
        except Exception as e:
            print ("exception"+str(e))
            time.sleep(1)
    
    
    
    ###获取文章的url连接,并将连接加入到队列
    class get_url(threading.Thread):
        def __init__(self,key,pagestart,pageend,urlque):
            threading.Thread.__init__(self)
            self.pagestart = pagestart
            self.pageend = pageend
            self.key = key
            self.urlque = urlque
    
        def run(self):
            try:
                keycode = urllib.parse.quote(self.key)
    
                for page in range(self.pagestart,self.pageend+1):
                    url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (keycode,page)
                    data = use_proxy(url)
                    print ("data1的内容是",data)
                    listurl_pattern = '<h3>.*?("http://.*?)</h3>'
                    result = re.compile(listurl_pattern,re.S).findall(data)
                    print (result)
                    if len(result) == 0:
                        print ("没有可用的url")
                        sys.exit()
                    for i in range(len(result)):
                        res = result[i].replace("amp;","").split(" ")[0].replace(""" ,"")
                        #list_url.append(res)       #加入列表
                        self.urlque.put(res)            ##加入队列
                        self.urlque.task_done()
    
                #return list_url
            except urllib.error.URLError as e:
                if hasattr(e, "code"):
                    print(e.code)
                elif hasattr(e, "reason"):
                    print(e.reason)
            except Exception as e:
                print ("exception:",e)
    
    ##根据url获取文章内容
    class get_url_content(threading.Thread):
        def __init__(self,urlque):
            threading.Thread.__init__(self)
            self.urlque = urlque
    
        def run(self):
            fh1 = open("D:\python-script\1.html", 'wb')
            html1 = '''<!DOCTYPE html>
    <html xmlns="http://www.w3.org/1999/xhmtl">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>微信文章</title></head>
    <body>'''
            fh1.write(html1.encode("utf-8"))
            fh1.close()
            fh = open("D:\python-script\1.html", 'ab')
            while True:
                try:
                    url = self.urlque.get()
                    data_content = use_proxy(url)
    
                    title_pattern = '<h2.*>.*?</h2>'
                    result_title = re.compile(title_pattern, re.S).findall(data_content)
                    ##标题
                    res_title = result_title[0].replace("<h2 class="rich_media_title" id="activity-name">", "").replace("</h2>","").strip()
    
                    content_pattern = 'id="js_content">(.*?)<div class="rich_media_tool" id="js_sg_bar">'
                    content = re.compile(content_pattern, re.S).findall(data_content)
                    #c = '<p style="max- 100%;box-sizing: border-box;min-height: 1em;text-indent: 2em;word-wrap: break-word !important;">'
                    # for i in content:
                    #     ##内容
                    #     c_content=i.replace(c, "").replace("<br  /></p>", "").replace("</p>", "")
    
                    fh.write(res_title.encode("utf-8"))
                    for i in content:
                        fh.write(i.strip().encode("utf-8"))
                except UnicodeEncodeError as e:
                    continue
    
                fh.close()
    class contrl(threading.Thread):
        def __init__(self,urlqueue):
            threading.Thread.__init__(self)
    
            self.urlqueue = urlqueue
            while True:
                print ("程序正在执行")
                if self.urlqueue.empty():
                    time.sleep(3)
                    print ("程序执行完毕")
                    exit()
    
    
    
    if __name__ == '__main__':
        pagestart = 1
        pageend = 2
        key = "人工智能"
        get_url = get_url(key,pagestart,pageend,urlque)
    
        get_url.start()
    
        get_content = get_url_content(urlque)
        get_content.start()
    
        cntrol = contrl(urlque)
        cntrol.start()
    

      

  • 相关阅读:
    FortiGate 硬件加速
    RSA modulus too small: 512 < minimum 768 bits
    VMXNET3 vs E1000E and E1000
    BZOJ 1432: [ZJOI2009]Function(新生必做的水题)
    BZOJ 2456: mode(新生必做的水题)
    BZOJ 1968: [Ahoi2005]COMMON 约数研究(新生必做的水题)
    BZOJ 2463: [中山市选2009]谁能赢呢?(新生必做的水题)
    海量数据处理算法总结【超详解】
    POJ 1659 Frogs' Neighborhood(可图性判定—Havel-Hakimi定理)【超详解】
    图的存储结构之邻接表(详解)
  • 原文地址:https://www.cnblogs.com/FRESHMANS/p/8125594.html
Copyright © 2011-2022 走看看