zoukankan      html  css  js  c++  java
  • python3爬虫 -----爬取百思不得姐信息-------http://www.budejie.com/

     1 # -*- coding:utf-8 -*-
     2 # author:zxy
     3 # Date:2018-10-21
     4 
     5 import request
     6 from lxml import etree
     7 import threading
     8 from queue import Queue
     9 import csv
    10 import requests
    11 
    12 class Produce(threading.Thread):
    13     headers = {
    14         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
    15                       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    16         'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
    17                   ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
    18     }
    19     def __init__(self,page_queue,joke_queue,*args,**kwargs):
    20         super(Produce, self).__init__(*args,**kwargs)
    21         self.base_domain="http://www.budejie.com"
    22         self.page_queue = page_queue
    23         self.joke_queue = joke_queue
    24     def run(self):
    25         while True:
    26             if self.page_queue.empty():
    27                 break
    28             url=self.page_queue.get()
    29             self.parse_url(url)
    30 
    31     def parse_url(self,url):
    32         reponse=requests.get(url,headers=self.headers)
    33         text=reponse.text
    34         html=etree.HTML(text)
    35         descs=html.xpath("//div[@class='j-r-list-c-desc']")
    36         for desc in descs:
    37             jokes=desc.xpath(".//text()")
    38             joke="
    ".join(jokes).strip()
    39             link=self.base_domain+desc.xpath(".//a/@href")[0]
    40             self.joke_queue.put((joke,link))
    41         print("="*30+"第%s页下载完成!"%url.split('/')[-1]+"="*30)
    42 
    43 
    44 class Consumer(threading.Thread):
    45     headers = {
    46         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
    47                       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    48         'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
    49                   ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
    50     }
    51     def __init__(self,joke_queue,write,gLock,*args,**kwargs):
    52         super(Consumer, self).__init__(*args,**kwargs)
    53         self.joke_queue=joke_queue
    54         self.write=write
    55         self.gLock=gLock
    56 
    57     def run(self):
    58         while True:
    59             try:
    60                 joke_info=self.joke_queue.get(timeout=40)
    61                 joke,link=joke_info
    62                 self.gLock.acquire()
    63                 self.write.writerow((joke,link))
    64                 self.gLock.release()
    65             except:
    66                 break
    67 
    68 
    69 def main():
    70     page_queue=Queue(100)
    71     joke_queue=Queue(1000)
    72     gLock=threading.Lock()
    73     fp=open('baisibudejie.csv','a',newline='',encoding='utf-8')
    74     writer=csv.writer(fp)
    75     writer.writerow(('content','link'))
    76 
    77     for x in range(1,11):
    78         url="http://www.budejie.com/%d"%x
    79         page_queue.put(url)
    80 
    81     for x  in range(5):
    82         t=Produce(page_queue,joke_queue)
    83         t.start()
    84 
    85     for x in range(3):
    86         t=Consumer(joke_queue,writer,gLock)
    87         t.start()
    88 
    89 
    90 if __name__ == '__main__':
    91     main()
  • 相关阅读:
    vue-cil和webpack中本地静态图片的路径问题解决方案
    vue-cil 中的配置分析
    webpack中mainifest.js vendor.js app.js 三者的区别
    css 中可以继承的属性
    有关正则表达式的Js方法(replace)
    css 常用的几种垂直居中(包括图片)
    如何在Vue中建立全局引用或者全局命令
    删除数组中多个不连续的数组元素的正确姿势
    数据库
    代码片段
  • 原文地址:https://www.cnblogs.com/z-712/p/9824940.html
Copyright © 2011-2022 走看看