zoukankan      html  css  js  c++  java
  • 3.21

    转载一个养老院的爬虫项目:(为毕设数据做铺垫的)

    import requests
    from lxml import etree
    import csv
    import threading
    from queue import Queue
    import time

    parse_count = 1
    crawl_fail_list = []
    parse_fail_list = []
    #http://www.yanglao.com.cn/resthome_2
    class crawl_thread(threading.Thread):

    def __init__(self, name, page_queue, data_queue):
    super().__init__()
    self.name = name
    self.page_queue = page_queue
    self.data_queue =data_queue

    def run(self):
    global crawl_fail_list
    print("*********%s开始************" % self.name)
    while 1:
    #如果page-queue空就终止线程
    if self.page_queue.empty():
    break
    #从页码池获取数据,拼接url
    try:
    page = self.page_queue.get()
    url = 'http://www.yanglao.com.cn/resthome_' + str(page)
    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    }
    #发送请求,获得响应
    r = requests.get(url, headers=headers)
    #将响应放入数据队列
    self.data_queue.put(r.text)
    print('%s:第%s页爬取完成' % (self.name, page))
    time.sleep(0.3)
    except Exception as e:
    print(e)
    crawl_fail_list.appen(page)
    print("*********%s结束************" % self.name)

    class parse_thread(threading.Thread):

    def __init__(self, name, data_queue, suo, writer):
    super().__init__()
    self.name = name
    self.data_queue = data_queue
    self.suo = suo
    self.writer = writer

    def run(self):
    global parse_count
    global parse_fail_list
    print("*********%s开始************" % self.name)
    while 1:
    #从数据队列获得数据,如果超过30s没有新数据就终止
    try:
    content = self.data_queue.get(True,15)
    except:
    break
    #解析数据
    try:
    tree = etree.HTML(content)
    li_list = tree.xpath('//li[@class="rest-item"]')
    for li in li_list:
    name = li.xpath('.//h4/a/text()')[0]
    location = li.xpath('.//ul/li[1]/text()')[0].replace('地址:','')
    beds = li.xpath('.//ul/li[2]/text()')[0].replace('床位数:','').replace('张','')
    money = li.xpath('.//ul/li[3]/text()')[0].replace('收费区间:','')
    lt = [name, location, beds, money]
    #上锁写csv
    self.suo.acquire()
    self.writer.writerow(lt)
    self.suo.release()
    print("%s:第%s页写入完成" %(self.name, parse_count))
    #如果解析失败就抛出错误,继续循环
    except Exception as e:
    print(e)
    parse_fail_list.append(parse_count)
    parse_count += 1
    print("*********%s结束************" % self.name)

    ##################################################################
    def create_queue():
    #创建页码队列
    page_queue = Queue()
    #总页数
    for page in range(1, 1676):
    page_queue.put(page)
    #创建数据队列
    data_queue = Queue()
    return page_queue, data_queue

    def create_crawl_list(page_queue, data_queue):
    crawl_list = []
    name_list = ['爬虫1号', '爬虫2号']
    for name in name_list:
    crawl = crawl_thread(name, page_queue, data_queue)
    crawl_list.append(crawl)
    return crawl_list

    def create_parse_list(data_queue, suo, writer):
    parse_list = []
    name_list = ['解析1号', '解析2号']
    for name in name_list:
    parse = parse_thread(name, data_queue, suo, writer)
    parse_list.append(parse)
    return parse_list

    ###################################################
    def main():
    #创建队列
    page_queue, data_queue = create_queue()
    #创建锁
    suo = threading.Lock()
    #打开文件,创建writer
    f = open('养老院数据_全.csv', 'a', encoding='utf8', newline='')
    writer = csv.writer(f)
    #创建爬虫队列和解析队列
    crawl_list = create_crawl_list(page_queue, data_queue)
    parse_list = create_parse_list(data_queue, suo, writer)
    print(crawl_list, parse_list)
    #启动爬虫
    for crawl in crawl_list:
    crawl.start()
    for parse in parse_list:
    parse.start()
    #确保主线程最后关闭
    for crawl in crawl_list:
    crawl.join()
    for parse in parse_list:
    parse.join()
    #收尾
    f.close()
    print('所有线程关闭,程序结束!!!')
    print(crawl_fail_list)
    print(parse_fail_list)


    if __name__ == '__main__':
    main()

  • 相关阅读:
    web2.0网站如何设计UE/UI
    SQL查询入门(中篇)
    跟我学做c#皮肤美化(三)
    王通:SEO成功的秘密
    26个Jquery使用小技巧(jQuery tips, tricks & solutions)
    跟我学做c#皮肤美化(五)
    js iframe子父页面读取方式
    我的新网站上线了历史五千年www.lswqn.com在诵读经典中传承中华文明
    jQuery获取Radio,CheckBox选择的Value值
    C#仿QQ皮肤-主窗体MainForm和Main的实现
  • 原文地址:https://www.cnblogs.com/maxin123/p/12536922.html
Copyright © 2011-2022 走看看