zoukankan      html  css  js  c++  java
  • 驴妈妈旅游爬虫

    概览页抓取链接

      1 import requests
      2 import re
      3 import pymysql
      4 import hashlib
      5 import datetime
      6 
      7 
      8 class Demo(object):
      9     def __init__(self):
     10         self.host = '127.0.0.1'
     11         self.db = 'app_mark'
     12         self.user = 'root'
     13         self.passwd = '123456'
     14         self.charset = 'utf8mb4'
     15         self.headers = {
     16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
     17         }
     18         self.url = 'http://www.lvmama.com/'
     19         self.channel_link = [
     20             'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list',  # 海岛
     21             'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 东南亚
     22             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 中国港澳台
     23             'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list',  # 迪拜
     24             'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list',  # 俄罗斯
     25             'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list',  # 越南
     26             'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22',  # 法瑞意德
     27             'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list',  # 巴厘岛
     28             'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list',  # 日本
     29             'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list',  # 欧美
     30             'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 新加坡
     31             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 香港
     32             'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list',  # 澳洲
     33             'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list',  # 泰国
     34             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list',  # 三亚
     35             'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p2
     36             'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p3
     37             'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p4
     38             'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list',  # 厦门
     39             'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list',  # 广东
     40             'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list',  # 云南
     41             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list',  # 上海
     42             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list',  # 西安
     43             'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list',  # 成都
     44             'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list',  # 吉林
     45             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list',  # 西北
     46             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list',  # 北京
     47             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list',  # 山东
     48             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list',  # 山西
     49             'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list',  # 河北
     50             'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list',  # 辽宁
     51             ]
     52         self.channel_name = [
     53             '海岛',
     54             '东南亚',
     55             '中国港澳台',
     56             '迪拜',
     57             '俄罗斯',
     58             '越南',
     59             '法瑞意德',
     60             '巴厘岛',
     61             '日本',
     62             '欧洲',
     63             '新加坡',
     64             '香港',
     65             '澳洲',
     66             '泰国',
     67             '三亚',
     68             '三亚p2',
     69             '三亚p3',
     70             '三亚p4',
     71             '厦门',
     72             '广东',
     73             '云南',
     74             '上海',
     75             '西安',
     76             '成都',
     77             '吉林',
     78             '西北',
     79             '北京',
     80             '山东',
     81             '山西',
     82             '河北',
     83             '辽宁',
     84         ]
     85 
     86     def get_html(self, url):
     87         response = requests.get(url, headers=self.headers)
     88         response.encoding = response.apparent_encoding
     89         html = response.text
     90         return html
     91 
     92     def get_data(self):
     93         # 首页抓取
     94         # html = self.get_html(self.url)
     95         # datas = re.findall('<li data-mmurl=.*?<div class="footLink">', html, re.S)[0]
     96         # lis = re.findall('(<li data-mmurl=.*?</li>)', datas, re.S)
     97         # for li in lis:
     98         #     # detail_url = re.findall('<li data-mmurl="(.*?)"', li, re.S)  # 详情页app链接
     99         #     detail_url = re.findall('href="(.*?)"', li, re.S)[0]  # 详情页网页链接
    100         #     self.save_data(detail_url)
    101         # print(datas)
    102 
    103         # 频道抓取
    104         urls = []
    105         # 正则匹配链接
    106         for index, channel in enumerate(self.channel_link):
    107             html = self.get_html(channel)
    108             divs = re.findall('<div class="product-left".*<div class="paging orangestyle"', html, re.S)[0]
    109             divs = re.findall('<div class="product-section">.*?</div>', divs, re.S)
    110             for div in divs:
    111                 print(self.channel_name[index])
    112                 url = re.findall('<a href="(.*?)"', div, re.S)[0]
    113                 self.save_data(url)
    114 
    115     def save_data(self, url):
    116         print(url)
    117         hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest()
    118         sitename = '驴妈妈旅游'
    119         lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    120         tag = '0'
    121         list_sql = [url, hkey, tag, sitename, lasttime]
    122         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
    123         cur = con.cursor()
    124         sql = 'insert into gly(link, hkey, tag, sitename, lasttime) values (%s, %s, %s, %s, %s)'
    125         try:
    126             cur.execute(sql, list_sql)
    127             print('insert success')
    128         except Exception as e:
    129             con.rollback()
    130             print('error~', e)
    131         else:
    132             con.commit()
    133         cur.close()
    134         con.close()
    135 
    136 
    137 if __name__ == '__main__':
    138     demo = Demo()
    139     demo.get_data()

    细览页解析字段

      1 import pymysql
      2 import re
      3 import requests
      4 from multiprocessing.dummy import Pool as ThreadPool
      5 import datetime
      6 
      7 
      8 class XLY(object):
      9     def __init__(self):
     10         self.host = '127.0.0.1'
     11         self.db = 'app_mark'
     12         self.user = 'root'
     13         self.passwd = '123456'
     14         self.charset = 'utf8mb4'
     15         self.headers = {
     16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
     17         }
     18         self.start = datetime.datetime.now()
     19 
     20     def get_data(self):
     21         # 从gly表中拿链接
     22         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
     23         cur = con.cursor()
     24         sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"'
     25         after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"'
     26         try:
     27             cur.execute(sql)
     28             results = cur.fetchall()
     29             cur.execute(after_sql)
     30         except Exception as e:
     31             con.rollback()
     32             results = None
     33             print('error~', e)
     34         else:
     35             con.commit()
     36         cur.close()
     37         con.close()
     38         return results
     39 
     40     def parse_data(self, url):
     41         # 正则匹配各个字段
     42         print(url)
     43         url = url[0]
     44         # 匹配id
     45         id = url.split('/')[-1]
     46         id = re.sub('?.*', '', id)
     47         # print(id)
     48         response = requests.get(url, headers=self.headers)
     49         html = response.text
     50         if 'scenic' not in url and 'hotel' not in url:
     51             # 去掉酒店和景点
     52             # 匹配标题
     53             title = re.findall('<h.*?tit">(.*?)</h.*?>', html, re.S)
     54             if title:
     55                 title = title[0]
     56                 title = re.sub('
    |
    |&nbsp;|自营|<[sS]*?>', '', title)
     57                 title = title.strip()
     58             else:
     59                 title = re.findall('<p class="nchtitle">(.*?)</p>', html, re.S)
     60                 if title:
     61                     title = title[0]
     62                     title = re.sub('
    |
    |&nbsp;|自营|<[sS]*?>', '', title)
     63                     title = title.strip()
     64             # 匹配价格
     65             price = re.findall('<dfn.*?>(d+)</dfn>', html, re.S)
     66             if price:
     67                 price = price[0]
     68             else:
     69                 price = re.findall('<span class="product_price">.*?(d+).*?</span>', html, re.S)
     70                 if price:
     71                     price = price[0]
     72                 else:
     73                     price = re.findall('¥<em>(d+)</em>', html, re.S)
     74                     if price:
     75                         price = price[0]
     76                     else:
     77                         price = re.findall('<span class="product-price-value">.*?(d+).*?</span>', html, re.S)
     78                         if price:
     79                             price = price[0]
     80                         else:
     81                             price = None
     82             # 匹配好评率
     83             praise = re.findall('<p class="product_top_dp">[sS]*?<span>([sS]*?)</span>[sS]*?</p>', html, re.S)
     84             if praise:
     85                 praise = praise[0]
     86                 praise = re.sub('<.*?>', '', praise)
     87                 praise = praise.strip()
     88             else:
     89                 praise = re.findall('<a href="#pro_comment".*?<span>([sS]*?)</span>', html, re.S)
     90                 if praise:
     91                     praise = praise[0]
     92                 else:
     93                     praise = re.findall('<span class="c_f60">([sS]*?)</span>', html, re.S)
     94                     if praise:
     95                         praise = praise[0]
     96                         praise = praise.strip()
     97                     else:
     98                         praise = re.findall('<p class="product_top_dp">[sS]*?<span>([sS]*?)<small>%</small>[sS]*?</span>', html, re.S)
     99                         if praise:
    100                             praise = praise[0]
    101                             praise = praise.strip()
    102                         else:
    103                             praise = re.findall('<span class="val">([sS]*?)</span>', html, re.S)
    104                             if praise:
    105                                 praise = praise[0]
    106             if praise:
    107                 if '%' in praise:
    108                     praise = re.sub('%', '', praise)
    109                 praise = float(praise)
    110                 if praise > 100:
    111                     praise = None
    112                     print('好评率抓取错误')
    113                 else:
    114                     pass
    115             else:
    116                 praise = None
    117             # 匹配出发地
    118             starting_city = re.findall('<dl class="info-city">[sS]*?出发城市[sS]*?<ii>([sS]*?)</ii></dd>', html, re.S)
    119             target_city = re.findall('<dt>目的地[sS]*?<dd>([sS]*?)</dd>', html, re.S)
    120             if starting_city:
    121                 starting_city = starting_city[0]
    122                 starting_city = re.sub('<.*?>', '', starting_city)
    123                 # 匹配目的地
    124                 target_city = target_city[0]
    125                 target_city = re.sub('<.*?>', '', target_city)
    126                 # 匹配天数
    127                 days_spent = re.findall('<dt>出游天数[sS]*?<dd>([sS]*?)</dd>', html, re.S)[0]
    128                 days_spent = re.sub('<.*?>', '', days_spent)
    129                 # print(days_spent)
    130             else:
    131                 starting_city = target_city = days_spent = None
    132             # 匹配类型
    133             type_ = re.findall('<i class="t-category">([sS]*?)</i>', html, re.S)
    134             if type_:
    135                 type_ = type_[0]
    136             else:
    137                 type_ = re.findall('<span class="product_top_type product_type_zyx">([sS]*?)</span>', html, re.S)
    138                 if type_:
    139                     type_ = type_[0]
    140                 else:
    141                     type_ = re.findall('<span class="dpn_group">([sS]*?)</span>', html, re.S)
    142                     if type_:
    143                         type_ = type_[0]
    144                     else:
    145                         type_ = None
    146             # print(type_)
    147             list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url]
    148             self.save_data(list_data)
    149 
    150     def save_data(self, list_data):
    151         # 写入数据库
    152         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
    153         cur = con.cursor()
    154         sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
    155         # cur.execute(sql, list_data)
    156         # con.commit()
    157         try:
    158             cur.execute(sql, list_data)
    159             print('insert success')
    160         except Exception as e:
    161             con.rollback()
    162             print('error~', e)
    163         else:
    164             con.commit()
    165         cur.close()
    166         con.close()
    167 
    168 
    169 if __name__ == '__main__':
    170     xly = XLY()
    171     urls = xly.get_data()
    172     if urls:
    173         # 开启多线程
    174         pool = ThreadPool(20)
    175         pool.map(xly.parse_data, urls)
    176         pool.close()
    177         pool.join()
    178     end = datetime.datetime.now()
    179     print('耗时:', (end-xly.start))
    180     # for url in urls:
    181     #     url = url[0]
    182     #     xly.parse_data(url)
    183         # break
  • 相关阅读:
    Redis学习笔记(九、Redis总结)
    菜鸟刷面试题(二、RabbitMQ篇)
    RabbitMQ学习笔记(八、RabbitMQ总结)
    MongoDB学习笔记(七、MongoDB总结)
    菜鸟刷面试题(一、Java基础篇)
    朋友圈点赞
    队列变换
    犯二的程度
    猴子选大王
    最大销售增幅
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/10529578.html
Copyright © 2011-2022 走看看