概览页抓取链接
1 import requests 2 import re 3 import pymysql 4 import hashlib 5 import datetime 6 7 8 class Demo(object): 9 def __init__(self): 10 self.host = '127.0.0.1' 11 self.db = 'app_mark' 12 self.user = 'root' 13 self.passwd = '123456' 14 self.charset = 'utf8mb4' 15 self.headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 17 } 18 self.url = 'http://www.lvmama.com/' 19 self.channel_link = [ 20 'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list', # 海岛 21 'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list', # 东南亚 22 'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list', # 中国港澳台 23 'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list', # 迪拜 24 'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list', # 俄罗斯 25 'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list', # 越南 26 'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22', # 法瑞意德 27 'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list', # 巴厘岛 28 'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list', # 日本 29 'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list', # 欧美 30 'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list', # 新加坡 31 'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list', # 香港 32 'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list', # 澳洲 33 'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list', # 泰国 34 'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list', # 三亚 35 'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p2 36 'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p3 37 'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p4 38 'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list', # 厦门 39 'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list', # 广东 40 'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list', # 云南 41 'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list', # 上海 42 'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list', # 西安 43 'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list', # 成都 44 'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list', # 吉林 45 'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list', # 西北 46 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list', # 北京 47 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list', # 山东 48 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list', # 山西 49 'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list', # 河北 50 'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list', # 辽宁 51 ] 52 self.channel_name = [ 53 '海岛', 54 '东南亚', 55 '中国港澳台', 56 '迪拜', 57 '俄罗斯', 58 '越南', 59 '法瑞意德', 60 '巴厘岛', 61 '日本', 62 '欧洲', 63 '新加坡', 64 '香港', 65 '澳洲', 66 '泰国', 67 '三亚', 68 '三亚p2', 69 '三亚p3', 70 '三亚p4', 71 '厦门', 72 '广东', 73 '云南', 74 '上海', 75 '西安', 76 '成都', 77 '吉林', 78 '西北', 79 '北京', 80 '山东', 81 '山西', 82 '河北', 83 '辽宁', 84 ] 85 86 def get_html(self, url): 87 response = requests.get(url, headers=self.headers) 88 response.encoding = response.apparent_encoding 89 html = response.text 90 return html 91 92 def get_data(self): 93 # 首页抓取 94 # html = self.get_html(self.url) 95 # datas = re.findall('<li data-mmurl=.*?<div class="footLink">', html, re.S)[0] 96 # lis = re.findall('(<li data-mmurl=.*?</li>)', datas, re.S) 97 # for li in lis: 98 # # detail_url = re.findall('<li data-mmurl="(.*?)"', li, re.S) # 详情页app链接 99 # detail_url = re.findall('href="(.*?)"', li, re.S)[0] # 详情页网页链接 100 # self.save_data(detail_url) 101 # print(datas) 102 103 # 频道抓取 104 urls = [] 105 # 正则匹配链接 106 for index, channel in enumerate(self.channel_link): 107 html = self.get_html(channel) 108 divs = re.findall('<div class="product-left".*<div class="paging orangestyle"', html, re.S)[0] 109 divs = re.findall('<div class="product-section">.*?</div>', divs, re.S) 110 for div in divs: 111 print(self.channel_name[index]) 112 url = re.findall('<a href="(.*?)"', div, re.S)[0] 113 self.save_data(url) 114 115 def save_data(self, url): 116 print(url) 117 hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest() 118 sitename = '驴妈妈旅游' 119 lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 120 tag = '0' 121 list_sql = [url, hkey, tag, sitename, lasttime] 122 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 123 cur = con.cursor() 124 sql = 'insert into gly(link, hkey, tag, sitename, lasttime) values (%s, %s, %s, %s, %s)' 125 try: 126 cur.execute(sql, list_sql) 127 print('insert success') 128 except Exception as e: 129 con.rollback() 130 print('error~', e) 131 else: 132 con.commit() 133 cur.close() 134 con.close() 135 136 137 if __name__ == '__main__': 138 demo = Demo() 139 demo.get_data()
细览页解析字段
1 import pymysql 2 import re 3 import requests 4 from multiprocessing.dummy import Pool as ThreadPool 5 import datetime 6 7 8 class XLY(object): 9 def __init__(self): 10 self.host = '127.0.0.1' 11 self.db = 'app_mark' 12 self.user = 'root' 13 self.passwd = '123456' 14 self.charset = 'utf8mb4' 15 self.headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' 17 } 18 self.start = datetime.datetime.now() 19 20 def get_data(self): 21 # 从gly表中拿链接 22 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 23 cur = con.cursor() 24 sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"' 25 after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"' 26 try: 27 cur.execute(sql) 28 results = cur.fetchall() 29 cur.execute(after_sql) 30 except Exception as e: 31 con.rollback() 32 results = None 33 print('error~', e) 34 else: 35 con.commit() 36 cur.close() 37 con.close() 38 return results 39 40 def parse_data(self, url): 41 # 正则匹配各个字段 42 print(url) 43 url = url[0] 44 # 匹配id 45 id = url.split('/')[-1] 46 id = re.sub('?.*', '', id) 47 # print(id) 48 response = requests.get(url, headers=self.headers) 49 html = response.text 50 if 'scenic' not in url and 'hotel' not in url: 51 # 去掉酒店和景点 52 # 匹配标题 53 title = re.findall('<h.*?tit">(.*?)</h.*?>', html, re.S) 54 if title: 55 title = title[0] 56 title = re.sub(' | | |自营|<[sS]*?>', '', title) 57 title = title.strip() 58 else: 59 title = re.findall('<p class="nchtitle">(.*?)</p>', html, re.S) 60 if title: 61 title = title[0] 62 title = re.sub(' | | |自营|<[sS]*?>', '', title) 63 title = title.strip() 64 # 匹配价格 65 price = re.findall('<dfn.*?>(d+)</dfn>', html, re.S) 66 if price: 67 price = price[0] 68 else: 69 price = re.findall('<span class="product_price">.*?(d+).*?</span>', html, re.S) 70 if price: 71 price = price[0] 72 else: 73 price = re.findall('¥<em>(d+)</em>', html, re.S) 74 if price: 75 price = price[0] 76 else: 77 price = re.findall('<span class="product-price-value">.*?(d+).*?</span>', html, re.S) 78 if price: 79 price = price[0] 80 else: 81 price = None 82 # 匹配好评率 83 praise = re.findall('<p class="product_top_dp">[sS]*?<span>([sS]*?)</span>[sS]*?</p>', html, re.S) 84 if praise: 85 praise = praise[0] 86 praise = re.sub('<.*?>', '', praise) 87 praise = praise.strip() 88 else: 89 praise = re.findall('<a href="#pro_comment".*?<span>([sS]*?)</span>', html, re.S) 90 if praise: 91 praise = praise[0] 92 else: 93 praise = re.findall('<span class="c_f60">([sS]*?)</span>', html, re.S) 94 if praise: 95 praise = praise[0] 96 praise = praise.strip() 97 else: 98 praise = re.findall('<p class="product_top_dp">[sS]*?<span>([sS]*?)<small>%</small>[sS]*?</span>', html, re.S) 99 if praise: 100 praise = praise[0] 101 praise = praise.strip() 102 else: 103 praise = re.findall('<span class="val">([sS]*?)</span>', html, re.S) 104 if praise: 105 praise = praise[0] 106 if praise: 107 if '%' in praise: 108 praise = re.sub('%', '', praise) 109 praise = float(praise) 110 if praise > 100: 111 praise = None 112 print('好评率抓取错误') 113 else: 114 pass 115 else: 116 praise = None 117 # 匹配出发地 118 starting_city = re.findall('<dl class="info-city">[sS]*?出发城市[sS]*?<ii>([sS]*?)</ii></dd>', html, re.S) 119 target_city = re.findall('<dt>目的地[sS]*?<dd>([sS]*?)</dd>', html, re.S) 120 if starting_city: 121 starting_city = starting_city[0] 122 starting_city = re.sub('<.*?>', '', starting_city) 123 # 匹配目的地 124 target_city = target_city[0] 125 target_city = re.sub('<.*?>', '', target_city) 126 # 匹配天数 127 days_spent = re.findall('<dt>出游天数[sS]*?<dd>([sS]*?)</dd>', html, re.S)[0] 128 days_spent = re.sub('<.*?>', '', days_spent) 129 # print(days_spent) 130 else: 131 starting_city = target_city = days_spent = None 132 # 匹配类型 133 type_ = re.findall('<i class="t-category">([sS]*?)</i>', html, re.S) 134 if type_: 135 type_ = type_[0] 136 else: 137 type_ = re.findall('<span class="product_top_type product_type_zyx">([sS]*?)</span>', html, re.S) 138 if type_: 139 type_ = type_[0] 140 else: 141 type_ = re.findall('<span class="dpn_group">([sS]*?)</span>', html, re.S) 142 if type_: 143 type_ = type_[0] 144 else: 145 type_ = None 146 # print(type_) 147 list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url] 148 self.save_data(list_data) 149 150 def save_data(self, list_data): 151 # 写入数据库 152 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 153 cur = con.cursor() 154 sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)' 155 # cur.execute(sql, list_data) 156 # con.commit() 157 try: 158 cur.execute(sql, list_data) 159 print('insert success') 160 except Exception as e: 161 con.rollback() 162 print('error~', e) 163 else: 164 con.commit() 165 cur.close() 166 con.close() 167 168 169 if __name__ == '__main__': 170 xly = XLY() 171 urls = xly.get_data() 172 if urls: 173 # 开启多线程 174 pool = ThreadPool(20) 175 pool.map(xly.parse_data, urls) 176 pool.close() 177 pool.join() 178 end = datetime.datetime.now() 179 print('耗时:', (end-xly.start)) 180 # for url in urls: 181 # url = url[0] 182 # xly.parse_data(url) 183 # break