zoukankan html css js c++ java
榛果美团登录爬虫 requests session

所有美团方面旗下的登陆都采用重定向来解决登陆问题
即利用session 对话来解决登陆问题
当然也可以每次都模拟他的cookie来进行登陆
我用的代理是阿布云代理你们也可以选择别代理
这次是爬取的美团旗下的榛果民宿
  1 import requests
  2 from urllib.parse import urlencode
  3 import json
  4 import time, datetime
  5 import logging
  6 from lxml import etree
  7 import pymysql
  8 from pymysql.err import IntegrityError
  9 
 10 proxies_ = {
 11     'http': '@http-dyn.abuyun.com:9020',
 12     'https': '@http-dyn.abuyun.com:9020',
 13 }
 14 headers = {
 15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52'
 16 }
 17 session = requests.Session()
 18 
 19 
 20 def session_get(url, header=headers, tab=12):
 21     if tab == 0:
 22         return False
 23     try:
 24         response = session.get(url, headers=header, proxies=proxies_)
 25         time.sleep(2)
 26         return response if response.status_code == 200 else session_get(url, header, tab - 1)
 27     except Exception as e:
 28         if tab == 1:
 29             logging.exception(e)
 30         return session_get(url, header, tab - 1)
 31 
 32 
 33 def session_post(url, header=headers, data=None, tab=12):
 34     if tab == 0:
 35         return False
 36     try:
 37         response = session.post(url, headers=header, data=data, proxies=proxies_)
 38         time.sleep(2)
 39         return response if response.status_code == 200 else session_post(url, header, data, tab - 1)
 40     except Exception as e:
 41         if tab == 1:
 42             logging.exception(e)
 43         return session_post(url, header, data, tab - 1)
 44 
 45 
 46 def get_node_text(node, xpath):
 47     """
 48     通过节点和xpath来获取节点需要的内容
 49     :param node:
 50     :param xpath:
 51     :return:
 52     """
 53     try:
 54         if xpath == "string(.)": return node.xpath('string(.)').strip()
 55         if len(node.xpath(xpath)) > 0:
 56             return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0]
 57         return ""
 58     except:
 59         logging.exception('获取xpath %s 出错' % (xpath))
 60         return None
 61 
 62 
 63 def get_youjia_tpp_conn():
 64     """
 65     获取井队数据库连接
 66     :return:
 67     """
 68     return pymysql.connect(host='host', user='user', passwd='passwd', db='db', port=3306,
 69                            charset='utf8')
 70 
 71 
 72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"):
 73     """
 74     非json类型数据存储数据库
 75     :param data_json:
 76     :param t_name:
 77     :param l_name:
 78     :return:
 79     """
 80     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
 81     data_list = []
 82     insert_sql = "INSERT INTO " + l_name + "." + t_name + " ("
 83     update_sql = "UPDATE " + l_name + "." + t_name + " SET "
 84     for key in data_json:
 85         update_sql += str(key) + "=%s , "
 86         if str(key) == "id":
 87             id_key = data_json[key]
 88         insert_sql += str(key) + ","
 89     update_sql += "modify_time = '" + str(now_time) + "' where id = '" + str(id_key) + "'"
 90     insert_sql = insert_sql[:-1]
 91     insert_sql += ")VALUES("
 92     for key in data_json:
 93         insert_sql += "%s,"
 94         data_list.append(str(data_json[key]))
 95     insert_sql = insert_sql[:-1]
 96     insert_sql += ");"
 97     # print(update_sql)
 98     # print(insert_sql)
 99     with get_youjia_tpp_conn() as conn:
100         try:
101             print("storage_database_text  insert_sql : ", t_name)
102             conn.execute(insert_sql, tuple(data_list))
103         except IntegrityError:
104             print("storage_database_text  update_sql : ", t_name)
105             conn.execute(update_sql, tuple(data_list))
106         except Exception as msg:
107             logging.exception(msg)
108 
109 
110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"):
111     """
112     存储json形式至数据库
113     :param id_: id
114     :param data_json: json
115     :param j_name: json的名字
116     :param t_name: 表名
117     :param l_name: 库名
118     :return:
119     """
120     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
121     insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);"
122     updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;"
123     # print(updatesql % (data_json, now_time, id_))
124     with get_youjia_tpp_conn() as conn:
125         try:
126             print("storage_database_json  insert_sql : ", t_name)
127             conn.execute(insert_sql, (id_, data_json))
128         except IntegrityError:
129             print("storage_database_json  update_sql : ", t_name)
130             conn.execute(updatesql, (data_json, now_time, id_))
131         except Exception as msg:
132             logging.exception(msg)
133 
134 
135 def pre_login():
136     try:
137         param = {
138             # 'uuid': 'e8514dbe200b4fde9393.1532912269.1.0.0',
139             'service': 'phoenix',
140             'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
141         }
142         url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(param)
143         response = session_get(url=url, header=headers, tab=5)
144         if response.status_code == 200:
145             print("pre_login 成功")
146             return response.text
147         else:
148             return None
149     except ConnectionError as e:
150         print(e.args)
151         print('预登陆出错')
152 
153 
154 def parse_param(html):
155     try:
156         html = etree.HTML(html)
157         csrf = html.xpath('//input[@name="csrf"]/@value')[0]
158         origin = html.xpath('//input[@name="origin"]/@value')[0]
159         fingerprint = html.xpath('//input[@name="fingerprint"]/@value')[0]
160         uuid = html.xpath('//i[@class="form-uuid"]/text()')[0]
161         need_captcha = html.xpath('//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style')[
162             0].replace("display:", "")
163         return (csrf, uuid, need_captcha, origin, fingerprint)
164     except:
165         print('解析csrf,uuid,need_captcha出错')
166 
167 
168 def formal_login(username, password, param):
169     csrf = param[0]
170     uuid = param[1]
171     origin, fingerprint = param[3], param[4]
172     if 1 == 1:
173         captcha_param = {
174             'uuid': uuid,
175         }
176         url = 'https://passport.meituan.com/account/captcha?' + urlencode(captcha_param)
177         print(url)
178         image_resp = session_get(url)
179         with open('C:/Users/admin/Desktop/image/zg.jpg', 'wb') as file:
180             file.write(image_resp.content)
181         captcha = input('需要验证码:')
182     # else:
183     #     captcha = ''
184     url_param = {
185         'uuid': uuid,
186         'service': 'phoenix',
187         'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
188     }
189     postdata = {
190         'email': username,
191         'password': password,
192         'captcha': captcha,
193         'origin': origin,
194         'fingerprint': fingerprint,
195         'csrf': csrf
196     }
197     url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(url_param)
198     try:
199         response = session_post(url, data=postdata, header=headers)
200         if response.status_code == 200:
201             print("登陆成功！")
202             return response.text
203         else:
204             return None
205     except ConnectionError as e:
206         print(e.args)
207         print('登录出错')
208 
209 
210 def parse_token(html):
211     try:
212         html = etree.HTML(html)
213         action_url = html.xpath('//form[@class="J-form mainbox__content"]/@action')[0]
214         token = html.xpath('//input[@name="token"]/@value')[0]
215         expire = html.xpath('//input[@name="expire"]/@value')[0]
216         isdialog = html.xpath('//input[@name="isdialog"]/@value')[0]
217         autologin = html.xpath('//input[@name="autologin"]/@value')[0]
218         csrf = html.xpath('//*[@id="csrf"]/text()')[0]
219 
220         # headers['x-csrf-token'] = csrf
221         # trust_response = session.post(action_url, data=postdata, headers=headers)
222         # print(trust_response.text)
223         return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
224                 "autologin": autologin, "csrf": csrf}
225     except:
226         logging.exception('解析token出错')
227 
228 
229 def redirect_login(token_json):
230     """
231     {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
232                 "autologin": autologin, "csrf": csrf}
233     :param token:
234     :return:
235     """
236     postdata = {
237         'token': token_json['token'],
238         'expire': token_json['expire'],
239         'isdialog': token_json['isdialog'],
240         'autologin': token_json['autologin'],
241         'logintype': 'normal'
242     }
243     headers['x-csrf-token'] = token_json['csrf']
244     try:
245         trust_response = session_post(token_json['action_url'], data=postdata, header=headers)
246         print("重定向成功！！")
247         # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h)
248     except ConnectionError as e:
249         print(e.args)
250         print('重定向出错')
251 
252 
253 def test():
254     try:
255         time.sleep(5)
256         url = 'http://maoyan.com/profile'
257         response = session_get(url, header=headers)
258         print(response.status_code)
259         print(response.text)
260     except ConnectionError as e:
261         print(e.args)
262         print('测试出错')
263 
264 
265 def crawl_order(account_id, token, page_no=1, page_size=20):
266     orders_url = "https://www.zhenguo.com/host/orders/"
267     response = session_get(orders_url, header=headers)
268     print(response.status_code)
269     html = etree.HTML(response.text)
270     csrf = html.xpath('//meta[@name="csrf-token"]/@content')[0]
271     headers['x-csrf-token'] = csrf
272     print(csrf)
273     queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType"
274     OrderByType = {'pageNow': page_no, 'pageSize': page_size, 'orderStatusType': 9}
275     headers['Accept'] = "application/json"
276     headers['Content-Type'] = "application/json"
277     query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers)
278     query_json = query_response.json()
279     query_list = query_json['data']['list']
280     print(len(query_list))
281     for order_json in query_list:
282         order_id = order_json['orderId']
283         storage_database_json(order_id, json.dumps(order_json), 'order', 'zhenguo_order')
284         storage_database_text({"id": order_id, 'account_id': account_id}, 'zhenguo_order')
285 
286     if len(query_list) == page_size:
287         crawl_order(account_id, page_no + 1)
288 
289 
290 def house_detail(list_json):
291     """
292     解析房屋详情的
293     :param list_json:
294     :return:
295     """
296     room_id = list_json["id"]
297     room_url = "https://www.zhenguo.com/housing/%s" % room_id
298     room_response = session_get(room_url)
299     if room_response:
300         html = etree.HTML(room_response.text)
301         room_type = get_node_text(html,
302                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text()')
303         list_json["room_type"] = room_type
304         house_wear = get_node_text(html,
305                                    '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text()')
306         list_json["house_wear"] = house_wear
307         room_area = get_node_text(html,
308                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text()')
309         list_json["room_area"] = room_area
310         for node in html.xpath('//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li'):
311             text = get_node_text(node, './div[1]/text()')
312             node_detail = get_node_text(node, './div[2]/text()')
313             if text == "房源":
314                 room_count = node_detail
315                 list_json["room_count"] = room_count
316             if text == "评价":
317                 comment_count = node_detail
318                 list_json["comment_count"] = comment_count
319             if text == "咨询回复率":
320                 rep_rate = node_detail
321                 list_json["rep_rate"] = rep_rate
322             if text == "咨询回复时长":
323                 rep_length = node_detail
324                 list_json["rep_length"] = rep_length
325         str(1).strip()
326         reserve = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/'
327                                       'div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()').split("，")
328         # list_json["reserve"] = reserve
329         if len(reserve) > 1:
330             less_day = reserve[0].replace("最少预订", "").replace("天", "").strip()
331             more_day = reserve[1].replace("最多预订", "").replace("天", "").strip()
332             list_json["less_day"] = less_day
333             list_json["more_day"] = more_day
334         unsubscribe = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text()')
335         list_json["unsubscribe"] = unsubscribe
336     return list_json
337 
338 
339 def crawl_room(account_id, token):
340     comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100"
341     room_list_url = "https://www.zhenguo.com/house/list/"
342     room_response = session_get(url=room_list_url, header=headers)
343     if room_response:
344         html = etree.HTML(room_response.text)
345     for node in html.xpath('//div[@class="houseCard__block"]'):
346         title = get_node_text(node, './div[@class="houseCard__titleLine"]/text()')  # 标题
347         price = get_node_text(node, './div[@class="houseCard__addLine clearfix"]'
348                                     '/span[1]/span[@class="houseCard__price"]/text()').replace("¥", "")  # 价格
349         state = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]/'
350                                     'div[1]/span[@class="houseCard__verifyStatus-5"]/text()')  # 状态
351         room_id = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]'
352                                       '/div[1]/@data-product-id')  # 房源id
353         print(account_id, title, price, state, room_id)
354         list_json = {"account_id": account_id, "title": title,
355                      "price": price, "state": state, "id": room_id, "room_id": room_id}
356         comment_ = comment_url % room_id
357 
358         house_json = house_detail(list_json)
359         response = session_get(url=comment_)
360         if response:
361             print(response.text)
362             storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info",
363                                   l_name="youjia_tpp")
364         storage_database_text(house_json, 'zhenguo_room_info')
365 
366 
367 def crawl_room_list(account_id, token):
368     app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
369                                 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 "
370                                 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0",
371                   "Cookie": "token=" + token}
372     list_url = "https://iphx.meituan.com/ds/product/online/list"
373     list_resp = session_get(url=list_url, header=app_header)
374     if list_resp:
375         list_json = list_resp.json()
376         for room_json in list_json['data']['list']:
377             room_id = room_json['productId']
378             product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id)
379             product_quota_resp = session_get(url=product_quota_url, header=app_header)
380             print(room_json)
381             print(product_quota_resp.json()['data'])
382 
383 
384 
385 def crawl(account_id, token):
386     """
387     登录的session搞定之后 开始爬取详细信息
388     :return:
389     """
390     crawl_room_list(account_id, token)  # 爬取手机端信息
391 
392     # crawl_room(account_id, token)  # 房屋爬取
393     # crawl_order(account_id, token)  # 订单爬虫
394 
395 
396 def login(username, password):
397     html_pre_login = pre_login()
398     param = parse_param(html_pre_login)
399     print("param: ", param)
400     html_login = formal_login(username, password, param)
401     # print(html_login)
402     token_json = parse_token(html_login)
403     print("token_json: ", token_json)
404     redirect_login(token_json)
405     return token_json['token']
406 
407 
408 if __name__ == '__main__':
409     username = 'username'
410     password = 'username'
411     token = login(username, password)
412     crawl(1, token)
查看全文
相关阅读:
Pupet自动化管理环境部署记录
 Puppet常识梳理
 手动编写的几个简单的puppet管理配置
 Centos下部署DRBD+NFS+Keepalived高可用环境记录
 DRBD详细解说及配置过程记录
 kvm虚拟化管理平台WebVirtMgr部署-完整记录(2)
kvm虚拟化管理平台WebVirtMgr部署-完整记录(1)
kvm虚拟化管理平台WebVirtMgr部署-完整记录(0)
zabbix监控-基本原理介绍
 OpenStack构架知识梳理
原文地址：https://www.cnblogs.com/bianzhiwei/p/9517282.html
榛果 美团 登录 爬虫 requests session

榛果美团登录爬虫 requests session