zoukankan      html  css  js  c++  java
  • 榛果 美团 登录 爬虫 requests session

    所有美团方面旗下的登陆都采用重定向来解决登陆问题  

    即利用session 对话来解决登陆问题 

    当然也可以每次都模拟他的cookie来进行登陆

    我用的代理是阿布云代理   你们也可以选择别代理

    这次是爬取的美团旗下的榛果民宿

      1 import requests
      2 from urllib.parse import urlencode
      3 import json
      4 import time, datetime
      5 import logging
      6 from lxml import etree
      7 import pymysql
      8 from pymysql.err import IntegrityError
      9 
     10 proxies_ = {
     11     'http': '@http-dyn.abuyun.com:9020',
     12     'https': '@http-dyn.abuyun.com:9020',
     13 }
     14 headers = {
     15     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52'
     16 }
     17 session = requests.Session()
     18 
     19 
     20 def session_get(url, header=headers, tab=12):
     21     if tab == 0:
     22         return False
     23     try:
     24         response = session.get(url, headers=header, proxies=proxies_)
     25         time.sleep(2)
     26         return response if response.status_code == 200 else session_get(url, header, tab - 1)
     27     except Exception as e:
     28         if tab == 1:
     29             logging.exception(e)
     30         return session_get(url, header, tab - 1)
     31 
     32 
     33 def session_post(url, header=headers, data=None, tab=12):
     34     if tab == 0:
     35         return False
     36     try:
     37         response = session.post(url, headers=header, data=data, proxies=proxies_)
     38         time.sleep(2)
     39         return response if response.status_code == 200 else session_post(url, header, data, tab - 1)
     40     except Exception as e:
     41         if tab == 1:
     42             logging.exception(e)
     43         return session_post(url, header, data, tab - 1)
     44 
     45 
     46 def get_node_text(node, xpath):
     47     """
     48     通过节点和xpath来获取节点需要的内容
     49     :param node:
     50     :param xpath:
     51     :return:
     52     """
     53     try:
     54         if xpath == "string(.)": return node.xpath('string(.)').strip()
     55         if len(node.xpath(xpath)) > 0:
     56             return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0]
     57         return ""
     58     except:
     59         logging.exception('获取xpath %s 出错' % (xpath))
     60         return None
     61 
     62 
     63 def get_youjia_tpp_conn():
     64     """
     65     获取井队数据库连接
     66     :return:
     67     """
     68     return pymysql.connect(host='host', user='user', passwd='passwd', db='db', port=3306,
     69                            charset='utf8')
     70 
     71 
     72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"):
     73     """
     74     非json类型数据存储数据库
     75     :param data_json:
     76     :param t_name:
     77     :param l_name:
     78     :return:
     79     """
     80     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
     81     data_list = []
     82     insert_sql = "INSERT INTO " + l_name + "." + t_name + " ("
     83     update_sql = "UPDATE " + l_name + "." + t_name + " SET "
     84     for key in data_json:
     85         update_sql += str(key) + "=%s , "
     86         if str(key) == "id":
     87             id_key = data_json[key]
     88         insert_sql += str(key) + ","
     89     update_sql += "modify_time = '" + str(now_time) + "' where id = '" + str(id_key) + "'"
     90     insert_sql = insert_sql[:-1]
     91     insert_sql += ")VALUES("
     92     for key in data_json:
     93         insert_sql += "%s,"
     94         data_list.append(str(data_json[key]))
     95     insert_sql = insert_sql[:-1]
     96     insert_sql += ");"
     97     # print(update_sql)
     98     # print(insert_sql)
     99     with get_youjia_tpp_conn() as conn:
    100         try:
    101             print("storage_database_text  insert_sql : ", t_name)
    102             conn.execute(insert_sql, tuple(data_list))
    103         except IntegrityError:
    104             print("storage_database_text  update_sql : ", t_name)
    105             conn.execute(update_sql, tuple(data_list))
    106         except Exception as msg:
    107             logging.exception(msg)
    108 
    109 
    110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"):
    111     """
    112     存储json形式至数据库
    113     :param id_: id
    114     :param data_json: json
    115     :param j_name: json的名字
    116     :param t_name: 表名
    117     :param l_name: 库名
    118     :return:
    119     """
    120     now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    121     insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);"
    122     updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;"
    123     # print(updatesql % (data_json, now_time, id_))
    124     with get_youjia_tpp_conn() as conn:
    125         try:
    126             print("storage_database_json  insert_sql : ", t_name)
    127             conn.execute(insert_sql, (id_, data_json))
    128         except IntegrityError:
    129             print("storage_database_json  update_sql : ", t_name)
    130             conn.execute(updatesql, (data_json, now_time, id_))
    131         except Exception as msg:
    132             logging.exception(msg)
    133 
    134 
    135 def pre_login():
    136     try:
    137         param = {
    138             # 'uuid': 'e8514dbe200b4fde9393.1532912269.1.0.0',
    139             'service': 'phoenix',
    140             'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
    141         }
    142         url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(param)
    143         response = session_get(url=url, header=headers, tab=5)
    144         if response.status_code == 200:
    145             print("pre_login 成功")
    146             return response.text
    147         else:
    148             return None
    149     except ConnectionError as e:
    150         print(e.args)
    151         print('预登陆出错')
    152 
    153 
    154 def parse_param(html):
    155     try:
    156         html = etree.HTML(html)
    157         csrf = html.xpath('//input[@name="csrf"]/@value')[0]
    158         origin = html.xpath('//input[@name="origin"]/@value')[0]
    159         fingerprint = html.xpath('//input[@name="fingerprint"]/@value')[0]
    160         uuid = html.xpath('//i[@class="form-uuid"]/text()')[0]
    161         need_captcha = html.xpath('//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style')[
    162             0].replace("display:", "")
    163         return (csrf, uuid, need_captcha, origin, fingerprint)
    164     except:
    165         print('解析csrf,uuid,need_captcha出错')
    166 
    167 
    168 def formal_login(username, password, param):
    169     csrf = param[0]
    170     uuid = param[1]
    171     origin, fingerprint = param[3], param[4]
    172     if 1 == 1:
    173         captcha_param = {
    174             'uuid': uuid,
    175         }
    176         url = 'https://passport.meituan.com/account/captcha?' + urlencode(captcha_param)
    177         print(url)
    178         image_resp = session_get(url)
    179         with open('C:/Users/admin/Desktop/image/zg.jpg', 'wb') as file:
    180             file.write(image_resp.content)
    181         captcha = input('需要验证码:')
    182     # else:
    183     #     captcha = ''
    184     url_param = {
    185         'uuid': uuid,
    186         'service': 'phoenix',
    187         'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/',
    188     }
    189     postdata = {
    190         'email': username,
    191         'password': password,
    192         'captcha': captcha,
    193         'origin': origin,
    194         'fingerprint': fingerprint,
    195         'csrf': csrf
    196     }
    197     url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(url_param)
    198     try:
    199         response = session_post(url, data=postdata, header=headers)
    200         if response.status_code == 200:
    201             print("登陆成功!")
    202             return response.text
    203         else:
    204             return None
    205     except ConnectionError as e:
    206         print(e.args)
    207         print('登录出错')
    208 
    209 
    210 def parse_token(html):
    211     try:
    212         html = etree.HTML(html)
    213         action_url = html.xpath('//form[@class="J-form mainbox__content"]/@action')[0]
    214         token = html.xpath('//input[@name="token"]/@value')[0]
    215         expire = html.xpath('//input[@name="expire"]/@value')[0]
    216         isdialog = html.xpath('//input[@name="isdialog"]/@value')[0]
    217         autologin = html.xpath('//input[@name="autologin"]/@value')[0]
    218         csrf = html.xpath('//*[@id="csrf"]/text()')[0]
    219 
    220         # headers['x-csrf-token'] = csrf
    221         # trust_response = session.post(action_url, data=postdata, headers=headers)
    222         # print(trust_response.text)
    223         return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
    224                 "autologin": autologin, "csrf": csrf}
    225     except:
    226         logging.exception('解析token出错')
    227 
    228 
    229 def redirect_login(token_json):
    230     """
    231     {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog,
    232                 "autologin": autologin, "csrf": csrf}
    233     :param token:
    234     :return:
    235     """
    236     postdata = {
    237         'token': token_json['token'],
    238         'expire': token_json['expire'],
    239         'isdialog': token_json['isdialog'],
    240         'autologin': token_json['autologin'],
    241         'logintype': 'normal'
    242     }
    243     headers['x-csrf-token'] = token_json['csrf']
    244     try:
    245         trust_response = session_post(token_json['action_url'], data=postdata, header=headers)
    246         print("重定向成功!!")
    247         # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h)
    248     except ConnectionError as e:
    249         print(e.args)
    250         print('重定向出错')
    251 
    252 
    253 def test():
    254     try:
    255         time.sleep(5)
    256         url = 'http://maoyan.com/profile'
    257         response = session_get(url, header=headers)
    258         print(response.status_code)
    259         print(response.text)
    260     except ConnectionError as e:
    261         print(e.args)
    262         print('测试出错')
    263 
    264 
    265 def crawl_order(account_id, token, page_no=1, page_size=20):
    266     orders_url = "https://www.zhenguo.com/host/orders/"
    267     response = session_get(orders_url, header=headers)
    268     print(response.status_code)
    269     html = etree.HTML(response.text)
    270     csrf = html.xpath('//meta[@name="csrf-token"]/@content')[0]
    271     headers['x-csrf-token'] = csrf
    272     print(csrf)
    273     queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType"
    274     OrderByType = {'pageNow': page_no, 'pageSize': page_size, 'orderStatusType': 9}
    275     headers['Accept'] = "application/json"
    276     headers['Content-Type'] = "application/json"
    277     query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers)
    278     query_json = query_response.json()
    279     query_list = query_json['data']['list']
    280     print(len(query_list))
    281     for order_json in query_list:
    282         order_id = order_json['orderId']
    283         storage_database_json(order_id, json.dumps(order_json), 'order', 'zhenguo_order')
    284         storage_database_text({"id": order_id, 'account_id': account_id}, 'zhenguo_order')
    285 
    286     if len(query_list) == page_size:
    287         crawl_order(account_id, page_no + 1)
    288 
    289 
    290 def house_detail(list_json):
    291     """
    292     解析房屋详情的
    293     :param list_json:
    294     :return:
    295     """
    296     room_id = list_json["id"]
    297     room_url = "https://www.zhenguo.com/housing/%s" % room_id
    298     room_response = session_get(room_url)
    299     if room_response:
    300         html = etree.HTML(room_response.text)
    301         room_type = get_node_text(html,
    302                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text()')
    303         list_json["room_type"] = room_type
    304         house_wear = get_node_text(html,
    305                                    '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text()')
    306         list_json["house_wear"] = house_wear
    307         room_area = get_node_text(html,
    308                                   '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text()')
    309         list_json["room_area"] = room_area
    310         for node in html.xpath('//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li'):
    311             text = get_node_text(node, './div[1]/text()')
    312             node_detail = get_node_text(node, './div[2]/text()')
    313             if text == "房源":
    314                 room_count = node_detail
    315                 list_json["room_count"] = room_count
    316             if text == "评价":
    317                 comment_count = node_detail
    318                 list_json["comment_count"] = comment_count
    319             if text == "咨询回复率":
    320                 rep_rate = node_detail
    321                 list_json["rep_rate"] = rep_rate
    322             if text == "咨询回复时长":
    323                 rep_length = node_detail
    324                 list_json["rep_length"] = rep_length
    325         str(1).strip()
    326         reserve = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/'
    327                                       'div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()').split("")
    328         # list_json["reserve"] = reserve
    329         if len(reserve) > 1:
    330             less_day = reserve[0].replace("最少预订", "").replace("", "").strip()
    331             more_day = reserve[1].replace("最多预订", "").replace("", "").strip()
    332             list_json["less_day"] = less_day
    333             list_json["more_day"] = more_day
    334         unsubscribe = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text()')
    335         list_json["unsubscribe"] = unsubscribe
    336     return list_json
    337 
    338 
    339 def crawl_room(account_id, token):
    340     comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100"
    341     room_list_url = "https://www.zhenguo.com/house/list/"
    342     room_response = session_get(url=room_list_url, header=headers)
    343     if room_response:
    344         html = etree.HTML(room_response.text)
    345     for node in html.xpath('//div[@class="houseCard__block"]'):
    346         title = get_node_text(node, './div[@class="houseCard__titleLine"]/text()')  # 标题
    347         price = get_node_text(node, './div[@class="houseCard__addLine clearfix"]'
    348                                     '/span[1]/span[@class="houseCard__price"]/text()').replace("¥", "")  # 价格
    349         state = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]/'
    350                                     'div[1]/span[@class="houseCard__verifyStatus-5"]/text()')  # 状态
    351         room_id = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]'
    352                                       '/div[1]/@data-product-id')  # 房源id
    353         print(account_id, title, price, state, room_id)
    354         list_json = {"account_id": account_id, "title": title,
    355                      "price": price, "state": state, "id": room_id, "room_id": room_id}
    356         comment_ = comment_url % room_id
    357 
    358         house_json = house_detail(list_json)
    359         response = session_get(url=comment_)
    360         if response:
    361             print(response.text)
    362             storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info",
    363                                   l_name="youjia_tpp")
    364         storage_database_text(house_json, 'zhenguo_room_info')
    365 
    366 
    367 def crawl_room_list(account_id, token):
    368     app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 "
    369                                 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 "
    370                                 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0",
    371                   "Cookie": "token=" + token}
    372     list_url = "https://iphx.meituan.com/ds/product/online/list"
    373     list_resp = session_get(url=list_url, header=app_header)
    374     if list_resp:
    375         list_json = list_resp.json()
    376         for room_json in list_json['data']['list']:
    377             room_id = room_json['productId']
    378             product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id)
    379             product_quota_resp = session_get(url=product_quota_url, header=app_header)
    380             print(room_json)
    381             print(product_quota_resp.json()['data'])
    382 
    383 
    384 
    385 def crawl(account_id, token):
    386     """
    387     登录的session搞定之后 开始爬取详细信息
    388     :return:
    389     """
    390     crawl_room_list(account_id, token)  # 爬取手机端信息
    391 
    392     # crawl_room(account_id, token)  # 房屋爬取
    393     # crawl_order(account_id, token)  # 订单爬虫
    394 
    395 
    396 def login(username, password):
    397     html_pre_login = pre_login()
    398     param = parse_param(html_pre_login)
    399     print("param: ", param)
    400     html_login = formal_login(username, password, param)
    401     # print(html_login)
    402     token_json = parse_token(html_login)
    403     print("token_json: ", token_json)
    404     redirect_login(token_json)
    405     return token_json['token']
    406 
    407 
    408 if __name__ == '__main__':
    409     username = 'username'
    410     password = 'username'
    411     token = login(username, password)
    412     crawl(1, token)
  • 相关阅读:
    Pupet自动化管理环境部署记录
    Puppet常识梳理
    手动编写的几个简单的puppet管理配置
    Centos下部署DRBD+NFS+Keepalived高可用环境记录
    DRBD详细解说及配置过程记录
    kvm虚拟化管理平台WebVirtMgr部署-完整记录(2)
    kvm虚拟化管理平台WebVirtMgr部署-完整记录(1)
    kvm虚拟化管理平台WebVirtMgr部署-完整记录(0)
    zabbix监控-基本原理介绍
    OpenStack构架知识梳理
  • 原文地址:https://www.cnblogs.com/bianzhiwei/p/9517282.html
Copyright © 2011-2022 走看看