zoukankan      html  css  js  c++  java
  • 爬取百度信用(转)

    转自:https://blog.csdn.net/weixin_42812527/article/details/86682167

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    #author tom
    
    import re
    import time
    import execjs
    import requests
    from chardet import *
    from scrapy import Selector
    
    
    class BaiDuQiYeXinYong:
        def __init__(self):
            self.key_world = input('请输入想要查询的公司名:')
    
        def index(self):
            """
            首页信息获取。
            :return: 详情界面的 url
            """
            index_url = 'https://xin.baidu.com/s'
            index_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Referer': 'https://xin.baidu.com/',
            }
            params = {
                'q': self.key_world,
                't': '0'
            }
            response = requests.get(url=index_url, headers=index_headers, params=params).content
            type_code = detect(response).get('encoding')  # 获取编码格式。
            response = response.decode(type_code)  # 转化为字符串。
            selector_response = Selector(text=response)
            details_href = selector_response.xpath('//a[@class="zx-list-item-url"]/@href').extract_first()
            pid = re.findall(r'pid=(.*)', details_href)[0]  # 获取pid的值。
            details_href = 'https://xin.baidu.com' + details_href
            return pid, details_href
    
        def details(self, pid, details_href):
            """
            获取查找公司的 基本信息。
            :param pid: 参数 pid 的值
            :param details_href: 详情界面的 url
            :return:
            """
            details_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            }
            details_response = requests.get(url=details_href, headers=details_headers)
            selector_details_response = Selector(text=details_response.text)
            bid = selector_details_response.xpath('//span[@id="baiducode"]//text()').extract_first()
            by_id, attribute = 
                re.findall(r"tk = document.getElementById('(.*?)').getAttribute('(.*?)')", details_response.text)[0]
            tk = re.findall(attribute + r'="(.*?)">', details_response.text)[0]  # 通过属性来查找tk值
            mix_func = re.findall(r'(function mix.*)(function', details_response.text)[0]
            mix_func = """
            %s
            """ % mix_func
            js_function = execjs.compile(mix_func)
            tot = js_function.call('mix', tk, bid)  # 得到tot
            search_time = int(time.time() * 1000)
            info_url = "https://xin.baidu.com/detail/basicAjax?"
            params = {
                'pid': pid,
                'tot': tot,
                '_': search_time,
            }
            response = requests.get(info_url, headers=details_headers, params=params)
            return response.content
    
        def run(self):
            """
            开始运行,返回最终的信息
            :return:
            """
            pid, details_href = self.index()
            content_info = self.details(pid, details_href)
            print(content_info.decode('unicode_escape')) # 在这里添加了一行解码方式而已。
    
    
    if __name__ == '__main__':
        bai_xin_yong = BaiDuQiYeXinYong()
        bai_xin_yong.run()
  • 相关阅读:
    CruiseControl.NET与TFS结合的配置文件
    环信Restfull API dotnetSDK
    NAnt0.92版本首次在windows 8.1的机子上运行报错的问题解决
    asp.net接收ajax请求参数时为空的现象
    对接微信红包时:CA证书出错,请登录微信支付商户平台下载证书
    在打开vs解决方案时,怎样让所以打开的项目自动折叠
    使用Chrome或Fiddler抓取WebSocket包
    SVN使用教程
    禁用Resharper长代码自动换行的解决办法
    SQLServer日期格式化
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10826718.html
Copyright © 2011-2022 走看看