zoukankan      html  css  js  c++  java
  • 【Python requests多页面爬取案例】

    "```python
    import requests
    from fake_useragent import UserAgent # 随机ua库

    class Boring():

    def __init__(self, page_scope=(4, 7)):
        """
        :param page_scope: 页码范围
        """
        self.page_scope = page_scope
        self.all_id = self.get_all_company_id()
        self.enterprise_info = self.get_all_company_info()
        self.show_enterprise_info()
    
    @property
    def firefox_ua(self):
        """返回随机火狐UA头"""
        ua = UserAgent(use_cache_server=False)
        return {'User-Agent': ua.Firefox}  # ua.Firefox:随机生成火狐浏览器UA
    
    def get_all_company_id(self):
        """
        将返回指定页码数内的公司的id
        :param start_page: 起始页码
        :param end_page: 结束页码
        """
        all_id = {}
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'  # 此连接见图1
        for page in range(self.page_scope[0], self.page_scope[1] + 1):
            json_text = requests.post(url, data=self.post_data(page), headers=self.firefox_ua).json()
            current_page_all_id = [dict['ID'] for dict in json_text['list']]
            all_id.setdefault(page, current_page_all_id)
        return all_id
    
    def get_all_company_info(self):
        """开始获取公司信息"""
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'  # 见图3
        enterprise_info = {}
        for page in self.all_id:
            for id in self.all_id.get(page):
                response = requests.post(url, data={'id': id}, headers=self.firefox_ua)  # data={'id': id}:见图4
                if response.headers['Content-Type'] == 'application/json;charset=UTF-8':
                    json_text = response.json()
                    enterprise_info.setdefault(json_text.get('businessPerson'), json_text.get('epsName'))
                    # 这里仅获取企业负责人和企业名
        return enterprise_info
    
    def show_enterprise_info(self):
        [print(k, v) for k, v in self.enterprise_info.items()]
    
    def post_data(self, page):
        """获取公司列表时要提交的form"""
        return {
            'on': 'true',
            'page': page,
            'pageSize': '15',
            'productName': '',
            'conditionType': '1',
            'applyname': '',
            'applysn': '',
        }  # 见图2
    

    go

    Boring()

    
    
    "
  • 相关阅读:
    Class.forName和ClassLoader.loadClass的区别
    数据库连接池优化配置(druid,dbcp,c3p0)
    MySQL在默认事务下各SQL语句使用的锁分析
    ArrayList vs LinkedList 空间占用
    MySQL锁详解
    利用ConcurrentHashMap来实现一个ConcurrentHashSet
    list与Set、Map区别及适用场景
    实现一个原子的正整数类:AtomicPositiveInteger
    mysql如何处理亿级数据,第一个阶段——优化SQL语句
    java性能优化
  • 原文地址:https://www.cnblogs.com/zyk01/p/11376466.html
Copyright © 2011-2022 走看看