zoukankan      html  css  js  c++  java
  • 抓取赶集app数据

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    
    import json
    import requests
    
    url = "https://app.ganji.com/datashare/"
    
    headers = {
                "Content-Type": "application/x-www-form-urlencoded",
                "userid": "C1ED10776D9B6108D8FEFEE4EA53058A",
                "model":"Generic/iphone",
                "customerid":"705",
                "clientagent":"iPhone 6S Plus#414*736#11.0.3",
                "versionid":"8.3.0",
                "os":"ios",
                "net":"wifi",
                "dv":"iPhone 6S Plus",
                "interface":"SearchPostsByJson3",
                "accept-language":"zh-cn",
            }
    
    def req(url, headers, data):
        content = None
        try:
            r = requests.post(url, headers=headers, data=data, timeout=5)
            content = r.json()
        except Exception as e:
            print("requests error: ", e, "requests url: ", url)
        return content
    
    def get_ganji_list_data():
        # 获取列表数据
        data = 't=-576747455&&showType=0&showtype=0&jsonArgs={"pageSize":20,"cityScriptIndex":2300,"majorCategoryScriptIndex":7,"queryFilters":[],"categoryId":7,"andKeywords":[{"name":"title","value":"%E5%95%86%E9%93%BA%E5%87%BA%E5%94%AE"}],"customerId":"705","sortKeywords":[{"field":"post_at","sort":"desc"}],"pageIndex":1}'
        ganji_data = req(url, headers, data)
        if ganji_data is not None:
            return ganji_data
        return None
    
    def get_article_data():
        ganji_data = get_ganji_list_data()
        if ganji_data is not None:
            data_list = ganji_data["posts"]
            print("count: ", ganji_data["total"])
    
            for data_ in data_list:
                title, d_sign, puid = data_["title"], data_["d_sign"], data_["puid"]
                print(title, d_sign)
                data_article = "d_sign={0}&cityId=176&post_type_for_maidian=5&categoryId=7&spfy=0".format(d_sign)
                # 根据 puid 获取详细信息. puid  需放在headers中
                headers["interface"] = "GetPostByPuid"
                headers["puid"] = puid
                content_data = req(url, headers, data_article)
                if content_data["status"] == 0:
                    data = content_data["data"]
                    end_data = {}
                    end_data["price"] = data["price"]["v"]
                    end_data["price_unit"] = data["price"]["u"]
                    end_data["title"] = data["title"]
                    end_data["city"] = data["city"]
                    end_data["description"] = data["description"]
                    end_data["district_name"] = data["district_name"]
                    end_data["street_name"] = data["street_name"]
                    end_data["latlng"] = data["latlng"]
                    end_data["id"] = data["id"]
    
                time.sleep(2)
    

    header里东西真多,最终测试 只需要这几种,累死宝宝了,

     教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    考研系列 HDU2241之早起看书 三分
    考研系列 HDU2242之空调教室 tarjan
    HDU5880 Family View ac自动机第二题
    HDU2222 Keywords Search ac自动机第一题
    hiho1514 偶像的条件 lower_bound
    HDU1800 hash+去前导0
    阿里云数据库自研产品亮相国际顶级会议ICDE 推动云原生数据库成为行业标准
    MaxCompute 图计算开发指南
    MaxCompute Mars开发指南
    基于MaxCompute的数仓数据质量管理
  • 原文地址:https://www.cnblogs.com/dockers/p/7811514.html
Copyright © 2011-2022 走看看