zoukankan      html  css  js  c++  java
  • 爬取加载页面数据

    '''
    @author:zl
    @contact:
    @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
    '''
    # _*_ coding:utf-8 _*_
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    from pymongo import MongoClient
    import xlwt
    import json
    headers = {
        'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9",
        'cache-control': "max-age=0",
        'upgrade-insecure-requests': "1",
        'Connection': 'keep-alive',
        'Host': "search.51job.com",
    
    }
    # 获取源码
    def get_content():
        post_param = {'action':'','start': 0,'limit':300}
        html = requests.get("https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90", params=post_param, verify=False)
        #jsondata = html.content.decode(encoding='utf-8')
        jsondata=html.json()
        return jsondata
    # 获取字段
    def get(jsondata):
        #jsondata=json.loads(jsondata)
        list = []
        for i in jsondata:
            item ={
                'rank':i['rank'],
                'cover_url':i['cover_url'],
                'id':i['id'],
                'types':i['types'],
                'regions':i['regions'],
                'title':i['title'],
                'url':i['url'],
                'release_date':i['release_date'],
                'actor_count':i['actor_count'],
                'vote_count':i['vote_count'],
                'score':i['score'],
                'actors':i['actors'],
            }
            list.append(item)
        return list
    # 爬到的内容写入excel
    def excel_write(items):
        for item in items: # 职位信息
            j=0
            for i in item:
                print(item[i])
                print("j:",j)
                index=item['rank']
                print("index:",index)
                ws.write(index, j, item[i])  # 行,列,数据
                j += 1
    if __name__ == '__main__':
        newTable = "test2.xls"  # 表格名称
        wb = xlwt.Workbook(encoding='utf-8')  # 创建excel文件,声明编码
        ws = wb.add_sheet('sheet1',cell_overwrite_ok=True)  # 创建表格
        headData = ['rank', 'cover_url', 'id','types','regions','title','url','release_date','actor_count','vote_count','score','actors']  # 表头信息
        for colnum in range(0,12):
            ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
        excel_write(get(get_content()))
        wb.save(newTable)
    

     #解析json

    import requests

    response=requests.get('http://httpbin.org/get')

    import json

    res1=json.loads(response.text)

    #太麻烦

    res2=response.json()

    #直接获取json数据

    print(res1 == res2) #True

    '''
    @author:zl
    @contact:
    @site: https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html
    '''
    # _*_ coding:utf-8 _*_
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    from pymongo import MongoClient
    import xlwt
    import json
    headers = {
        'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" ,
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9",
        'cache-control': "max-age=0",
        'upgrade-insecure-requests': "1",
        'Connection': 'keep-alive',
        'Host': "search.51job.com",
    
    }
    # 获取源码
    def get_content():
        post_param = {'action':'','start': 0,'limit':300}
        html = requests.get("https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90", params=post_param, verify=False)
        #jsondata = html.content.decode(encoding='utf-8')
        jsondata=html.json()
        return jsondata
    # 获取字段
    def get(jsondata):
        #jsondata=json.loads(jsondata)
        list = []
        for i in jsondata:
            item ={
                'rank':i['rank'],
                'cover_url':i['cover_url'],
                'id':i['id'],
                'types':i['types'],
                'regions':i['regions'],
                'title':i['title'],
                'url':i['url'],
                'release_date':i['release_date'],
                'actor_count':i['actor_count'],
                'vote_count':i['vote_count'],
                'score':i['score'],
                'actors':i['actors'],
            }
            list.append(item)
        return list
    # 爬到的内容写入excel
    def excel_write(items):
        for item in items: # 职位信息
            j=0
            for i in item:
                print(item[i])
                print("j:",j)
                index=item['rank']
                print("index:",index)
                ws.write(index, j, item[i])  # 行,列,数据
                j += 1
    if __name__ == '__main__':
        newTable = "test2.xls"  # 表格名称
        wb = xlwt.Workbook(encoding='utf-8')  # 创建excel文件,声明编码
        ws = wb.add_sheet('sheet1',cell_overwrite_ok=True)  # 创建表格
        headData = ['rank', 'cover_url', 'id','types','regions','title','url','release_date','actor_count','vote_count','score','actors']  # 表头信息
        for colnum in range(0,12):
            ws.write(0,colnum,headData[colnum],xlwt.easyxf('font: bold on'))
        excel_write(get(get_content()))
        wb.save(newTable)
    
  • 相关阅读:
    java基础5 (一维)数组和二维数组
    Java 内部类
    Java 抽象类和接口
    Java 多态
    Java 继承
    Java 包(package)
    Java String类和StringBuffer类
    Java 数组
    Java 封装与类
    Java 概述和编程基础
  • 原文地址:https://www.cnblogs.com/zhanglin123/p/9205659.html
Copyright © 2011-2022 走看看