zoukankan      html  css  js  c++  java
  • Python大数据:外部数据获取(网页抓取)

    import urllib2 as url
    import cookielib,StringIO,gzip,json
    import pandas as pd
    import numpy as np
    
    #定义一个通用函数,用于抓取指定商品的指定页评论
    def GetPage(link, page):
        # 伪造请求头
        req=url.Request(link)
        req.add_header("Cookie","ykjjdc=jjcc=e94cc85e72c94e55a098c78e19d979e4&jjcs=1&jjst=0; UM_distinctid=1609c238cf0111-0e3a4ab84d1fdf-6b1b1279-13c680-1609c238cf164f; CNZZDATA4396285=cnzz_eid%3D1644510205-1514443813-%26ntime%3D1514443813; Hm_lvt_f38eafa6ecbff460f93b98423ef80584=1514448064; Hm_lpvt_f38eafa6ecbff460f93b98423ef80584=1514448087; Hm_lvt_06b2a1ee40cb8f7fbd2546dfc4bfaa8c=1514448064; Hm_lpvt_06b2a1ee40cb8f7fbd2546dfc4bfaa8c=1514448087")
        req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36")
        req.add_header("Upgrade-Insecure-Requests","1")
        req.add_header("Accept","*/*")
        req.add_header("Accept-Encoding","gzip, deflate, sdch")
        req.add_header("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4")
        req.add_header("Cache-Control","no-cache")
        req.add_header("Connection","keep-alive")
        req.add_header("Pragma","no-cache")
        req.add_header("Upgrade-Insecure-Requests","1")
    
        # 发送请求
        f=url.urlopen(req)
    
        # 读取返回的数据流
        s=f.read()
    
        #数据流解压缩
        compressedstream = StringIO.StringIO(s)
        gzipper = gzip.GzipFile(fileobj=compressedstream) 
    
        # 数据流编码格式转换
        content = gzipper.read()
        #只保留列表部分
        startPos = content.index("<ul class="Sec_lul01">")
        endPos = content.index("<div class="Sec_lright01">")
        content = content[startPos:endPos]
        content = content.replace("
    ","").replace("  "," ")
        
        return content
    
    print GetPage("http://www.jjw.com/ershoufang",1)
  • 相关阅读:
    多线程
    集合与文件操作
    Net基础复习
    form表单
    html的常用标签和属性
    C#泛型与linq
    2020 年度总结 & OI 生涯感想——当年酒狂自负
    TODO-List
    Attention Points
    THUWC2020 游记
  • 原文地址:https://www.cnblogs.com/blackice/p/8612933.html
Copyright © 2011-2022 走看看