zoukankan      html  css  js  c++  java
  • 实战 7 淘宝商品信息定向爬虫

    import requests
    import re
    def getHTMLText(url):
        try:
        #淘宝用了反爬虫机制,必须提取cookie让他认为是用户在操作
            headers = {
                "user-agent": "Mozilla/5.0",
                "cookie": "miid=1612134452349690119; cna=THqIFiCoTDcCAasjUtS73iNL; t=bbc9a140acd8d518326e1a1d7c9d659d; cookie2=12dbf17b95b0b5e287db790e3c6202f1; v=0; _tb_token_=5183ef37b54d5; _samesite_flag_=true; sgcookie=ExY97bTo2Ovq1IjpIjgji; uc3=id2=UNDUK%2FS2voKDvw%3D%3D&lg2=URm48syIIVrSKA%3D%3D&nk2=AHY2D185rXA%3D&vt3=F8dBxGZuEXJXsog%2BdQI%3D; csg=e8b87b29; lgc=cltt%5Cu5C0F%5Cu9648; dnk=cltt%5Cu5C0F%5Cu9648; skt=bee4d9ccfaf7138b; existShop=MTU4OTg5MzczMw%3D%3D; uc4=id4=0%40UgckEyzZMpFaBzLNri18B0sDs8OZ&nk4=0%40AhhLsGvGLncPumlBqdyreeIqcw%3D%3D; tracknick=cltt%5Cu5C0F%5Cu9648; _cc_=VFC%2FuZ9ajQ%3D%3D; tfstk=cekOBvaPOeYg3-iaaxd3Gve3pwxlaUSToGaAHnubV71Vuh6c3s2jEYFTtCZVWJKd.; mt=ci=64_1; thw=cn; enc=Z2tsLVHv7rciprJdoPFfnnZyK95pCm8ewfzNojqFEtdzPKqxI0juRoRMkxETY%2BWbVCs%2BL%2Boj2XUdNPU0o9010w%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=cdba13bd71a70bac8da9f7717cc536ba_1590040171929; _m_h5_tk_enc=ad9a7ae2e64d058179abbd852424c9a7; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&pas=0&cookie14=UoTV7NZUno0ZKw%3D%3D&cookie21=Vq8l%2BKCLjhS4UhJVbhgU&existShop=false; JSESSIONID=A7629614520E309C033FC2F553C818F1; l=eBSghlFHQZ0UoZ_9BOfZnurza77OsIRYnuPzaNbMiOCP_y1p5_wcWZASS9T9CnGVh6qBR3PBVv7HBeYBqnY4n5U62j-la1Dmn; isg=BL6-xLp5bKy6I7j39z6KK6j6D9QA_4J5JGvLwWjHJYH8C17l0I1YiZPph9fHCXqR"
                }
            r=requests.get(url,timeout=30,headers=headers)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except:
            return ""
    def parsePage(ilt,html):
        try:
            plt=re.findall(r'"view_price":"[d.]*"',html)#[d.]* 找价格:12.03 12.00 1000 具体可参考https://www.cnblogs.com/tingtin/p/12928217.html    s = re.findall(r'[d.]*','123.3 2.3 1000')
            tlt=re.findall(r'"raw_title":".*?"',html)#raw_title:""的最小匹配
            for i in range(len(plt)):
                price=eval(plt[i].split(':')[1])#取:后紧跟的数字如"view_price":"69.00"   eval去掉"",再取69.00
                title=eval(tlt[i].split(':')[1])
                ilt.append([price,title])
        except:
            return ""
    def printGoodsList(ilt):
        tply="{:4}	{:8}	{:16}"
        print(tply.format("序号","价格","商品名称"))
        count=0
        for g in ilt:
            count=count+1
            print(tply.format(count,g[0],g[1]))
    def main():
        goods="背包"#可以换为其他的
        depth=3
        strat_url='https://s.taobao.com/search?q='+goods
        infoList=[]
        for i in  range(depth):
            try:
                url=strat_url+"&s="+str(44*i)
                html=getHTMLText(url)
                parsePage(infoList,html)
            except:
                continue
        printGoodsList(infoList)
    main()

    cookie

     爬取的结果

  • 相关阅读:
    vc++ 错误 CreateWindow
    设置某个页面的编码方式
    部署Struts2框架
    oracle之约束
    java web开发(三)、Ajax技术
    vs2008 安装出错 Microsoft Visual Studio Web 组件安装失败
    java web开发(二)、Servlet技术
    WLS Zip Distribution for Oracle WebLogic Server 12.1.1.0安装
    MyEclipse配置Weblogic服务器
    js 未结束的字符串常量错误解决方法
  • 原文地址:https://www.cnblogs.com/tingtin/p/12929914.html
Copyright © 2011-2022 走看看