zoukankan      html  css  js  c++  java
  • python 爬虫实列

    In [5]:
    import urllib.request
    In [6]:
    #创建一个ruquest对象
    url="https://tieba.baidu.com/p/6310762577"
    request=urllib.request.Request(url)
    #连接url,返回response对象
    response=urllib.request.urlopen(request)
    #获取内容数据
    html=response.read()#read(方法)
    #设置内容为utf-8编码
    html=html.decode("utf-8")
    html

    Out[6]:

    '
    <!DOCTYPE html><!--STATUS OK--><html><head><meta name="keywords" content="百度贴吧,美图骑单,
    车的"/><meta name="description" content="骑单车的人..骑单车的人" /><meta charset="UTF-8"><meta
    furl="tieba.baidu.com/f?kw=%E7%BE%8E%E5%9B%BE&ie=utf-8" fname="美图"><meta http-equiv="X-UA-
    Compatible" content="IE=edge,chrome=1"><meta name="baidu-site-verification" content="jpBCrwX689"
    /><link rel="search" type="application/opensearchdescription+xml" href="/tb/cms/content-search.xml"
    title="百度贴吧" /><title>骑单车的人_美图吧_百度贴吧</title><script type="text/javascript">void function(t,e,n,a,o,i,r)
    {t.alogObjectName=o,t[o]=t[o]||function(){(t[o].q=t[o].q||[]).push(arguments)},t[o].l=t[o].l||+new Date,a="https:
    "===t.location.protocol?"https://fex.bdstatic.com"+a:"http://fex.bdstatic.com"+a;var c=!0;if(t.alogObjectConfig&&t.
    alogObjectConfig.sample){var s=Math.random();t.alogObjectConfig.rand=s,s>t.alogObjectConfig.sample&&(c=!1)}c&&(i=e.
    createElement(n),i.async=!0,i.src=a+"?v="+~(new Date/864e5)+~(new Date/864e5),r=e.getElementsByTagName(n)[0],r.parentNode.
    insertBefore(i,r))}(window,document,"script","/hunter/alog/alog.min.js","alog"),void function(){function t(){}window.PDC=
    {mark:function(t,e){alog("speed.set",t,e||+new Date),alog.fire&&alog.fire("mark")},init:function(t){alog("speed.set","options",t)}

    用正则表达式解析内容

    In [7]:
    #导入正则表达式包
    import re
    In [8]:
    #创建一个正则对象
    str='src="(.+?.jpg)" size'
    imger=re.compile(str)
    imglist=re.findall(imger,html)#找打所有图片
    imglist
    Out[8]:
    ['https://imgsa.baidu.com/forum/w%3D580/sign=4b966626e4c4b7453494b71efffd1e78/aeea0b55b319ebc4f68696968d26cffc1f17161a.jpg',
     'https://imgsa.baidu.com/forum/w%3D580/sign=4172a3f9a8efce1bea2bc8c29f50f3e8/30ca91ef76c6a7ef1f864829f2faaf51f2de667d.jpg',
     'https://imgsa.baidu.com/forum/w%3D580/sign=f1a090968d26cffc692abfba89004a7d/3c1fb64543a982268919cd288582b9014890ebc5.jpg']
    In [9]:
    #把imglist保存到目录下
    import  os
    import time
    i=1
    #if not os.path.exists("imagess"):
        #os.mkdir("imagrss")#系统里面不存在”images“就创建一个
    for img in imglist:
        time.sleep(1)#休息1秒抓取下一张
        urllib.request.urlretrieve(img,"C:/Users/1/Desktop/{}.jpg".format(i))
        i=i+1
    print("爬虫结束")
    爬虫结束  只上传一张0.0
    
    
     

    get方法抓取

    In [10]:
    import urllib.request
    In [25]:
    #在淘宝转化浏览设备为6plus,F12在JS中找到内容
    url="https://suggest.taobao.com/sug?q=python+%E7%88%AC%E8%99%AB&code=utf-8&area=c2c&nick=&sid=null&callback=jsonp157311230486278746"
    #连接url,返回response对象
    response=urllib.request.urlopen(url)
    html=response.read().decode("utf8")
    html
    Out[25]:
    '
    jsonp157311230486278746({"result":[["python爬虫<b>书籍<\/b>","690.2666666666667"],["python<b>3网络<\/b>爬虫<b>开发实战<\/b>","396.02"],["python<b>3网络<\/b>爬虫<b>开发<\/b>","429.95348837209303"],["python爬虫<b>实战<\/b>","1046.909090909091"],["python爬虫<b>课程网课<\/b>","107.2"],["python<b>3网络<\/b>爬虫","473.5"],["python爬虫<b>源代码<\/b>","110.9375"],["python爬虫<b>项目<\/b>","1116.8333333333333"],["python<b>网络<\/b>爬虫<b>实战<\/b>","658.6"],["python爬虫<b>入门<\/b>","1084.969696969697"]]})'
    In [42]:
    #把json的字符串格式转换成python的字典类型
    import json
    dic=json.loads(a)
    dic
    #注json不支持单引号,包含单引号的字符串会解析失败

    Out[42]:

    {'result': [['python爬虫<b>书籍</b>', '690.2666666666667'],
      ['python<b>3网络</b>爬虫<b>开发实战</b>', '396.02'],
      ['python<b>3网络</b>爬虫<b>开发</b>', '429.95348837209303'],
      ['python爬虫<b>实战</b>', '1046.909090909091'],
      ['python爬虫<b>课程网课</b>', '107.2'],
      ['python<b>3网络</b>爬虫', '473.5'],
      ['python爬虫<b>源代码</b>', '110.9375'],
      ['python爬虫<b>项目</b>', '1116.8333333333333'],
      ['python<b>网络</b>爬虫<b>实战</b>', '658.6'],
      ['python爬虫<b>入门</b>', '1084.969696969697']]}
    In [41]:
    a=html[26:452]
    In [54]:
    for item in dic["result"]:
        print(item[0].replace("<b>","").replace("</b>",""))#replace字符串替换函数
    python爬虫书籍
    python3网络爬虫开发实战
    python3网络爬虫开发
    python爬虫实战
    python爬虫课程网课
    python3网络爬虫
    python爬虫源代码
    python爬虫项目
    python网络爬虫实战
    python爬虫入门
    
    In [45]:
    #python 自带工具可以做编码‘
    str=urllib.request.quote("牛仔衣")
    str
    Out[45]:
    '%E7%89%9B%E4%BB%94%E8%A1%A3'
    In [56]:
    import requests
    import json
    keyword = '连衣裙'
    # 不需要进行URL的编码
    url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
    response = requests.get(url)
    html = response.text # 获取响应体的内容
    dic = json.loads(html)
    for item in dic['result']:
        print(item[0])
    连衣裙女秋冬
    连衣裙2019新款秋
    连衣裙女
    连衣裙2019秋款新
    连衣裙夏
    连衣裙女春秋
    连衣裙长款秋冬
    连衣裙收腰显瘦 气质
    连衣裙两件套秋冬
    连衣裙长裙女秋冬
    #搜索连衣裙二级菜单
    import requests
    import json
    #定义一个函数,替换返回值中的特殊格式
    def replace_str(html):
        html = html.replace('<b>','')
        html = html.replace('</b>','')
        return html
    
    keyword = '连衣裙'
    url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
    response = requests.get(url)
    dic = json.loads(response.text)
    for item in dic['result']:
        print(item[0])
        url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0])
        response2 = requests.get(url2)
        content = replace_str(response2.text)
        dic2 = json.loads(content)
        for item2 in dic2['result']:
            print('	'+item2[0])
    连衣裙2018款新款
    	连衣裙2018款新款女
    	连衣裙2018款新款女 雪纺
    	连衣裙2018夏新款女 中长款 修身
    	裙子女夏2018新款 中长款连衣裙
    	裙子夏女2018新款 款连衣裙 气质
    	连衣裙夏女2018新款 中长款 气质
    	女裙子2018新款 中长款连衣裙
    	女夏2018新款连衣裙长  款35岁
    	女装2018新款中长连衣裙 中长款
    	连衣裙女夏2018新款拉链款
    连衣裙夏季新款
    	连衣裙夏季新款女 2018 气质 a字
    	连衣裙夏季新款 雪纺 2018
    	连衣裙夏季新款 宽松 韩版 2018
    	连衣裙夏季新款女  名媛
    	连衣裙夏季新款 2018韩版孕妇装
    	连衣裙夏季新款2018显瘦超仙
    	连衣裙夏季新款女装欧美中裙
    	连衣裙夏季新款女欧洲站2018
    	连衣裙夏季新款女韩版 棉麻
    	连衣裙夏季新款 时尚气质

    反爬虫机制

    加入user-agent,冒充浏览器 user-agent可在浏览器中F12 找到,且必须以字典格式传入

    #加入user-agent,冒充浏览器 user-agent可在浏览器中F12 找到,且必须以字典格式传入
    header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
    keyword = '羽绒服'
    url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
    response = requests.get(url,headers=header)
    dic = json.loads(response.text)
    for item in dic['result']:
        print(item[0])
    羽绒服女中长款 修身显瘦
    羽绒服女轻薄款
    羽绒服中长款女
    羽绒服女长款2017新款 韩版 潮
    羽绒服 枣红色
    羽绒服女收腰
    羽绒服男轻薄
    羽绒服女长款 冬季 中长款
    羽绒服套装
    羽绒服女长款收腰过膝

    设置代理IP

    #设置代理IP
    proxies = {"HTTP":"122.114.31.177:808"} #百度搜索代理IP
    header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
    keyword = '西装'
    url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
    response = requests.get(url,headers=header,proxies=proxies)
    dic = json.loads(response.text)
    for item in dic['result']:
        print(item[0])
    西装男套装 青少年
    西装短裤套装女
    西装 修身连衣裙
    西装领长袖女
    西装 短袖
    西装v领 连衣裙 收腰
    西装喇叭裤高腰
    西装裤 夏 女
    西装春秋女
    西装热裤女
    # 把数据写到文件中
    #搜索连衣裙二级菜单
    import requests
    import json
    import pandas as pd
    #定义一个函数,替换返回值中的特殊格式
    def replace_str(html):
        html = html.replace('<b>','')
        html = html.replace('</b>','')
        return html
    
    keyword = '连衣裙'
    proxies = {"HTTP":"122.114.31.177:808"}
    header={"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
    url = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(keyword)
    response = requests.get(url,headers=header,proxies=proxies)
    dic = json.loads(response.text)
    lst=[]
    for item in dic['result']:
        lst.append([item[0],1])
        url2 = "https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null".format(item[0])
        response2 = requests.get(url2,headers=header,proxies=proxies)
        content = replace_str(response2.text)
        dic2 = json.loads(content)
        for item2 in dic2['result']:
            lst.append([item2[0],2])
    #写到文件中
    data = pd.DataFrame(lst,columns=['title','level'])
    data.to_csv('./lyq.csv',index=False,header=True)
    print('运行结束')
    # 写一个爬虫—Post 方式抓取有道翻译数据
    import requests
    import json
    
    def translate(word):
        url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
        header={
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection":"keep-alive",
        "Content-Length": "254",
        "Content-Type": "application/x-www-form-urlencoded;",
        "Cookie": "OUTFOX_SEARCH_USER_ID=-2118096325@10.168.8.63; OUTFOX_SEARCH_USER_ID_NCOO=529585232.72911924; fanyi-ad-id=44547; fanyi-ad-closed=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abcm1o-ND2W6Y9HCq9vqw; _ntes_nnid=a7d939a02cd5c3865942d2a2051b410e,1529376772962; ___rl__test__cookies=1529398225385",
        "Host": "fanyi.youdao.com",
        "Origin": "http://fanyi.youdao.com",
        "Referer": "http://fanyi.youdao.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
        "X-Requested-With":"XMLHttpRequest"
        }
        payload = {
        "i": word,
        "from": "AUTO",
        "to": "AUTO",
        "smartresult": "dict",
        "client": "fanyideskweb",
        "salt": "1529398225392",
        "sign": "bf09bc9795dfc7863516162c961fd97e",
        "doctype": "json",
        "version": "2.1",
        "keyfrom": "fanyi.web",
        "action": "FY_BY_CLICKBUTTION",
        "typoResult":"false"
        }
        response = requests.post(url,data=payload,headers=header)
        dic = json.loads(response.text)
        print(dic['translateResult'][0][0]['tgt'])
    if __name__=='__main__':
        translate('中国')
    China
     
     
     
     
  • 相关阅读:
    Tyvj 1729 文艺平衡树
    送花
    Tyvj 1728 普通平衡树
    [NOI2004]郁闷的出纳员
    [HNOI2004]宠物收养所
    [HNOI2002]营业额统计
    [NOIP2012] 借教室
    无聊的数列
    忠诚
    XOR的艺术
  • 原文地址:https://www.cnblogs.com/Koi504330/p/11909334.html
Copyright © 2011-2022 走看看