zoukankan      html  css  js  c++  java
  • 简单反爬虫代码

    import urllib.request

    #发起请求
    res = urllib.request.urlopen('http://www.baidu.com/')
    print(type(res))

    #获取状态码
    # print(res.getcode())

    #获取请求地址
    # print(res.geturl())

    #获取头信息
    # print(res.getheaders())

    #读取全文 以二进制方式读取网页全文 需要进行解码操作
    #编码 encode --->字节
    #解码 decode --->文本
    #utf-8 gbk gb2312
    # print(res.read().decode('utf-8'))

    # with open("baidu.html","w",encoding="utf-8") as f:
    # f.write(res.read().decode("utf-8"))

    #请求网页# url = "http://www.baidu.com/"
    # url = 'http://www.baidu.com/'
    # urllib.request.urlretrieve(url=url,filename='baidu.html')

    #请求图片
    # img_url = "http://b-ssl.duitang.com/uploads/item/201601/28/20160128084015_z3cUP.jpeg"
    # name = img_url.rsplit("/")[-1-2]
    # urllib.request.urlretrieve(url=img_url,filename=name+".jpg")


    # 请求视频
    # vedio_url = 'http://v6-default.ixigua.com/741142c2612117615b8343d7a6c12643/5cece6c6/video/m/2204a5c8c6f50be412db774c6d688b6bade1162097c100008550afaf2a34/?rc=M29wdjxrNHFxbTMzaTczM0ApQHRAbzQ5NTM7MzQzMzY3NDUzNDVvQGg1dilAZzN3KUBmM3UpZHNyZ3lrdXJneXJseHdmNzZAMnJzYW5eLV5hXy0tYS0vc3MtbyNvIzI0MC8vMS0uNDAxNTI2LTojbyM6YS1vIzpgLXAjOmB2aVxiZitgXmJmK15xbDojMy5e&vfrom=xgplayer'
    # urllib.request.urlretrieve(url=vedio_url,filename="toutiao.mp4")


    url = 'http://www.baidu.com/'
    #构造请求头
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
    }

    #构造请求
    req = urllib.request.Request(url=url,headers=headers)
    #发起请求 获得响应内容
    res = urllib.request.urlopen(req)
    #读取响应内容
    print(res.getcode())



    import urllib
    import urllib.request
    url1 = 'http://www.baidu.com/s?'
    name = input('请输入要查询的内容:')
    source = {
    'wd':name
    }
    url2 = urllib.parse.urlencode(source)
    url3 = url1+url2
    print(url3)

    res = urllib.request.urlopen(url3)
    print(res.getcode())


    # http://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
    # %e7%be%8e%e5%a5%b3



    百度贴吧
    #导包
    import urllib
    import urllib.request
    #定义固定url
    url1 = 'http://tieba.baidu.com/f?'

    name = input('请输入要查询的贴吧名称:')
    pge = int(input('请输入要查询的页数:'))
    #定义url关键字
    source = {
    'kw':name
    }
    #关键字编码
    url2 = urllib.parse.urlencode(source)

    #拼接url
    url3 = url1+url2
    for page in range(1,pge+1):
    pn = (page-1)*50
    full_url = url3+"&pn=%s"%pn
    #发起请求
    res = urllib.request.urlopen(full_url)
    #读取
    print(res.getcode())



    # import urllib
    # # import urllib.request
    # #
    # # url='https://tieba.baidu.com/f?'

    # # name = input('请输入要查询的贴吧:')
    # # page = int(input('请输入要查询的页数:'))
    # #
    # # headers = {
    # # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
    # # "Host": "www.baidu.com",
    # # }
    # # source = {
    # # 'kw':name
    # # }
    # # new_url = urllib.parse.urlencode(source)
    # # url2 = url + new_url
    # #
    # # for i in range(1,page+1):
    # # pn = (i-1)*50
    # # full_url = url2 + "&pn=%s"%pn
    # # #构造请求
    # # req = urllib.request.Request(url=full_url,headers=headers)
    # # #获取响应
    # # res = urllib.request.urlopen(req)
    # # print(res.getcode())
    # # # urllib.request.urlretrieve(url=full_url,filename='第%s页.html'%i)



    import urllib
    import urllib.request
    #kw搜索名称
    #pn页数50为数距,初始值为0
    url = "https://tieba.baidu.com/f?"
    name = input("请输入你要搜索的贴吧:")
    page = int(input("请输入要爬取的页数:"))
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Host": "tieba.baidu.com"
    }
    source = {
    "kw":name
    }
    url1 = urllib.parse.urlencode(source)
    url2 = url + url1
    for i in range(1,page+1):
    pn = (i-1)*50
    full_url = url2 + "&pn=%s"%pn
    req = urllib.request.Request(url=full_url,headers=headers)
    res = urllib.request.urlopen(req)
    # print(res.getcode())
    urllib.request.urlretrieve(url=full_url,filename="第%s页.html"%i)

    百度翻译
    from urllib import request,parse
    import json
    # POST提交地址
    url = "https://fanyi.baidu.com/sug"
    headers ={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
    }
    name = input("请输入要翻译的内容:")
    #发起POST请求时需要的数据
    form = {
    "kw":name
    }
    # 将字典进行编码 编码完成后是一个字符串类型
    forms = parse.urlencode(form)
    #构造post请求 post请求和get请求的区别就在于是否有data参数
    #post的数据必须是字节 所以我们用bytes()函数进行转换
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    #发起请求
    res = request.urlopen(req)
    #获得响应数据
    content = res.read().decode("utf-8")
    # print(content)
    #把一个json字符串转换为字典
    res = json.loads(content)
    print(res)
    print(res['data'][0]['v'])




    豆瓣电影
    from urllib import request,parse
    import json
    url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20"

    headers = {
    "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    form = {
    "start":"40",
    "limit":"20"
    }
    forms = parse.urlencode(form)
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    response = request.urlopen(req)
    content = response.read().decode("utf-8")
    res = json.loads(content)
    print(res)
    for i in res:
    title = i['title']
    actors = i['actors']
    print(title)
    print(actors)






    # from urllib import request,parse
    # import json
    #
    # url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action="
    #
    # headers = {
    # "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    # }
    #
    #
    # form = {
    # "start":"10",
    # "limit":"40"
    # }
    #
    # forms = parse.urlencode(form)
    #
    # req = request.Request(url=url,data = bytes(forms,encoding="utf-8"),headers=headers)
    #
    # response = request.urlopen(req)
    #
    # conten = response.read().decode("utf-8")
    #
    # res = json.loads(conten)
    #
    # # print(res)
    #
    # for var in res:
    # title = var["title"]
    # actors = var["actors"]
    #
    # print(title)
    # print(actors)



    模拟登陆
    from urllib import request
    url = "https://user.qzone.qq.com/2862346891"
    headers = {
    "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
    "cookie": "tvfe_boss_uuid=58a76f49bc6ae610; pgv_pvid=7834310198; _qpsvr_localtk=0.6362081929899539; pgv_pvi=9847598080; pgv_si=s2316292096; pgv_info=ssid=s461279900; uin=o2862346891; skey=@VHWWL050N; ptisp=ctc; RK=hRqQfr4Nwh; ptcz=fc6eb6560e4aa5836aa5f0ac644eede30edc9c95840ea9126bb7462bcc1e13c6; p_uin=o2862346891; pt4_token=uvlbuVIP21ETttWCMi*1wnd2sjHOOsnfKyW3A0vMbqI_; p_skey=ynMqbG7hjPmWuwpOwlccdBS*I36xq9z3jBf83sJq7Ik_; Loading=Yes; qz_screen=1280x720; 2862346891_todaycount=0; 2862346891_totalcount=29429; QZ_FE_WEBP_SUPPORT=1; __Q_w_s_hat_seed=1; rv2=80336DBCB68C00892BB825EA0FACDE3AD70C89C10A1903D221; property20=63A613D7498074A6D1571F37A4941C048B9F458DA01089D6A40BF0D8EB5D330F6A4905911FFF9094; cpu_performance_v8=11; v6uin=2862346891|qzone_player"
    }
    req = request.Request(url=url,headers=headers)
    response = request.urlopen(req)
    content = response.read().decode("utf-8")
    with open("qq.html","w",encoding="utf-8") as f:
    f.write(content)


    肯德基店铺位置
    from urllib import request,parse
    import json
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"

    }
    size = int(input("请输入要查询多少条数据:"))
    form = {
    "cname": "北京",
    "pid": "",
    "pageIndex": 1,
    "pageSize": size,
    }
    forms = parse.urlencode(form)
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    response = request.urlopen(req)
    result = response.read().decode("utf-8")
    res = json.loads(result)
    print(res)
    for i in res['Table1']:
    addressDetail = i["addressDetail"]
    storeName = i["storeName"]
    print("addressDetail:", addressDetail)
    print("storeName:", storeName)












  • 相关阅读:
    三数之和
    罗马数字与整数
    Oracle 开启或关闭归档
    Oracle RMAN scripts to delete archivelog
    Oracle check TBS usage
    Oracle kill locked sessions
    场景9 深入RAC运行原理
    场景7 Data Guard
    场景4 Data Warehouse Management 数据仓库
    场景5 Performance Management
  • 原文地址:https://www.cnblogs.com/wyf2019/p/10946334.html
Copyright © 2011-2022 走看看